diff mbox

[v13,1/4] iommu/arm-smmu: Add pm_runtime/sleep ops

Message ID 20180719101539.6104-2-vivek.gautam@codeaurora.org (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Vivek Gautam July 19, 2018, 10:15 a.m. UTC
From: Sricharan R <sricharan@codeaurora.org>

The smmu needs to be functional only when the respective
master's using it are active. The device_link feature
helps to track such functional dependencies, so that the
iommu gets powered when the master device enables itself
using pm_runtime. So by adapting the smmu driver for
runtime pm, above said dependency can be addressed.

This patch adds the pm runtime/sleep callbacks to the
driver and also the functions to parse the smmu clocks
from DT and enable them in resume/suspend.

Also, while we enable the runtime pm add a pm sleep suspend
callback that pushes devices to low power state by turning
the clocks off in a system sleep.
Also add corresponding clock enable path in resume callback.

Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Archit Taneja <architt@codeaurora.org>
[vivek: rework for clock and pm ops]
Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
---

Changes since v12:
 - Added pm sleep .suspend callback. This disables the clocks.
 - Added corresponding change to enable clocks in .resume
  pm sleep callback.

 drivers/iommu/arm-smmu.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

Comments

Robin Murphy July 24, 2018, 3:21 p.m. UTC | #1
On 19/07/18 11:15, Vivek Gautam wrote:
> From: Sricharan R <sricharan@codeaurora.org>
> 
> The smmu needs to be functional only when the respective
> master's using it are active. The device_link feature
> helps to track such functional dependencies, so that the
> iommu gets powered when the master device enables itself
> using pm_runtime. So by adapting the smmu driver for
> runtime pm, above said dependency can be addressed.
> 
> This patch adds the pm runtime/sleep callbacks to the
> driver and also the functions to parse the smmu clocks
> from DT and enable them in resume/suspend.
> 
> Also, while we enable the runtime pm add a pm sleep suspend
> callback that pushes devices to low power state by turning
> the clocks off in a system sleep.
> Also add corresponding clock enable path in resume callback.
> 
> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
> Signed-off-by: Archit Taneja <architt@codeaurora.org>
> [vivek: rework for clock and pm ops]
> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
> ---
> 
> Changes since v12:
>   - Added pm sleep .suspend callback. This disables the clocks.
>   - Added corresponding change to enable clocks in .resume
>    pm sleep callback.
> 
>   drivers/iommu/arm-smmu.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index c73cfce1ccc0..9138a6fffe04 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -48,6 +48,7 @@
>   #include <linux/of_iommu.h>
>   #include <linux/pci.h>
>   #include <linux/platform_device.h>
> +#include <linux/pm_runtime.h>
>   #include <linux/slab.h>
>   #include <linux/spinlock.h>
>   
> @@ -205,6 +206,8 @@ struct arm_smmu_device {
>   	u32				num_global_irqs;
>   	u32				num_context_irqs;
>   	unsigned int			*irqs;
> +	struct clk_bulk_data		*clks;
> +	int				num_clks;
>   
>   	u32				cavium_id_base; /* Specific to Cavium */
>   
> @@ -1897,10 +1900,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
>   struct arm_smmu_match_data {
>   	enum arm_smmu_arch_version version;
>   	enum arm_smmu_implementation model;
> +	const char * const *clks;
> +	int num_clks;
>   };
>   
>   #define ARM_SMMU_MATCH_DATA(name, ver, imp)	\
> -static struct arm_smmu_match_data name = { .version = ver, .model = imp }
> +static const struct arm_smmu_match_data name = { .version = ver, .model = imp }
>   
>   ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
>   ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
> @@ -1919,6 +1924,23 @@ static const struct of_device_id arm_smmu_of_match[] = {
>   };
>   MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
>   
> +static void arm_smmu_fill_clk_data(struct arm_smmu_device *smmu,
> +				   const char * const *clks)
> +{
> +	int i;
> +
> +	if (smmu->num_clks < 1)
> +		return;
> +
> +	smmu->clks = devm_kcalloc(smmu->dev, smmu->num_clks,
> +				  sizeof(*smmu->clks), GFP_KERNEL);
> +	if (!smmu->clks)
> +		return;
> +
> +	for (i = 0; i < smmu->num_clks; i++)
> +		smmu->clks[i].id = clks[i];
> +}
> +
>   #ifdef CONFIG_ACPI
>   static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
>   {
> @@ -2001,6 +2023,9 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev,
>   	data = of_device_get_match_data(dev);
>   	smmu->version = data->version;
>   	smmu->model = data->model;
> +	smmu->num_clks = data->num_clks;
> +
> +	arm_smmu_fill_clk_data(smmu, data->clks);
>   
>   	parse_driver_options(smmu);
>   
> @@ -2099,6 +2124,14 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>   		smmu->irqs[i] = irq;
>   	}
>   
> +	err = devm_clk_bulk_get(smmu->dev, smmu->num_clks, smmu->clks);
> +	if (err)
> +		return err;
> +
> +	err = clk_bulk_prepare(smmu->num_clks, smmu->clks);
> +	if (err)
> +		return err;
> +
>   	err = arm_smmu_device_cfg_probe(smmu);
>   	if (err)
>   		return err;
> @@ -2181,6 +2214,9 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
>   
>   	/* Turn the thing off */
>   	writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
> +
> +	clk_bulk_unprepare(smmu->num_clks, smmu->clks);
> +
>   	return 0;
>   }
>   
> @@ -2189,15 +2225,50 @@ static void arm_smmu_device_shutdown(struct platform_device *pdev)
>   	arm_smmu_device_remove(pdev);
>   }
>   
> +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
> +{
> +	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> +
> +	return clk_bulk_enable(smmu->num_clks, smmu->clks);

If there's a power domain being automatically switched by genpd then we 
need a reset here because we may have lost state entirely. Since I 
remembered the otherwise-useless GPU SMMU on Juno is in a separate power 
domain, I gave it a poking via sysfs with some debug stuff to dump sCR0 
in these callbacks, and the problem is clear:

...
[    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
[    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 0x00201936
[    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 6733980 ns
[   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
[   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 0x00220101
[   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 6658020 ns
...

> +}
> +
> +static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
> +{
> +	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> +
> +	clk_bulk_disable(smmu->num_clks, smmu->clks);
> +
> +	return 0;
> +}
> +
>   static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
>   {
>   	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> +	int ret;
> +
> +	if (!pm_runtime_suspended(dev)) {
> +		ret = arm_smmu_runtime_resume(dev);
> +		if (ret)
> +			return ret;
> +	}
>   
>   	arm_smmu_device_reset(smmu);

This looks a bit off too - if we wake from sleep when the SMMU was also 
runtime-suspended, it appears we might end up trying to restore the 
register state without clocks enabled. Surely we need to always enable 
clocks for the reset, then restore the previous suspended state? 
Although given my previous point, it's probably not worth doing anything 
at all here for that case.

Robin.

>   	return 0;
>   }
>   
> -static SIMPLE_DEV_PM_OPS(arm_smmu_pm_ops, NULL, arm_smmu_pm_resume);
> +static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
> +{
> +	if (!pm_runtime_suspended(dev))
> +		return arm_smmu_runtime_suspend(dev);
> +
> +	return 0;
> +}
> +
> +static const struct dev_pm_ops arm_smmu_pm_ops = {
> +	SET_SYSTEM_SLEEP_PM_OPS(arm_smmu_pm_suspend, arm_smmu_pm_resume)
> +	SET_RUNTIME_PM_OPS(arm_smmu_runtime_suspend,
> +			   arm_smmu_runtime_resume, NULL)
> +};
>   
>   static struct platform_driver arm_smmu_driver = {
>   	.driver	= {
>
Tomasz Figa July 25, 2018, 6:28 a.m. UTC | #2
On Wed, Jul 25, 2018 at 12:21 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 19/07/18 11:15, Vivek Gautam wrote:
> > From: Sricharan R <sricharan@codeaurora.org>
> >
> > The smmu needs to be functional only when the respective
> > master's using it are active. The device_link feature
> > helps to track such functional dependencies, so that the
> > iommu gets powered when the master device enables itself
> > using pm_runtime. So by adapting the smmu driver for
> > runtime pm, above said dependency can be addressed.
> >
> > This patch adds the pm runtime/sleep callbacks to the
> > driver and also the functions to parse the smmu clocks
> > from DT and enable them in resume/suspend.
> >
> > Also, while we enable the runtime pm add a pm sleep suspend
> > callback that pushes devices to low power state by turning
> > the clocks off in a system sleep.
> > Also add corresponding clock enable path in resume callback.
> >
> > Signed-off-by: Sricharan R <sricharan@codeaurora.org>
> > Signed-off-by: Archit Taneja <architt@codeaurora.org>
> > [vivek: rework for clock and pm ops]
> > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
> > Reviewed-by: Tomasz Figa <tfiga@chromium.org>
> > ---
> >
> > Changes since v12:
> >   - Added pm sleep .suspend callback. This disables the clocks.
> >   - Added corresponding change to enable clocks in .resume
> >    pm sleep callback.
> >
> >   drivers/iommu/arm-smmu.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++--
> >   1 file changed, 73 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> > index c73cfce1ccc0..9138a6fffe04 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -48,6 +48,7 @@
> >   #include <linux/of_iommu.h>
> >   #include <linux/pci.h>
> >   #include <linux/platform_device.h>
> > +#include <linux/pm_runtime.h>
> >   #include <linux/slab.h>
> >   #include <linux/spinlock.h>
> >
> > @@ -205,6 +206,8 @@ struct arm_smmu_device {
> >       u32                             num_global_irqs;
> >       u32                             num_context_irqs;
> >       unsigned int                    *irqs;
> > +     struct clk_bulk_data            *clks;
> > +     int                             num_clks;
> >
> >       u32                             cavium_id_base; /* Specific to Cavium */
> >
> > @@ -1897,10 +1900,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
> >   struct arm_smmu_match_data {
> >       enum arm_smmu_arch_version version;
> >       enum arm_smmu_implementation model;
> > +     const char * const *clks;
> > +     int num_clks;
> >   };
> >
> >   #define ARM_SMMU_MATCH_DATA(name, ver, imp) \
> > -static struct arm_smmu_match_data name = { .version = ver, .model = imp }
> > +static const struct arm_smmu_match_data name = { .version = ver, .model = imp }
> >
> >   ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
> >   ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
> > @@ -1919,6 +1924,23 @@ static const struct of_device_id arm_smmu_of_match[] = {
> >   };
> >   MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
> >
> > +static void arm_smmu_fill_clk_data(struct arm_smmu_device *smmu,
> > +                                const char * const *clks)
> > +{
> > +     int i;
> > +
> > +     if (smmu->num_clks < 1)
> > +             return;
> > +
> > +     smmu->clks = devm_kcalloc(smmu->dev, smmu->num_clks,
> > +                               sizeof(*smmu->clks), GFP_KERNEL);
> > +     if (!smmu->clks)
> > +             return;
> > +
> > +     for (i = 0; i < smmu->num_clks; i++)
> > +             smmu->clks[i].id = clks[i];
> > +}
> > +
> >   #ifdef CONFIG_ACPI
> >   static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
> >   {
> > @@ -2001,6 +2023,9 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev,
> >       data = of_device_get_match_data(dev);
> >       smmu->version = data->version;
> >       smmu->model = data->model;
> > +     smmu->num_clks = data->num_clks;
> > +
> > +     arm_smmu_fill_clk_data(smmu, data->clks);
> >
> >       parse_driver_options(smmu);
> >
> > @@ -2099,6 +2124,14 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
> >               smmu->irqs[i] = irq;
> >       }
> >
> > +     err = devm_clk_bulk_get(smmu->dev, smmu->num_clks, smmu->clks);
> > +     if (err)
> > +             return err;
> > +
> > +     err = clk_bulk_prepare(smmu->num_clks, smmu->clks);
> > +     if (err)
> > +             return err;
> > +
> >       err = arm_smmu_device_cfg_probe(smmu);
> >       if (err)
> >               return err;
> > @@ -2181,6 +2214,9 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
> >
> >       /* Turn the thing off */
> >       writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
> > +
> > +     clk_bulk_unprepare(smmu->num_clks, smmu->clks);
> > +
> >       return 0;
> >   }
> >
> > @@ -2189,15 +2225,50 @@ static void arm_smmu_device_shutdown(struct platform_device *pdev)
> >       arm_smmu_device_remove(pdev);
> >   }
> >
> > +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
> > +{
> > +     struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> > +
> > +     return clk_bulk_enable(smmu->num_clks, smmu->clks);
>
> If there's a power domain being automatically switched by genpd then we
> need a reset here because we may have lost state entirely. Since I
> remembered the otherwise-useless GPU SMMU on Juno is in a separate power
> domain, I gave it a poking via sysfs with some debug stuff to dump sCR0
> in these callbacks, and the problem is clear:
>
> ...
> [    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
> [    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 0x00201936
> [    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 6733980 ns
> [   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
> [   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 0x00220101
> [   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 6658020 ns
> ...
>
> > +}
> > +
> > +static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
> > +{
> > +     struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> > +
> > +     clk_bulk_disable(smmu->num_clks, smmu->clks);
> > +
> > +     return 0;
> > +}
> > +
> >   static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
> >   {
> >       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
> > +     int ret;
> > +
> > +     if (!pm_runtime_suspended(dev)) {
> > +             ret = arm_smmu_runtime_resume(dev);
> > +             if (ret)
> > +                     return ret;
> > +     }
> >
> >       arm_smmu_device_reset(smmu);
>
> This looks a bit off too - if we wake from sleep when the SMMU was also
> runtime-suspended, it appears we might end up trying to restore the
> register state without clocks enabled. Surely we need to always enable
> clocks for the reset, then restore the previous suspended state?
> Although given my previous point, it's probably not worth doing anything
> at all here for that case.

With a reset added to arm_smmu_runtime_resume(), we wouldn't need the
reset here anymore. With that, the code below should work.

    if (pm_runtime_suspended(dev))
        return 0;
    return arm_smmu_runtime_resume(dev);

Best regards,
Tomasz
Vivek Gautam July 25, 2018, 6:16 p.m. UTC | #3
On Tue, Jul 24, 2018 at 8:51 PM, Robin Murphy <robin.murphy@arm.com> wrote:
> On 19/07/18 11:15, Vivek Gautam wrote:
>>
>> From: Sricharan R <sricharan@codeaurora.org>
>>
>> The smmu needs to be functional only when the respective
>> master's using it are active. The device_link feature
>> helps to track such functional dependencies, so that the
>> iommu gets powered when the master device enables itself
>> using pm_runtime. So by adapting the smmu driver for
>> runtime pm, above said dependency can be addressed.
>>
>> This patch adds the pm runtime/sleep callbacks to the
>> driver and also the functions to parse the smmu clocks
>> from DT and enable them in resume/suspend.
>>
>> Also, while we enable the runtime pm add a pm sleep suspend
>> callback that pushes devices to low power state by turning
>> the clocks off in a system sleep.
>> Also add corresponding clock enable path in resume callback.
>>
>> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
>> Signed-off-by: Archit Taneja <architt@codeaurora.org>
>> [vivek: rework for clock and pm ops]
>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
>> ---
>>
>> Changes since v12:
>>   - Added pm sleep .suspend callback. This disables the clocks.
>>   - Added corresponding change to enable clocks in .resume
>>    pm sleep callback.
>>
>>   drivers/iommu/arm-smmu.c | 75
>> ++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index c73cfce1ccc0..9138a6fffe04 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -48,6 +48,7 @@
>>   #include <linux/of_iommu.h>
>>   #include <linux/pci.h>
>>   #include <linux/platform_device.h>
>> +#include <linux/pm_runtime.h>
>>   #include <linux/slab.h>
>>   #include <linux/spinlock.h>
>>   @@ -205,6 +206,8 @@ struct arm_smmu_device {
>>         u32                             num_global_irqs;
>>         u32                             num_context_irqs;
>>         unsigned int                    *irqs;
>> +       struct clk_bulk_data            *clks;
>> +       int                             num_clks;
>>         u32                             cavium_id_base; /* Specific to
>> Cavium */
>>   @@ -1897,10 +1900,12 @@ static int arm_smmu_device_cfg_probe(struct
>> arm_smmu_device *smmu)
>>   struct arm_smmu_match_data {
>>         enum arm_smmu_arch_version version;
>>         enum arm_smmu_implementation model;
>> +       const char * const *clks;
>> +       int num_clks;
>>   };
>>     #define ARM_SMMU_MATCH_DATA(name, ver, imp) \
>> -static struct arm_smmu_match_data name = { .version = ver, .model = imp }
>> +static const struct arm_smmu_match_data name = { .version = ver, .model =
>> imp }
>>     ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
>>   ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
>> @@ -1919,6 +1924,23 @@ static const struct of_device_id
>> arm_smmu_of_match[] = {
>>   };
>>   MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
>>   +static void arm_smmu_fill_clk_data(struct arm_smmu_device *smmu,
>> +                                  const char * const *clks)
>> +{
>> +       int i;
>> +
>> +       if (smmu->num_clks < 1)
>> +               return;
>> +
>> +       smmu->clks = devm_kcalloc(smmu->dev, smmu->num_clks,
>> +                                 sizeof(*smmu->clks), GFP_KERNEL);
>> +       if (!smmu->clks)
>> +               return;
>> +
>> +       for (i = 0; i < smmu->num_clks; i++)
>> +               smmu->clks[i].id = clks[i];
>> +}
>> +
>>   #ifdef CONFIG_ACPI
>>   static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
>>   {
>> @@ -2001,6 +2023,9 @@ static int arm_smmu_device_dt_probe(struct
>> platform_device *pdev,
>>         data = of_device_get_match_data(dev);
>>         smmu->version = data->version;
>>         smmu->model = data->model;
>> +       smmu->num_clks = data->num_clks;
>> +
>> +       arm_smmu_fill_clk_data(smmu, data->clks);
>>         parse_driver_options(smmu);
>>   @@ -2099,6 +2124,14 @@ static int arm_smmu_device_probe(struct
>> platform_device *pdev)
>>                 smmu->irqs[i] = irq;
>>         }
>>   +     err = devm_clk_bulk_get(smmu->dev, smmu->num_clks, smmu->clks);
>> +       if (err)
>> +               return err;
>> +
>> +       err = clk_bulk_prepare(smmu->num_clks, smmu->clks);
>> +       if (err)
>> +               return err;
>> +
>>         err = arm_smmu_device_cfg_probe(smmu);
>>         if (err)
>>                 return err;
>> @@ -2181,6 +2214,9 @@ static int arm_smmu_device_remove(struct
>> platform_device *pdev)
>>         /* Turn the thing off */
>>         writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
>> +
>> +       clk_bulk_unprepare(smmu->num_clks, smmu->clks);
>> +
>>         return 0;
>>   }
>>   @@ -2189,15 +2225,50 @@ static void arm_smmu_device_shutdown(struct
>> platform_device *pdev)
>>         arm_smmu_device_remove(pdev);
>>   }
>>   +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
>> +{
>> +       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>> +
>> +       return clk_bulk_enable(smmu->num_clks, smmu->clks);
>
>
> If there's a power domain being automatically switched by genpd then we need
> a reset here because we may have lost state entirely. Since I remembered the
> otherwise-useless GPU SMMU on Juno is in a separate power domain, I gave it
> a poking via sysfs with some debug stuff to dump sCR0 in these callbacks,
> and the problem is clear:
>
> ...
> [    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
> [    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 0x00201936
> [    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 6733980 ns
> [   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
> [   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 0x00220101
> [   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 6658020 ns
> ...

Qualcomm SoCs have retention enabled for SMMU registers so they don't
lose state.
...
[  256.013367] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
SCR0 = 0x201e36
[  256.013367]
[  256.019160] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
SCR0 = 0x201e36
[  256.019160]
[  256.027368] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
SCR0 = 0x201e36
[  256.027368]
[  256.036786] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
SCR0 = 0x201e36
...

However after adding arm_smmu_device_reset() in runtime_resume() I observe
some performance degradation when kill an instance of 'kmscube' and
start it again.
The launch time with arm_smmu_device_reset() in runtime_resume() change is
more.
Could this be because of frequent TLB invalidation and sync?

Best regards
Vivek
>
>> +}
>> +
>> +static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
>> +{
>> +       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>> +
>> +       clk_bulk_disable(smmu->num_clks, smmu->clks);
>> +
>> +       return 0;
>> +}
>> +
>>   static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
>>   {
>>         struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>> +       int ret;
>> +
>> +       if (!pm_runtime_suspended(dev)) {
>> +               ret = arm_smmu_runtime_resume(dev);
>> +               if (ret)
>> +                       return ret;
>> +       }
>>         arm_smmu_device_reset(smmu);
>
>
> This looks a bit off too - if we wake from sleep when the SMMU was also
> runtime-suspended, it appears we might end up trying to restore the register
> state without clocks enabled. Surely we need to always enable clocks for the
> reset, then restore the previous suspended state? Although given my previous
> point, it's probably not worth doing anything at all here for that case.
>
> Robin.
>
>>         return 0;
>>   }
>>   -static SIMPLE_DEV_PM_OPS(arm_smmu_pm_ops, NULL, arm_smmu_pm_resume);
>> +static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
>> +{
>> +       if (!pm_runtime_suspended(dev))
>> +               return arm_smmu_runtime_suspend(dev);
>> +
>> +       return 0;
>> +}
>> +
>> +static const struct dev_pm_ops arm_smmu_pm_ops = {
>> +       SET_SYSTEM_SLEEP_PM_OPS(arm_smmu_pm_suspend, arm_smmu_pm_resume)
>> +       SET_RUNTIME_PM_OPS(arm_smmu_runtime_suspend,
>> +                          arm_smmu_runtime_resume, NULL)
>> +};
>>     static struct platform_driver arm_smmu_driver = {
>>         .driver = {
>>
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Vivek Gautam July 26, 2018, 7:12 a.m. UTC | #4
On Wed, Jul 25, 2018 at 11:46 PM, Vivek Gautam
<vivek.gautam@codeaurora.org> wrote:
> On Tue, Jul 24, 2018 at 8:51 PM, Robin Murphy <robin.murphy@arm.com> wrote:
>> On 19/07/18 11:15, Vivek Gautam wrote:
>>>
>>> From: Sricharan R <sricharan@codeaurora.org>
>>>
>>> The smmu needs to be functional only when the respective
>>> master's using it are active. The device_link feature
>>> helps to track such functional dependencies, so that the
>>> iommu gets powered when the master device enables itself
>>> using pm_runtime. So by adapting the smmu driver for
>>> runtime pm, above said dependency can be addressed.
>>>
>>> This patch adds the pm runtime/sleep callbacks to the
>>> driver and also the functions to parse the smmu clocks
>>> from DT and enable them in resume/suspend.
>>>
>>> Also, while we enable the runtime pm add a pm sleep suspend
>>> callback that pushes devices to low power state by turning
>>> the clocks off in a system sleep.
>>> Also add corresponding clock enable path in resume callback.
>>>
>>> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
>>> Signed-off-by: Archit Taneja <architt@codeaurora.org>
>>> [vivek: rework for clock and pm ops]
>>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>>> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
>>> ---
>>>
>>> Changes since v12:
>>>   - Added pm sleep .suspend callback. This disables the clocks.
>>>   - Added corresponding change to enable clocks in .resume
>>>    pm sleep callback.
>>>
>>>   drivers/iommu/arm-smmu.c | 75
>>> ++++++++++++++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 73 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>> index c73cfce1ccc0..9138a6fffe04 100644
>>> --- a/drivers/iommu/arm-smmu.c
>>> +++ b/drivers/iommu/arm-smmu.c

[snip]

>>> platform_device *pdev)
>>>         arm_smmu_device_remove(pdev);
>>>   }
>>>   +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
>>> +{
>>> +       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>>> +
>>> +       return clk_bulk_enable(smmu->num_clks, smmu->clks);
>>
>>
>> If there's a power domain being automatically switched by genpd then we need
>> a reset here because we may have lost state entirely. Since I remembered the
>> otherwise-useless GPU SMMU on Juno is in a separate power domain, I gave it
>> a poking via sysfs with some debug stuff to dump sCR0 in these callbacks,
>> and the problem is clear:
>>
>> ...
>> [    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
>> [    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 0x00201936
>> [    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 6733980 ns
>> [   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
>> [   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 0x00220101
>> [   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 6658020 ns
>> ...
>
> Qualcomm SoCs have retention enabled for SMMU registers so they don't
> lose state.
> ...
> [  256.013367] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
> SCR0 = 0x201e36
> [  256.013367]
> [  256.019160] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
> SCR0 = 0x201e36
> [  256.019160]
> [  256.027368] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
> SCR0 = 0x201e36
> [  256.027368]
> [  256.036786] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
> SCR0 = 0x201e36
> ...
>
> However after adding arm_smmu_device_reset() in runtime_resume() I observe
> some performance degradation when kill an instance of 'kmscube' and
> start it again.
> The launch time with arm_smmu_device_reset() in runtime_resume() change is
> more.
> Could this be because of frequent TLB invalidation and sync?

Some more information that i gathered.
On Qcom SoCs besides the registers retention, TCU invalidates TLB cache on
a CX power collapse exit, which is the system wide suspend case.
The arm-smmu software is not aware of this CX power collapse /
auto-invalidation.

So wouldn't doing an explicit TLB invalidations during runtime resume be
detrimental to performance?

I have one more doubt here -
We do runtime power cycle around arm_smmu_map/unmap() too.
Now during map/unmap we selectively do TLB maintenance (either
tlb_sync or tlb_add_flush).
But with runtime pm we want to do TLBIALL*. Is that a problem?

Best regards
Vivek

>
> Best regards
> Vivek

[snip]
Robin Murphy July 26, 2018, 3:30 p.m. UTC | #5
On 26/07/18 08:12, Vivek Gautam wrote:
> On Wed, Jul 25, 2018 at 11:46 PM, Vivek Gautam
> <vivek.gautam@codeaurora.org> wrote:
>> On Tue, Jul 24, 2018 at 8:51 PM, Robin Murphy <robin.murphy@arm.com> wrote:
>>> On 19/07/18 11:15, Vivek Gautam wrote:
>>>>
>>>> From: Sricharan R <sricharan@codeaurora.org>
>>>>
>>>> The smmu needs to be functional only when the respective
>>>> master's using it are active. The device_link feature
>>>> helps to track such functional dependencies, so that the
>>>> iommu gets powered when the master device enables itself
>>>> using pm_runtime. So by adapting the smmu driver for
>>>> runtime pm, above said dependency can be addressed.
>>>>
>>>> This patch adds the pm runtime/sleep callbacks to the
>>>> driver and also the functions to parse the smmu clocks
>>>> from DT and enable them in resume/suspend.
>>>>
>>>> Also, while we enable the runtime pm add a pm sleep suspend
>>>> callback that pushes devices to low power state by turning
>>>> the clocks off in a system sleep.
>>>> Also add corresponding clock enable path in resume callback.
>>>>
>>>> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
>>>> Signed-off-by: Archit Taneja <architt@codeaurora.org>
>>>> [vivek: rework for clock and pm ops]
>>>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>>>> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
>>>> ---
>>>>
>>>> Changes since v12:
>>>>    - Added pm sleep .suspend callback. This disables the clocks.
>>>>    - Added corresponding change to enable clocks in .resume
>>>>     pm sleep callback.
>>>>
>>>>    drivers/iommu/arm-smmu.c | 75
>>>> ++++++++++++++++++++++++++++++++++++++++++++++--
>>>>    1 file changed, 73 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>>> index c73cfce1ccc0..9138a6fffe04 100644
>>>> --- a/drivers/iommu/arm-smmu.c
>>>> +++ b/drivers/iommu/arm-smmu.c
> 
> [snip]
> 
>>>> platform_device *pdev)
>>>>          arm_smmu_device_remove(pdev);
>>>>    }
>>>>    +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
>>>> +{
>>>> +       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>>>> +
>>>> +       return clk_bulk_enable(smmu->num_clks, smmu->clks);
>>>
>>>
>>> If there's a power domain being automatically switched by genpd then we need
>>> a reset here because we may have lost state entirely. Since I remembered the
>>> otherwise-useless GPU SMMU on Juno is in a separate power domain, I gave it
>>> a poking via sysfs with some debug stuff to dump sCR0 in these callbacks,
>>> and the problem is clear:
>>>
>>> ...
>>> [    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
>>> [    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 0x00201936
>>> [    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 6733980 ns
>>> [   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
>>> [   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 0x00220101
>>> [   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 6658020 ns
>>> ...
>>
>> Qualcomm SoCs have retention enabled for SMMU registers so they don't
>> lose state.
>> ...
>> [  256.013367] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
>> SCR0 = 0x201e36
>> [  256.013367]
>> [  256.019160] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
>> SCR0 = 0x201e36
>> [  256.019160]
>> [  256.027368] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
>> SCR0 = 0x201e36
>> [  256.027368]
>> [  256.036786] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
>> SCR0 = 0x201e36
>> ...
>>
>> However after adding arm_smmu_device_reset() in runtime_resume() I observe
>> some performance degradation when kill an instance of 'kmscube' and
>> start it again.
>> The launch time with arm_smmu_device_reset() in runtime_resume() change is
>> more.
>> Could this be because of frequent TLB invalidation and sync?

Probably. Plus the reset procedure is a big chunk of MMIO accesses, 
which for a non-trivial SMMU configuration probably isn't negligible in 
itself. Unfortunately, unless you know for absolute certain that you 
don't need to do that, you do.

> Some more information that i gathered.
> On Qcom SoCs besides the registers retention, TCU invalidates TLB cache on
> a CX power collapse exit, which is the system wide suspend case.
> The arm-smmu software is not aware of this CX power collapse /
> auto-invalidation.
> 
> So wouldn't doing an explicit TLB invalidations during runtime resume be
> detrimental to performance?

Indeed it would be, but resuming with TLBs full of random valid-looking 
junk is even more so.

> I have one more doubt here -
> We do runtime power cycle around arm_smmu_map/unmap() too.
> Now during map/unmap we selectively do TLB maintenance (either
> tlb_sync or tlb_add_flush).
> But with runtime pm we want to do TLBIALL*. Is that a problem?

It's technically redundant to do both, true, but as we've covered in 
previous rounds of discussion it's very difficult to know *which* one is 
sufficient at any given time, so in order to make progress for now I 
think we have to settle with doing both.

Robin.
Vivek Gautam July 27, 2018, 5:05 a.m. UTC | #6
On 7/26/2018 9:00 PM, Robin Murphy wrote:
> On 26/07/18 08:12, Vivek Gautam wrote:
>> On Wed, Jul 25, 2018 at 11:46 PM, Vivek Gautam
>> <vivek.gautam@codeaurora.org> wrote:
>>> On Tue, Jul 24, 2018 at 8:51 PM, Robin Murphy <robin.murphy@arm.com> 
>>> wrote:
>>>> On 19/07/18 11:15, Vivek Gautam wrote:
>>>>>
>>>>> From: Sricharan R <sricharan@codeaurora.org>
>>>>>
>>>>> The smmu needs to be functional only when the respective
>>>>> master's using it are active. The device_link feature
>>>>> helps to track such functional dependencies, so that the
>>>>> iommu gets powered when the master device enables itself
>>>>> using pm_runtime. So by adapting the smmu driver for
>>>>> runtime pm, above said dependency can be addressed.
>>>>>
>>>>> This patch adds the pm runtime/sleep callbacks to the
>>>>> driver and also the functions to parse the smmu clocks
>>>>> from DT and enable them in resume/suspend.
>>>>>
>>>>> Also, while we enable the runtime pm add a pm sleep suspend
>>>>> callback that pushes devices to low power state by turning
>>>>> the clocks off in a system sleep.
>>>>> Also add corresponding clock enable path in resume callback.
>>>>>
>>>>> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
>>>>> Signed-off-by: Archit Taneja <architt@codeaurora.org>
>>>>> [vivek: rework for clock and pm ops]
>>>>> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
>>>>> Reviewed-by: Tomasz Figa <tfiga@chromium.org>
>>>>> ---
>>>>>
>>>>> Changes since v12:
>>>>>    - Added pm sleep .suspend callback. This disables the clocks.
>>>>>    - Added corresponding change to enable clocks in .resume
>>>>>     pm sleep callback.
>>>>>
>>>>>    drivers/iommu/arm-smmu.c | 75
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++--
>>>>>    1 file changed, 73 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>>>> index c73cfce1ccc0..9138a6fffe04 100644
>>>>> --- a/drivers/iommu/arm-smmu.c
>>>>> +++ b/drivers/iommu/arm-smmu.c
>>
>> [snip]
>>
>>>>> platform_device *pdev)
>>>>>          arm_smmu_device_remove(pdev);
>>>>>    }
>>>>>    +static int __maybe_unused arm_smmu_runtime_resume(struct 
>>>>> device *dev)
>>>>> +{
>>>>> +       struct arm_smmu_device *smmu = dev_get_drvdata(dev);
>>>>> +
>>>>> +       return clk_bulk_enable(smmu->num_clks, smmu->clks);
>>>>
>>>>
>>>> If there's a power domain being automatically switched by genpd 
>>>> then we need
>>>> a reset here because we may have lost state entirely. Since I 
>>>> remembered the
>>>> otherwise-useless GPU SMMU on Juno is in a separate power domain, I 
>>>> gave it
>>>> a poking via sysfs with some debug stuff to dump sCR0 in these 
>>>> callbacks,
>>>> and the problem is clear:
>>>>
>>>> ...
>>>> [    4.625551] arm-smmu 2b400000.iommu: genpd_runtime_suspend()
>>>> [    4.631163] arm-smmu 2b400000.iommu: arm_smmu_runtime_suspend: 
>>>> 0x00201936
>>>> [    4.637897] arm-smmu 2b400000.iommu: suspend latency exceeded, 
>>>> 6733980 ns
>>>> [   21.566983] arm-smmu 2b400000.iommu: genpd_runtime_resume()
>>>> [   21.584796] arm-smmu 2b400000.iommu: arm_smmu_runtime_resume: 
>>>> 0x00220101
>>>> [   21.591452] arm-smmu 2b400000.iommu: resume latency exceeded, 
>>>> 6658020 ns
>>>> ...
>>>
>>> Qualcomm SoCs have retention enabled for SMMU registers so they don't
>>> lose state.
>>> ...
>>> [  256.013367] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
>>> SCR0 = 0x201e36
>>> [  256.013367]
>>> [  256.019160] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
>>> SCR0 = 0x201e36
>>> [  256.019160]
>>> [  256.027368] arm-smmu b40000.arm,smmu: arm_smmu_runtime_suspend
>>> SCR0 = 0x201e36
>>> [  256.027368]
>>> [  256.036786] arm-smmu b40000.arm,smmu: arm_smmu_runtime_resume
>>> SCR0 = 0x201e36
>>> ...
>>>
>>> However after adding arm_smmu_device_reset() in runtime_resume() I 
>>> observe
>>> some performance degradation when kill an instance of 'kmscube' and
>>> start it again.
>>> The launch time with arm_smmu_device_reset() in runtime_resume() 
>>> change is
>>> more.
>>> Could this be because of frequent TLB invalidation and sync?
>
> Probably. Plus the reset procedure is a big chunk of MMIO accesses, 
> which for a non-trivial SMMU configuration probably isn't negligible 
> in itself. Unfortunately, unless you know for absolute certain that 
> you don't need to do that, you do.
>
>> Some more information that i gathered.
>> On Qcom SoCs besides the registers retention, TCU invalidates TLB 
>> cache on
>> a CX power collapse exit, which is the system wide suspend case.
>> The arm-smmu software is not aware of this CX power collapse /
>> auto-invalidation.
>>
>> So wouldn't doing an explicit TLB invalidations during runtime resume be
>> detrimental to performance?
>
> Indeed it would be, but resuming with TLBs full of random 
> valid-looking junk is even more so.
>
>> I have one more doubt here -
>> We do runtime power cycle around arm_smmu_map/unmap() too.
>> Now during map/unmap we selectively do TLB maintenance (either
>> tlb_sync or tlb_add_flush).
>> But with runtime pm we want to do TLBIALL*. Is that a problem?
>
> It's technically redundant to do both, true, but as we've covered in 
> previous rounds of discussion it's very difficult to know *which* one 
> is sufficient at any given time, so in order to make progress for now 
> I think we have to settle with doing both.

Thanks Robin. I will respin the patches as Tomasz also suggested;

arm_smmu_runtime_resume() will look like:

     if (pm_runtime_suspended(dev))
         return 0;
     return arm_smmu_runtime_resume(dev);

and,
arm_smmu_runtime_resume() will have arm_smmu_device_reset().

Best regards
Vivek
>
> Robin.
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <p><br>
    </p>
    <br>
    <div class="moz-cite-prefix">On 7/26/2018 9:00 PM, Robin Murphy
      wrote:<br>
    </div>
    <blockquote type="cite"
      cite="mid:401d8509-b9ad-913b-334e-f4ac853472e3@arm.com">On
      26/07/18 08:12, Vivek Gautam wrote:
      <br>
      <blockquote type="cite">On Wed, Jul 25, 2018 at 11:46 PM, Vivek
        Gautam
        <br>
        <a class="moz-txt-link-rfc2396E" href="mailto:vivek.gautam@codeaurora.org">&lt;vivek.gautam@codeaurora.org&gt;</a> wrote:
        <br>
        <blockquote type="cite">On Tue, Jul 24, 2018 at 8:51 PM, Robin
          Murphy <a class="moz-txt-link-rfc2396E" href="mailto:robin.murphy@arm.com">&lt;robin.murphy@arm.com&gt;</a> wrote:
          <br>
          <blockquote type="cite">On 19/07/18 11:15, Vivek Gautam wrote:
            <br>
            <blockquote type="cite">
              <br>
              From: Sricharan R <a class="moz-txt-link-rfc2396E" href="mailto:sricharan@codeaurora.org">&lt;sricharan@codeaurora.org&gt;</a>
              <br>
              <br>
              The smmu needs to be functional only when the respective
              <br>
              master's using it are active. The device_link feature
              <br>
              helps to track such functional dependencies, so that the
              <br>
              iommu gets powered when the master device enables itself
              <br>
              using pm_runtime. So by adapting the smmu driver for
              <br>
              runtime pm, above said dependency can be addressed.
              <br>
              <br>
              This patch adds the pm runtime/sleep callbacks to the
              <br>
              driver and also the functions to parse the smmu clocks
              <br>
              from DT and enable them in resume/suspend.
              <br>
              <br>
              Also, while we enable the runtime pm add a pm sleep
              suspend
              <br>
              callback that pushes devices to low power state by turning
              <br>
              the clocks off in a system sleep.
              <br>
              Also add corresponding clock enable path in resume
              callback.
              <br>
              <br>
              Signed-off-by: Sricharan R
              <a class="moz-txt-link-rfc2396E" href="mailto:sricharan@codeaurora.org">&lt;sricharan@codeaurora.org&gt;</a>
              <br>
              Signed-off-by: Archit Taneja
              <a class="moz-txt-link-rfc2396E" href="mailto:architt@codeaurora.org">&lt;architt@codeaurora.org&gt;</a>
              <br>
              [vivek: rework for clock and pm ops]
              <br>
              Signed-off-by: Vivek Gautam
              <a class="moz-txt-link-rfc2396E" href="mailto:vivek.gautam@codeaurora.org">&lt;vivek.gautam@codeaurora.org&gt;</a>
              <br>
              Reviewed-by: Tomasz Figa <a class="moz-txt-link-rfc2396E" href="mailto:tfiga@chromium.org">&lt;tfiga@chromium.org&gt;</a>
              <br>
              ---
              <br>
              <br>
              Changes since v12:
              <br>
                 - Added pm sleep .suspend callback. This disables the
              clocks.
              <br>
                 - Added corresponding change to enable clocks in
              .resume
              <br>
                  pm sleep callback.
              <br>
              <br>
                 drivers/iommu/arm-smmu.c | 75
              <br>
              ++++++++++++++++++++++++++++++++++++++++++++++--
              <br>
                 1 file changed, 73 insertions(+), 2 deletions(-)
              <br>
              <br>
              diff --git a/drivers/iommu/arm-smmu.c
              b/drivers/iommu/arm-smmu.c
              <br>
              index c73cfce1ccc0..9138a6fffe04 100644
              <br>
              --- a/drivers/iommu/arm-smmu.c
              <br>
              +++ b/drivers/iommu/arm-smmu.c
              <br>
            </blockquote>
          </blockquote>
        </blockquote>
        <br>
        [snip]
        <br>
        <br>
        <blockquote type="cite">
          <blockquote type="cite">
            <blockquote type="cite">platform_device *pdev)
              <br>
                       arm_smmu_device_remove(pdev);
              <br>
                 }
              <br>
                 +static int __maybe_unused
              arm_smmu_runtime_resume(struct device *dev)
              <br>
              +{
              <br>
              +       struct arm_smmu_device *smmu =
              dev_get_drvdata(dev);
              <br>
              +
              <br>
              +       return clk_bulk_enable(smmu-&gt;num_clks,
              smmu-&gt;clks);
              <br>
            </blockquote>
            <br>
            <br>
            If there's a power domain being automatically switched by
            genpd then we need
            <br>
            a reset here because we may have lost state entirely. Since
            I remembered the
            <br>
            otherwise-useless GPU SMMU on Juno is in a separate power
            domain, I gave it
            <br>
            a poking via sysfs with some debug stuff to dump sCR0 in
            these callbacks,
            <br>
            and the problem is clear:
            <br>
            <br>
            ...
            <br>
            [    4.625551] arm-smmu 2b400000.iommu:
            genpd_runtime_suspend()
            <br>
            [    4.631163] arm-smmu 2b400000.iommu:
            arm_smmu_runtime_suspend: 0x00201936
            <br>
            [    4.637897] arm-smmu 2b400000.iommu: suspend latency
            exceeded, 6733980 ns
            <br>
            [   21.566983] arm-smmu 2b400000.iommu:
            genpd_runtime_resume()
            <br>
            [   21.584796] arm-smmu 2b400000.iommu:
            arm_smmu_runtime_resume: 0x00220101
            <br>
            [   21.591452] arm-smmu 2b400000.iommu: resume latency
            exceeded, 6658020 ns
            <br>
            ...
            <br>
          </blockquote>
          <br>
          Qualcomm SoCs have retention enabled for SMMU registers so
          they don't
          <br>
          lose state.
          <br>
          ...
          <br>
          [  256.013367] arm-smmu b40000.arm,smmu:
          arm_smmu_runtime_suspend
          <br>
          SCR0 = 0x201e36
          <br>
          [  256.013367]
          <br>
          [  256.019160] arm-smmu b40000.arm,smmu:
          arm_smmu_runtime_resume
          <br>
          SCR0 = 0x201e36
          <br>
          [  256.019160]
          <br>
          [  256.027368] arm-smmu b40000.arm,smmu:
          arm_smmu_runtime_suspend
          <br>
          SCR0 = 0x201e36
          <br>
          [  256.027368]
          <br>
          [  256.036786] arm-smmu b40000.arm,smmu:
          arm_smmu_runtime_resume
          <br>
          SCR0 = 0x201e36
          <br>
          ...
          <br>
          <br>
          However after adding arm_smmu_device_reset() in
          runtime_resume() I observe
          <br>
          some performance degradation when kill an instance of
          'kmscube' and
          <br>
          start it again.
          <br>
          The launch time with arm_smmu_device_reset() in
          runtime_resume() change is
          <br>
          more.
          <br>
          Could this be because of frequent TLB invalidation and sync?
          <br>
        </blockquote>
      </blockquote>
      <br>
      Probably. Plus the reset procedure is a big chunk of MMIO
      accesses, which for a non-trivial SMMU configuration probably
      isn't negligible in itself. Unfortunately, unless you know for
      absolute certain that you don't need to do that, you do.
      <br>
      <br>
      <blockquote type="cite">Some more information that i gathered.
        <br>
        On Qcom SoCs besides the registers retention, TCU invalidates
        TLB cache on
        <br>
        a CX power collapse exit, which is the system wide suspend case.
        <br>
        The arm-smmu software is not aware of this CX power collapse /
        <br>
        auto-invalidation.
        <br>
        <br>
        So wouldn't doing an explicit TLB invalidations during runtime
        resume be
        <br>
        detrimental to performance?
        <br>
      </blockquote>
      <br>
      Indeed it would be, but resuming with TLBs full of random
      valid-looking junk is even more so.
      <br>
      <br>
      <blockquote type="cite">I have one more doubt here -
        <br>
        We do runtime power cycle around arm_smmu_map/unmap() too.
        <br>
        Now during map/unmap we selectively do TLB maintenance (either
        <br>
        tlb_sync or tlb_add_flush).
        <br>
        But with runtime pm we want to do TLBIALL*. Is that a problem?
        <br>
      </blockquote>
      <br>
      It's technically redundant to do both, true, but as we've covered
      in previous rounds of discussion it's very difficult to know
      *which* one is sufficient at any given time, so in order to make
      progress for now I think we have to settle with doing both.
      <br>
    </blockquote>
    <br>
    Thanks Robin. I will respin the patches as Tomasz also suggested;<span
      style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial; display: inline
      !important; float: none;"><br>
      <br>
      arm_smmu_runtime_resume() will look like:</span><br style="color:
      rgb(34, 34, 34); font-family: arial, sans-serif; font-size:
      12.8px; font-style: normal; font-variant-ligatures: normal;
      font-variant-caps: normal; font-weight: 400; letter-spacing:
      normal; orphans: 2; text-align: start; text-indent: 0px;
      text-transform: none; white-space: normal; widows: 2;
      word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial;">
    <br style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial;">
    <span style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial; display: inline
      !important; float: none;">    if (pm_runtime_suspended(dev))</span><br
      style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial;">
    <span style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial; display: inline
      !important; float: none;">        return 0;</span><br
      style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial;">
    <span style="color: rgb(34, 34, 34); font-family: arial, sans-serif;
      font-size: 12.8px; font-style: normal; font-variant-ligatures:
      normal; font-variant-caps: normal; font-weight: 400;
      letter-spacing: normal; orphans: 2; text-align: start;
      text-indent: 0px; text-transform: none; white-space: normal;
      widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px;
      background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial; display: inline
      !important; float: none;">    return arm_smmu_runtime_resume(dev);<br>
      <br>
      and,<br>
    </span><span style="color: rgb(34, 34, 34); font-family: arial,
      sans-serif; font-size: 12.8px; font-style: normal;
      font-variant-ligatures: normal; font-variant-caps: normal;
      font-weight: 400; letter-spacing: normal; orphans: 2; text-align:
      start; text-indent: 0px; text-transform: none; white-space:
      normal; widows: 2; word-spacing: 0px; -webkit-text-stroke-width:
      0px; background-color: rgb(255, 255, 255); text-decoration-style:
      initial; text-decoration-color: initial; display: inline
      !important; float: none;"><span style="color: rgb(34, 34, 34);
        font-family: arial, sans-serif; font-size: 12.8px; font-style:
        normal; font-variant-ligatures: normal; font-variant-caps:
        normal; font-weight: 400; letter-spacing: normal; orphans: 2;
        text-align: start; text-indent: 0px; text-transform: none;
        white-space: normal; widows: 2; word-spacing: 0px;
        -webkit-text-stroke-width: 0px; background-color: rgb(255, 255,
        255); text-decoration-style: initial; text-decoration-color:
        initial; display: inline !important; float: none;">arm_smmu_runtime_resume()
        will have arm_smmu_device_reset().<br>
      </span></span><br>
    Best regards<br>
    Vivek<br>
    <blockquote type="cite"
      cite="mid:401d8509-b9ad-913b-334e-f4ac853472e3@arm.com">
      <br>
      Robin.
      <br>
      --
      <br>
      To unsubscribe from this list: send the line "unsubscribe
      linux-arm-msm" in
      <br>
      the body of a message to <a class="moz-txt-link-abbreviated" href="mailto:majordomo@vger.kernel.org">majordomo@vger.kernel.org</a>
      <br>
      More majordomo info at  <a class="moz-txt-link-freetext" href="http://vger.kernel.org/majordomo-info.html">http://vger.kernel.org/majordomo-info.html</a>
      <br>
    </blockquote>
    <br>
  </body>
</html>
diff mbox

Patch

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index c73cfce1ccc0..9138a6fffe04 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -48,6 +48,7 @@ 
 #include <linux/of_iommu.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
@@ -205,6 +206,8 @@  struct arm_smmu_device {
 	u32				num_global_irqs;
 	u32				num_context_irqs;
 	unsigned int			*irqs;
+	struct clk_bulk_data		*clks;
+	int				num_clks;
 
 	u32				cavium_id_base; /* Specific to Cavium */
 
@@ -1897,10 +1900,12 @@  static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
 struct arm_smmu_match_data {
 	enum arm_smmu_arch_version version;
 	enum arm_smmu_implementation model;
+	const char * const *clks;
+	int num_clks;
 };
 
 #define ARM_SMMU_MATCH_DATA(name, ver, imp)	\
-static struct arm_smmu_match_data name = { .version = ver, .model = imp }
+static const struct arm_smmu_match_data name = { .version = ver, .model = imp }
 
 ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU);
 ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU);
@@ -1919,6 +1924,23 @@  static const struct of_device_id arm_smmu_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
 
+static void arm_smmu_fill_clk_data(struct arm_smmu_device *smmu,
+				   const char * const *clks)
+{
+	int i;
+
+	if (smmu->num_clks < 1)
+		return;
+
+	smmu->clks = devm_kcalloc(smmu->dev, smmu->num_clks,
+				  sizeof(*smmu->clks), GFP_KERNEL);
+	if (!smmu->clks)
+		return;
+
+	for (i = 0; i < smmu->num_clks; i++)
+		smmu->clks[i].id = clks[i];
+}
+
 #ifdef CONFIG_ACPI
 static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu)
 {
@@ -2001,6 +2023,9 @@  static int arm_smmu_device_dt_probe(struct platform_device *pdev,
 	data = of_device_get_match_data(dev);
 	smmu->version = data->version;
 	smmu->model = data->model;
+	smmu->num_clks = data->num_clks;
+
+	arm_smmu_fill_clk_data(smmu, data->clks);
 
 	parse_driver_options(smmu);
 
@@ -2099,6 +2124,14 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 		smmu->irqs[i] = irq;
 	}
 
+	err = devm_clk_bulk_get(smmu->dev, smmu->num_clks, smmu->clks);
+	if (err)
+		return err;
+
+	err = clk_bulk_prepare(smmu->num_clks, smmu->clks);
+	if (err)
+		return err;
+
 	err = arm_smmu_device_cfg_probe(smmu);
 	if (err)
 		return err;
@@ -2181,6 +2214,9 @@  static int arm_smmu_device_remove(struct platform_device *pdev)
 
 	/* Turn the thing off */
 	writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+
+	clk_bulk_unprepare(smmu->num_clks, smmu->clks);
+
 	return 0;
 }
 
@@ -2189,15 +2225,50 @@  static void arm_smmu_device_shutdown(struct platform_device *pdev)
 	arm_smmu_device_remove(pdev);
 }
 
+static int __maybe_unused arm_smmu_runtime_resume(struct device *dev)
+{
+	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+	return clk_bulk_enable(smmu->num_clks, smmu->clks);
+}
+
+static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev)
+{
+	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+
+	clk_bulk_disable(smmu->num_clks, smmu->clks);
+
+	return 0;
+}
+
 static int __maybe_unused arm_smmu_pm_resume(struct device *dev)
 {
 	struct arm_smmu_device *smmu = dev_get_drvdata(dev);
+	int ret;
+
+	if (!pm_runtime_suspended(dev)) {
+		ret = arm_smmu_runtime_resume(dev);
+		if (ret)
+			return ret;
+	}
 
 	arm_smmu_device_reset(smmu);
 	return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(arm_smmu_pm_ops, NULL, arm_smmu_pm_resume);
+static int __maybe_unused arm_smmu_pm_suspend(struct device *dev)
+{
+	if (!pm_runtime_suspended(dev))
+		return arm_smmu_runtime_suspend(dev);
+
+	return 0;
+}
+
+static const struct dev_pm_ops arm_smmu_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(arm_smmu_pm_suspend, arm_smmu_pm_resume)
+	SET_RUNTIME_PM_OPS(arm_smmu_runtime_suspend,
+			   arm_smmu_runtime_resume, NULL)
+};
 
 static struct platform_driver arm_smmu_driver = {
 	.driver	= {