diff mbox series

[3/4] soundwire: intel: exit clock stop mode on system suspend

Message ID 20210727055608.30247-4-yung-chuan.liao@linux.intel.com (mailing list archive)
State Superseded
Headers show
Series soundwire: intel: exit clock-stop mode before system suspend | expand

Commit Message

Bard Liao July 27, 2021, 5:56 a.m. UTC
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>

Intel validation reported an issue where the HW_RST self-clearing bit
is not cleared in hardware, which as a ripple effect creates issues
with the clock stop mode.

This happens is a specific sequence where the Intel manager is
pm_runtime suspended with the clock-stop mode enabled. During the
system suspend, we currently do nothing, which can lead to potential
issues on system resume and the following pm_runtime suspend,
depending on the hardware state.

This patch suggests a full resume (parent+child devices) if the
clock-stop mode is used. This may require extra time but will make the
suspend/resume flows completely symmetric. This also removes a race
condition where we could not access SHIM registers if the parent was
suspended as well. Resuming the link also resumes the parent by
construction.

BugLink: https://github.com/thesofproject/linux/issues/2606
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
---
 drivers/soundwire/intel.c | 65 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

Comments

Vinod Koul Aug. 2, 2021, 4:31 a.m. UTC | #1
On 27-07-21, 13:56, Bard Liao wrote:
> From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
> 
> Intel validation reported an issue where the HW_RST self-clearing bit
> is not cleared in hardware, which as a ripple effect creates issues
> with the clock stop mode.
> 
> This happens is a specific sequence where the Intel manager is
> pm_runtime suspended with the clock-stop mode enabled. During the
> system suspend, we currently do nothing, which can lead to potential
> issues on system resume and the following pm_runtime suspend,
> depending on the hardware state.
> 
> This patch suggests a full resume (parent+child devices) if the
> clock-stop mode is used. This may require extra time but will make the
> suspend/resume flows completely symmetric. This also removes a race
> condition where we could not access SHIM registers if the parent was
> suspended as well. Resuming the link also resumes the parent by
> construction.
> 
> BugLink: https://github.com/thesofproject/linux/issues/2606
> Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
> Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
> Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
> ---
>  drivers/soundwire/intel.c | 65 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 65 insertions(+)
> 
> diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
> index 46d1645cb7fe..9d05e158fe0e 100644
> --- a/drivers/soundwire/intel.c
> +++ b/drivers/soundwire/intel.c
> @@ -1527,6 +1527,70 @@ int intel_link_process_wakeen_event(struct auxiliary_device *auxdev)
>   * PM calls
>   */
>  
> +static int intel_resume_child_device(struct device *dev, void *data)
> +{
> +	int ret;
> +	struct sdw_slave *slave = dev_to_sdw_dev(dev);
> +
> +	if (!slave->probed) {
> +		dev_dbg(dev, "%s: skipping device, no probed driver\n", __func__);
> +		return 0;
> +	}
> +	if (!slave->dev_num_sticky) {
> +		dev_dbg(dev, "%s: skipping device, never detected on bus\n", __func__);
> +		return 0;
> +	}
> +
> +	ret = pm_request_resume(dev);
> +	if (ret < 0)
> +		dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);
> +
> +	return ret;
> +}
> +
> +static int __maybe_unused intel_pm_prepare(struct device *dev)
> +{
> +	struct sdw_cdns *cdns = dev_get_drvdata(dev);
> +	struct sdw_intel *sdw = cdns_to_intel(cdns);
> +	struct sdw_bus *bus = &cdns->bus;
> +	u32 clock_stop_quirks;
> +	int ret = 0;
> +
> +	if (bus->prop.hw_disabled || !sdw->startup_done) {
> +		dev_dbg(dev, "SoundWire master %d is disabled or not-started, ignoring\n",
> +			bus->link_id);
> +		return 0;
> +	}
> +
> +	clock_stop_quirks = sdw->link_res->clock_stop_quirks;
> +
> +	if ((clock_stop_quirks & SDW_INTEL_CLK_STOP_BUS_RESET) ||
> +	    !clock_stop_quirks) {
> +		/*
> +		 * Try to resume the entire bus (parent + child devices) to exit
> +		 * the clock stop mode. If this fails, we keep going since we don't want
> +		 * to prevent system suspend from happening and errors should be recoverable
> +		 * on resume.
> +		 */
> +		ret = device_for_each_child(bus->dev, NULL, intel_resume_child_device);
> +
> +		if (ret < 0)
> +			dev_err(dev, "%s: intel_resume_child_device failed: %d\n", __func__, ret);
> +
> +		/*
> +		 * in the case where a link was started but does not have anything connected,
> +		 * we still need to resume to keep link power up/down sequences balanced.
> +		 * This is a no-op if a child device was present, since resuming the child
> +		 * device would also resume the parent
> +		 */
> +		ret = pm_request_resume(dev);

I am not sure of this patch yet, maybe I am comprehending it..

1. In above you are calling resume of child devices first and then intel
device, which sounds reverse, should you not resume intel device first
and then child (codec devices) ?

2. What about when resume is invoked by the core for the child devices.
That would be called in the PM resume flow, so why do it here?

> +		if (ret < 0)
> +			dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);
> +	}
> +
> +	return 0;
> +}
> +
>  static int __maybe_unused intel_suspend(struct device *dev)
>  {
>  	struct sdw_cdns *cdns = dev_get_drvdata(dev);
> @@ -1923,6 +1987,7 @@ static int __maybe_unused intel_resume_runtime(struct device *dev)
>  }
>  
>  static const struct dev_pm_ops intel_pm = {
> +	.prepare = intel_pm_prepare,
>  	SET_SYSTEM_SLEEP_PM_OPS(intel_suspend, intel_resume)
>  	SET_RUNTIME_PM_OPS(intel_suspend_runtime, intel_resume_runtime, NULL)
>  };
> -- 
> 2.17.1
Pierre-Louis Bossart Aug. 2, 2021, 2:24 p.m. UTC | #2
>> +static int __maybe_unused intel_pm_prepare(struct device *dev)
>> +{
>> +	struct sdw_cdns *cdns = dev_get_drvdata(dev);
>> +	struct sdw_intel *sdw = cdns_to_intel(cdns);
>> +	struct sdw_bus *bus = &cdns->bus;
>> +	u32 clock_stop_quirks;
>> +	int ret = 0;
>> +
>> +	if (bus->prop.hw_disabled || !sdw->startup_done) {
>> +		dev_dbg(dev, "SoundWire master %d is disabled or not-started, ignoring\n",
>> +			bus->link_id);
>> +		return 0;
>> +	}
>> +
>> +	clock_stop_quirks = sdw->link_res->clock_stop_quirks;
>> +
>> +	if ((clock_stop_quirks & SDW_INTEL_CLK_STOP_BUS_RESET) ||
>> +	    !clock_stop_quirks) {
>> +		/*
>> +		 * Try to resume the entire bus (parent + child devices) to exit
>> +		 * the clock stop mode. If this fails, we keep going since we don't want
>> +		 * to prevent system suspend from happening and errors should be recoverable
>> +		 * on resume.
>> +		 */
>> +		ret = device_for_each_child(bus->dev, NULL, intel_resume_child_device);
>> +
>> +		if (ret < 0)
>> +			dev_err(dev, "%s: intel_resume_child_device failed: %d\n", __func__, ret);
>> +
>> +		/*
>> +		 * in the case where a link was started but does not have anything connected,
>> +		 * we still need to resume to keep link power up/down sequences balanced.
>> +		 * This is a no-op if a child device was present, since resuming the child
>> +		 * device would also resume the parent
>> +		 */
>> +		ret = pm_request_resume(dev);
> 
> I am not sure of this patch yet, maybe I am comprehending it..
> 
> 1. In above you are calling resume of child devices first and then intel
> device, which sounds reverse, should you not resume intel device first
> and then child (codec devices) ?
> 
> 2. What about when resume is invoked by the core for the child devices.
> That would be called in the PM resume flow, so why do it here?

I realize it's a complicated sequence, it took us multiple phases to get
it right. There are multiple layers between power domain, bus and driver.

The .prepare phase happens before the system suspend phase. Unlike
suspend, which progresses from children to parents, the .prepare is
handled parent first.

When we do a request_resume of the child device, by construction that
also resumes the parent. In other words, if we have multiple codecs on a
link, the first iteration of device_for_each_child() will already resume
the parent and the first device, and the second iteration will only
resume the second device.

What this step does is make sure than when the codec .suspend routine is
invoked, the entire bus is already back to full power. I did check
privately with Rafael (CC:ed) if this sequence was legit.

We did consider modifying the system suspend callback in codec drivers,
so that we would do a pm_runtime resume(). This is functionally
equivalent to what we are suggesting here, but we decided not to do so
for two main reasons

a) lots of code changes across all codecs for an Intel-specific issue

b) we would need to add a flag so that codec drivers would know in which
Intel-specific clock-stop mode the bus was configured. That's not so
good either.

It seemed simpler to use to add this .prepare step and test on the Intel
clock stop mode before doing a pm_runtime_resume for all codecs.

> 
>> +		if (ret < 0)
>> +			dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int __maybe_unused intel_suspend(struct device *dev)
>>  {
>>  	struct sdw_cdns *cdns = dev_get_drvdata(dev);
>> @@ -1923,6 +1987,7 @@ static int __maybe_unused intel_resume_runtime(struct device *dev)
>>  }
>>  
>>  static const struct dev_pm_ops intel_pm = {
>> +	.prepare = intel_pm_prepare,
>>  	SET_SYSTEM_SLEEP_PM_OPS(intel_suspend, intel_resume)
>>  	SET_RUNTIME_PM_OPS(intel_suspend_runtime, intel_resume_runtime, NULL)
>>  };
>> -- 
>> 2.17.1
>
Pierre-Louis Bossart Aug. 2, 2021, 4:28 p.m. UTC | #3
>> 1. In above you are calling resume of child devices first and then intel
>> device, which sounds reverse, should you not resume intel device first
>> and then child (codec devices) ?
>>
>> 2. What about when resume is invoked by the core for the child devices.
>> That would be called in the PM resume flow, so why do it here?
> 
> I realize it's a complicated sequence, it took us multiple phases to get
> it right. There are multiple layers between power domain, bus and driver.
> 
> The .prepare phase happens before the system suspend phase. Unlike
> suspend, which progresses from children to parents, the .prepare is
> handled parent first.
> 
> When we do a request_resume of the child device, by construction that
> also resumes the parent. In other words, if we have multiple codecs on a
> link, the first iteration of device_for_each_child() will already resume
> the parent and the first device, and the second iteration will only
> resume the second device.
> 
> What this step does is make sure than when the codec .suspend routine is
> invoked, the entire bus is already back to full power. I did check
> privately with Rafael (CC:ed) if this sequence was legit.
> 
> We did consider modifying the system suspend callback in codec drivers,
> so that we would do a pm_runtime resume(). This is functionally
> equivalent to what we are suggesting here, but we decided not to do so
> for two main reasons
> 
> a) lots of code changes across all codecs for an Intel-specific issue
> 
> b) we would need to add a flag so that codec drivers would know in which
> Intel-specific clock-stop mode the bus was configured. That's not so
> good either.
> 
> It seemed simpler to use to add this .prepare step and test on the Intel
> clock stop mode before doing a pm_runtime_resume for all codecs.

Note that we could invert the two parts and do a parent resume first,
and a loop for all children second. It's completely equivalent, but
might be less convoluted to understand without any implicit behavior
assumed.

	if ((clock_stop_quirks & SDW_INTEL_CLK_STOP_BUS_RESET) ||
	    !clock_stop_quirks) {

		/* resume parent first */
		ret = pm_request_resume(dev);
		if (ret < 0)
			dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);

		/*
		 * resume all children next.
		 * if there are no children on this link,
		 * this is a no-op
		 */
		ret = device_for_each_child(bus->dev, NULL, intel_resume_child_device);

		if (ret < 0)
			dev_err(dev, "%s: intel_resume_child_device failed: %d\n", __func__,
ret);
	}
Vinod Koul Aug. 6, 2021, 1:31 p.m. UTC | #4
On 02-08-21, 11:28, Pierre-Louis Bossart wrote:
> 
> 
> 
> >> 1. In above you are calling resume of child devices first and then intel
> >> device, which sounds reverse, should you not resume intel device first
> >> and then child (codec devices) ?
> >>
> >> 2. What about when resume is invoked by the core for the child devices.
> >> That would be called in the PM resume flow, so why do it here?
> > 
> > I realize it's a complicated sequence, it took us multiple phases to get
> > it right. There are multiple layers between power domain, bus and driver.
> > 
> > The .prepare phase happens before the system suspend phase. Unlike
> > suspend, which progresses from children to parents, the .prepare is
> > handled parent first.
> > 
> > When we do a request_resume of the child device, by construction that
> > also resumes the parent. In other words, if we have multiple codecs on a
> > link, the first iteration of device_for_each_child() will already resume
> > the parent and the first device, and the second iteration will only
> > resume the second device.
> > 
> > What this step does is make sure than when the codec .suspend routine is
> > invoked, the entire bus is already back to full power. I did check
> > privately with Rafael (CC:ed) if this sequence was legit.
> > 
> > We did consider modifying the system suspend callback in codec drivers,
> > so that we would do a pm_runtime resume(). This is functionally
> > equivalent to what we are suggesting here, but we decided not to do so
> > for two main reasons
> > 
> > a) lots of code changes across all codecs for an Intel-specific issue
> > 
> > b) we would need to add a flag so that codec drivers would know in which
> > Intel-specific clock-stop mode the bus was configured. That's not so
> > good either.
> > 
> > It seemed simpler to use to add this .prepare step and test on the Intel
> > clock stop mode before doing a pm_runtime_resume for all codecs.

Ack, the code looks neat. But glancing at it, reader might get confused
about the sequencing done here.. It is not very obvious, so consider
adding this to changelog or driver comments. It will be helpful

> 
> Note that we could invert the two parts and do a parent resume first,
> and a loop for all children second. It's completely equivalent, but
> might be less convoluted to understand without any implicit behavior
> assumed.

Agree, it would be redundant as PM core would take care of it. maybe
add a comment so that it is explicit
Pierre-Louis Bossart Aug. 6, 2021, 4:03 p.m. UTC | #5
On 8/6/21 8:31 AM, Vinod Koul wrote:
> On 02-08-21, 11:28, Pierre-Louis Bossart wrote:
>>
>>
>>
>>>> 1. In above you are calling resume of child devices first and then intel
>>>> device, which sounds reverse, should you not resume intel device first
>>>> and then child (codec devices) ?
>>>>
>>>> 2. What about when resume is invoked by the core for the child devices.
>>>> That would be called in the PM resume flow, so why do it here?
>>>
>>> I realize it's a complicated sequence, it took us multiple phases to get
>>> it right. There are multiple layers between power domain, bus and driver.
>>>
>>> The .prepare phase happens before the system suspend phase. Unlike
>>> suspend, which progresses from children to parents, the .prepare is
>>> handled parent first.
>>>
>>> When we do a request_resume of the child device, by construction that
>>> also resumes the parent. In other words, if we have multiple codecs on a
>>> link, the first iteration of device_for_each_child() will already resume
>>> the parent and the first device, and the second iteration will only
>>> resume the second device.
>>>
>>> What this step does is make sure than when the codec .suspend routine is
>>> invoked, the entire bus is already back to full power. I did check
>>> privately with Rafael (CC:ed) if this sequence was legit.
>>>
>>> We did consider modifying the system suspend callback in codec drivers,
>>> so that we would do a pm_runtime resume(). This is functionally
>>> equivalent to what we are suggesting here, but we decided not to do so
>>> for two main reasons
>>>
>>> a) lots of code changes across all codecs for an Intel-specific issue
>>>
>>> b) we would need to add a flag so that codec drivers would know in which
>>> Intel-specific clock-stop mode the bus was configured. That's not so
>>> good either.
>>>
>>> It seemed simpler to use to add this .prepare step and test on the Intel
>>> clock stop mode before doing a pm_runtime_resume for all codecs.
> 
> Ack, the code looks neat. But glancing at it, reader might get confused
> about the sequencing done here.. It is not very obvious, so consider
> adding this to changelog or driver comments. It will be helpful

Yep, even in internal reviews this was far from straightforward to
explain. I added comments but I can certainly try to explain more.

>>
>> Note that we could invert the two parts and do a parent resume first,
>> and a loop for all children second. It's completely equivalent, but
>> might be less convoluted to understand without any implicit behavior
>> assumed.
> 
> Agree, it would be redundant as PM core would take care of it. maybe
> add a comment so that it is explicit

Will add comments as well.

Note that I have another lead to further improve suspend-resume, running
stress tests on thousands of cycles atm. I'll wait until we have more
results to resubmit this series.

Thanks for the reviews!
diff mbox series

Patch

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 46d1645cb7fe..9d05e158fe0e 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -1527,6 +1527,70 @@  int intel_link_process_wakeen_event(struct auxiliary_device *auxdev)
  * PM calls
  */
 
+static int intel_resume_child_device(struct device *dev, void *data)
+{
+	int ret;
+	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+
+	if (!slave->probed) {
+		dev_dbg(dev, "%s: skipping device, no probed driver\n", __func__);
+		return 0;
+	}
+	if (!slave->dev_num_sticky) {
+		dev_dbg(dev, "%s: skipping device, never detected on bus\n", __func__);
+		return 0;
+	}
+
+	ret = pm_request_resume(dev);
+	if (ret < 0)
+		dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);
+
+	return ret;
+}
+
+static int __maybe_unused intel_pm_prepare(struct device *dev)
+{
+	struct sdw_cdns *cdns = dev_get_drvdata(dev);
+	struct sdw_intel *sdw = cdns_to_intel(cdns);
+	struct sdw_bus *bus = &cdns->bus;
+	u32 clock_stop_quirks;
+	int ret = 0;
+
+	if (bus->prop.hw_disabled || !sdw->startup_done) {
+		dev_dbg(dev, "SoundWire master %d is disabled or not-started, ignoring\n",
+			bus->link_id);
+		return 0;
+	}
+
+	clock_stop_quirks = sdw->link_res->clock_stop_quirks;
+
+	if ((clock_stop_quirks & SDW_INTEL_CLK_STOP_BUS_RESET) ||
+	    !clock_stop_quirks) {
+		/*
+		 * Try to resume the entire bus (parent + child devices) to exit
+		 * the clock stop mode. If this fails, we keep going since we don't want
+		 * to prevent system suspend from happening and errors should be recoverable
+		 * on resume.
+		 */
+		ret = device_for_each_child(bus->dev, NULL, intel_resume_child_device);
+
+		if (ret < 0)
+			dev_err(dev, "%s: intel_resume_child_device failed: %d\n", __func__, ret);
+
+		/*
+		 * in the case where a link was started but does not have anything connected,
+		 * we still need to resume to keep link power up/down sequences balanced.
+		 * This is a no-op if a child device was present, since resuming the child
+		 * device would also resume the parent
+		 */
+		ret = pm_request_resume(dev);
+		if (ret < 0)
+			dev_err(dev, "%s: pm_request_resume failed: %d\n", __func__, ret);
+	}
+
+	return 0;
+}
+
 static int __maybe_unused intel_suspend(struct device *dev)
 {
 	struct sdw_cdns *cdns = dev_get_drvdata(dev);
@@ -1923,6 +1987,7 @@  static int __maybe_unused intel_resume_runtime(struct device *dev)
 }
 
 static const struct dev_pm_ops intel_pm = {
+	.prepare = intel_pm_prepare,
 	SET_SYSTEM_SLEEP_PM_OPS(intel_suspend, intel_resume)
 	SET_RUNTIME_PM_OPS(intel_suspend_runtime, intel_resume_runtime, NULL)
 };