diff mbox series

[v4,3/4] scmi-cpufreq: get opp_shared_cpus from opp-v2 for EM

Message ID 20201202172356.10508-4-nicola.mazzucato@arm.com (mailing list archive)
State New, archived
Delegated to: viresh kumar
Headers show
Series CPUFreq: Add support for opp-sharing cpus | expand

Commit Message

Nicola Mazzucato Dec. 2, 2020, 5:23 p.m. UTC
By design, SCMI performance domains define the granularity of
performance controls, they do not describe any underlying hardware
dependencies (although they may match in many cases).

It is therefore possible to have some platforms where hardware may have
the ability to control CPU performance at different granularity and choose
to describe fine-grained performance control through SCMI.

In such situations, the energy model would be provided with inaccurate
information based on controls, while it still needs to know the
performance boundaries.

To restore correct functionality, retrieve information of CPUs under the
same v/f domain from operating-points-v2 in DT, and pass it on to EM.

Signed-off-by: Nicola Mazzucato <nicola.mazzucato@arm.com>
---
 drivers/cpufreq/scmi-cpufreq.c | 51 +++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 16 deletions(-)

Comments

Viresh Kumar Dec. 8, 2020, 5:50 a.m. UTC | #1
On 02-12-20, 17:23, Nicola Mazzucato wrote:
> By design, SCMI performance domains define the granularity of
> performance controls, they do not describe any underlying hardware
> dependencies (although they may match in many cases).
> 
> It is therefore possible to have some platforms where hardware may have
> the ability to control CPU performance at different granularity and choose
> to describe fine-grained performance control through SCMI.
> 
> In such situations, the energy model would be provided with inaccurate
> information based on controls, while it still needs to know the
> performance boundaries.
> 
> To restore correct functionality, retrieve information of CPUs under the
> same v/f domain from operating-points-v2 in DT, and pass it on to EM.
> 
> Signed-off-by: Nicola Mazzucato <nicola.mazzucato@arm.com>
> ---
>  drivers/cpufreq/scmi-cpufreq.c | 51 +++++++++++++++++++++++-----------
>  1 file changed, 35 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
> index 491a0a24fb1e..f505efcc62b1 100644
> --- a/drivers/cpufreq/scmi-cpufreq.c
> +++ b/drivers/cpufreq/scmi-cpufreq.c
> @@ -127,6 +127,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>  	struct cpufreq_frequency_table *freq_table;
>  	struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power);
>  	bool power_scale_mw;
> +	cpumask_var_t opp_shared_cpus;
>  
>  	cpu_dev = get_cpu_device(policy->cpu);
>  	if (!cpu_dev) {
> @@ -134,30 +135,45 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>  		return -ENODEV;
>  	}
>  
> -	ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> -	if (ret) {
> -		dev_warn(cpu_dev, "failed to add opps to the device\n");
> -		return ret;
> -	}
> +	if (!zalloc_cpumask_var(&opp_shared_cpus, GFP_KERNEL))
> +		return -ENOMEM;
>  
>  	ret = scmi_get_sharing_cpus(cpu_dev, policy->cpus);
>  	if (ret) {
>  		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
> -		return ret;
> +		goto out_free_cpumask;
>  	}
>  
> -	ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
> -	if (ret) {
> -		dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> -			__func__, ret);
> -		return ret;
> +	/*
> +	 * The OPP 'sharing cpus' info may come from dt through an empty opp
> +	 * table and opp-shared. If found, it takes precedence over the SCMI
> +	 * domain IDs info.
> +	 */
> +	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, opp_shared_cpus);
> +	if (ret || !cpumask_weight(opp_shared_cpus)) {
> +		/*
> +		 * Either opp-table is not set or no opp-shared was found,
> +		 * use the information from SCMI domain IDs.
> +		 */
> +		cpumask_copy(opp_shared_cpus, policy->cpus);
>  	}
>  
>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>  	if (nr_opp <= 0) {
> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> -		ret = -EPROBE_DEFER;
> -		goto out_free_opp;
> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> +		if (ret) {
> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> +			goto out_free_cpumask;
> +		}
> +
> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> +		if (ret) {
> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> +				__func__, ret);
> +			goto out_free_cpumask;
> +		}
> +

Why do we need to call above two after calling
dev_pm_opp_get_opp_count() ? And we don't check the return value of
the below call anymore, moreover we have to call it twice now.

> +		nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>  	}
>  
>  	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> @@ -191,15 +207,18 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>  		handle->perf_ops->fast_switch_possible(handle, cpu_dev);
>  
>  	power_scale_mw = handle->perf_ops->power_scale_mw_get(handle);
> -	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus,
> +	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus,
>  				    power_scale_mw);
>  
> -	return 0;
> +	ret = 0;

ret is already 0 here.

> +	goto out_free_cpumask;
>  
>  out_free_priv:
>  	kfree(priv);
>  out_free_opp:
>  	dev_pm_opp_remove_all_dynamic(cpu_dev);
> +out_free_cpumask:
> +	free_cpumask_var(opp_shared_cpus);
>  
>  	return ret;
>  }
> -- 
> 2.27.0
Nicola Mazzucato Dec. 8, 2020, 7:22 a.m. UTC | #2
Hi Viresh,

thanks for looking into this. Please find below

On 12/8/20 5:50 AM, Viresh Kumar wrote:
> On 02-12-20, 17:23, Nicola Mazzucato wrote:
>> By design, SCMI performance domains define the granularity of
>> performance controls, they do not describe any underlying hardware
>> dependencies (although they may match in many cases).
>>
>> It is therefore possible to have some platforms where hardware may have
>> the ability to control CPU performance at different granularity and choose
>> to describe fine-grained performance control through SCMI.
>>
>> In such situations, the energy model would be provided with inaccurate
>> information based on controls, while it still needs to know the
>> performance boundaries.
>>
>> To restore correct functionality, retrieve information of CPUs under the
>> same v/f domain from operating-points-v2 in DT, and pass it on to EM.
>>
>> Signed-off-by: Nicola Mazzucato <nicola.mazzucato@arm.com>
>> ---
>>  drivers/cpufreq/scmi-cpufreq.c | 51 +++++++++++++++++++++++-----------
>>  1 file changed, 35 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
>> index 491a0a24fb1e..f505efcc62b1 100644
>> --- a/drivers/cpufreq/scmi-cpufreq.c
>> +++ b/drivers/cpufreq/scmi-cpufreq.c
>> @@ -127,6 +127,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>>  	struct cpufreq_frequency_table *freq_table;
>>  	struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power);
>>  	bool power_scale_mw;
>> +	cpumask_var_t opp_shared_cpus;
>>  
>>  	cpu_dev = get_cpu_device(policy->cpu);
>>  	if (!cpu_dev) {
>> @@ -134,30 +135,45 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>>  		return -ENODEV;
>>  	}
>>  
>> -	ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
>> -	if (ret) {
>> -		dev_warn(cpu_dev, "failed to add opps to the device\n");
>> -		return ret;
>> -	}
>> +	if (!zalloc_cpumask_var(&opp_shared_cpus, GFP_KERNEL))
>> +		return -ENOMEM;
>>  
>>  	ret = scmi_get_sharing_cpus(cpu_dev, policy->cpus);
>>  	if (ret) {
>>  		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
>> -		return ret;
>> +		goto out_free_cpumask;
>>  	}
>>  
>> -	ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
>> -	if (ret) {
>> -		dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
>> -			__func__, ret);
>> -		return ret;
>> +	/*
>> +	 * The OPP 'sharing cpus' info may come from dt through an empty opp
>> +	 * table and opp-shared. If found, it takes precedence over the SCMI
>> +	 * domain IDs info.
>> +	 */
>> +	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, opp_shared_cpus);
>> +	if (ret || !cpumask_weight(opp_shared_cpus)) {
>> +		/*
>> +		 * Either opp-table is not set or no opp-shared was found,
>> +		 * use the information from SCMI domain IDs.
>> +		 */
>> +		cpumask_copy(opp_shared_cpus, policy->cpus);
>>  	}
>>  
>>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>>  	if (nr_opp <= 0) {
>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
>> -		ret = -EPROBE_DEFER;
>> -		goto out_free_opp;
>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
>> +		if (ret) {
>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
>> +			goto out_free_cpumask;
>> +		}
>> +
>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
>> +		if (ret) {
>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
>> +				__func__, ret);
>> +			goto out_free_cpumask;
>> +		}
>> +
> 
> Why do we need to call above two after calling
> dev_pm_opp_get_opp_count() ?

Sorry, I am not sure to understand your question here. If there are no opps for
a device we want to add them to it, otherwise no need as they would be duplicated.

And we don't check the return value of
> the below call anymore, moreover we have to call it twice now.

This second get_opp_count is required such that we register em with the correct
opp number after having added them. Without this the opp_count would not be correct.

Hope I have answered your questions.
> 
>> +		nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>>  	}
>>  
>>  	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> @@ -191,15 +207,18 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
>>  		handle->perf_ops->fast_switch_possible(handle, cpu_dev);
>>  
>>  	power_scale_mw = handle->perf_ops->power_scale_mw_get(handle);
>> -	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus,
>> +	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus,
>>  				    power_scale_mw);
>>  
>> -	return 0;
>> +	ret = 0;
> 
> ret is already 0 here.

true, nice spot, thanks

> 
>> +	goto out_free_cpumask;
>>  
>>  out_free_priv:
>>  	kfree(priv);
>>  out_free_opp:
>>  	dev_pm_opp_remove_all_dynamic(cpu_dev);
>> +out_free_cpumask:
>> +	free_cpumask_var(opp_shared_cpus);
>>  
>>  	return ret;
>>  }
>> -- 
>> 2.27.0
>
Viresh Kumar Dec. 8, 2020, 7:26 a.m. UTC | #3
On 08-12-20, 07:22, Nicola Mazzucato wrote:
> On 12/8/20 5:50 AM, Viresh Kumar wrote:
> > On 02-12-20, 17:23, Nicola Mazzucato wrote:
> >>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
> >>  	if (nr_opp <= 0) {
> >> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> >> -		ret = -EPROBE_DEFER;
> >> -		goto out_free_opp;
> >> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> >> +		if (ret) {
> >> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> >> +			goto out_free_cpumask;
> >> +		}
> >> +
> >> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> >> +		if (ret) {
> >> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> >> +				__func__, ret);
> >> +			goto out_free_cpumask;
> >> +		}
> >> +
> > 
> > Why do we need to call above two after calling
> > dev_pm_opp_get_opp_count() ?
> 
> Sorry, I am not sure to understand your question here. If there are no opps for
> a device we want to add them to it

Earlier we used to call handle->perf_ops->device_opps_add() and
dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
the order changed now ?

> otherwise no need as they would be duplicated.

I am not sure why they would be duplicated in your case. I though
device_opps_add() is responsible for dynamically adding the OPPs here.

> > And we don't check the return value of
> > the below call anymore, moreover we have to call it twice now.
> 
> This second get_opp_count is required such that we register em with the correct
> opp number after having added them. Without this the opp_count would not be correct.

What if the count is still 0 ? What about deferred probe we were doing earlier ?
Nicola Mazzucato Dec. 8, 2020, 10:58 a.m. UTC | #4
On 12/8/20 7:26 AM, Viresh Kumar wrote:
> On 08-12-20, 07:22, Nicola Mazzucato wrote:
>> On 12/8/20 5:50 AM, Viresh Kumar wrote:
>>> On 02-12-20, 17:23, Nicola Mazzucato wrote:
>>>>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>>>>  	if (nr_opp <= 0) {
>>>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
>>>> -		ret = -EPROBE_DEFER;
>>>> -		goto out_free_opp;
>>>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
>>>> +		if (ret) {
>>>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
>>>> +			goto out_free_cpumask;
>>>> +		}
>>>> +
>>>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
>>>> +		if (ret) {
>>>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
>>>> +				__func__, ret);
>>>> +			goto out_free_cpumask;
>>>> +		}
>>>> +
>>>
>>> Why do we need to call above two after calling
>>> dev_pm_opp_get_opp_count() ?
>>
>> Sorry, I am not sure to understand your question here. If there are no opps for
>> a device we want to add them to it
> 
> Earlier we used to call handle->perf_ops->device_opps_add() and
> dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
> the order changed now ?

True. The order has changed to take into account the fact that when we have
per-cpu + opp-shared, we don't need to add opps for devices which already have them.

> 
>> otherwise no need as they would be duplicated.
> 
> I am not sure why they would be duplicated in your case. I though
> device_opps_add() is responsible for dynamically adding the OPPs here.

In case of per-cpu + opp-shared, with the "previous order" we would try to add
opps to a device which already has them, in fact attempting to add duplicates.
Nothing wrong with it, but a lot of warnings are thrown.

> 
>>> And we don't check the return value of
>>> the below call anymore, moreover we have to call it twice now.
>>
>> This second get_opp_count is required such that we register em with the correct
>> opp number after having added them. Without this the opp_count would not be correct.
> 
> What if the count is still 0 ? What about deferred probe we were doing earlier ?

My assumption is to rely on the two above to fail if there was something wrong.
For the deferred probe, I am not sure it is still a useful case to have, but I
will let Sudeep have his view also on this.

> 

Thanks Viresh, hope it's a bit more clear now.
Nicola
Viresh Kumar Dec. 8, 2020, 11:01 a.m. UTC | #5
On 08-12-20, 10:58, Nicola Mazzucato wrote:
> 
> 
> On 12/8/20 7:26 AM, Viresh Kumar wrote:
> > On 08-12-20, 07:22, Nicola Mazzucato wrote:
> >> On 12/8/20 5:50 AM, Viresh Kumar wrote:
> >>> On 02-12-20, 17:23, Nicola Mazzucato wrote:
> >>>>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
> >>>>  	if (nr_opp <= 0) {
> >>>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> >>>> -		ret = -EPROBE_DEFER;
> >>>> -		goto out_free_opp;
> >>>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> >>>> +		if (ret) {
> >>>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> >>>> +			goto out_free_cpumask;
> >>>> +		}
> >>>> +
> >>>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> >>>> +		if (ret) {
> >>>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> >>>> +				__func__, ret);
> >>>> +			goto out_free_cpumask;
> >>>> +		}
> >>>> +
> >>>
> >>> Why do we need to call above two after calling
> >>> dev_pm_opp_get_opp_count() ?
> >>
> >> Sorry, I am not sure to understand your question here. If there are no opps for
> >> a device we want to add them to it
> > 
> > Earlier we used to call handle->perf_ops->device_opps_add() and
> > dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
> > the order changed now ?
> 
> True. The order has changed to take into account the fact that when we have
> per-cpu + opp-shared, we don't need to add opps for devices which already have them.

The opp-shared thing is mostly a dummy thing to get you some information here.
What else has changed here ? I still don't understand why the OPPs would get
added and so the duplicate OPPs messages. Does this already happen ?
Sudeep Holla Dec. 8, 2020, 11:20 a.m. UTC | #6
On Tue, Dec 08, 2020 at 12:56:11PM +0530, Viresh Kumar wrote:
> On 08-12-20, 07:22, Nicola Mazzucato wrote:
> > On 12/8/20 5:50 AM, Viresh Kumar wrote:
> > > On 02-12-20, 17:23, Nicola Mazzucato wrote:
> > >>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
> > >>  	if (nr_opp <= 0) {
> > >> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> > >> -		ret = -EPROBE_DEFER;
> > >> -		goto out_free_opp;
> > >> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> > >> +		if (ret) {
> > >> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> > >> +			goto out_free_cpumask;
> > >> +		}
> > >> +
> > >> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> > >> +		if (ret) {
> > >> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> > >> +				__func__, ret);
> > >> +			goto out_free_cpumask;
> > >> +		}
> > >> +
> > >
> > > Why do we need to call above two after calling
> > > dev_pm_opp_get_opp_count() ?
> >
> > Sorry, I am not sure to understand your question here. If there are no opps for
> > a device we want to add them to it
>
> Earlier we used to call handle->perf_ops->device_opps_add() and
> dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
> the order changed now ?
>
> 
> I am not sure why they would be duplicated in your case. I though
> device_opps_add() is responsible for dynamically adding the OPPs here.
> 

It is because of per-CPU vs per domain drama here. Imagine a system with
4 CPUs which the firmware puts in individual domains while they all are
in the same perf domain and hence OPP is marked shared in DT.

Since this probe gets called for all the cpus, we need to skip adding
OPPs for the last 3(add only for 1st one and mark others as shared).
If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
OPP as we would have already marked it as shared table with the first cpu.
Am I missing anything ? I suggested this as Nicola saw OPP duplicate
warnings when he was hacking up this patch.

> > otherwise no need as they would be duplicated.
> > > And we don't check the return value of
> > > the below call anymore, moreover we have to call it twice now.

Yes, that looks wrong, we need to add the check for non zero values, but ....

> > 
> > This second get_opp_count is required such that we register em with the correct
> > opp number after having added them. Without this the opp_count would not be correct.
>

... I have a question here. Why do you need to call

em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)

on each CPU ? Why can't that be done once for unique opp_shared_cpus ?

The whole drama of per-CPU vs perf domain is to have energy model and
if feeding it opp_shared_cpus once is not sufficient, then something is
wrong or simply duplicated or just not necessary IMO.

> What if the count is still 0 ? What about deferred probe we were doing earlier ?

OK, you made me think with that question. I think the check was original
added for deferred probe but then scmi core was changed to add the cpufreq
device only after everything needed is ready. So the condition must never
occur now.
Sudeep Holla Dec. 8, 2020, 11:21 a.m. UTC | #7
On Tue, Dec 08, 2020 at 04:31:48PM +0530, Viresh Kumar wrote:
> On 08-12-20, 10:58, Nicola Mazzucato wrote:
> > 
> > 
> > On 12/8/20 7:26 AM, Viresh Kumar wrote:
> > > On 08-12-20, 07:22, Nicola Mazzucato wrote:
> > >> On 12/8/20 5:50 AM, Viresh Kumar wrote:
> > >>> On 02-12-20, 17:23, Nicola Mazzucato wrote:
> > >>>>  	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
> > >>>>  	if (nr_opp <= 0) {
> > >>>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> > >>>> -		ret = -EPROBE_DEFER;
> > >>>> -		goto out_free_opp;
> > >>>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> > >>>> +		if (ret) {
> > >>>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> > >>>> +			goto out_free_cpumask;
> > >>>> +		}
> > >>>> +
> > >>>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> > >>>> +		if (ret) {
> > >>>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> > >>>> +				__func__, ret);
> > >>>> +			goto out_free_cpumask;
> > >>>> +		}
> > >>>> +
> > >>>
> > >>> Why do we need to call above two after calling
> > >>> dev_pm_opp_get_opp_count() ?
> > >>
> > >> Sorry, I am not sure to understand your question here. If there are no opps for
> > >> a device we want to add them to it
> > > 
> > > Earlier we used to call handle->perf_ops->device_opps_add() and
> > > dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
> > > the order changed now ?
> > 
> > True. The order has changed to take into account the fact that when we have
> > per-cpu + opp-shared, we don't need to add opps for devices which already have them.
> 
> The opp-shared thing is mostly a dummy thing to get you some information here.
> What else has changed here ? I still don't understand why the OPPs would get
> added and so the duplicate OPPs messages. Does this already happen ?
> 

Yes, details in my earlier response.
Lukasz Luba Dec. 8, 2020, 11:34 a.m. UTC | #8
On 12/8/20 11:20 AM, Sudeep Holla wrote:
> On Tue, Dec 08, 2020 at 12:56:11PM +0530, Viresh Kumar wrote:
>> On 08-12-20, 07:22, Nicola Mazzucato wrote:
>>> On 12/8/20 5:50 AM, Viresh Kumar wrote:
>>>> On 02-12-20, 17:23, Nicola Mazzucato wrote:
>>>>>   	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>>>>>   	if (nr_opp <= 0) {
>>>>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
>>>>> -		ret = -EPROBE_DEFER;
>>>>> -		goto out_free_opp;
>>>>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
>>>>> +		if (ret) {
>>>>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
>>>>> +			goto out_free_cpumask;
>>>>> +		}
>>>>> +
>>>>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
>>>>> +		if (ret) {
>>>>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
>>>>> +				__func__, ret);
>>>>> +			goto out_free_cpumask;
>>>>> +		}
>>>>> +
>>>>
>>>> Why do we need to call above two after calling
>>>> dev_pm_opp_get_opp_count() ?
>>>
>>> Sorry, I am not sure to understand your question here. If there are no opps for
>>> a device we want to add them to it
>>
>> Earlier we used to call handle->perf_ops->device_opps_add() and
>> dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
>> the order changed now ?
>>
>>
>> I am not sure why they would be duplicated in your case. I though
>> device_opps_add() is responsible for dynamically adding the OPPs here.
>>
> 
> It is because of per-CPU vs per domain drama here. Imagine a system with
> 4 CPUs which the firmware puts in individual domains while they all are
> in the same perf domain and hence OPP is marked shared in DT.
> 
> Since this probe gets called for all the cpus, we need to skip adding
> OPPs for the last 3(add only for 1st one and mark others as shared).
> If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
> OPP as we would have already marked it as shared table with the first cpu.
> Am I missing anything ? I suggested this as Nicola saw OPP duplicate
> warnings when he was hacking up this patch.
> 
>>> otherwise no need as they would be duplicated.
>>>> And we don't check the return value of
>>>> the below call anymore, moreover we have to call it twice now.
> 
> Yes, that looks wrong, we need to add the check for non zero values, but ....
> 
>>>
>>> This second get_opp_count is required such that we register em with the correct
>>> opp number after having added them. Without this the opp_count would not be correct.
>>
> 
> ... I have a question here. Why do you need to call
> 
> em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)
> 
> on each CPU ? Why can't that be done once for unique opp_shared_cpus ?

It just have to be called once, for one CPU from the mask. Otherwise for
the next CPUs you should see error:
"EM: exists for CPU%d"
It can happen that this print is not seen when the get_cpu_device(cpu)
failed, but that would lead to investigation why CPU devices are not
there yet.

Nicola: have you seen that print?
Sudeep Holla Dec. 8, 2020, 12:22 p.m. UTC | #9
On Tue, Dec 08, 2020 at 11:34:36AM +0000, Lukasz Luba wrote:
> 
> 
> On 12/8/20 11:20 AM, Sudeep Holla wrote:
> > On Tue, Dec 08, 2020 at 12:56:11PM +0530, Viresh Kumar wrote:
> > > On 08-12-20, 07:22, Nicola Mazzucato wrote:
> > > > On 12/8/20 5:50 AM, Viresh Kumar wrote:
> > > > > On 02-12-20, 17:23, Nicola Mazzucato wrote:
> > > > > >   	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
> > > > > >   	if (nr_opp <= 0) {
> > > > > > -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
> > > > > > -		ret = -EPROBE_DEFER;
> > > > > > -		goto out_free_opp;
> > > > > > +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
> > > > > > +		if (ret) {
> > > > > > +			dev_warn(cpu_dev, "failed to add opps to the device\n");
> > > > > > +			goto out_free_cpumask;
> > > > > > +		}
> > > > > > +
> > > > > > +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
> > > > > > +		if (ret) {
> > > > > > +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
> > > > > > +				__func__, ret);
> > > > > > +			goto out_free_cpumask;
> > > > > > +		}
> > > > > > +
> > > > > 
> > > > > Why do we need to call above two after calling
> > > > > dev_pm_opp_get_opp_count() ?
> > > > 
> > > > Sorry, I am not sure to understand your question here. If there are no opps for
> > > > a device we want to add them to it
> > > 
> > > Earlier we used to call handle->perf_ops->device_opps_add() and
> > > dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
> > > the order changed now ?
> > > 
> > > 
> > > I am not sure why they would be duplicated in your case. I though
> > > device_opps_add() is responsible for dynamically adding the OPPs here.
> > > 
> > 
> > It is because of per-CPU vs per domain drama here. Imagine a system with
> > 4 CPUs which the firmware puts in individual domains while they all are
> > in the same perf domain and hence OPP is marked shared in DT.
> > 
> > Since this probe gets called for all the cpus, we need to skip adding
> > OPPs for the last 3(add only for 1st one and mark others as shared).
> > If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
> > OPP as we would have already marked it as shared table with the first cpu.
> > Am I missing anything ? I suggested this as Nicola saw OPP duplicate
> > warnings when he was hacking up this patch.
> > 
> > > > otherwise no need as they would be duplicated.
> > > > > And we don't check the return value of
> > > > > the below call anymore, moreover we have to call it twice now.
> > 
> > Yes, that looks wrong, we need to add the check for non zero values, but ....
> > 
> > > > 
> > > > This second get_opp_count is required such that we register em with the correct
> > > > opp number after having added them. Without this the opp_count would not be correct.
> > > 
> > 
> > ... I have a question here. Why do you need to call
> > 
> > em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)
> > 
> > on each CPU ? Why can't that be done once for unique opp_shared_cpus ?
> 
> It just have to be called once, for one CPU from the mask. Otherwise for
> the next CPUs you should see error:
> "EM: exists for CPU%d"

OK cool, at least it is designed and expected to be used like I thought.
Ah, I might have seen those, but never thought it was error message 
Nicola Mazzucato Dec. 8, 2020, 1:17 p.m. UTC | #10
Hi All, thanks for your feedback, please see below

On 12/8/20 12:22 PM, Sudeep Holla wrote:
> On Tue, Dec 08, 2020 at 11:34:36AM +0000, Lukasz Luba wrote:
>>
>>
>> On 12/8/20 11:20 AM, Sudeep Holla wrote:
>>> On Tue, Dec 08, 2020 at 12:56:11PM +0530, Viresh Kumar wrote:
>>>> On 08-12-20, 07:22, Nicola Mazzucato wrote:
>>>>> On 12/8/20 5:50 AM, Viresh Kumar wrote:
>>>>>> On 02-12-20, 17:23, Nicola Mazzucato wrote:
>>>>>>>   	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
>>>>>>>   	if (nr_opp <= 0) {
>>>>>>> -		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
>>>>>>> -		ret = -EPROBE_DEFER;
>>>>>>> -		goto out_free_opp;
>>>>>>> +		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
>>>>>>> +		if (ret) {
>>>>>>> +			dev_warn(cpu_dev, "failed to add opps to the device\n");
>>>>>>> +			goto out_free_cpumask;
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
>>>>>>> +		if (ret) {
>>>>>>> +			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
>>>>>>> +				__func__, ret);
>>>>>>> +			goto out_free_cpumask;
>>>>>>> +		}
>>>>>>> +
>>>>>>
>>>>>> Why do we need to call above two after calling
>>>>>> dev_pm_opp_get_opp_count() ?
>>>>>
>>>>> Sorry, I am not sure to understand your question here. If there are no opps for
>>>>> a device we want to add them to it
>>>>
>>>> Earlier we used to call handle->perf_ops->device_opps_add() and
>>>> dev_pm_opp_set_sharing_cpus() before calling dev_pm_opp_get_opp_count(), why is
>>>> the order changed now ?
>>>>
>>>>
>>>> I am not sure why they would be duplicated in your case. I though
>>>> device_opps_add() is responsible for dynamically adding the OPPs here.
>>>>
>>>
>>> It is because of per-CPU vs per domain drama here. Imagine a system with
>>> 4 CPUs which the firmware puts in individual domains while they all are
>>> in the same perf domain and hence OPP is marked shared in DT.
>>>
>>> Since this probe gets called for all the cpus, we need to skip adding
>>> OPPs for the last 3(add only for 1st one and mark others as shared).
>>> If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
>>> OPP as we would have already marked it as shared table with the first cpu.
>>> Am I missing anything ? I suggested this as Nicola saw OPP duplicate
>>> warnings when he was hacking up this patch.
>>>
>>>>> otherwise no need as they would be duplicated.
>>>>>> And we don't check the return value of
>>>>>> the below call anymore, moreover we have to call it twice now.
>>>
>>> Yes, that looks wrong, we need to add the check for non zero values, but ....

will add the check, thanks

>>>
>>>>>
>>>>> This second get_opp_count is required such that we register em with the correct
>>>>> opp number after having added them. Without this the opp_count would not be correct.
>>>>
>>>
>>> ... I have a question here. Why do you need to call
>>>
>>> em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)
>>>
>>> on each CPU ? Why can't that be done once for unique opp_shared_cpus ?

I left it untouched to reduce changes, but I see your point.

>>
>> It just have to be called once, for one CPU from the mask. Otherwise for
>> the next CPUs you should see error:
>> "EM: exists for CPU%d"
> 
> OK cool, at least it is designed and expected to be used like I thought.
> Ah, I might have seen those, but never thought it was error message 
Viresh Kumar Dec. 9, 2020, 5:45 a.m. UTC | #11
On 08-12-20, 11:20, Sudeep Holla wrote:
> It is because of per-CPU vs per domain drama here. Imagine a system with
> 4 CPUs which the firmware puts in individual domains while they all are
> in the same perf domain and hence OPP is marked shared in DT.
> 
> Since this probe gets called for all the cpus, we need to skip adding
> OPPs for the last 3(add only for 1st one and mark others as shared).

Okay and this wasn't happening before this series because the firmware
was only returning the current CPU from scmi_get_sharing_cpus() ?

Is this driver also used for the cases where we have multiple CPUs in
a policy ? Otherwise we won't be required to call
dev_pm_opp_set_sharing_cpus().

So I assume that we want to support both the cases here ?

> If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
> OPP as we would have already marked it as shared table with the first cpu.
> Am I missing anything ? I suggested this as Nicola saw OPP duplicate
> warnings when he was hacking up this patch.

The common stuff (for all the CPUs) is better moved to probe() in this
case, instead of the ->init() callback. Otherwise it will always be
messy. You can initialize the OPP and cpufreq tables in probe()
itself, save the pointer somewhere and then just use it here in
->init().

Also do EM registration from there.

> > > otherwise no need as they would be duplicated.
> > > > And we don't check the return value of
> > > > the below call anymore, moreover we have to call it twice now.
> 
> Yes, that looks wrong, we need to add the check for non zero values, but ....
> 
> > > 
> > > This second get_opp_count is required such that we register em with the correct
> > > opp number after having added them. Without this the opp_count would not be correct.
> >
> 
> ... I have a question here. Why do you need to call
> 
> em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)
> 
> on each CPU ? Why can't that be done once for unique opp_shared_cpus ?
> 
> The whole drama of per-CPU vs perf domain is to have energy model and
> if feeding it opp_shared_cpus once is not sufficient, then something is
> wrong or simply duplicated or just not necessary IMO.
> 
> > What if the count is still 0 ? What about deferred probe we were doing earlier ?
> 
> OK, you made me think with that question. I think the check was original
> added for deferred probe but then scmi core was changed to add the cpufreq
> device only after everything needed is ready. So the condition must never
> occur now.

The deferred probe shall be handled in a different patch in that case.

Nicola, please break the patch into multiple patches, with one patch
dealing only with one task.
Nicola Mazzucato Dec. 9, 2020, 9:20 a.m. UTC | #12
Hi both,

thanks for looking into this.

On 12/9/20 5:45 AM, Viresh Kumar wrote:
> On 08-12-20, 11:20, Sudeep Holla wrote:
>> It is because of per-CPU vs per domain drama here. Imagine a system with
>> 4 CPUs which the firmware puts in individual domains while they all are
>> in the same perf domain and hence OPP is marked shared in DT.
>>
>> Since this probe gets called for all the cpus, we need to skip adding
>> OPPs for the last 3(add only for 1st one and mark others as shared).
> 
> Okay and this wasn't happening before this series because the firmware
> was only returning the current CPU from scmi_get_sharing_cpus() ?

yes

> 
> Is this driver also used for the cases where we have multiple CPUs in
> a policy ? Otherwise we won't be required to call
> dev_pm_opp_set_sharing_cpus().
> 
> So I assume that we want to support both the cases here ?

yes, we want to support existing platforms (n cpus in a policy) + the per-cpu case.

> 
>> If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
>> OPP as we would have already marked it as shared table with the first cpu.
>> Am I missing anything ? I suggested this as Nicola saw OPP duplicate
>> warnings when he was hacking up this patch.
> 
> The common stuff (for all the CPUs) is better moved to probe() in this
> case, instead of the ->init() callback. Otherwise it will always be
> messy. You can initialize the OPP and cpufreq tables in probe()
> itself, save the pointer somewhere and then just use it here in
> ->init().
> 
> Also do EM registration from there.
>

ok, will rework

>>>> otherwise no need as they would be duplicated.
>>>>> And we don't check the return value of
>>>>> the below call anymore, moreover we have to call it twice now.
>>
>> Yes, that looks wrong, we need to add the check for non zero values, but ....
>>
>>>>
>>>> This second get_opp_count is required such that we register em with the correct
>>>> opp number after having added them. Without this the opp_count would not be correct.
>>>
>>
>> ... I have a question here. Why do you need to call
>>
>> em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus..)
>>
>> on each CPU ? Why can't that be done once for unique opp_shared_cpus ?
>>
>> The whole drama of per-CPU vs perf domain is to have energy model and
>> if feeding it opp_shared_cpus once is not sufficient, then something is
>> wrong or simply duplicated or just not necessary IMO.
>>
>>> What if the count is still 0 ? What about deferred probe we were doing earlier ?
>>
>> OK, you made me think with that question. I think the check was original
>> added for deferred probe but then scmi core was changed to add the cpufreq
>> device only after everything needed is ready. So the condition must never
>> occur now.
> 
> The deferred probe shall be handled in a different patch in that case.
> 
> Nicola, please break the patch into multiple patches, with one patch
> dealing only with one task.

Sure, I had the doubt and thanks for confirming. will do, thanks

> 

Cheers,
Nicola
Sudeep Holla Dec. 9, 2020, 9:41 a.m. UTC | #13
On Wed, Dec 09, 2020 at 11:15:02AM +0530, Viresh Kumar wrote:
> On 08-12-20, 11:20, Sudeep Holla wrote:
> > It is because of per-CPU vs per domain drama here. Imagine a system with
> > 4 CPUs which the firmware puts in individual domains while they all are
> > in the same perf domain and hence OPP is marked shared in DT.
> >
> > Since this probe gets called for all the cpus, we need to skip adding
> > OPPs for the last 3(add only for 1st one and mark others as shared).
>
> Okay and this wasn't happening before this series because the firmware
> was only returning the current CPU from scmi_get_sharing_cpus() ?
>
> Is this driver also used for the cases where we have multiple CPUs in
> a policy ? Otherwise we won't be required to call
> dev_pm_opp_set_sharing_cpus().
>
> So I assume that we want to support both the cases here ?
>

Yes indeed, completely depends on what granularity firmware provides the
performance control. It could be individual CPUs, could be pair of CPUs
(or all the threads in the core) or subset of CPUs in the performance
domain. The subset could be full set.

> > If we attempt to add OPPs on second cpu probe, it *will* shout as duplicate
> > OPP as we would have already marked it as shared table with the first cpu.
> > Am I missing anything ? I suggested this as Nicola saw OPP duplicate
> > warnings when he was hacking up this patch.
>
> The common stuff (for all the CPUs) is better moved to probe() in this
> case, instead of the ->init() callback. Otherwise it will always be
> messy. You can initialize the OPP and cpufreq tables in probe()
> itself, save the pointer somewhere and then just use it here in
> ->init().
>
> Also do EM registration from there.
>

Makes sense.

--
Regards,
Sudeep
diff mbox series

Patch

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 491a0a24fb1e..f505efcc62b1 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -127,6 +127,7 @@  static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *freq_table;
 	struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power);
 	bool power_scale_mw;
+	cpumask_var_t opp_shared_cpus;
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -134,30 +135,45 @@  static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 		return -ENODEV;
 	}
 
-	ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
-	if (ret) {
-		dev_warn(cpu_dev, "failed to add opps to the device\n");
-		return ret;
-	}
+	if (!zalloc_cpumask_var(&opp_shared_cpus, GFP_KERNEL))
+		return -ENOMEM;
 
 	ret = scmi_get_sharing_cpus(cpu_dev, policy->cpus);
 	if (ret) {
 		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
-		return ret;
+		goto out_free_cpumask;
 	}
 
-	ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
-	if (ret) {
-		dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
-			__func__, ret);
-		return ret;
+	/*
+	 * The OPP 'sharing cpus' info may come from dt through an empty opp
+	 * table and opp-shared. If found, it takes precedence over the SCMI
+	 * domain IDs info.
+	 */
+	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, opp_shared_cpus);
+	if (ret || !cpumask_weight(opp_shared_cpus)) {
+		/*
+		 * Either opp-table is not set or no opp-shared was found,
+		 * use the information from SCMI domain IDs.
+		 */
+		cpumask_copy(opp_shared_cpus, policy->cpus);
 	}
 
 	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
 	if (nr_opp <= 0) {
-		dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
-		ret = -EPROBE_DEFER;
-		goto out_free_opp;
+		ret = handle->perf_ops->device_opps_add(handle, cpu_dev);
+		if (ret) {
+			dev_warn(cpu_dev, "failed to add opps to the device\n");
+			goto out_free_cpumask;
+		}
+
+		ret = dev_pm_opp_set_sharing_cpus(cpu_dev, opp_shared_cpus);
+		if (ret) {
+			dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
+				__func__, ret);
+			goto out_free_cpumask;
+		}
+
+		nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
 	}
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -191,15 +207,18 @@  static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 		handle->perf_ops->fast_switch_possible(handle, cpu_dev);
 
 	power_scale_mw = handle->perf_ops->power_scale_mw_get(handle);
-	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus,
+	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, opp_shared_cpus,
 				    power_scale_mw);
 
-	return 0;
+	ret = 0;
+	goto out_free_cpumask;
 
 out_free_priv:
 	kfree(priv);
 out_free_opp:
 	dev_pm_opp_remove_all_dynamic(cpu_dev);
+out_free_cpumask:
+	free_cpumask_var(opp_shared_cpus);
 
 	return ret;
 }