diff mbox

[v3,1/2] cpufreq: Don't destroy/realloc policy/sysfs on hotplug/suspend

Message ID 1405464473-3916-2-git-send-email-skannan@codeaurora.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Saravana Kannan July 15, 2014, 10:47 p.m. UTC
The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
policy ownership between CPUs, it also moves the cpufreq sysfs directory
between CPUs and also fixes up the symlinks of the other CPUs in the
cluster.

Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
directories are deleted, the kobject is released and the policy is freed.
And when the first CPU in a cluster comes up, the policy is reallocated and
initialized, kobject is acquired, the sysfs nodes are created or symlinked,
etc.

All these steps end up creating unnecessarily complicated code and locking.
There's no real benefit to adding/removing/moving the sysfs nodes and the
policy between CPUs. Other per CPU sysfs directories like power and cpuidle
are left alone during hotplug. So there's some precedence to what this
patch is trying to do.

This patch simplifies a lot of the code and locking by removing the
adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
directory and policy in place irrespective of whether the CPUs are
ONLINE/OFFLINE.

Leaving the policy, sysfs and kobject in place also brings these additional
benefits:
* Faster suspend/resume
* Faster hotplug
* Sysfs file permissions maintained across hotplug
* Policy settings and governor tunables maintained across hotplug
* Cpufreq stats would be maintained across hotplug for all CPUs and can be
  queried even after CPU goes OFFLINE

Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Saravana Kannan <skannan@codeaurora.org>
---
 drivers/cpufreq/cpufreq.c | 388 +++++++++++++---------------------------------
 1 file changed, 107 insertions(+), 281 deletions(-)

Comments

Saravana Kannan July 16, 2014, 12:28 a.m. UTC | #1
One preemptive comment.

On 07/15/2014 03:47 PM, Saravana Kannan wrote:
> The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
> within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
> policy ownership between CPUs, it also moves the cpufreq sysfs directory
> between CPUs and also fixes up the symlinks of the other CPUs in the
> cluster.
>
> Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
> directories are deleted, the kobject is released and the policy is freed.
> And when the first CPU in a cluster comes up, the policy is reallocated and
> initialized, kobject is acquired, the sysfs nodes are created or symlinked,
> etc.
>
> All these steps end up creating unnecessarily complicated code and locking.
> There's no real benefit to adding/removing/moving the sysfs nodes and the
> policy between CPUs. Other per CPU sysfs directories like power and cpuidle
> are left alone during hotplug. So there's some precedence to what this
> patch is trying to do.
>
> This patch simplifies a lot of the code and locking by removing the
> adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
> directory and policy in place irrespective of whether the CPUs are
> ONLINE/OFFLINE.
>
> Leaving the policy, sysfs and kobject in place also brings these additional
> benefits:
> * Faster suspend/resume
> * Faster hotplug
> * Sysfs file permissions maintained across hotplug
> * Policy settings and governor tunables maintained across hotplug
> * Cpufreq stats would be maintained across hotplug for all CPUs and can be
>    queried even after CPU goes OFFLINE
>
> Tested-by: Stephen Boyd <sboyd@codeaurora.org>
> Signed-off-by: Saravana Kannan <skannan@codeaurora.org>
> ---
>   drivers/cpufreq/cpufreq.c | 388 +++++++++++++---------------------------------
>   1 file changed, 107 insertions(+), 281 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 62259d2..a0a2ec2 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c

<SNIP>

> @@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
>   }
>
>   #ifdef CONFIG_HOTPLUG_CPU
> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
> -				  unsigned int cpu, struct device *dev)
> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
> +				  unsigned int cpu, bool add)
>   {
>   	int ret = 0;
> -	unsigned long flags;
> +	unsigned int cpus, pcpu;
>
> -	if (has_target()) {
> +	down_write(&policy->rwsem);
> +
> +	cpus = !cpumask_empty(policy->cpus);
> +	if (has_target() && cpus) {
>   		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
>   		if (ret) {
>   			pr_err("%s: Failed to stop governor\n", __func__);
> -			return ret;
> +			goto unlock;
>   		}
>   	}
>

<SNIP>

> +	if (add)
> +		cpumask_set_cpu(cpu, policy->cpus);
> +	else
> +		cpumask_clear_cpu(cpu, policy->cpus);
>
> -	up_write(&policy->rwsem);
> +	pcpu = cpumask_first(policy->cpus);
> +	if (pcpu < nr_cpu_ids && policy->cpu != pcpu) {
> +		policy->last_cpu = policy->cpu;
> +		policy->cpu = pcpu;
> +		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> +					CPUFREQ_UPDATE_POLICY_CPU, policy);
> +	}
>
> -	if (has_target()) {
> +	cpus = !cpumask_empty(policy->cpus);
> +	if (has_target() && cpus) {
>   		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
>   		if (!ret)
>   			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
>
>   		if (ret) {
>   			pr_err("%s: Failed to start governor\n", __func__);
> -			return ret;
> +			goto unlock;
>   		}
>   	}
>

<SNIP>

> +	if (!cpus && cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
> +		cpufreq_driver->stop_cpu(policy);
> +	}
>

Viresh, I tried your suggestion (and my initial thought too) to combine 
this as an if/else with the previous if. But the indentation got nasty 
and made it hard to read. I'm sure the compiler will optimize it. So, I 
would prefer to leave it this way.


> -	policy->governor = NULL;
> +unlock:
> +	up_write(&policy->rwsem);
>
> -	return policy;
> +	return ret;
>   }
> +#endif
>

-Saravana
Viresh Kumar July 16, 2014, 8:24 a.m. UTC | #2
On 16 July 2014 04:17, Saravana Kannan <skannan@codeaurora.org> wrote:
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c

> +/* symlink related CPUs */
> +static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
>  {
> -       unsigned int j;
> +       unsigned int j, first_cpu = cpumask_first(policy->related_cpus);

The CPU which came first should get the ownership by default, instead
of the first one in the mask.

Normally at boot, all CPUs come up first and then only cpufreq init starts.
But in case all other CPUs fail to come up, then policy->cpu *might* point
to a failed cpu.

And so, we should simply use policy->cpu here instead of finding the
first one in the mask.

Also, its not the duty of this routine to find which one is the policy cpu as
that is done by __cpufreq_add_dev(). And so in case we need to make
first cpu of a mask as policy->cpu, it should be done in __cpufreq_add_dev()
and not here. This one should just follow the orders :)

@Srivatsa: What happens to the sysfs directory if a CPU fails to come up?
Is it exactly similar to how it happens in hotplug? i.e. we do have a directory
there?

>         int ret = 0;
>
> -       for_each_cpu(j, policy->cpus) {
> +       for_each_cpu(j, policy->related_cpus) {
>                 struct device *cpu_dev;
>
> -               if (j == policy->cpu)
> +               if (j == first_cpu)
>                         continue;
>
> -               pr_debug("Adding link for CPU: %u\n", j);

Keep this please, it might be useful while debugging.

>                 cpu_dev = get_cpu_device(j);
> -               ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> -                                       "cpufreq");
> +               if (add)
> +                       ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> +                                               "cpufreq");
> +               else
> +                       sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
> +
>                 if (ret)
>                         break;
>         }
>         return ret;
>  }
>
> -static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
> -                                    struct device *dev)
> +static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
>  {
>         struct freq_attr **drv_attr;
> +       struct device *dev;
>         int ret = 0;
>
> +       dev = get_cpu_device(cpumask_first(policy->related_cpus));
> +       if (!dev)
> +               return -EINVAL;

Again, deciding which cpu is policy->cpu here is wrong. Just follow
orders of __cpufreq_add_dev().

>         /* prepare interface data */
>         ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
>                                    &dev->kobj, "cpufreq");
> @@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
>                         goto err_out_kobj_put;
>         }
>
> -       ret = cpufreq_add_dev_symlink(policy);
> +       ret = cpufreq_dev_symlink(policy, true);
>         if (ret)
>                 goto err_out_kobj_put;
>
> @@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
>  }
>
>  #ifdef CONFIG_HOTPLUG_CPU

@Srivatsa: I will try this but you also take care of this. These
ifdefs might go wrong,
i.e. we are surely using it in the current patch without HOTPLUG as well. See
cpufreq_add_dev()..

Also, how does suspend/resume work without CONFIG_HOTPLUG_CPU ?
What's the sequence of events?

> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
> -                                 unsigned int cpu, struct device *dev)
> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
> +                                 unsigned int cpu, bool add)
>  {
>         int ret = 0;
> -       unsigned long flags;
> +       unsigned int cpus, pcpu;
>
> -       if (has_target()) {
> +       down_write(&policy->rwsem);
> +
> +       cpus = !cpumask_empty(policy->cpus);

We aren't using cpus at multiple places and so probably it would
be better to using cpumask_empty() directly.

> +       if (has_target() && cpus) {

I may get the answer later in reviews, but when will cpus be 0 here?
Probably for non-boot cluster during suspend/resume, or forceful
hotplugging off all CPUs of a cluster. Right?

>                 ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
>                 if (ret) {
>                         pr_err("%s: Failed to stop governor\n", __func__);
> -                       return ret;
> +                       goto unlock;
>                 }
>         }
>
> -       down_write(&policy->rwsem);
> -
> -       write_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -       cpumask_set_cpu(cpu, policy->cpus);
> -       per_cpu(cpufreq_cpu_data, cpu) = policy;
> -       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +       if (add)
> +               cpumask_set_cpu(cpu, policy->cpus);
> +       else
> +               cpumask_clear_cpu(cpu, policy->cpus);
>
> -       up_write(&policy->rwsem);
> +       pcpu = cpumask_first(policy->cpus);
> +       if (pcpu < nr_cpu_ids && policy->cpu != pcpu) {

No, we don't have to consider changing policy->cpu for every change
in policy->cpus. We need to do that only when policy->cpu goes down.

Also pcpu can't be < nr_cpu_ids, right?

> +               policy->last_cpu = policy->cpu;
> +               policy->cpu = pcpu;
> +               blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> +                                       CPUFREQ_UPDATE_POLICY_CPU, policy);
> +       }
>
> -       if (has_target()) {
> +       cpus = !cpumask_empty(policy->cpus);
> +       if (has_target() && cpus) {
>                 ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
>                 if (!ret)
>                         ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
>
>                 if (ret) {
>                         pr_err("%s: Failed to start governor\n", __func__);
> -                       return ret;
> +                       goto unlock;
>                 }
>         }
>
> -       return sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq");
> -}
> -#endif
> -
> -static struct cpufreq_policy *cpufreq_policy_restore(unsigned int cpu)
> -{
> -       struct cpufreq_policy *policy;
> -       unsigned long flags;
> -
> -       read_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -       policy = per_cpu(cpufreq_cpu_data_fallback, cpu);
> -
> -       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +       if (!cpus && cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {

As I commented on V1, please make it else part of above if..

> +               cpufreq_driver->stop_cpu(policy);
> +       }
>
> -       policy->governor = NULL;
> +unlock:
> +       up_write(&policy->rwsem);
>
> -       return policy;
> +       return ret;
>  }
> +#endif
>
>  static struct cpufreq_policy *cpufreq_policy_alloc(void)
>  {
> @@ -1053,10 +1057,8 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
>         blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>                         CPUFREQ_REMOVE_POLICY, policy);
>
> -       down_read(&policy->rwsem);
>         kobj = &policy->kobj;
>         cmp = &policy->kobj_unregister;
> -       up_read(&policy->rwsem);

Why? And also, these are unrelated changes and must be added as separate
commits.

>         kobject_put(kobj);
>
>         /*
> @@ -1076,32 +1078,12 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
>         kfree(policy);
>  }
>
> -static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu)
> -{
> -       if (WARN_ON(cpu == policy->cpu))
> -               return;
> -
> -       down_write(&policy->rwsem);
> -
> -       policy->last_cpu = policy->cpu;
> -       policy->cpu = cpu;
> -
> -       up_write(&policy->rwsem);
> -
> -       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> -                       CPUFREQ_UPDATE_POLICY_CPU, policy);
> -}
> -
>  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>  {
>         unsigned int j, cpu = dev->id;
>         int ret = -ENOMEM;
>         struct cpufreq_policy *policy;
>         unsigned long flags;
> -       bool recover_policy = cpufreq_suspended;
> -#ifdef CONFIG_HOTPLUG_CPU
> -       struct cpufreq_policy *tpolicy;
> -#endif
>
>         if (cpu_is_offline(cpu))
>                 return 0;
> @@ -1110,9 +1092,10 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>
>  #ifdef CONFIG_SMP
>         /* check whether a different CPU already registered this
> -        * CPU because it is in the same boat. */
> +        * CPU because it is one of the related CPUs. */
>         policy = cpufreq_cpu_get(cpu);
> -       if (unlikely(policy)) {
> +       if (policy) {
> +               cpufreq_change_policy_cpus(policy, cpu, true);

This is just a waste of time at boot as ... (see below)

>                 cpufreq_cpu_put(policy);
>                 return 0;
>         }
> @@ -1121,45 +1104,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>         if (!down_read_trylock(&cpufreq_rwsem))
>                 return 0;
>
> -#ifdef CONFIG_HOTPLUG_CPU
> -       /* Check if this cpu was hot-unplugged earlier and has siblings */
> -       read_lock_irqsave(&cpufreq_driver_lock, flags);
> -       list_for_each_entry(tpolicy, &cpufreq_policy_list, policy_list) {
> -               if (cpumask_test_cpu(cpu, tpolicy->related_cpus)) {
> -                       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -                       ret = cpufreq_add_policy_cpu(tpolicy, cpu, dev);
> -                       up_read(&cpufreq_rwsem);
> -                       return ret;
> -               }
> -       }
> -       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -#endif
> -
> -       /*
> -        * Restore the saved policy when doing light-weight init and fall back
> -        * to the full init if that fails.
> -        */
> -       policy = recover_policy ? cpufreq_policy_restore(cpu) : NULL;
> -       if (!policy) {
> -               recover_policy = false;
> -               policy = cpufreq_policy_alloc();
> -               if (!policy)
> -                       goto nomem_out;
> -       }
> -
> -       /*
> -        * In the resume path, since we restore a saved policy, the assignment
> -        * to policy->cpu is like an update of the existing policy, rather than
> -        * the creation of a brand new one. So we need to perform this update
> -        * by invoking update_policy_cpu().
> -        */
> -       if (recover_policy && cpu != policy->cpu)
> -               update_policy_cpu(policy, cpu);
> -       else
> -               policy->cpu = cpu;
> +       /* If we get this far, this is the first time we are adding the
> +        * policy */
> +       policy = cpufreq_policy_alloc();
> +       if (!policy)
> +               goto nomem_out;
> +       policy->cpu = cpu;
>
>         cpumask_copy(policy->cpus, cpumask_of(cpu));
> -
>         init_completion(&policy->kobj_unregister);
>         INIT_WORK(&policy->update, handle_update);
>
> @@ -1169,26 +1121,25 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>         ret = cpufreq_driver->init(policy);
>         if (ret) {
>                 pr_debug("initialization failed\n");
> -               goto err_set_policy_cpu;
> +               goto err_init;
>         }
>
>         /* related cpus should atleast have policy->cpus */
>         cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);

policy->cpus is already updated here.

> +       /* Weed out impossible CPUs. */
> +       cpumask_and(policy->related_cpus, policy->related_cpus,
> +                       cpu_possible_mask);

This has to be in a separate commit..

>         /*
>          * affected cpus must always be the one, which are online. We aren't
>          * managing offline cpus here.
>          */
>         cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
>
> -       if (!recover_policy) {
> -               policy->user_policy.min = policy->min;
> -               policy->user_policy.max = policy->max;
> -       }
> -

Where did these go? There weren't there for fun.

>         down_write(&policy->rwsem);
>         write_lock_irqsave(&cpufreq_driver_lock, flags);
> -       for_each_cpu(j, policy->cpus)
> +       for_each_cpu(j, policy->related_cpus)
>                 per_cpu(cpufreq_cpu_data, j) = policy;
>         write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>
> @@ -1243,13 +1194,11 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>         blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>                                      CPUFREQ_START, policy);
>
> -       if (!recover_policy) {
> -               ret = cpufreq_add_dev_interface(policy, dev);
> -               if (ret)
> -                       goto err_out_unregister;
> -               blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> -                               CPUFREQ_CREATE_POLICY, policy);
> -       }
> +       ret = cpufreq_add_dev_interface(policy);
> +       if (ret)
> +               goto err_out_unregister;
> +       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> +                       CPUFREQ_CREATE_POLICY, policy);
>
>         write_lock_irqsave(&cpufreq_driver_lock, flags);
>         list_add(&policy->policy_list, &cpufreq_policy_list);
> @@ -1257,10 +1206,6 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>
>         cpufreq_init_policy(policy);
>
> -       if (!recover_policy) {
> -               policy->user_policy.policy = policy->policy;
> -               policy->user_policy.governor = policy->governor;
> -       }

Same here.

>         up_write(&policy->rwsem);
>
>         kobject_uevent(&policy->kobj, KOBJ_ADD);
> @@ -1273,20 +1218,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>  err_out_unregister:
>  err_get_freq:
>         write_lock_irqsave(&cpufreq_driver_lock, flags);
> -       for_each_cpu(j, policy->cpus)
> +       for_each_cpu(j, policy->related_cpus)
>                 per_cpu(cpufreq_cpu_data, j) = NULL;
>         write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -
> +       up_write(&policy->rwsem);
>         if (cpufreq_driver->exit)
>                 cpufreq_driver->exit(policy);
> -err_set_policy_cpu:
> -       if (recover_policy) {
> -               /* Do not leave stale fallback data behind. */
> -               per_cpu(cpufreq_cpu_data_fallback, cpu) = NULL;
> -               cpufreq_policy_put_kobj(policy);
> -       }
> +err_init:
>         cpufreq_policy_free(policy);
> -
>  nomem_out:
>         up_read(&cpufreq_rwsem);
>

Just to mention, I am not looking at the validity of error fallback paths
in this version. Just make sure they are all good :)

> @@ -1307,100 +1246,16 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>         return __cpufreq_add_dev(dev, sif);
>  }
>
> -static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy,
> -                                          unsigned int old_cpu)
> -{
> -       struct device *cpu_dev;
> -       int ret;
> -
> -       /* first sibling now owns the new sysfs dir */
> -       cpu_dev = get_cpu_device(cpumask_any_but(policy->cpus, old_cpu));
> -
> -       sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
> -       ret = kobject_move(&policy->kobj, &cpu_dev->kobj);
> -       if (ret) {
> -               pr_err("%s: Failed to move kobj: %d\n", __func__, ret);
> -
> -               down_write(&policy->rwsem);
> -               cpumask_set_cpu(old_cpu, policy->cpus);
> -               up_write(&policy->rwsem);
> -
> -               ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> -                                       "cpufreq");
> -
> -               return -EINVAL;
> -       }
> -
> -       return cpu_dev->id;
> -}
> -
> -static int __cpufreq_remove_dev_prepare(struct device *dev,
> -                                       struct subsys_interface *sif)
> +static int __cpufreq_remove_dev(struct device *dev,
> +                               struct subsys_interface *sif)
>  {
> -       unsigned int cpu = dev->id, cpus;
> -       int new_cpu, ret;
> +       unsigned int cpu = dev->id, j;
> +       int ret = 0;
>         unsigned long flags;
>         struct cpufreq_policy *policy;
>
>         pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
>
> -       write_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -       policy = per_cpu(cpufreq_cpu_data, cpu);
> -
> -       /* Save the policy somewhere when doing a light-weight tear-down */
> -       if (cpufreq_suspended)
> -               per_cpu(cpufreq_cpu_data_fallback, cpu) = policy;
> -
> -       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -
> -       if (!policy) {
> -               pr_debug("%s: No cpu_data found\n", __func__);
> -               return -EINVAL;
> -       }
> -
> -       if (has_target()) {
> -               ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
> -               if (ret) {
> -                       pr_err("%s: Failed to stop governor\n", __func__);
> -                       return ret;
> -               }
> -       }
> -
> -       if (!cpufreq_driver->setpolicy)
> -               strncpy(per_cpu(cpufreq_cpu_governor, cpu),
> -                       policy->governor->name, CPUFREQ_NAME_LEN);

Where is this gone? There are several instances of code just being
removed, this is the third one. Its really really tough to catch these
in this big of a patch. Believe me.

You have to break this patch into multiple ones, see this on how to
break even simplest of the changes into multiple patches:
https://lkml.org/lkml/2013/9/6/400

Its just impossible to catch bugs that you might have introduced here due
to the size of this patch. And its taking a LOT of time for me to review this.
As I have to keep diff in one tab, new cpufreq.c in one and the old cpufreq.c
in one and then compare..

> -       down_read(&policy->rwsem);
> -       cpus = cpumask_weight(policy->cpus);
> -       up_read(&policy->rwsem);
> -
> -       if (cpu != policy->cpu) {
> -               sysfs_remove_link(&dev->kobj, "cpufreq");
> -       } else if (cpus > 1) {
> -               new_cpu = cpufreq_nominate_new_policy_cpu(policy, cpu);
> -               if (new_cpu >= 0) {
> -                       update_policy_cpu(policy, new_cpu);
> -
> -                       if (!cpufreq_suspended)
> -                               pr_debug("%s: policy Kobject moved to cpu: %d from: %d\n",
> -                                        __func__, new_cpu, cpu);
> -               }
> -       } else if (cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
> -               cpufreq_driver->stop_cpu(policy);
> -       }
> -
> -       return 0;
> -}
> -
> -static int __cpufreq_remove_dev_finish(struct device *dev,
> -                                      struct subsys_interface *sif)
> -{
> -       unsigned int cpu = dev->id, cpus;
> -       int ret;
> -       unsigned long flags;
> -       struct cpufreq_policy *policy;
> -
>         read_lock_irqsave(&cpufreq_driver_lock, flags);
>         policy = per_cpu(cpufreq_cpu_data, cpu);
>         read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> @@ -1410,56 +1265,45 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>                 return -EINVAL;
>         }
>
> -       down_write(&policy->rwsem);
> -       cpus = cpumask_weight(policy->cpus);
> -
> -       if (cpus > 1)
> -               cpumask_clear_cpu(cpu, policy->cpus);
> -       up_write(&policy->rwsem);
> -
> -       /* If cpu is last user of policy, free policy */
> -       if (cpus == 1) {
> -               if (has_target()) {
> -                       ret = __cpufreq_governor(policy,
> -                                       CPUFREQ_GOV_POLICY_EXIT);
> -                       if (ret) {
> -                               pr_err("%s: Failed to exit governor\n",
> -                                      __func__);
> -                               return ret;
> -                       }
> -               }
> -
> -               if (!cpufreq_suspended)
> -                       cpufreq_policy_put_kobj(policy);
> +#ifdef CONFIG_HOTPLUG_CPU
> +       ret = cpufreq_change_policy_cpus(policy, cpu, false);
> +#endif
> +       if (ret)
> +               return ret;

Why is the if block kept outside of #ifdef? And should we really call
change_*() from inside a #ifdef here?

>
> -               /*
> -                * Perform the ->exit() even during light-weight tear-down,
> -                * since this is a core component, and is essential for the
> -                * subsequent light-weight ->init() to succeed.
> -                */
> -               if (cpufreq_driver->exit)
> -                       cpufreq_driver->exit(policy);
> +       if (!sif)
> +               return 0;

Why? I know that, but we should have comments to describe this ...

>
> -               /* Remove policy from list of active policies */
> -               write_lock_irqsave(&cpufreq_driver_lock, flags);
> -               list_del(&policy->policy_list);
> -               write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +       if (!cpumask_empty(policy->cpus)) {
> +               return 0;
> +       }

You might still call this attempt a showcase of idea, but I am reviewing it
at my full capacity. And these small things just break my flow.

- Don't add {} for single liner blocks
- Add comments with proper comment style
- Run checkpatch --strict before sending patches.

>
> -               if (!cpufreq_suspended)
> -                       cpufreq_policy_free(policy);
> -       } else if (has_target()) {
> -               ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
> -               if (!ret)
> -                       ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
> +       cpufreq_dev_symlink(policy, false);
>
> +       if (has_target()) {
> +               ret = __cpufreq_governor(policy,
> +                               CPUFREQ_GOV_POLICY_EXIT);

Can come in single line

>                 if (ret) {
> -                       pr_err("%s: Failed to start governor\n", __func__);
> +                       pr_err("%s: Failed to exit governor\n",
> +                              __func__);

This too..

>                         return ret;
>                 }
>         }
>
> -       per_cpu(cpufreq_cpu_data, cpu) = NULL;
> -       return 0;
> +       cpufreq_policy_put_kobj(policy);
> +       if (cpufreq_driver->exit)
> +               cpufreq_driver->exit(policy);
> +
> +       /* Remove policy from list of active policies */
> +       write_lock_irqsave(&cpufreq_driver_lock, flags);
> +       for_each_cpu(j, policy->related_cpus)
> +               per_cpu(cpufreq_cpu_data, j) = NULL;
> +       list_del(&policy->policy_list);
> +       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +
> +       cpufreq_policy_free(policy);
> +
> +       return ret;
>  }
>
>  /**
> @@ -1469,18 +1313,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>   */
>  static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
>  {
> -       unsigned int cpu = dev->id;
> -       int ret;
> -
> -       if (cpu_is_offline(cpu))
> -               return 0;

Why is it part of this commit?

> -       ret = __cpufreq_remove_dev_prepare(dev, sif);
> -
> -       if (!ret)
> -               ret = __cpufreq_remove_dev_finish(dev, sif);
> -
> -       return ret;
> +       return __cpufreq_remove_dev(dev, sif);
>  }
>
>  static void handle_update(struct work_struct *work)
> @@ -2295,19 +2128,12 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
>         if (dev) {
>                 switch (action & ~CPU_TASKS_FROZEN) {
>                 case CPU_ONLINE:
> +               case CPU_DOWN_FAILED:

For example. This change doesn't have anything to do with this patch
and would have been so easy to review it, if it was kept separate.

Also, this would even require to wait for this complete series to make
sense and can be merged very early.

>                         __cpufreq_add_dev(dev, NULL);
>                         break;
>
>                 case CPU_DOWN_PREPARE:
> -                       __cpufreq_remove_dev_prepare(dev, NULL);
> -                       break;
> -
> -               case CPU_POST_DEAD:
> -                       __cpufreq_remove_dev_finish(dev, NULL);
> -                       break;
> -
> -               case CPU_DOWN_FAILED:
> -                       __cpufreq_add_dev(dev, NULL);
> +                       __cpufreq_remove_dev(dev, NULL);

@Srivatsa: You might want to have a look at this, remove sequence was
separated for some purpose and I am just not able to concentrate enough
to think of that, just too many cases running in my mind :)

>                         break;
>                 }
>         }

I am still not sure if everything will work as expected as I seriously doubt
my reviewing capabilities. There might be corner cases which I am still
missing.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat July 16, 2014, 11:16 a.m. UTC | #3
On 07/16/2014 01:54 PM, Viresh Kumar wrote:
> On 16 July 2014 04:17, Saravana Kannan <skannan@codeaurora.org> wrote:
>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> 
>> +/* symlink related CPUs */
>> +static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
>>  {
>> -       unsigned int j;
>> +       unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
> 
> The CPU which came first should get the ownership by default, instead
> of the first one in the mask.
> 
> Normally at boot, all CPUs come up first and then only cpufreq init starts.
> But in case all other CPUs fail to come up, then policy->cpu *might* point
> to a failed cpu.
> 
> And so, we should simply use policy->cpu here instead of finding the
> first one in the mask.
> 
> Also, its not the duty of this routine to find which one is the policy cpu as
> that is done by __cpufreq_add_dev(). And so in case we need to make
> first cpu of a mask as policy->cpu, it should be done in __cpufreq_add_dev()
> and not here. This one should just follow the orders :)
> 
> @Srivatsa: What happens to the sysfs directory if a CPU fails to come up?
> Is it exactly similar to how it happens in hotplug? i.e. we do have a directory
> there?
> 

Short answer: If the sysfs directory has already been created by cpufreq,
then yes, it will remain as it is. However, if the online operation failed
before that, then cpufreq won't know about that CPU at all, and no file will
be created.

Long answer:
The existing cpufreq code does all its work (including creating the sysfs
directories etc) at the CPU_ONLINE stage. This stage is not expected to fail
(in fact even the core CPU hotplug code in kernel/cpu.c doesn't care for
error returns at this point). So if a CPU fails to come up in earlier stages
itself (such as CPU_UP_PREPARE), then cpufreq won't even hear about that CPU,
and hence no sysfs files will be created/linked. However, if the CPU bringup
operation fails during the CPU_ONLINE stage after the cpufreq's notifier has
been invoked, then we do nothing about it and the cpufreq sysfs files will
remain.

>>         int ret = 0;
>>
>> -       for_each_cpu(j, policy->cpus) {
>> +       for_each_cpu(j, policy->related_cpus) {
>>                 struct device *cpu_dev;
>>

[...]

>> @@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
>>  }
>>
>>  #ifdef CONFIG_HOTPLUG_CPU
> 
> @Srivatsa: I will try this but you also take care of this. These
> ifdefs might go wrong,
> i.e. we are surely using it in the current patch without HOTPLUG as well. See
> cpufreq_add_dev()..
> 

Yeah, looks suspicious.

> Also, how does suspend/resume work without CONFIG_HOTPLUG_CPU ?
> What's the sequence of events?
> 

Well, CONFIG_SUSPEND doesn't have an explicit dependency on HOTPLUG_CPU, but
SMP systems usually use CONFIG_PM_SLEEP_SMP, which sets CONFIG_HOTPLUG_CPU.
(I guess the reason why CONFIG_SUSPEND doesn't depend on HOTPLUG_CPU is
because suspend is possible even on uniprocessor systems and hence the
Kconfig dependency wasn't really justified).

>> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
>> -                                 unsigned int cpu, struct device *dev)
>> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
>> +                                 unsigned int cpu, bool add)

[...]

>> -
>> -       if (!cpufreq_driver->setpolicy)
>> -               strncpy(per_cpu(cpufreq_cpu_governor, cpu),
>> -                       policy->governor->name, CPUFREQ_NAME_LEN);
> 
> Where is this gone? There are several instances of code just being
> removed, this is the third one. Its really really tough to catch these
> in this big of a patch. Believe me.
> 
> You have to break this patch into multiple ones, see this on how to
> break even simplest of the changes into multiple patches:
> https://lkml.org/lkml/2013/9/6/400
> 
> Its just impossible to catch bugs that you might have introduced here due
> to the size of this patch. And its taking a LOT of time for me to review this.
> As I have to keep diff in one tab, new cpufreq.c in one and the old cpufreq.c
> in one and then compare..
> 

True, this is still a pretty huge chunk. Saravana, at this stage, don't worry
about making cpufreq work properly in each and every patch. Just ensure that
every patch builds fine; that should be good enough. I hope this will help you
in splitting up the patches further.

One other thing: your changelog contains what we usually write in a cover-
letter - *very* high-level goals of the patch. Ideally, you should explain
the subtle details and the non-obvious decisions or trade-offs that you have
made at various places in the code. Otherwise it becomes very hard to follow
your thought-flow just by looking at the patch. So please split up the patch
further and also make the changelogs useful to review the patch :-)

The link that Viresh gave above also did a lot of code reorganization in
cpufreq, so it should give you a good example of how to proceed.

[...]

>>                         __cpufreq_add_dev(dev, NULL);
>>                         break;
>>
>>                 case CPU_DOWN_PREPARE:
>> -                       __cpufreq_remove_dev_prepare(dev, NULL);
>> -                       break;
>> -
>> -               case CPU_POST_DEAD:
>> -                       __cpufreq_remove_dev_finish(dev, NULL);
>> -                       break;
>> -
>> -               case CPU_DOWN_FAILED:
>> -                       __cpufreq_add_dev(dev, NULL);
>> +                       __cpufreq_remove_dev(dev, NULL);
> 
> @Srivatsa: You might want to have a look at this, remove sequence was
> separated for some purpose and I am just not able to concentrate enough
> to think of that, just too many cases running in my mind :)
> 

Yeah, we had split it into _remove_dev_prepare() and _remove_dev_finish()
to avoid a few potential deadlocks. We wanted to call _remove_dev_prepare()
in the DOWN_PREPARE stage and then call _remove_dev_finish() (which waits
for the kobject refcount to drop) in the POST_DEAD stage. That is, we wanted
to do the kobject cleanup after releasing the hotplug lock, and POST_DEAD stage
was well-suited for that.

Commit 1aee40ac9c8 (cpufreq: Invoke __cpufreq_remove_dev_finish() after
releasing cpu_hotplug.lock) explains this in detail. Saravana, please take a
look at that reasoning and ensure that your patch doesn't re-introduce those
deadlock possibilities!

>>                         break;
>>                 }
>>         }
> 
> I am still not sure if everything will work as expected as I seriously doubt
> my reviewing capabilities. There might be corner cases which I am still
> missing.
> 

Regards,
Srivatsa S. Bhat
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar July 16, 2014, 1:13 p.m. UTC | #4
On 16 July 2014 16:46, Srivatsa S. Bhat <srivatsa@mit.edu> wrote:
> Short answer: If the sysfs directory has already been created by cpufreq,
> then yes, it will remain as it is. However, if the online operation failed
> before that, then cpufreq won't know about that CPU at all, and no file will
> be created.
>
> Long answer:
> The existing cpufreq code does all its work (including creating the sysfs
> directories etc) at the CPU_ONLINE stage. This stage is not expected to fail
> (in fact even the core CPU hotplug code in kernel/cpu.c doesn't care for
> error returns at this point). So if a CPU fails to come up in earlier stages
> itself (such as CPU_UP_PREPARE), then cpufreq won't even hear about that CPU,
> and hence no sysfs files will be created/linked. However, if the CPU bringup
> operation fails during the CPU_ONLINE stage after the cpufreq's notifier has
> been invoked, then we do nothing about it and the cpufreq sysfs files will
> remain.

In short, the problem I mentioned before this para is genuine. And setting
policy->cpu to the first cpu of a mask is indeed a bad idea.

>> Also, how does suspend/resume work without CONFIG_HOTPLUG_CPU ?
>> What's the sequence of events?
>>
>
> Well, CONFIG_SUSPEND doesn't have an explicit dependency on HOTPLUG_CPU, but
> SMP systems usually use CONFIG_PM_SLEEP_SMP, which sets CONFIG_HOTPLUG_CPU.

I read usually as *optional*

> (I guess the reason why CONFIG_SUSPEND doesn't depend on HOTPLUG_CPU is
> because suspend is possible even on uniprocessor systems and hence the
> Kconfig dependency wasn't really justified).

Again the same question, how do we suspend when HOTPLUG is disabled?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
dirk.brandewie@gmail.com July 16, 2014, 2:29 p.m. UTC | #5
On 07/15/2014 03:47 PM, Saravana Kannan wrote:
> The CPUfreq core moves the cpufreq policy ownership between CPUs when CPUs
> within a cluster (CPUs sharing same policy) go ONLINE/OFFLINE. When moving
> policy ownership between CPUs, it also moves the cpufreq sysfs directory
> between CPUs and also fixes up the symlinks of the other CPUs in the
> cluster.
>
> Also, when all the CPUs in a cluster go OFFLINE, all the sysfs nodes and
> directories are deleted, the kobject is released and the policy is freed.
> And when the first CPU in a cluster comes up, the policy is reallocated and
> initialized, kobject is acquired, the sysfs nodes are created or symlinked,
> etc.
>
> All these steps end up creating unnecessarily complicated code and locking.
> There's no real benefit to adding/removing/moving the sysfs nodes and the
> policy between CPUs. Other per CPU sysfs directories like power and cpuidle
> are left alone during hotplug. So there's some precedence to what this
> patch is trying to do.
>
> This patch simplifies a lot of the code and locking by removing the
> adding/removing/moving of policy/sysfs/kobj and just leaves the cpufreq
> directory and policy in place irrespective of whether the CPUs are
> ONLINE/OFFLINE.
>
> Leaving the policy, sysfs and kobject in place also brings these additional
> benefits:
> * Faster suspend/resume
> * Faster hotplug
> * Sysfs file permissions maintained across hotplug
> * Policy settings and governor tunables maintained across hotplug
> * Cpufreq stats would be maintained across hotplug for all CPUs and can be
>    queried even after CPU goes OFFLINE
>
> Tested-by: Stephen Boyd <sboyd@codeaurora.org>
> Signed-off-by: Saravana Kannan <skannan@codeaurora.org>
> ---
>   drivers/cpufreq/cpufreq.c | 388 +++++++++++++---------------------------------
>   1 file changed, 107 insertions(+), 281 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 62259d2..a0a2ec2 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -37,7 +37,6 @@
>    */
>   static struct cpufreq_driver *cpufreq_driver;
>   static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
> -static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
>   static DEFINE_RWLOCK(cpufreq_driver_lock);
>   DEFINE_MUTEX(cpufreq_governor_lock);
>   static LIST_HEAD(cpufreq_policy_list);
> @@ -859,34 +858,41 @@ void cpufreq_sysfs_remove_file(const struct attribute *attr)
>   }
>   EXPORT_SYMBOL(cpufreq_sysfs_remove_file);
>
> -/* symlink affected CPUs */
> -static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
> +/* symlink related CPUs */
> +static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
>   {
> -	unsigned int j;
> +	unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
>   	int ret = 0;
>
> -	for_each_cpu(j, policy->cpus) {
> +	for_each_cpu(j, policy->related_cpus) {
>   		struct device *cpu_dev;
>
> -		if (j == policy->cpu)
> +		if (j == first_cpu)
>   			continue;
>
> -		pr_debug("Adding link for CPU: %u\n", j);
>   		cpu_dev = get_cpu_device(j);
> -		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> -					"cpufreq");
> +		if (add)
> +			ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> +						"cpufreq");
> +		else
> +			sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
> +
>   		if (ret)
>   			break;
>   	}
>   	return ret;
>   }
>
> -static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
> -				     struct device *dev)
> +static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
>   {
>   	struct freq_attr **drv_attr;
> +	struct device *dev;
>   	int ret = 0;
>
> +	dev = get_cpu_device(cpumask_first(policy->related_cpus));
> +	if (!dev)
> +		return -EINVAL;
> +
>   	/* prepare interface data */
>   	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
>   				   &dev->kobj, "cpufreq");
> @@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
>   			goto err_out_kobj_put;
>   	}
>
> -	ret = cpufreq_add_dev_symlink(policy);
> +	ret = cpufreq_dev_symlink(policy, true);
>   	if (ret)
>   		goto err_out_kobj_put;
>
> @@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
>   }
>
>   #ifdef CONFIG_HOTPLUG_CPU
> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
> -				  unsigned int cpu, struct device *dev)
> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
> +				  unsigned int cpu, bool add)
>   {
>   	int ret = 0;
> -	unsigned long flags;
> +	unsigned int cpus, pcpu;
>
> -	if (has_target()) {
> +	down_write(&policy->rwsem);
> +
> +	cpus = !cpumask_empty(policy->cpus);
> +	if (has_target() && cpus) {
>   		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
>   		if (ret) {
>   			pr_err("%s: Failed to stop governor\n", __func__);
> -			return ret;
> +			goto unlock;
>   		}
>   	}
>
> -	down_write(&policy->rwsem);
> -
> -	write_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -	cpumask_set_cpu(cpu, policy->cpus);
> -	per_cpu(cpufreq_cpu_data, cpu) = policy;
> -	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +	if (add)
> +		cpumask_set_cpu(cpu, policy->cpus);
> +	else
> +		cpumask_clear_cpu(cpu, policy->cpus);
>
> -	up_write(&policy->rwsem);
> +	pcpu = cpumask_first(policy->cpus);
> +	if (pcpu < nr_cpu_ids && policy->cpu != pcpu) {
> +		policy->last_cpu = policy->cpu;
> +		policy->cpu = pcpu;
> +		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> +					CPUFREQ_UPDATE_POLICY_CPU, policy);
> +	}
>
> -	if (has_target()) {
> +	cpus = !cpumask_empty(policy->cpus);
> +	if (has_target() && cpus) {
>   		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
>   		if (!ret)
>   			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
>
>   		if (ret) {
>   			pr_err("%s: Failed to start governor\n", __func__);
> -			return ret;
> +			goto unlock;
>   		}
>   	}
>
> -	return sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq");
> -}
> -#endif
> -
> -static struct cpufreq_policy *cpufreq_policy_restore(unsigned int cpu)
> -{
> -	struct cpufreq_policy *policy;
> -	unsigned long flags;
> -
> -	read_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -	policy = per_cpu(cpufreq_cpu_data_fallback, cpu);
> -
> -	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +	if (!cpus && cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
> +		cpufreq_driver->stop_cpu(policy);
> +	}

stop_cpu() only needs to be called during __cpufreq_remove_dev_prepare() no
where else.

>
> -	policy->governor = NULL;
> +unlock:
> +	up_write(&policy->rwsem);
>
> -	return policy;
> +	return ret;
>   }
> +#endif
>
>   static struct cpufreq_policy *cpufreq_policy_alloc(void)
>   {
> @@ -1053,10 +1057,8 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
>   	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>   			CPUFREQ_REMOVE_POLICY, policy);
>
> -	down_read(&policy->rwsem);
>   	kobj = &policy->kobj;
>   	cmp = &policy->kobj_unregister;
> -	up_read(&policy->rwsem);
>   	kobject_put(kobj);
>
>   	/*
> @@ -1076,32 +1078,12 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
>   	kfree(policy);
>   }
>
> -static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu)
> -{
> -	if (WARN_ON(cpu == policy->cpu))
> -		return;
> -
> -	down_write(&policy->rwsem);
> -
> -	policy->last_cpu = policy->cpu;
> -	policy->cpu = cpu;
> -
> -	up_write(&policy->rwsem);
> -
> -	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> -			CPUFREQ_UPDATE_POLICY_CPU, policy);
> -}
> -
>   static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   {
>   	unsigned int j, cpu = dev->id;
>   	int ret = -ENOMEM;
>   	struct cpufreq_policy *policy;
>   	unsigned long flags;
> -	bool recover_policy = cpufreq_suspended;
> -#ifdef CONFIG_HOTPLUG_CPU
> -	struct cpufreq_policy *tpolicy;
> -#endif
>
>   	if (cpu_is_offline(cpu))
>   		return 0;
> @@ -1110,9 +1092,10 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>
>   #ifdef CONFIG_SMP
>   	/* check whether a different CPU already registered this
> -	 * CPU because it is in the same boat. */
> +	 * CPU because it is one of the related CPUs. */
>   	policy = cpufreq_cpu_get(cpu);
> -	if (unlikely(policy)) {
> +	if (policy) {
> +		cpufreq_change_policy_cpus(policy, cpu, true);
>   		cpufreq_cpu_put(policy);
>   		return 0;
>   	}
> @@ -1121,45 +1104,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   	if (!down_read_trylock(&cpufreq_rwsem))
>   		return 0;
>
> -#ifdef CONFIG_HOTPLUG_CPU
> -	/* Check if this cpu was hot-unplugged earlier and has siblings */
> -	read_lock_irqsave(&cpufreq_driver_lock, flags);
> -	list_for_each_entry(tpolicy, &cpufreq_policy_list, policy_list) {
> -		if (cpumask_test_cpu(cpu, tpolicy->related_cpus)) {
> -			read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -			ret = cpufreq_add_policy_cpu(tpolicy, cpu, dev);
> -			up_read(&cpufreq_rwsem);
> -			return ret;
> -		}
> -	}
> -	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -#endif
> -
> -	/*
> -	 * Restore the saved policy when doing light-weight init and fall back
> -	 * to the full init if that fails.
> -	 */
> -	policy = recover_policy ? cpufreq_policy_restore(cpu) : NULL;
> -	if (!policy) {
> -		recover_policy = false;
> -		policy = cpufreq_policy_alloc();
> -		if (!policy)
> -			goto nomem_out;
> -	}
> -
> -	/*
> -	 * In the resume path, since we restore a saved policy, the assignment
> -	 * to policy->cpu is like an update of the existing policy, rather than
> -	 * the creation of a brand new one. So we need to perform this update
> -	 * by invoking update_policy_cpu().
> -	 */
> -	if (recover_policy && cpu != policy->cpu)
> -		update_policy_cpu(policy, cpu);
> -	else
> -		policy->cpu = cpu;
> +	/* If we get this far, this is the first time we are adding the
> +	 * policy */
> +	policy = cpufreq_policy_alloc();
> +	if (!policy)
> +		goto nomem_out;
> +	policy->cpu = cpu;
>
>   	cpumask_copy(policy->cpus, cpumask_of(cpu));
> -
>   	init_completion(&policy->kobj_unregister);
>   	INIT_WORK(&policy->update, handle_update);
>
> @@ -1169,26 +1121,25 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   	ret = cpufreq_driver->init(policy);
>   	if (ret) {
>   		pr_debug("initialization failed\n");
> -		goto err_set_policy_cpu;
> +		goto err_init;
>   	}
>
>   	/* related cpus should atleast have policy->cpus */
>   	cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);
>
> +	/* Weed out impossible CPUs. */
> +	cpumask_and(policy->related_cpus, policy->related_cpus,
> +			cpu_possible_mask);
> +
>   	/*
>   	 * affected cpus must always be the one, which are online. We aren't
>   	 * managing offline cpus here.
>   	 */
>   	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
>
> -	if (!recover_policy) {
> -		policy->user_policy.min = policy->min;
> -		policy->user_policy.max = policy->max;
> -	}
> -
>   	down_write(&policy->rwsem);
>   	write_lock_irqsave(&cpufreq_driver_lock, flags);
> -	for_each_cpu(j, policy->cpus)
> +	for_each_cpu(j, policy->related_cpus)
>   		per_cpu(cpufreq_cpu_data, j) = policy;
>   	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>
> @@ -1243,13 +1194,11 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>   				     CPUFREQ_START, policy);
>
> -	if (!recover_policy) {
> -		ret = cpufreq_add_dev_interface(policy, dev);
> -		if (ret)
> -			goto err_out_unregister;
> -		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> -				CPUFREQ_CREATE_POLICY, policy);
> -	}
> +	ret = cpufreq_add_dev_interface(policy);
> +	if (ret)
> +		goto err_out_unregister;
> +	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
> +			CPUFREQ_CREATE_POLICY, policy);
>
>   	write_lock_irqsave(&cpufreq_driver_lock, flags);
>   	list_add(&policy->policy_list, &cpufreq_policy_list);
> @@ -1257,10 +1206,6 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>
>   	cpufreq_init_policy(policy);
>
> -	if (!recover_policy) {
> -		policy->user_policy.policy = policy->policy;
> -		policy->user_policy.governor = policy->governor;
> -	}
>   	up_write(&policy->rwsem);
>
>   	kobject_uevent(&policy->kobj, KOBJ_ADD);
> @@ -1273,20 +1218,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   err_out_unregister:
>   err_get_freq:
>   	write_lock_irqsave(&cpufreq_driver_lock, flags);
> -	for_each_cpu(j, policy->cpus)
> +	for_each_cpu(j, policy->related_cpus)
>   		per_cpu(cpufreq_cpu_data, j) = NULL;
>   	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -
> +	up_write(&policy->rwsem);
>   	if (cpufreq_driver->exit)
>   		cpufreq_driver->exit(policy);
> -err_set_policy_cpu:
> -	if (recover_policy) {
> -		/* Do not leave stale fallback data behind. */
> -		per_cpu(cpufreq_cpu_data_fallback, cpu) = NULL;
> -		cpufreq_policy_put_kobj(policy);
> -	}
> +err_init:
>   	cpufreq_policy_free(policy);
> -
>   nomem_out:
>   	up_read(&cpufreq_rwsem);
>
> @@ -1307,100 +1246,16 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>   	return __cpufreq_add_dev(dev, sif);
>   }
>
> -static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy,
> -					   unsigned int old_cpu)
> -{
> -	struct device *cpu_dev;
> -	int ret;
> -
> -	/* first sibling now owns the new sysfs dir */
> -	cpu_dev = get_cpu_device(cpumask_any_but(policy->cpus, old_cpu));
> -
> -	sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
> -	ret = kobject_move(&policy->kobj, &cpu_dev->kobj);
> -	if (ret) {
> -		pr_err("%s: Failed to move kobj: %d\n", __func__, ret);
> -
> -		down_write(&policy->rwsem);
> -		cpumask_set_cpu(old_cpu, policy->cpus);
> -		up_write(&policy->rwsem);
> -
> -		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
> -					"cpufreq");
> -
> -		return -EINVAL;
> -	}
> -
> -	return cpu_dev->id;
> -}
> -
> -static int __cpufreq_remove_dev_prepare(struct device *dev,
> -					struct subsys_interface *sif)
> +static int __cpufreq_remove_dev(struct device *dev,
> +				struct subsys_interface *sif)
>   {
> -	unsigned int cpu = dev->id, cpus;
> -	int new_cpu, ret;
> +	unsigned int cpu = dev->id, j;
> +	int ret = 0;
>   	unsigned long flags;
>   	struct cpufreq_policy *policy;
>
>   	pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
>
> -	write_lock_irqsave(&cpufreq_driver_lock, flags);
> -
> -	policy = per_cpu(cpufreq_cpu_data, cpu);
> -
> -	/* Save the policy somewhere when doing a light-weight tear-down */
> -	if (cpufreq_suspended)
> -		per_cpu(cpufreq_cpu_data_fallback, cpu) = policy;
> -
> -	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> -
> -	if (!policy) {
> -		pr_debug("%s: No cpu_data found\n", __func__);
> -		return -EINVAL;
> -	}
> -
> -	if (has_target()) {
> -		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
> -		if (ret) {
> -			pr_err("%s: Failed to stop governor\n", __func__);
> -			return ret;
> -		}
> -	}
> -
> -	if (!cpufreq_driver->setpolicy)
> -		strncpy(per_cpu(cpufreq_cpu_governor, cpu),
> -			policy->governor->name, CPUFREQ_NAME_LEN);
> -
> -	down_read(&policy->rwsem);
> -	cpus = cpumask_weight(policy->cpus);
> -	up_read(&policy->rwsem);
> -
> -	if (cpu != policy->cpu) {
> -		sysfs_remove_link(&dev->kobj, "cpufreq");
> -	} else if (cpus > 1) {
> -		new_cpu = cpufreq_nominate_new_policy_cpu(policy, cpu);
> -		if (new_cpu >= 0) {
> -			update_policy_cpu(policy, new_cpu);
> -
> -			if (!cpufreq_suspended)
> -				pr_debug("%s: policy Kobject moved to cpu: %d from: %d\n",
> -					 __func__, new_cpu, cpu);
> -		}
> -	} else if (cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
> -		cpufreq_driver->stop_cpu(policy);
> -	}
> -
> -	return 0;
> -}
> -
> -static int __cpufreq_remove_dev_finish(struct device *dev,
> -				       struct subsys_interface *sif)
> -{
> -	unsigned int cpu = dev->id, cpus;
> -	int ret;
> -	unsigned long flags;
> -	struct cpufreq_policy *policy;
> -
>   	read_lock_irqsave(&cpufreq_driver_lock, flags);
>   	policy = per_cpu(cpufreq_cpu_data, cpu);
>   	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
> @@ -1410,56 +1265,45 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>   		return -EINVAL;
>   	}
>
> -	down_write(&policy->rwsem);
> -	cpus = cpumask_weight(policy->cpus);
> -
> -	if (cpus > 1)
> -		cpumask_clear_cpu(cpu, policy->cpus);
> -	up_write(&policy->rwsem);
> -
> -	/* If cpu is last user of policy, free policy */
> -	if (cpus == 1) {
> -		if (has_target()) {
> -			ret = __cpufreq_governor(policy,
> -					CPUFREQ_GOV_POLICY_EXIT);
> -			if (ret) {
> -				pr_err("%s: Failed to exit governor\n",
> -				       __func__);
> -				return ret;
> -			}
> -		}
> -
> -		if (!cpufreq_suspended)
> -			cpufreq_policy_put_kobj(policy);
> +#ifdef CONFIG_HOTPLUG_CPU
> +	ret = cpufreq_change_policy_cpus(policy, cpu, false);
> +#endif
> +	if (ret)
> +		return ret;
>
> -		/*
> -		 * Perform the ->exit() even during light-weight tear-down,
> -		 * since this is a core component, and is essential for the
> -		 * subsequent light-weight ->init() to succeed.
> -		 */
> -		if (cpufreq_driver->exit)
> -			cpufreq_driver->exit(policy);
> +	if (!sif)
> +		return 0;
>
> -		/* Remove policy from list of active policies */
> -		write_lock_irqsave(&cpufreq_driver_lock, flags);
> -		list_del(&policy->policy_list);
> -		write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +	if (!cpumask_empty(policy->cpus)) {
> +		return 0;
> +	}
>
> -		if (!cpufreq_suspended)
> -			cpufreq_policy_free(policy);
> -	} else if (has_target()) {
> -		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
> -		if (!ret)
> -			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
> +	cpufreq_dev_symlink(policy, false);
>
> +	if (has_target()) {
> +		ret = __cpufreq_governor(policy,
> +				CPUFREQ_GOV_POLICY_EXIT);
>   		if (ret) {
> -			pr_err("%s: Failed to start governor\n", __func__);
> +			pr_err("%s: Failed to exit governor\n",
> +			       __func__);
>   			return ret;
>   		}
>   	}
>
> -	per_cpu(cpufreq_cpu_data, cpu) = NULL;
> -	return 0;
> +	cpufreq_policy_put_kobj(policy);
> +	if (cpufreq_driver->exit)
> +		cpufreq_driver->exit(policy);
> +
> +	/* Remove policy from list of active policies */
> +	write_lock_irqsave(&cpufreq_driver_lock, flags);
> +	for_each_cpu(j, policy->related_cpus)
> +		per_cpu(cpufreq_cpu_data, j) = NULL;
> +	list_del(&policy->policy_list);
> +	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> +
> +	cpufreq_policy_free(policy);
> +
> +	return ret;
>   }
>
>   /**
> @@ -1469,18 +1313,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>    */
>   static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
>   {
> -	unsigned int cpu = dev->id;
> -	int ret;
> -
> -	if (cpu_is_offline(cpu))
> -		return 0;
> -
> -	ret = __cpufreq_remove_dev_prepare(dev, sif);
> -
> -	if (!ret)
> -		ret = __cpufreq_remove_dev_finish(dev, sif);
> -
> -	return ret;
> +	return __cpufreq_remove_dev(dev, sif);
>   }
>
>   static void handle_update(struct work_struct *work)
> @@ -2295,19 +2128,12 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
>   	if (dev) {
>   		switch (action & ~CPU_TASKS_FROZEN) {
>   		case CPU_ONLINE:
> +		case CPU_DOWN_FAILED:
>   			__cpufreq_add_dev(dev, NULL);
>   			break;
>
>   		case CPU_DOWN_PREPARE:
> -			__cpufreq_remove_dev_prepare(dev, NULL);
> -			break;
> -
> -		case CPU_POST_DEAD:
> -			__cpufreq_remove_dev_finish(dev, NULL);
> -			break;
> -
> -		case CPU_DOWN_FAILED:
> -			__cpufreq_add_dev(dev, NULL);
> +			__cpufreq_remove_dev(dev, NULL);
>   			break;
>   		}
>   	}
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar July 16, 2014, 3:28 p.m. UTC | #6
On 16 July 2014 19:59, Dirk Brandewie <dirk.brandewie@gmail.com> wrote:
> stop_cpu() only needs to be called during __cpufreq_remove_dev_prepare() no
> where else.

Oh, thanks for reminding us..

Look at this Saravana:
367dc4a cpufreq: Add stop CPU callback to cpufreq_driver interface
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat July 16, 2014, 6:04 p.m. UTC | #7
On 07/16/2014 06:43 PM, Viresh Kumar wrote:
> On 16 July 2014 16:46, Srivatsa S. Bhat <srivatsa@mit.edu> wrote:
>> Short answer: If the sysfs directory has already been created by cpufreq,
>> then yes, it will remain as it is. However, if the online operation failed
>> before that, then cpufreq won't know about that CPU at all, and no file will
>> be created.
>>
>> Long answer:
>> The existing cpufreq code does all its work (including creating the sysfs
>> directories etc) at the CPU_ONLINE stage. This stage is not expected to fail
>> (in fact even the core CPU hotplug code in kernel/cpu.c doesn't care for
>> error returns at this point). So if a CPU fails to come up in earlier stages
>> itself (such as CPU_UP_PREPARE), then cpufreq won't even hear about that CPU,
>> and hence no sysfs files will be created/linked. However, if the CPU bringup
>> operation fails during the CPU_ONLINE stage after the cpufreq's notifier has
>> been invoked, then we do nothing about it and the cpufreq sysfs files will
>> remain.
> 
> In short, the problem I mentioned before this para is genuine. And setting
> policy->cpu to the first cpu of a mask is indeed a bad idea.
> 
>>> Also, how does suspend/resume work without CONFIG_HOTPLUG_CPU ?
>>> What's the sequence of events?
>>>
>>
>> Well, CONFIG_SUSPEND doesn't have an explicit dependency on HOTPLUG_CPU, but
>> SMP systems usually use CONFIG_PM_SLEEP_SMP, which sets CONFIG_HOTPLUG_CPU.
> 
> I read usually as *optional*
> 
>> (I guess the reason why CONFIG_SUSPEND doesn't depend on HOTPLUG_CPU is
>> because suspend is possible even on uniprocessor systems and hence the
>> Kconfig dependency wasn't really justified).
> 
> Again the same question, how do we suspend when HOTPLUG is disabled?
> 

From what I understand, if you disable HOTPLUG_CPU and enable CONFIG_SUSPEND
and try suspend/resume on an SMP system, the disable_nonboot_cpus() call will
return silently without doing anything. Thus, suspend will fail silently and
the system might have trouble resuming.

But surprisingly we have never had such bug reports so far! Most probably this
is because PM_SLEEP_SMP has a default of y (which in turn selects HOTPLUG_CPU):

config PM_SLEEP_SMP
        def_bool y
        depends on SMP
        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
        depends on PM_SLEEP
        select HOTPLUG_CPU

So I guess nobody really tried turning this off on SMP systems and then trying
suspend. Then I started looking at the git history and wondered where this
Kconfig dependency between SUSPEND and SMP<->HOTPLUG_CPU got messed up. But
instead I found that the initial commit itself didn't get the dependency right.

Commit 296699de6bdc (Introduce CONFIG_SUSPEND for suspend-to-Ram and standby)
introduced all the Kconfig options, and it indeed mentions this in the
changelog: "Make HOTPLUG_CPU be selected automatically if SUSPEND or
HIBERNATION has been chosen and the kernel is intended for SMP systems". But
unfortunately, the code didn't get it right because it made CONFIG_SUSPEND
depend on SUSPEND_SMP_POSSIBLE instead of SUSPEND_SMP.

In other words, we have had this incorrect dependency all the time!

Regards,
Srivatsa S. Bhat
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saravana Kannan July 16, 2014, 7:42 p.m. UTC | #8
On 07/16/2014 08:28 AM, Viresh Kumar wrote:
> On 16 July 2014 19:59, Dirk Brandewie <dirk.brandewie@gmail.com> wrote:
>> stop_cpu() only needs to be called during __cpufreq_remove_dev_prepare() no
>> where else.
>
> Oh, thanks for reminding us..
>
> Look at this Saravana:
> 367dc4a cpufreq: Add stop CPU callback to cpufreq_driver interface
>

I'll only get called at the same time as it is called today. 
__cpufreq_remove_dev_prepare is now renamed to __cpufreq_remove_dev. And 
this function is called from there.

The only time stop does get called is when __cpufreq_remove_dev is 
called on the last CPU of a policy. So, functionally it's identical.

Btw, I already added logs to all cpufreq driver ops and checked the 
calls come in the same order with and without my changes.

-Saravana
Saravana Kannan July 16, 2014, 7:56 p.m. UTC | #9
On 07/16/2014 04:16 AM, Srivatsa S. Bhat wrote:
> On 07/16/2014 01:54 PM, Viresh Kumar wrote:
>> On 16 July 2014 04:17, Saravana Kannan <skannan@codeaurora.org> wrote:
>>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>>

<SNIP>

>>> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
>>> -                                 unsigned int cpu, struct device *dev)
>>> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
>>> +                                 unsigned int cpu, bool add)
>
> [...]
>
>>> -
>>> -       if (!cpufreq_driver->setpolicy)
>>> -               strncpy(per_cpu(cpufreq_cpu_governor, cpu),
>>> -                       policy->governor->name, CPUFREQ_NAME_LEN);
>>
>> Where is this gone? There are several instances of code just being
>> removed, this is the third one. Its really really tough to catch these
>> in this big of a patch. Believe me.
>>
>> You have to break this patch into multiple ones, see this on how to
>> break even simplest of the changes into multiple patches:
>> https://lkml.org/lkml/2013/9/6/400
>>
>> Its just impossible to catch bugs that you might have introduced here due
>> to the size of this patch. And its taking a LOT of time for me to review this.
>> As I have to keep diff in one tab, new cpufreq.c in one and the old cpufreq.c
>> in one and then compare..
>>
>
> True, this is still a pretty huge chunk. Saravana, at this stage, don't worry
> about making cpufreq work properly in each and every patch. Just ensure that
> every patch builds fine; that should be good enough. I hope this will help you
> in splitting up the patches further.

Thanks Srivatsa. This will definitely help split them up into smaller 
chunks.

> One other thing: your changelog contains what we usually write in a cover-
> letter - *very* high-level goals of the patch. Ideally, you should explain
> the subtle details and the non-obvious decisions or trade-offs that you have
> made at various places in the code. Otherwise it becomes very hard to follow
> your thought-flow just by looking at the patch. So please split up the patch
> further and also make the changelogs useful to review the patch :-)

Thanks. Will do.

> The link that Viresh gave above also did a lot of code reorganization in
> cpufreq, so it should give you a good example of how to proceed.
>
> [...]
>
>>>                          __cpufreq_add_dev(dev, NULL);
>>>                          break;
>>>
>>>                  case CPU_DOWN_PREPARE:
>>> -                       __cpufreq_remove_dev_prepare(dev, NULL);
>>> -                       break;
>>> -
>>> -               case CPU_POST_DEAD:
>>> -                       __cpufreq_remove_dev_finish(dev, NULL);
>>> -                       break;
>>> -
>>> -               case CPU_DOWN_FAILED:
>>> -                       __cpufreq_add_dev(dev, NULL);
>>> +                       __cpufreq_remove_dev(dev, NULL);
>>
>> @Srivatsa: You might want to have a look at this, remove sequence was
>> separated for some purpose and I am just not able to concentrate enough
>> to think of that, just too many cases running in my mind :)
>>
>
> Yeah, we had split it into _remove_dev_prepare() and _remove_dev_finish()
> to avoid a few potential deadlocks. We wanted to call _remove_dev_prepare()
> in the DOWN_PREPARE stage and then call _remove_dev_finish() (which waits
> for the kobject refcount to drop) in the POST_DEAD stage. That is, we wanted
> to do the kobject cleanup after releasing the hotplug lock, and POST_DEAD stage
> was well-suited for that.
>
> Commit 1aee40ac9c8 (cpufreq: Invoke __cpufreq_remove_dev_finish() after
> releasing cpu_hotplug.lock) explains this in detail. Saravana, please take a
> look at that reasoning and ensure that your patch doesn't re-introduce those
> deadlock possibilities!

But all of that was needed _because_ we were creating and destroying 
policies and kobjs all the time. We don't do that anymore. So, I don't 
think any of that applies. We only destroy when the cpufreq driver is 
unregistered. That's kinda of the point of this patchset.

Thoughts?

-Saravana
Saravana Kannan July 16, 2014, 7:56 p.m. UTC | #10
On 07/16/2014 06:13 AM, Viresh Kumar wrote:
> On 16 July 2014 16:46, Srivatsa S. Bhat <srivatsa@mit.edu> wrote:
>> Short answer: If the sysfs directory has already been created by cpufreq,
>> then yes, it will remain as it is. However, if the online operation failed
>> before that, then cpufreq won't know about that CPU at all, and no file will
>> be created.
>>
>> Long answer:
>> The existing cpufreq code does all its work (including creating the sysfs
>> directories etc) at the CPU_ONLINE stage. This stage is not expected to fail
>> (in fact even the core CPU hotplug code in kernel/cpu.c doesn't care for
>> error returns at this point). So if a CPU fails to come up in earlier stages
>> itself (such as CPU_UP_PREPARE), then cpufreq won't even hear about that CPU,
>> and hence no sysfs files will be created/linked. However, if the CPU bringup
>> operation fails during the CPU_ONLINE stage after the cpufreq's notifier has
>> been invoked, then we do nothing about it and the cpufreq sysfs files will
>> remain.
>
> In short, the problem I mentioned before this para is genuine. And setting
> policy->cpu to the first cpu of a mask is indeed a bad idea.

No it's not. All the cpu*/ directories for all possible CPUs will be 
there whether a CPU is online/offline. Which is why I also weed out 
impossible CPUs, but you said the driver shouldn't be passing impossible 
CPUs anyway. I'm just picking one directory to put the real cpufreq 
directory under. So, the code as-is is definitely not broken.

Sure, I can pick the first cpu that comes online to decide where to put 
the real sysfs cpufreq directory, but then I have to keep track of that 
in a separate field when it's time to remove it when the cpufreq driver 
is unregistered. It's yet another pointless thing to keep track. And no, 
we shouldn't be moving sysfs directory to stay with only an online 
directory. That's the thing this patch is trying to simplify.

So, I think using the first cpu in related CPUs will always work. If 
there's any disagreement, I think it's purely a personal preference over 
adding another field vs calling cpumask_first()

-Saravana
Saravana Kannan July 16, 2014, 8:25 p.m. UTC | #11
On 07/16/2014 01:24 AM, Viresh Kumar wrote:
> On 16 July 2014 04:17, Saravana Kannan <skannan@codeaurora.org> wrote:
>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>
>> +/* symlink related CPUs */
>> +static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
>>   {
>> -       unsigned int j;
>> +       unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
>
> The CPU which came first should get the ownership by default, instead
> of the first one in the mask.
>
> Normally at boot, all CPUs come up first and then only cpufreq init starts.
> But in case all other CPUs fail to come up, then policy->cpu *might* point
> to a failed cpu.
>
> And so, we should simply use policy->cpu here instead of finding the
> first one in the mask.
>

Replied to this in a different email.

> Also, its not the duty of this routine to find which one is the policy cpu as
> that is done by __cpufreq_add_dev(). And so in case we need to make
> first cpu of a mask as policy->cpu, it should be done in __cpufreq_add_dev()
> and not here. This one should just follow the orders :)

This is a new function. And that split up might have made sense earlier. 
But not so much anymore since I'm sharing a lot of code between 
__cpufreq_add_dev() and __cpufreq_remove_dev(). There's not reason to 
stick with the previous split of up work if it doesn't apply well anymore.

Please give this a second thought. Maybe it'll make more sense after I 
split this up into smaller patches.

>
> @Srivatsa: What happens to the sysfs directory if a CPU fails to come up?
> Is it exactly similar to how it happens in hotplug? i.e. we do have a directory
> there?
>
>>          int ret = 0;
>>
>> -       for_each_cpu(j, policy->cpus) {
>> +       for_each_cpu(j, policy->related_cpus) {
>>                  struct device *cpu_dev;
>>
>> -               if (j == policy->cpu)
>> +               if (j == first_cpu)
>>                          continue;
>>
>> -               pr_debug("Adding link for CPU: %u\n", j);
>
> Keep this please, it might be useful while debugging.

Reluctant ok. We don't add/remove these files anymore in a common 
scenario. So, it's not going to be very helpful. I'll also have to do a 
add ? Add : Remove blurb for the print.

>
>>                  cpu_dev = get_cpu_device(j);
>> -               ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
>> -                                       "cpufreq");
>> +               if (add)
>> +                       ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
>> +                                               "cpufreq");
>> +               else
>> +                       sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
>> +
>>                  if (ret)
>>                          break;
>>          }
>>          return ret;
>>   }
>>
>> -static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
>> -                                    struct device *dev)
>> +static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
>>   {
>>          struct freq_attr **drv_attr;
>> +       struct device *dev;
>>          int ret = 0;
>>
>> +       dev = get_cpu_device(cpumask_first(policy->related_cpus));
>> +       if (!dev)
>> +               return -EINVAL;
>
> Again, deciding which cpu is policy->cpu here is wrong. Just follow
> orders of __cpufreq_add_dev().

But that's not what I'm doing here?

>>          /* prepare interface data */
>>          ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
>>                                     &dev->kobj, "cpufreq");
>> @@ -917,7 +923,7 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
>>                          goto err_out_kobj_put;
>>          }
>>
>> -       ret = cpufreq_add_dev_symlink(policy);
>> +       ret = cpufreq_dev_symlink(policy, true);
>>          if (ret)
>>                  goto err_out_kobj_put;
>>
>> @@ -961,60 +967,58 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
>>   }
>>
>>   #ifdef CONFIG_HOTPLUG_CPU
>
> @Srivatsa: I will try this but you also take care of this. These
> ifdefs might go wrong,
> i.e. we are surely using it in the current patch without HOTPLUG as well. See
> cpufreq_add_dev()..
>
> Also, how does suspend/resume work without CONFIG_HOTPLUG_CPU ?
> What's the sequence of events?
>
>> -static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
>> -                                 unsigned int cpu, struct device *dev)
>> +static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
>> +                                 unsigned int cpu, bool add)
>>   {
>>          int ret = 0;
>> -       unsigned long flags;
>> +       unsigned int cpus, pcpu;
>>
>> -       if (has_target()) {
>> +       down_write(&policy->rwsem);
>> +
>> +       cpus = !cpumask_empty(policy->cpus);
>
> We aren't using cpus at multiple places and so probably it would
> be better to using cpumask_empty() directly.
>
>> +       if (has_target() && cpus) {
>
> I may get the answer later in reviews, but when will cpus be 0 here?
> Probably for non-boot cluster during suspend/resume, or forceful
> hotplugging off all CPUs of a cluster. Right?

Yup!

>
>>                  ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
>>                  if (ret) {
>>                          pr_err("%s: Failed to stop governor\n", __func__);
>> -                       return ret;
>> +                       goto unlock;
>>                  }
>>          }
>>
>> -       down_write(&policy->rwsem);
>> -
>> -       write_lock_irqsave(&cpufreq_driver_lock, flags);
>> -
>> -       cpumask_set_cpu(cpu, policy->cpus);
>> -       per_cpu(cpufreq_cpu_data, cpu) = policy;
>> -       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> +       if (add)
>> +               cpumask_set_cpu(cpu, policy->cpus);
>> +       else
>> +               cpumask_clear_cpu(cpu, policy->cpus);
>>
>> -       up_write(&policy->rwsem);
>> +       pcpu = cpumask_first(policy->cpus);
>> +       if (pcpu < nr_cpu_ids && policy->cpu != pcpu) {
>
> No, we don't have to consider changing policy->cpu for every change
> in policy->cpus. We need to do that only when policy->cpu goes down.

Ok, I agree I could improve the check to reduce the unnecessary 
notification even more.

> Also pcpu can't be < nr_cpu_ids, right?

This is for the case when all CPUs in a cluster have been taken down. We 
don't want to send the notifier at that point. When the mask is empty, 
the function returns a value >= nr_cpu_ids to indicate an error.

>
>> +               policy->last_cpu = policy->cpu;
>> +               policy->cpu = pcpu;
>> +               blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>> +                                       CPUFREQ_UPDATE_POLICY_CPU, policy);
>> +       }
>>
>> -       if (has_target()) {
>> +       cpus = !cpumask_empty(policy->cpus);
>> +       if (has_target() && cpus) {
>>                  ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
>>                  if (!ret)
>>                          ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
>>
>>                  if (ret) {
>>                          pr_err("%s: Failed to start governor\n", __func__);
>> -                       return ret;
>> +                       goto unlock;
>>                  }
>>          }
>>
>> -       return sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq");
>> -}
>> -#endif
>> -
>> -static struct cpufreq_policy *cpufreq_policy_restore(unsigned int cpu)
>> -{
>> -       struct cpufreq_policy *policy;
>> -       unsigned long flags;
>> -
>> -       read_lock_irqsave(&cpufreq_driver_lock, flags);
>> -
>> -       policy = per_cpu(cpufreq_cpu_data_fallback, cpu);
>> -
>> -       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> +       if (!cpus && cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
>
> As I commented on V1, please make it else part of above if..
>
>> +               cpufreq_driver->stop_cpu(policy);
>> +       }
>>
>> -       policy->governor = NULL;
>> +unlock:
>> +       up_write(&policy->rwsem);
>>
>> -       return policy;
>> +       return ret;
>>   }
>> +#endif
>>
>>   static struct cpufreq_policy *cpufreq_policy_alloc(void)
>>   {
>> @@ -1053,10 +1057,8 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
>>          blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>>                          CPUFREQ_REMOVE_POLICY, policy);
>>
>> -       down_read(&policy->rwsem);
>>          kobj = &policy->kobj;
>>          cmp = &policy->kobj_unregister;
>> -       up_read(&policy->rwsem);
>
> Why? And also, these are unrelated changes and must be added as separate
> commits.

This is because we call this with policy rwsem read lock held and 
lockdep throws a warning. So, it's related to this patch.

>
>>          kobject_put(kobj);
>>
>>          /*
>> @@ -1076,32 +1078,12 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
>>          kfree(policy);
>>   }
>>
>> -static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu)
>> -{
>> -       if (WARN_ON(cpu == policy->cpu))
>> -               return;
>> -
>> -       down_write(&policy->rwsem);
>> -
>> -       policy->last_cpu = policy->cpu;
>> -       policy->cpu = cpu;
>> -
>> -       up_write(&policy->rwsem);
>> -
>> -       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>> -                       CPUFREQ_UPDATE_POLICY_CPU, policy);
>> -}
>> -
>>   static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>   {
>>          unsigned int j, cpu = dev->id;
>>          int ret = -ENOMEM;
>>          struct cpufreq_policy *policy;
>>          unsigned long flags;
>> -       bool recover_policy = cpufreq_suspended;
>> -#ifdef CONFIG_HOTPLUG_CPU
>> -       struct cpufreq_policy *tpolicy;
>> -#endif
>>
>>          if (cpu_is_offline(cpu))
>>                  return 0;
>> @@ -1110,9 +1092,10 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>
>>   #ifdef CONFIG_SMP
>>          /* check whether a different CPU already registered this
>> -        * CPU because it is in the same boat. */
>> +        * CPU because it is one of the related CPUs. */
>>          policy = cpufreq_cpu_get(cpu);
>> -       if (unlikely(policy)) {
>> +       if (policy) {
>> +               cpufreq_change_policy_cpus(policy, cpu, true);
>
> This is just a waste of time at boot as ... (see below)

Why? Please explain.

>
>>                  cpufreq_cpu_put(policy);
>>                  return 0;
>>          }
>> @@ -1121,45 +1104,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>          if (!down_read_trylock(&cpufreq_rwsem))
>>                  return 0;
>>
>> -#ifdef CONFIG_HOTPLUG_CPU
>> -       /* Check if this cpu was hot-unplugged earlier and has siblings */
>> -       read_lock_irqsave(&cpufreq_driver_lock, flags);
>> -       list_for_each_entry(tpolicy, &cpufreq_policy_list, policy_list) {
>> -               if (cpumask_test_cpu(cpu, tpolicy->related_cpus)) {
>> -                       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> -                       ret = cpufreq_add_policy_cpu(tpolicy, cpu, dev);
>> -                       up_read(&cpufreq_rwsem);
>> -                       return ret;
>> -               }
>> -       }
>> -       read_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> -#endif
>> -
>> -       /*
>> -        * Restore the saved policy when doing light-weight init and fall back
>> -        * to the full init if that fails.
>> -        */
>> -       policy = recover_policy ? cpufreq_policy_restore(cpu) : NULL;
>> -       if (!policy) {
>> -               recover_policy = false;
>> -               policy = cpufreq_policy_alloc();
>> -               if (!policy)
>> -                       goto nomem_out;
>> -       }
>> -
>> -       /*
>> -        * In the resume path, since we restore a saved policy, the assignment
>> -        * to policy->cpu is like an update of the existing policy, rather than
>> -        * the creation of a brand new one. So we need to perform this update
>> -        * by invoking update_policy_cpu().
>> -        */
>> -       if (recover_policy && cpu != policy->cpu)
>> -               update_policy_cpu(policy, cpu);
>> -       else
>> -               policy->cpu = cpu;
>> +       /* If we get this far, this is the first time we are adding the
>> +        * policy */
>> +       policy = cpufreq_policy_alloc();
>> +       if (!policy)
>> +               goto nomem_out;
>> +       policy->cpu = cpu;
>>
>>          cpumask_copy(policy->cpus, cpumask_of(cpu));
>> -
>>          init_completion(&policy->kobj_unregister);
>>          INIT_WORK(&policy->update, handle_update);
>>
>> @@ -1169,26 +1121,25 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>          ret = cpufreq_driver->init(policy);
>>          if (ret) {
>>                  pr_debug("initialization failed\n");
>> -               goto err_set_policy_cpu;
>> +               goto err_init;
>>          }
>>
>>          /* related cpus should atleast have policy->cpus */
>>          cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);
>
> policy->cpus is already updated here.
>
>> +       /* Weed out impossible CPUs. */
>> +       cpumask_and(policy->related_cpus, policy->related_cpus,
>> +                       cpu_possible_mask);
>
> This has to be in a separate commit..

I meant to remove this based on your previous comment that it's the 
responsibility of the driver to pass only possible CPUs. Forgot. Will do.

>
>>          /*
>>           * affected cpus must always be the one, which are online. We aren't
>>           * managing offline cpus here.
>>           */
>>          cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
>>
>> -       if (!recover_policy) {
>> -               policy->user_policy.min = policy->min;
>> -               policy->user_policy.max = policy->max;
>> -       }
>> -
>
> Where did these go? There weren't there for fun.

We are keeping the policy intact. So, why would this be needed anymore?

>
>>          down_write(&policy->rwsem);
>>          write_lock_irqsave(&cpufreq_driver_lock, flags);
>> -       for_each_cpu(j, policy->cpus)
>> +       for_each_cpu(j, policy->related_cpus)
>>                  per_cpu(cpufreq_cpu_data, j) = policy;
>>          write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>>
>> @@ -1243,13 +1194,11 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>          blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>>                                       CPUFREQ_START, policy);
>>
>> -       if (!recover_policy) {
>> -               ret = cpufreq_add_dev_interface(policy, dev);
>> -               if (ret)
>> -                       goto err_out_unregister;
>> -               blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>> -                               CPUFREQ_CREATE_POLICY, policy);
>> -       }
>> +       ret = cpufreq_add_dev_interface(policy);
>> +       if (ret)
>> +               goto err_out_unregister;
>> +       blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>> +                       CPUFREQ_CREATE_POLICY, policy);
>>
>>          write_lock_irqsave(&cpufreq_driver_lock, flags);
>>          list_add(&policy->policy_list, &cpufreq_policy_list);
>> @@ -1257,10 +1206,6 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>
>>          cpufreq_init_policy(policy);
>>
>> -       if (!recover_policy) {
>> -               policy->user_policy.policy = policy->policy;
>> -               policy->user_policy.governor = policy->governor;
>> -       }
>
> Same here.

Same here. We are keeping the policy intact. So, not needed?

>
>>          up_write(&policy->rwsem);
>>
>>          kobject_uevent(&policy->kobj, KOBJ_ADD);
>> @@ -1273,20 +1218,14 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>   err_out_unregister:
>>   err_get_freq:
>>          write_lock_irqsave(&cpufreq_driver_lock, flags);
>> -       for_each_cpu(j, policy->cpus)
>> +       for_each_cpu(j, policy->related_cpus)
>>                  per_cpu(cpufreq_cpu_data, j) = NULL;
>>          write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> -
>> +       up_write(&policy->rwsem);
>>          if (cpufreq_driver->exit)
>>                  cpufreq_driver->exit(policy);
>> -err_set_policy_cpu:
>> -       if (recover_policy) {
>> -               /* Do not leave stale fallback data behind. */
>> -               per_cpu(cpufreq_cpu_data_fallback, cpu) = NULL;
>> -               cpufreq_policy_put_kobj(policy);
>> -       }
>> +err_init:
>>          cpufreq_policy_free(policy);
>> -
>>   nomem_out:
>>          up_read(&cpufreq_rwsem);
>>
>
> Just to mention, I am not looking at the validity of error fallback paths
> in this version. Just make sure they are all good :)

Will do. And even if it's broken, I'll fix it in a separate patch :)


>> @@ -1307,100 +1246,16 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
>>          return __cpufreq_add_dev(dev, sif);
>>   }
>>
>> -static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy,
>> -                                          unsigned int old_cpu)
>> -{
>> -       struct device *cpu_dev;
>> -       int ret;
>> -
>> -       /* first sibling now owns the new sysfs dir */
>> -       cpu_dev = get_cpu_device(cpumask_any_but(policy->cpus, old_cpu));
>> -
>> -       sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
>> -       ret = kobject_move(&policy->kobj, &cpu_dev->kobj);
>> -       if (ret) {
>> -               pr_err("%s: Failed to move kobj: %d\n", __func__, ret);
>> -
>> -               down_write(&policy->rwsem);
>> -               cpumask_set_cpu(old_cpu, policy->cpus);
>> -               up_write(&policy->rwsem);
>> -
>> -               ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
>> -                                       "cpufreq");
>> -
>> -               return -EINVAL;
>> -       }
>> -
>> -       return cpu_dev->id;
>> -}
>> -
>> -static int __cpufreq_remove_dev_prepare(struct device *dev,
>> -                                       struct subsys_interface *sif)
>> +static int __cpufreq_remove_dev(struct device *dev,
>> +                               struct subsys_interface *sif)
>>   {
>> -       unsigned int cpu = dev->id, cpus;
>> -       int new_cpu, ret;
>> +       unsigned int cpu = dev->id, j;
>> +       int ret = 0;
>>          unsigned long flags;
>>          struct cpufreq_policy *policy;
>>
>>          pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
>>
>> -       write_lock_irqsave(&cpufreq_driver_lock, flags);
>> -
>> -       policy = per_cpu(cpufreq_cpu_data, cpu);
>> -
>> -       /* Save the policy somewhere when doing a light-weight tear-down */
>> -       if (cpufreq_suspended)
>> -               per_cpu(cpufreq_cpu_data_fallback, cpu) = policy;
>> -
>> -       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> -
>> -       if (!policy) {
>> -               pr_debug("%s: No cpu_data found\n", __func__);
>> -               return -EINVAL;
>> -       }
>> -
>> -       if (has_target()) {
>> -               ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
>> -               if (ret) {
>> -                       pr_err("%s: Failed to stop governor\n", __func__);
>> -                       return ret;
>> -               }
>> -       }
>> -
>> -       if (!cpufreq_driver->setpolicy)
>> -               strncpy(per_cpu(cpufreq_cpu_governor, cpu),
>> -                       policy->governor->name, CPUFREQ_NAME_LEN);
>
> Where is this gone? There are several instances of code just being
> removed, this is the third one. Its really really tough to catch these
> in this big of a patch. Believe me.
>
> You have to break this patch into multiple ones, see this on how to
> break even simplest of the changes into multiple patches:
> https://lkml.org/lkml/2013/9/6/400
>
> Its just impossible to catch bugs that you might have introduced here due
> to the size of this patch. And its taking a LOT of time for me to review this.
> As I have to keep diff in one tab, new cpufreq.c in one and the old cpufreq.c
> in one and then compare..

Will do. With Srivatsa point about just making sure every patch 
compiles, it should be easy to break it up. But to answer your original 
question, it's again not needed to save/restore since we don't destroy it.

>
>> -       down_read(&policy->rwsem);
>> -       cpus = cpumask_weight(policy->cpus);
>> -       up_read(&policy->rwsem);
>> -
>> -       if (cpu != policy->cpu) {
>> -               sysfs_remove_link(&dev->kobj, "cpufreq");
>> -       } else if (cpus > 1) {
>> -               new_cpu = cpufreq_nominate_new_policy_cpu(policy, cpu);
>> -               if (new_cpu >= 0) {
>> -                       update_policy_cpu(policy, new_cpu);
>> -
>> -                       if (!cpufreq_suspended)
>> -                               pr_debug("%s: policy Kobject moved to cpu: %d from: %d\n",
>> -                                        __func__, new_cpu, cpu);
>> -               }
>> -       } else if (cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
>> -               cpufreq_driver->stop_cpu(policy);
>> -       }
>> -
>> -       return 0;
>> -}
>> -
>> -static int __cpufreq_remove_dev_finish(struct device *dev,
>> -                                      struct subsys_interface *sif)
>> -{
>> -       unsigned int cpu = dev->id, cpus;
>> -       int ret;
>> -       unsigned long flags;
>> -       struct cpufreq_policy *policy;
>> -
>>          read_lock_irqsave(&cpufreq_driver_lock, flags);
>>          policy = per_cpu(cpufreq_cpu_data, cpu);
>>          read_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> @@ -1410,56 +1265,45 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>>                  return -EINVAL;
>>          }
>>
>> -       down_write(&policy->rwsem);
>> -       cpus = cpumask_weight(policy->cpus);
>> -
>> -       if (cpus > 1)
>> -               cpumask_clear_cpu(cpu, policy->cpus);
>> -       up_write(&policy->rwsem);
>> -
>> -       /* If cpu is last user of policy, free policy */
>> -       if (cpus == 1) {
>> -               if (has_target()) {
>> -                       ret = __cpufreq_governor(policy,
>> -                                       CPUFREQ_GOV_POLICY_EXIT);
>> -                       if (ret) {
>> -                               pr_err("%s: Failed to exit governor\n",
>> -                                      __func__);
>> -                               return ret;
>> -                       }
>> -               }
>> -
>> -               if (!cpufreq_suspended)
>> -                       cpufreq_policy_put_kobj(policy);
>> +#ifdef CONFIG_HOTPLUG_CPU
>> +       ret = cpufreq_change_policy_cpus(policy, cpu, false);
>> +#endif
>> +       if (ret)
>> +               return ret;
>
> Why is the if block kept outside of #ifdef? And should we really call
> change_*() from inside a #ifdef here?

Yeah, it can be inside.

>
>>
>> -               /*
>> -                * Perform the ->exit() even during light-weight tear-down,
>> -                * since this is a core component, and is essential for the
>> -                * subsequent light-weight ->init() to succeed.
>> -                */
>> -               if (cpufreq_driver->exit)
>> -                       cpufreq_driver->exit(policy);
>> +       if (!sif)
>> +               return 0;
>
> Why? I know that, but we should have comments to describe this ...

Will do.

>
>>
>> -               /* Remove policy from list of active policies */
>> -               write_lock_irqsave(&cpufreq_driver_lock, flags);
>> -               list_del(&policy->policy_list);
>> -               write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> +       if (!cpumask_empty(policy->cpus)) {
>> +               return 0;
>> +       }
>
> You might still call this attempt a showcase of idea, but I am reviewing it
> at my full capacity.

Oh, at this point this is not an RFC at all. I want it merged. So, 
that's for the thorough review.

> And these small things just break my flow.
>
> - Don't add {} for single liner blocks
> - Add comments with proper comment style
> - Run checkpatch --strict before sending patches.

Will do.

>
>>
>> -               if (!cpufreq_suspended)
>> -                       cpufreq_policy_free(policy);
>> -       } else if (has_target()) {
>> -               ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
>> -               if (!ret)
>> -                       ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
>> +       cpufreq_dev_symlink(policy, false);
>>
>> +       if (has_target()) {
>> +               ret = __cpufreq_governor(policy,
>> +                               CPUFREQ_GOV_POLICY_EXIT);
>
> Can come in single line
>
>>                  if (ret) {
>> -                       pr_err("%s: Failed to start governor\n", __func__);
>> +                       pr_err("%s: Failed to exit governor\n",
>> +                              __func__);
>
> This too..
>
>>                          return ret;
>>                  }
>>          }
>>
>> -       per_cpu(cpufreq_cpu_data, cpu) = NULL;
>> -       return 0;
>> +       cpufreq_policy_put_kobj(policy);
>> +       if (cpufreq_driver->exit)
>> +               cpufreq_driver->exit(policy);
>> +
>> +       /* Remove policy from list of active policies */
>> +       write_lock_irqsave(&cpufreq_driver_lock, flags);
>> +       for_each_cpu(j, policy->related_cpus)
>> +               per_cpu(cpufreq_cpu_data, j) = NULL;
>> +       list_del(&policy->policy_list);
>> +       write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>> +
>> +       cpufreq_policy_free(policy);
>> +
>> +       return ret;
>>   }
>>
>>   /**
>> @@ -1469,18 +1313,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
>>    */
>>   static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
>>   {
>> -       unsigned int cpu = dev->id;
>> -       int ret;
>> -
>> -       if (cpu_is_offline(cpu))
>> -               return 0;
>
> Why is it part of this commit?

Which part? Just removing the offline check? I should move it to 2/2 (or 
the split ups of it) probably.

>
>> -       ret = __cpufreq_remove_dev_prepare(dev, sif);
>> -
>> -       if (!ret)
>> -               ret = __cpufreq_remove_dev_finish(dev, sif);
>> -
>> -       return ret;
>> +       return __cpufreq_remove_dev(dev, sif);
>>   }
>>
>>   static void handle_update(struct work_struct *work)
>> @@ -2295,19 +2128,12 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
>>          if (dev) {
>>                  switch (action & ~CPU_TASKS_FROZEN) {
>>                  case CPU_ONLINE:
>> +               case CPU_DOWN_FAILED:
>
> For example. This change doesn't have anything to do with this patch
> and would have been so easy to review it, if it was kept separate.
>
> Also, this would even require to wait for this complete series to make
> sense and can be merged very early.

Ok

>
>>                          __cpufreq_add_dev(dev, NULL);
>>                          break;
>>
>>                  case CPU_DOWN_PREPARE:
>> -                       __cpufreq_remove_dev_prepare(dev, NULL);
>> -                       break;
>> -
>> -               case CPU_POST_DEAD:
>> -                       __cpufreq_remove_dev_finish(dev, NULL);
>> -                       break;
>> -
>> -               case CPU_DOWN_FAILED:
>> -                       __cpufreq_add_dev(dev, NULL);
>> +                       __cpufreq_remove_dev(dev, NULL);
>
> @Srivatsa: You might want to have a look at this, remove sequence was
> separated for some purpose and I am just not able to concentrate enough
> to think of that, just too many cases running in my mind :)
>
>>                          break;
>>                  }
>>          }
>
> I am still not sure if everything will work as expected as I seriously doubt
> my reviewing capabilities. There might be corner cases which I am still
> missing.
>

-Saravana
Saravana Kannan July 16, 2014, 9:45 p.m. UTC | #12
On 07/16/2014 01:25 PM, Saravana Kannan wrote:
> On 07/16/2014 01:24 AM, Viresh Kumar wrote:
>> On 16 July 2014 04:17, Saravana Kannan <skannan@codeaurora.org> wrote:
>>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>>
>>> @@ -1110,9 +1092,10 @@ static int __cpufreq_add_dev(struct device
>>> *dev, struct subsys_interface *sif)
>>>
>>>   #ifdef CONFIG_SMP
>>>          /* check whether a different CPU already registered this
>>> -        * CPU because it is in the same boat. */
>>> +        * CPU because it is one of the related CPUs. */
>>>          policy = cpufreq_cpu_get(cpu);
>>> -       if (unlikely(policy)) {
>>> +       if (policy) {
>>> +               cpufreq_change_policy_cpus(policy, cpu, true);
>>
>> This is just a waste of time at boot as ... (see below)
>
> Why? Please explain.
>

Nevermind. Figured what you meant. I just need to improve the "if" check.

-Saravana
Viresh Kumar July 17, 2014, 5:35 a.m. UTC | #13
On 17 July 2014 01:26, Saravana Kannan <skannan@codeaurora.org> wrote:
> On 07/16/2014 04:16 AM, Srivatsa S. Bhat wrote:

>> That is, we wanted
>> to do the kobject cleanup after releasing the hotplug lock, and POST_DEAD
>> stage was well-suited for that.

I think, this has changed in Saravana's patch, we do it in the PREPARE stage
now.

>> Commit 1aee40ac9c8 (cpufreq: Invoke __cpufreq_remove_dev_finish() after
>> releasing cpu_hotplug.lock) explains this in detail. Saravana, please take
>> a
>> look at that reasoning and ensure that your patch doesn't re-introduce
>> those
>> deadlock possibilities!
>
>
> But all of that was needed _because_ we were creating and destroying
> policies and kobjs all the time. We don't do that anymore. So, I don't think
> any of that applies. We only destroy when the cpufreq driver is
> unregistered. That's kinda of the point of this patchset.
>
> Thoughts?

See above.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar July 17, 2014, 5:51 a.m. UTC | #14
On 17 July 2014 01:26, Saravana Kannan <skannan@codeaurora.org> wrote:
> No it's not. All the cpu*/ directories for all possible CPUs will be there
> whether a CPU is online/offline. Which is why I also weed out impossible
> CPUs, but you said the driver shouldn't be passing impossible CPUs anyway.
> I'm just picking one directory to put the real cpufreq directory under. So,
> the code as-is is definitely not broken.

I may be wrong, and that's why I checked it with Srivatsa. He is quite familiar
with hotplug code.

Let me show the example again, its a bit tricky.

I agree with you that sysfs nodes for CPUs stay as is with offline events, but
we aren't talking about that here. On boot when we are trying to bring CPUs
online, some of them may fail to come. And in that case, as confirmed by
Srivatsa, there are no sysfs directories. This doesn't happen normally and
is a very corner case.

Still think we are wrong?

> Sure, I can pick the first cpu that comes online to decide where to put the
> real sysfs cpufreq directory, but then I have to keep track of that in a
> separate field when it's time to remove it when the cpufreq driver is
> unregistered.

It works this way right now and I don't think we maintain any separate field
here. Subsys-interface takes care of the order in which CPUs are added/
removed. And we don't have to handle that here. Just fix policy->cpu
at first cpufreq_add_dev().

> And no, we
> shouldn't be moving sysfs directory to stay with only an online directory.
> That's the thing this patch is trying to simplify.

Ahh, I really missed it in reviews. So, that's why you are looking at first
cpu of related_cpus.. Hmm, so it is quite possible that we would end up
in a case where policy->cpu wouldn't have sysfs directory created for it.

Not sure if that might cause some hickups.

@Srivatsa: ??

> So, I think using the first cpu in related CPUs will always work. If there's
> any disagreement, I think it's purely a personal preference over adding
> another field vs calling cpumask_first()

That's what the problem with this patch was, just too big to miss important
things :)

I now understood why you had these extra cpumask_first() calls.

But having said that, I still don't see a need to change the current behavior.
The important point is kobject and links are added just once, no movement.
And so, I would still like to add it to policy->cpu, i.e. the cpu which comes
first. And this happens only once while we register a driver, so no side
effects probably.

Not every platform is going through hotplug/suspend/resume and keeping
policy->cpu and sysfs node aligned atleast for them might not be that bad.
Though it will work for any cpu.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar July 17, 2014, 6:24 a.m. UTC | #15
On 17 July 2014 01:55, Saravana Kannan <skannan@codeaurora.org> wrote:
> On 07/16/2014 01:24 AM, Viresh Kumar wrote:
>> Also, its not the duty of this routine to find which one is the policy cpu
>> as
>> that is done by __cpufreq_add_dev(). And so in case we need to make
>> first cpu of a mask as policy->cpu, it should be done in
>> __cpufreq_add_dev()
>> and not here. This one should just follow the orders :)
>
>
> This is a new function. And that split up might have made sense earlier. But
> not so much anymore since I'm sharing a lot of code between
> __cpufreq_add_dev() and __cpufreq_remove_dev(). There's not reason to stick
> with the previous split of up work if it doesn't apply well anymore.
>
> Please give this a second thought. Maybe it'll make more sense after I split
> this up into smaller patches.

Don't worry I am all open to good suggestions :)

So, this is why I see your idea of first cpu in related_cpus is good:
- cpufreq_dev_symlink() is called while adding/removing links now
- And we must know which CPU owns kobj in the first place, as there
is no symlink there.
- To be guaranteed about that, first-cpu logic makes sense and I can see
why you did it this way.
- If we want to do it myway, i.e. using policy->cpu there is a problem.
policy->cpu may change and we have to keep another field like:
policy->sysfs_cpu to track sysfs master. That's bad..

This is what you wanted to hear, isn't it ? :)

But, as usual I have few concerns:
- As we talked in another thread consider this scenario:
- Dual cluster system, 4 CPUs per cluster. Cluser0: cpu0-3, Cluster1: 4-7.
- CPU 4 failed to come online on boot, but it is still the first cpu of
related_cpus. It can still come online later on if we fix things somehow.
- We CAN'T guarantee that first CPU of related_cpus will have a sysfs
directory for itself, as it may have failed to comeup in the first place..
- What can we do now? Go to next CPU? Maybe yes, but then we *have*
to track policy->sysfs_cpu. Isn't it?

If that's the case, lets track sysfs_cpu and lets make it equal to policy->cpu
instead of the first-cpu logic :)

I know you don't like me much by now :) Just kidding.

>> Keep this please, it might be useful while debugging.
>
>
> Reluctant ok. We don't add/remove these files anymore in a common scenario.
> So, it's not going to be very helpful. I'll also have to do a add ? Add :
> Remove blurb for the print.

May still be useful :). For example:
- In IKS (In kernel switcher: which you may use for you b.L implementation), we
can turn IKS on/off at runtime and the only way it works is by
unregistering/registering
cpufreq driver. And this will be useful there. Also we might want to know what
went wrong while porting a cpufreq driver for a platform initially.

>>> +       dev = get_cpu_device(cpumask_first(policy->related_cpus));
>>> +       if (!dev)
>>> +               return -EINVAL;
>>
>>
>> Again, deciding which cpu is policy->cpu here is wrong. Just follow
>> orders of __cpufreq_add_dev().
>
> But that's not what I'm doing here?

Yeah, I misread that earlier. So take my comment for sysfs-cpu here :)

>> Also pcpu can't be < nr_cpu_ids, right?
>
> This is for the case when all CPUs in a cluster have been taken down. We
> don't want to send the notifier at that point. When the mask is empty, the
> function returns a value >= nr_cpu_ids to indicate an error.

I see, probably you can use cpumask_weight() earlier in the code and
reuse it here instead of checking for cpumask_first() to find if we need to do
something. Confusing ? Look at this routine again in your code and you will
come to know what I refer to. :)

>>> @@ -1053,10 +1057,8 @@ static void cpufreq_policy_put_kobj(struct
>>> cpufreq_policy *policy)
>>>          blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>>>                          CPUFREQ_REMOVE_POLICY, policy);
>>>
>>> -       down_read(&policy->rwsem);
>>>          kobj = &policy->kobj;
>>>          cmp = &policy->kobj_unregister;
>>> -       up_read(&policy->rwsem);
>>
>>
>> Why? And also, these are unrelated changes and must be added as separate
>> commits.
>
>
> This is because we call this with policy rwsem read lock held and lockdep
> throws a warning. So, it's related to this patch.

I fail to see that in the final code, Can you please enlighten me with line
numbers please?

>>> @@ -1110,9 +1092,10 @@ static int __cpufreq_add_dev(struct device *dev,
>>> +       if (policy) {
>>> +               cpufreq_change_policy_cpus(policy, cpu, true);
>>
>>
>> This is just a waste of time at boot as ... (see below)
>
> Why? Please explain.

As I said, see below :)

>>>          /* related cpus should atleast have policy->cpus */
>>>          cpumask_or(policy->related_cpus, policy->related_cpus,
>>> policy->cpus);
>>
>>
>> policy->cpus is already updated here.

--------------- HERE -------------------------

>>> -       if (!recover_policy) {
>>> -               policy->user_policy.min = policy->min;
>>> -               policy->user_policy.max = policy->max;
>>> -       }
>>
>> Where did these go? There weren't there for fun.
>
> We are keeping the policy intact. So, why would this be needed anymore?

This code would execute on !recover_policy, i.e. when we aren't recovering
policy. Also at boot.. I forgot exact details, please try 'git blame' ..

> Will do. With Srivatsa point about just making sure every patch compiles, it
> should be easy to break it up. But to answer your original question, it's
> again not needed to save/restore since we don't destroy it.

Again, check why it was required with git bisect.



Okay, another thing which I just figured out. You changed something really
really important.

We don't call ->init()/exit() anymore on suspend/resume or when we move
all CPUs out. I am quite sure this will break platforms, and actually those
which Rafael care about most :)

Again. I still feel this patch was a lot over-engineered. I agree that there
are things which we want to solve, but the first thing to solve is not moving
sysfs nodes. Which can be solved with very basic changes.

Get that right first and send patches for that. Nothing else.

You can send out improvements later once we have your really really
important fix in.

Otherwise, you will just make it tougher for this patchset to get merged.

Look at this (I don't have a link yet, but you are cc'd):
[PATCH V1 Resend 0/4] CPUFreq: Bug fixes & cleanups

A perfect example of how to get the fix in first and then improvements.

Everybody have to follow these, even Maintainers.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saravana Kannan July 18, 2014, 3:25 a.m. UTC | #16
On 07/16/2014 10:35 PM, Viresh Kumar wrote:
> On 17 July 2014 01:26, Saravana Kannan <skannan@codeaurora.org> wrote:
>> On 07/16/2014 04:16 AM, Srivatsa S. Bhat wrote:
>
>>> That is, we wanted
>>> to do the kobject cleanup after releasing the hotplug lock, and POST_DEAD
>>> stage was well-suited for that.
>
> I think, this has changed in Saravana's patch, we do it in the PREPARE stage
> now.

Not really. We much never do it during hotplug. We only do it when the 
cpufreq driver unregisters.

This should be easier to see in v4, where I'm breaking up the patches 
into easier diffs.

>>> Commit 1aee40ac9c8 (cpufreq: Invoke __cpufreq_remove_dev_finish() after
>>> releasing cpu_hotplug.lock) explains this in detail. Saravana, please take
>>> a
>>> look at that reasoning and ensure that your patch doesn't re-introduce
>>> those
>>> deadlock possibilities!
>>
>>
>> But all of that was needed _because_ we were creating and destroying
>> policies and kobjs all the time. We don't do that anymore. So, I don't think
>> any of that applies. We only destroy when the cpufreq driver is
>> unregistered. That's kinda of the point of this patchset.
>>
>> Thoughts?
>
> See above.
>

-Saravana
Viresh Kumar July 18, 2014, 4:19 a.m. UTC | #17
On 18 July 2014 08:55, Saravana Kannan <skannan@codeaurora.org> wrote:
> Not really. We much never do it during hotplug. We only do it when the
> cpufreq driver unregisters.

Oh yes.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 62259d2..a0a2ec2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -37,7 +37,6 @@ 
  */
 static struct cpufreq_driver *cpufreq_driver;
 static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
-static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data_fallback);
 static DEFINE_RWLOCK(cpufreq_driver_lock);
 DEFINE_MUTEX(cpufreq_governor_lock);
 static LIST_HEAD(cpufreq_policy_list);
@@ -859,34 +858,41 @@  void cpufreq_sysfs_remove_file(const struct attribute *attr)
 }
 EXPORT_SYMBOL(cpufreq_sysfs_remove_file);
 
-/* symlink affected CPUs */
-static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
+/* symlink related CPUs */
+static int cpufreq_dev_symlink(struct cpufreq_policy *policy, bool add)
 {
-	unsigned int j;
+	unsigned int j, first_cpu = cpumask_first(policy->related_cpus);
 	int ret = 0;
 
-	for_each_cpu(j, policy->cpus) {
+	for_each_cpu(j, policy->related_cpus) {
 		struct device *cpu_dev;
 
-		if (j == policy->cpu)
+		if (j == first_cpu)
 			continue;
 
-		pr_debug("Adding link for CPU: %u\n", j);
 		cpu_dev = get_cpu_device(j);
-		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
-					"cpufreq");
+		if (add)
+			ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
+						"cpufreq");
+		else
+			sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
+
 		if (ret)
 			break;
 	}
 	return ret;
 }
 
-static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
-				     struct device *dev)
+static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
 {
 	struct freq_attr **drv_attr;
+	struct device *dev;
 	int ret = 0;
 
+	dev = get_cpu_device(cpumask_first(policy->related_cpus));
+	if (!dev)
+		return -EINVAL;
+
 	/* prepare interface data */
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   &dev->kobj, "cpufreq");
@@ -917,7 +923,7 @@  static int cpufreq_add_dev_interface(struct cpufreq_policy *policy,
 			goto err_out_kobj_put;
 	}
 
-	ret = cpufreq_add_dev_symlink(policy);
+	ret = cpufreq_dev_symlink(policy, true);
 	if (ret)
 		goto err_out_kobj_put;
 
@@ -961,60 +967,58 @@  static void cpufreq_init_policy(struct cpufreq_policy *policy)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy,
-				  unsigned int cpu, struct device *dev)
+static int cpufreq_change_policy_cpus(struct cpufreq_policy *policy,
+				  unsigned int cpu, bool add)
 {
 	int ret = 0;
-	unsigned long flags;
+	unsigned int cpus, pcpu;
 
-	if (has_target()) {
+	down_write(&policy->rwsem);
+
+	cpus = !cpumask_empty(policy->cpus);
+	if (has_target() && cpus) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 		if (ret) {
 			pr_err("%s: Failed to stop governor\n", __func__);
-			return ret;
+			goto unlock;
 		}
 	}
 
-	down_write(&policy->rwsem);
-
-	write_lock_irqsave(&cpufreq_driver_lock, flags);
-
-	cpumask_set_cpu(cpu, policy->cpus);
-	per_cpu(cpufreq_cpu_data, cpu) = policy;
-	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
+	if (add)
+		cpumask_set_cpu(cpu, policy->cpus);
+	else
+		cpumask_clear_cpu(cpu, policy->cpus);
 
-	up_write(&policy->rwsem);
+	pcpu = cpumask_first(policy->cpus);
+	if (pcpu < nr_cpu_ids && policy->cpu != pcpu) {
+		policy->last_cpu = policy->cpu;
+		policy->cpu = pcpu;
+		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+					CPUFREQ_UPDATE_POLICY_CPU, policy);
+	}
 
-	if (has_target()) {
+	cpus = !cpumask_empty(policy->cpus);
+	if (has_target() && cpus) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
 		if (!ret)
 			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 
 		if (ret) {
 			pr_err("%s: Failed to start governor\n", __func__);
-			return ret;
+			goto unlock;
 		}
 	}
 
-	return sysfs_create_link(&dev->kobj, &policy->kobj, "cpufreq");
-}
-#endif
-
-static struct cpufreq_policy *cpufreq_policy_restore(unsigned int cpu)
-{
-	struct cpufreq_policy *policy;
-	unsigned long flags;
-
-	read_lock_irqsave(&cpufreq_driver_lock, flags);
-
-	policy = per_cpu(cpufreq_cpu_data_fallback, cpu);
-
-	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
+	if (!cpus && cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
+		cpufreq_driver->stop_cpu(policy);
+	}
 
-	policy->governor = NULL;
+unlock:
+	up_write(&policy->rwsem);
 
-	return policy;
+	return ret;
 }
+#endif
 
 static struct cpufreq_policy *cpufreq_policy_alloc(void)
 {
@@ -1053,10 +1057,8 @@  static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_REMOVE_POLICY, policy);
 
-	down_read(&policy->rwsem);
 	kobj = &policy->kobj;
 	cmp = &policy->kobj_unregister;
-	up_read(&policy->rwsem);
 	kobject_put(kobj);
 
 	/*
@@ -1076,32 +1078,12 @@  static void cpufreq_policy_free(struct cpufreq_policy *policy)
 	kfree(policy);
 }
 
-static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu)
-{
-	if (WARN_ON(cpu == policy->cpu))
-		return;
-
-	down_write(&policy->rwsem);
-
-	policy->last_cpu = policy->cpu;
-	policy->cpu = cpu;
-
-	up_write(&policy->rwsem);
-
-	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-			CPUFREQ_UPDATE_POLICY_CPU, policy);
-}
-
 static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int j, cpu = dev->id;
 	int ret = -ENOMEM;
 	struct cpufreq_policy *policy;
 	unsigned long flags;
-	bool recover_policy = cpufreq_suspended;
-#ifdef CONFIG_HOTPLUG_CPU
-	struct cpufreq_policy *tpolicy;
-#endif
 
 	if (cpu_is_offline(cpu))
 		return 0;
@@ -1110,9 +1092,10 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 
 #ifdef CONFIG_SMP
 	/* check whether a different CPU already registered this
-	 * CPU because it is in the same boat. */
+	 * CPU because it is one of the related CPUs. */
 	policy = cpufreq_cpu_get(cpu);
-	if (unlikely(policy)) {
+	if (policy) {
+		cpufreq_change_policy_cpus(policy, cpu, true);
 		cpufreq_cpu_put(policy);
 		return 0;
 	}
@@ -1121,45 +1104,14 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	if (!down_read_trylock(&cpufreq_rwsem))
 		return 0;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	/* Check if this cpu was hot-unplugged earlier and has siblings */
-	read_lock_irqsave(&cpufreq_driver_lock, flags);
-	list_for_each_entry(tpolicy, &cpufreq_policy_list, policy_list) {
-		if (cpumask_test_cpu(cpu, tpolicy->related_cpus)) {
-			read_unlock_irqrestore(&cpufreq_driver_lock, flags);
-			ret = cpufreq_add_policy_cpu(tpolicy, cpu, dev);
-			up_read(&cpufreq_rwsem);
-			return ret;
-		}
-	}
-	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
-#endif
-
-	/*
-	 * Restore the saved policy when doing light-weight init and fall back
-	 * to the full init if that fails.
-	 */
-	policy = recover_policy ? cpufreq_policy_restore(cpu) : NULL;
-	if (!policy) {
-		recover_policy = false;
-		policy = cpufreq_policy_alloc();
-		if (!policy)
-			goto nomem_out;
-	}
-
-	/*
-	 * In the resume path, since we restore a saved policy, the assignment
-	 * to policy->cpu is like an update of the existing policy, rather than
-	 * the creation of a brand new one. So we need to perform this update
-	 * by invoking update_policy_cpu().
-	 */
-	if (recover_policy && cpu != policy->cpu)
-		update_policy_cpu(policy, cpu);
-	else
-		policy->cpu = cpu;
+	/* If we get this far, this is the first time we are adding the
+	 * policy */
+	policy = cpufreq_policy_alloc();
+	if (!policy)
+		goto nomem_out;
+	policy->cpu = cpu;
 
 	cpumask_copy(policy->cpus, cpumask_of(cpu));
-
 	init_completion(&policy->kobj_unregister);
 	INIT_WORK(&policy->update, handle_update);
 
@@ -1169,26 +1121,25 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
 		pr_debug("initialization failed\n");
-		goto err_set_policy_cpu;
+		goto err_init;
 	}
 
 	/* related cpus should atleast have policy->cpus */
 	cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);
 
+	/* Weed out impossible CPUs. */
+	cpumask_and(policy->related_cpus, policy->related_cpus,
+			cpu_possible_mask);
+
 	/*
 	 * affected cpus must always be the one, which are online. We aren't
 	 * managing offline cpus here.
 	 */
 	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
 
-	if (!recover_policy) {
-		policy->user_policy.min = policy->min;
-		policy->user_policy.max = policy->max;
-	}
-
 	down_write(&policy->rwsem);
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
-	for_each_cpu(j, policy->cpus)
+	for_each_cpu(j, policy->related_cpus)
 		per_cpu(cpufreq_cpu_data, j) = policy;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
@@ -1243,13 +1194,11 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				     CPUFREQ_START, policy);
 
-	if (!recover_policy) {
-		ret = cpufreq_add_dev_interface(policy, dev);
-		if (ret)
-			goto err_out_unregister;
-		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-				CPUFREQ_CREATE_POLICY, policy);
-	}
+	ret = cpufreq_add_dev_interface(policy);
+	if (ret)
+		goto err_out_unregister;
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_CREATE_POLICY, policy);
 
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 	list_add(&policy->policy_list, &cpufreq_policy_list);
@@ -1257,10 +1206,6 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 
 	cpufreq_init_policy(policy);
 
-	if (!recover_policy) {
-		policy->user_policy.policy = policy->policy;
-		policy->user_policy.governor = policy->governor;
-	}
 	up_write(&policy->rwsem);
 
 	kobject_uevent(&policy->kobj, KOBJ_ADD);
@@ -1273,20 +1218,14 @@  static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 err_out_unregister:
 err_get_freq:
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
-	for_each_cpu(j, policy->cpus)
+	for_each_cpu(j, policy->related_cpus)
 		per_cpu(cpufreq_cpu_data, j) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
+	up_write(&policy->rwsem);
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
-err_set_policy_cpu:
-	if (recover_policy) {
-		/* Do not leave stale fallback data behind. */
-		per_cpu(cpufreq_cpu_data_fallback, cpu) = NULL;
-		cpufreq_policy_put_kobj(policy);
-	}
+err_init:
 	cpufreq_policy_free(policy);
-
 nomem_out:
 	up_read(&cpufreq_rwsem);
 
@@ -1307,100 +1246,16 @@  static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	return __cpufreq_add_dev(dev, sif);
 }
 
-static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy,
-					   unsigned int old_cpu)
-{
-	struct device *cpu_dev;
-	int ret;
-
-	/* first sibling now owns the new sysfs dir */
-	cpu_dev = get_cpu_device(cpumask_any_but(policy->cpus, old_cpu));
-
-	sysfs_remove_link(&cpu_dev->kobj, "cpufreq");
-	ret = kobject_move(&policy->kobj, &cpu_dev->kobj);
-	if (ret) {
-		pr_err("%s: Failed to move kobj: %d\n", __func__, ret);
-
-		down_write(&policy->rwsem);
-		cpumask_set_cpu(old_cpu, policy->cpus);
-		up_write(&policy->rwsem);
-
-		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
-					"cpufreq");
-
-		return -EINVAL;
-	}
-
-	return cpu_dev->id;
-}
-
-static int __cpufreq_remove_dev_prepare(struct device *dev,
-					struct subsys_interface *sif)
+static int __cpufreq_remove_dev(struct device *dev,
+				struct subsys_interface *sif)
 {
-	unsigned int cpu = dev->id, cpus;
-	int new_cpu, ret;
+	unsigned int cpu = dev->id, j;
+	int ret = 0;
 	unsigned long flags;
 	struct cpufreq_policy *policy;
 
 	pr_debug("%s: unregistering CPU %u\n", __func__, cpu);
 
-	write_lock_irqsave(&cpufreq_driver_lock, flags);
-
-	policy = per_cpu(cpufreq_cpu_data, cpu);
-
-	/* Save the policy somewhere when doing a light-weight tear-down */
-	if (cpufreq_suspended)
-		per_cpu(cpufreq_cpu_data_fallback, cpu) = policy;
-
-	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	if (!policy) {
-		pr_debug("%s: No cpu_data found\n", __func__);
-		return -EINVAL;
-	}
-
-	if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-		if (ret) {
-			pr_err("%s: Failed to stop governor\n", __func__);
-			return ret;
-		}
-	}
-
-	if (!cpufreq_driver->setpolicy)
-		strncpy(per_cpu(cpufreq_cpu_governor, cpu),
-			policy->governor->name, CPUFREQ_NAME_LEN);
-
-	down_read(&policy->rwsem);
-	cpus = cpumask_weight(policy->cpus);
-	up_read(&policy->rwsem);
-
-	if (cpu != policy->cpu) {
-		sysfs_remove_link(&dev->kobj, "cpufreq");
-	} else if (cpus > 1) {
-		new_cpu = cpufreq_nominate_new_policy_cpu(policy, cpu);
-		if (new_cpu >= 0) {
-			update_policy_cpu(policy, new_cpu);
-
-			if (!cpufreq_suspended)
-				pr_debug("%s: policy Kobject moved to cpu: %d from: %d\n",
-					 __func__, new_cpu, cpu);
-		}
-	} else if (cpufreq_driver->stop_cpu && cpufreq_driver->setpolicy) {
-		cpufreq_driver->stop_cpu(policy);
-	}
-
-	return 0;
-}
-
-static int __cpufreq_remove_dev_finish(struct device *dev,
-				       struct subsys_interface *sif)
-{
-	unsigned int cpu = dev->id, cpus;
-	int ret;
-	unsigned long flags;
-	struct cpufreq_policy *policy;
-
 	read_lock_irqsave(&cpufreq_driver_lock, flags);
 	policy = per_cpu(cpufreq_cpu_data, cpu);
 	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
@@ -1410,56 +1265,45 @@  static int __cpufreq_remove_dev_finish(struct device *dev,
 		return -EINVAL;
 	}
 
-	down_write(&policy->rwsem);
-	cpus = cpumask_weight(policy->cpus);
-
-	if (cpus > 1)
-		cpumask_clear_cpu(cpu, policy->cpus);
-	up_write(&policy->rwsem);
-
-	/* If cpu is last user of policy, free policy */
-	if (cpus == 1) {
-		if (has_target()) {
-			ret = __cpufreq_governor(policy,
-					CPUFREQ_GOV_POLICY_EXIT);
-			if (ret) {
-				pr_err("%s: Failed to exit governor\n",
-				       __func__);
-				return ret;
-			}
-		}
-
-		if (!cpufreq_suspended)
-			cpufreq_policy_put_kobj(policy);
+#ifdef CONFIG_HOTPLUG_CPU
+	ret = cpufreq_change_policy_cpus(policy, cpu, false);
+#endif
+	if (ret)
+		return ret;
 
-		/*
-		 * Perform the ->exit() even during light-weight tear-down,
-		 * since this is a core component, and is essential for the
-		 * subsequent light-weight ->init() to succeed.
-		 */
-		if (cpufreq_driver->exit)
-			cpufreq_driver->exit(policy);
+	if (!sif)
+		return 0;
 
-		/* Remove policy from list of active policies */
-		write_lock_irqsave(&cpufreq_driver_lock, flags);
-		list_del(&policy->policy_list);
-		write_unlock_irqrestore(&cpufreq_driver_lock, flags);
+	if (!cpumask_empty(policy->cpus)) {
+		return 0;
+	}
 
-		if (!cpufreq_suspended)
-			cpufreq_policy_free(policy);
-	} else if (has_target()) {
-		ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
-		if (!ret)
-			ret = __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
+	cpufreq_dev_symlink(policy, false);
 
+	if (has_target()) {
+		ret = __cpufreq_governor(policy,
+				CPUFREQ_GOV_POLICY_EXIT);
 		if (ret) {
-			pr_err("%s: Failed to start governor\n", __func__);
+			pr_err("%s: Failed to exit governor\n",
+			       __func__);
 			return ret;
 		}
 	}
 
-	per_cpu(cpufreq_cpu_data, cpu) = NULL;
-	return 0;
+	cpufreq_policy_put_kobj(policy);
+	if (cpufreq_driver->exit)
+		cpufreq_driver->exit(policy);
+
+	/* Remove policy from list of active policies */
+	write_lock_irqsave(&cpufreq_driver_lock, flags);
+	for_each_cpu(j, policy->related_cpus)
+		per_cpu(cpufreq_cpu_data, j) = NULL;
+	list_del(&policy->policy_list);
+	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
+
+	cpufreq_policy_free(policy);
+
+	return ret;
 }
 
 /**
@@ -1469,18 +1313,7 @@  static int __cpufreq_remove_dev_finish(struct device *dev,
  */
 static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = dev->id;
-	int ret;
-
-	if (cpu_is_offline(cpu))
-		return 0;
-
-	ret = __cpufreq_remove_dev_prepare(dev, sif);
-
-	if (!ret)
-		ret = __cpufreq_remove_dev_finish(dev, sif);
-
-	return ret;
+	return __cpufreq_remove_dev(dev, sif);
 }
 
 static void handle_update(struct work_struct *work)
@@ -2295,19 +2128,12 @@  static int cpufreq_cpu_callback(struct notifier_block *nfb,
 	if (dev) {
 		switch (action & ~CPU_TASKS_FROZEN) {
 		case CPU_ONLINE:
+		case CPU_DOWN_FAILED:
 			__cpufreq_add_dev(dev, NULL);
 			break;
 
 		case CPU_DOWN_PREPARE:
-			__cpufreq_remove_dev_prepare(dev, NULL);
-			break;
-
-		case CPU_POST_DEAD:
-			__cpufreq_remove_dev_finish(dev, NULL);
-			break;
-
-		case CPU_DOWN_FAILED:
-			__cpufreq_add_dev(dev, NULL);
+			__cpufreq_remove_dev(dev, NULL);
 			break;
 		}
 	}