diff mbox series

[v6,05/16] sched/core: uclamp: Update CPU's refcount on clamp changes

Message ID 20190115101513.2822-6-patrick.bellasi@arm.com (mailing list archive)
State Not Applicable, archived
Headers show
Series Add utilization clamping support | expand

Commit Message

Patrick Bellasi Jan. 15, 2019, 10:15 a.m. UTC
Utilization clamp values enforced on a CPU by a task can be updated, for
example via a sched_setattr() syscall, while a task is RUNNABLE on that
CPU. A clamp value change always implies a clamp bucket refcount update
to ensure the new constraints are enforced.

Hook into uclamp_bucket_get() to trigger a CPU refcount syncup, via
uclamp_cpu_{inc,dec}_id(), whenever a task is RUNNABLE.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>

---
Changes in v6:
 Other:
 - wholesale s/group/bucket/
 - wholesale s/_{get,put}/_{inc,dec}/ to match refcount APIs
 - small documentation updates
---
 kernel/sched/core.c | 48 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

Comments

Peter Zijlstra Jan. 21, 2019, 3:33 p.m. UTC | #1
On Tue, Jan 15, 2019 at 10:15:02AM +0000, Patrick Bellasi wrote:

> +static inline void
> +uclamp_task_update_active(struct task_struct *p, unsigned int clamp_id)
> +{
> +	struct rq_flags rf;
> +	struct rq *rq;
> +
> +	/*
> +	 * Lock the task and the CPU where the task is (or was) queued.
> +	 *
> +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
> +	 * price to pay to safely serialize util_{min,max} updates with
> +	 * enqueues, dequeues and migration operations.
> +	 * This is the same locking schema used by __set_cpus_allowed_ptr().
> +	 */
> +	rq = task_rq_lock(p, &rf);
> +
> +	/*
> +	 * Setting the clamp bucket is serialized by task_rq_lock().
> +	 * If the task is not yet RUNNABLE and its task_struct is not
> +	 * affecting a valid clamp bucket, the next time it's enqueued,
> +	 * it will already see the updated clamp bucket value.
> +	 */
> +	if (!p->uclamp[clamp_id].active)
> +		goto done;
> +
> +	uclamp_cpu_dec_id(p, rq, clamp_id);
> +	uclamp_cpu_inc_id(p, rq, clamp_id);
> +
> +done:
> +	task_rq_unlock(rq, p, &rf);
> +}

> @@ -1008,11 +1043,11 @@ static int __setscheduler_uclamp(struct task_struct *p,
>  
>  	mutex_lock(&uclamp_mutex);
>  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
> -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MIN],
> +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MIN],
>  				  UCLAMP_MIN, lower_bound);
>  	}
>  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
> -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MAX],
> +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MAX],
>  				  UCLAMP_MAX, upper_bound);
>  	}
>  	mutex_unlock(&uclamp_mutex);


But.... __sched_setscheduler() actually does the whole dequeue + enqueue
thing already ?!? See where it does __setscheduler().
Patrick Bellasi Jan. 21, 2019, 3:44 p.m. UTC | #2
On 21-Jan 16:33, Peter Zijlstra wrote:
> On Tue, Jan 15, 2019 at 10:15:02AM +0000, Patrick Bellasi wrote:
> 
> > +static inline void
> > +uclamp_task_update_active(struct task_struct *p, unsigned int clamp_id)
> > +{
> > +	struct rq_flags rf;
> > +	struct rq *rq;
> > +
> > +	/*
> > +	 * Lock the task and the CPU where the task is (or was) queued.
> > +	 *
> > +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
> > +	 * price to pay to safely serialize util_{min,max} updates with
> > +	 * enqueues, dequeues and migration operations.
> > +	 * This is the same locking schema used by __set_cpus_allowed_ptr().
> > +	 */
> > +	rq = task_rq_lock(p, &rf);
> > +
> > +	/*
> > +	 * Setting the clamp bucket is serialized by task_rq_lock().
> > +	 * If the task is not yet RUNNABLE and its task_struct is not
> > +	 * affecting a valid clamp bucket, the next time it's enqueued,
> > +	 * it will already see the updated clamp bucket value.
> > +	 */
> > +	if (!p->uclamp[clamp_id].active)
> > +		goto done;
> > +
> > +	uclamp_cpu_dec_id(p, rq, clamp_id);
> > +	uclamp_cpu_inc_id(p, rq, clamp_id);
> > +
> > +done:
> > +	task_rq_unlock(rq, p, &rf);
> > +}
> 
> > @@ -1008,11 +1043,11 @@ static int __setscheduler_uclamp(struct task_struct *p,
> >  
> >  	mutex_lock(&uclamp_mutex);
> >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
> > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MIN],
> > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MIN],
> >  				  UCLAMP_MIN, lower_bound);
> >  	}
> >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
> > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MAX],
> > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MAX],
> >  				  UCLAMP_MAX, upper_bound);
> >  	}
> >  	mutex_unlock(&uclamp_mutex);
> 
> 
> But.... __sched_setscheduler() actually does the whole dequeue + enqueue
> thing already ?!? See where it does __setscheduler().

This is slow-path accounting, not fast path.

There are two refcounting going on here:

  1) mapped buckets:

     clamp_value <--(M1)--> bucket_id

  2) RUNNABLE tasks:

     bucket_id <--(M2)--> RUNNABLE tasks in a bucket

What we fix here is the refcounting for the buckets mapping. If a task
does not have a task specific clamp value it does not refcount any
bucket. The moment we assign a task specific clamp value, we need to
refcount the task in the bucket corresponding to that clamp value.

This will keep the bucket in use at least as long as the task will
need that clamp value.
Peter Zijlstra Jan. 22, 2019, 9:37 a.m. UTC | #3
On Mon, Jan 21, 2019 at 03:44:12PM +0000, Patrick Bellasi wrote:
> On 21-Jan 16:33, Peter Zijlstra wrote:
> > On Tue, Jan 15, 2019 at 10:15:02AM +0000, Patrick Bellasi wrote:
> > 
> > > +static inline void
> > > +uclamp_task_update_active(struct task_struct *p, unsigned int clamp_id)
> > > +{
> > > +	struct rq_flags rf;
> > > +	struct rq *rq;
> > > +
> > > +	/*
> > > +	 * Lock the task and the CPU where the task is (or was) queued.
> > > +	 *
> > > +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
> > > +	 * price to pay to safely serialize util_{min,max} updates with
> > > +	 * enqueues, dequeues and migration operations.
> > > +	 * This is the same locking schema used by __set_cpus_allowed_ptr().
> > > +	 */
> > > +	rq = task_rq_lock(p, &rf);
> > > +
> > > +	/*
> > > +	 * Setting the clamp bucket is serialized by task_rq_lock().
> > > +	 * If the task is not yet RUNNABLE and its task_struct is not
> > > +	 * affecting a valid clamp bucket, the next time it's enqueued,
> > > +	 * it will already see the updated clamp bucket value.
> > > +	 */
> > > +	if (!p->uclamp[clamp_id].active)
> > > +		goto done;
> > > +
> > > +	uclamp_cpu_dec_id(p, rq, clamp_id);
> > > +	uclamp_cpu_inc_id(p, rq, clamp_id);
> > > +
> > > +done:
> > > +	task_rq_unlock(rq, p, &rf);
> > > +}
> > 
> > > @@ -1008,11 +1043,11 @@ static int __setscheduler_uclamp(struct task_struct *p,
> > >  
> > >  	mutex_lock(&uclamp_mutex);
> > >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
> > > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MIN],
> > > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MIN],
> > >  				  UCLAMP_MIN, lower_bound);
> > >  	}
> > >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
> > > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MAX],
> > > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MAX],
> > >  				  UCLAMP_MAX, upper_bound);
> > >  	}
> > >  	mutex_unlock(&uclamp_mutex);
> > 
> > 
> > But.... __sched_setscheduler() actually does the whole dequeue + enqueue
> > thing already ?!? See where it does __setscheduler().
> 
> This is slow-path accounting, not fast path.

Sure; but that's still no reason for duplicate or unneeded code.

> There are two refcounting going on here:
> 
>   1) mapped buckets:
> 
>      clamp_value <--(M1)--> bucket_id
> 
>   2) RUNNABLE tasks:
> 
>      bucket_id <--(M2)--> RUNNABLE tasks in a bucket
> 
> What we fix here is the refcounting for the buckets mapping. If a task
> does not have a task specific clamp value it does not refcount any
> bucket. The moment we assign a task specific clamp value, we need to
> refcount the task in the bucket corresponding to that clamp value.
> 
> This will keep the bucket in use at least as long as the task will
> need that clamp value.

Sure, I get that. What I don't get is why you're adding that (2) here.
Like said, __sched_setscheduler() already does a dequeue/enqueue under
rq->lock, which should already take care of that.
Patrick Bellasi Jan. 22, 2019, 10:43 a.m. UTC | #4
On 22-Jan 10:37, Peter Zijlstra wrote:
> On Mon, Jan 21, 2019 at 03:44:12PM +0000, Patrick Bellasi wrote:
> > On 21-Jan 16:33, Peter Zijlstra wrote:
> > > On Tue, Jan 15, 2019 at 10:15:02AM +0000, Patrick Bellasi wrote:
> > > 
> > > > +static inline void
> > > > +uclamp_task_update_active(struct task_struct *p, unsigned int clamp_id)
> > > > +{
> > > > +	struct rq_flags rf;
> > > > +	struct rq *rq;
> > > > +
> > > > +	/*
> > > > +	 * Lock the task and the CPU where the task is (or was) queued.
> > > > +	 *
> > > > +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
> > > > +	 * price to pay to safely serialize util_{min,max} updates with
> > > > +	 * enqueues, dequeues and migration operations.
> > > > +	 * This is the same locking schema used by __set_cpus_allowed_ptr().
> > > > +	 */
> > > > +	rq = task_rq_lock(p, &rf);
> > > > +
> > > > +	/*
> > > > +	 * Setting the clamp bucket is serialized by task_rq_lock().
> > > > +	 * If the task is not yet RUNNABLE and its task_struct is not
> > > > +	 * affecting a valid clamp bucket, the next time it's enqueued,
> > > > +	 * it will already see the updated clamp bucket value.
> > > > +	 */
> > > > +	if (!p->uclamp[clamp_id].active)
> > > > +		goto done;
> > > > +
> > > > +	uclamp_cpu_dec_id(p, rq, clamp_id);
> > > > +	uclamp_cpu_inc_id(p, rq, clamp_id);
> > > > +
> > > > +done:
> > > > +	task_rq_unlock(rq, p, &rf);
> > > > +}
> > > 
> > > > @@ -1008,11 +1043,11 @@ static int __setscheduler_uclamp(struct task_struct *p,
> > > >  
> > > >  	mutex_lock(&uclamp_mutex);
> > > >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
> > > > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MIN],
> > > > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MIN],
> > > >  				  UCLAMP_MIN, lower_bound);
> > > >  	}
> > > >  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
> > > > -		uclamp_bucket_inc(&p->uclamp[UCLAMP_MAX],
> > > > +		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MAX],
> > > >  				  UCLAMP_MAX, upper_bound);
> > > >  	}
> > > >  	mutex_unlock(&uclamp_mutex);
> > > 
> > > 
> > > But.... __sched_setscheduler() actually does the whole dequeue + enqueue
> > > thing already ?!? See where it does __setscheduler().
> > 
> > This is slow-path accounting, not fast path.
> 
> Sure; but that's still no reason for duplicate or unneeded code.
> 
> > There are two refcounting going on here:
> > 
> >   1) mapped buckets:
> > 
> >      clamp_value <--(M1)--> bucket_id
> > 
> >   2) RUNNABLE tasks:
> > 
> >      bucket_id <--(M2)--> RUNNABLE tasks in a bucket
> > 
> > What we fix here is the refcounting for the buckets mapping. If a task
> > does not have a task specific clamp value it does not refcount any
> > bucket. The moment we assign a task specific clamp value, we need to
> > refcount the task in the bucket corresponding to that clamp value.
> > 
> > This will keep the bucket in use at least as long as the task will
> > need that clamp value.
> 
> Sure, I get that. What I don't get is why you're adding that (2) here.
> Like said, __sched_setscheduler() already does a dequeue/enqueue under
> rq->lock, which should already take care of that.

Oh, ok... got it what you mean now.

With:

   [PATCH v6 01/16] sched/core: Allow sched_setattr() to use the current policy
   <20190115101513.2822-2-patrick.bellasi@arm.com>

we can call __sched_setscheduler() with:

   attr->sched_flags & SCHED_FLAG_KEEP_POLICY

whenever we want just to change the clamp values of a task without
changing its class. Thus, we can end up returning from
__sched_setscheduler() without doing an actual dequeue/enqueue.

This is likely the most common use-case.

I'll better check if I can propagate this info and avoid M2 if we
actually did a dequeue/enqueue.

Cheers Patrick
Peter Zijlstra Jan. 22, 2019, 1:28 p.m. UTC | #5
On Tue, Jan 22, 2019 at 10:43:05AM +0000, Patrick Bellasi wrote:
> On 22-Jan 10:37, Peter Zijlstra wrote:

> > Sure, I get that. What I don't get is why you're adding that (2) here.
> > Like said, __sched_setscheduler() already does a dequeue/enqueue under
> > rq->lock, which should already take care of that.
> 
> Oh, ok... got it what you mean now.
> 
> With:
> 
>    [PATCH v6 01/16] sched/core: Allow sched_setattr() to use the current policy
>    <20190115101513.2822-2-patrick.bellasi@arm.com>
> 
> we can call __sched_setscheduler() with:
> 
>    attr->sched_flags & SCHED_FLAG_KEEP_POLICY
> 
> whenever we want just to change the clamp values of a task without
> changing its class. Thus, we can end up returning from
> __sched_setscheduler() without doing an actual dequeue/enqueue.

I don't see that happening.. when KEEP_POLICY we set attr.sched_policy =
SETPARAM_POLICY. That is then checked again in __setscheduler_param(),
which is in the middle of that dequeue/enqueue.

Also, and this might be 'broken', SETPARAM_POLICY _does_ reset all the
other attributes, it only preserves policy, but it will (re)set nice
level for example (see that same function).

So maybe we want to introduce another (few?) FLAG_KEEP flag(s) that
preserve the other bits; I'm thinking at least KEEP_PARAM and KEEP_UTIL
or something.
Patrick Bellasi Jan. 22, 2019, 2:01 p.m. UTC | #6
On 22-Jan 14:28, Peter Zijlstra wrote:
> On Tue, Jan 22, 2019 at 10:43:05AM +0000, Patrick Bellasi wrote:
> > On 22-Jan 10:37, Peter Zijlstra wrote:
> 
> > > Sure, I get that. What I don't get is why you're adding that (2) here.
> > > Like said, __sched_setscheduler() already does a dequeue/enqueue under
> > > rq->lock, which should already take care of that.
> > 
> > Oh, ok... got it what you mean now.
> > 
> > With:
> > 
> >    [PATCH v6 01/16] sched/core: Allow sched_setattr() to use the current policy
> >    <20190115101513.2822-2-patrick.bellasi@arm.com>
> > 
> > we can call __sched_setscheduler() with:
> > 
> >    attr->sched_flags & SCHED_FLAG_KEEP_POLICY
> > 
> > whenever we want just to change the clamp values of a task without
> > changing its class. Thus, we can end up returning from
> > __sched_setscheduler() without doing an actual dequeue/enqueue.
> 
> I don't see that happening.. when KEEP_POLICY we set attr.sched_policy =
> SETPARAM_POLICY. That is then checked again in __setscheduler_param(),
> which is in the middle of that dequeue/enqueue.

Yes, I think I've forgot a check before we actually dequeue the task.

The current code does:

---8<---
   SYSCALL_DEFINE3(sched_setattr)

        // A) request to keep the same policy
        if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
            attr.sched_policy = SETPARAM_POLICY;

        sched_setattr()
            // B) actually enforce the same policy
            if (policy < 0)
                policy = oldpolicy = p->policy;

            // C) tune the clamp values
            if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
                retval = __setscheduler_uclamp(p, attr);

            // D) tune attributes if policy is the same
            if (unlikely(policy == p->policy))
                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                    goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                    goto change;
                if (dl_policy(policy) && dl_param_changed(p, attr))
                    goto change;
                return 0;
        change:

            // E) dequeue/enqueue task
---8<---

So, probably in D) I've missed a check on SCHED_FLAG_KEEP_POLICY to
enforce a return in that case...

> Also, and this might be 'broken', SETPARAM_POLICY _does_ reset all the
> other attributes, it only preserves policy, but it will (re)set nice
> level for example (see that same function).

Mmm... right... my bad! :/

> So maybe we want to introduce another (few?) FLAG_KEEP flag(s) that
> preserve the other bits; I'm thinking at least KEEP_PARAM and KEEP_UTIL
> or something.

Yes, I would say we have two options:

 1) SCHED_FLAG_KEEP_POLICY enforces all the scheduling class specific
    attributes, but cross class attributes (e.g. uclamp)

 2) add SCHED_KEEP_NICE, SCHED_KEEP_PRIO, and SCED_KEEP_PARAMS
    and use them in the if conditions in D)

In both cases the goal should be to return from code block D).

What do you prefer?
Peter Zijlstra Jan. 22, 2019, 2:57 p.m. UTC | #7
On Tue, Jan 22, 2019 at 02:01:15PM +0000, Patrick Bellasi wrote:
> On 22-Jan 14:28, Peter Zijlstra wrote:
> > On Tue, Jan 22, 2019 at 10:43:05AM +0000, Patrick Bellasi wrote:
> > > On 22-Jan 10:37, Peter Zijlstra wrote:
> > 
> > > > Sure, I get that. What I don't get is why you're adding that (2) here.
> > > > Like said, __sched_setscheduler() already does a dequeue/enqueue under
> > > > rq->lock, which should already take care of that.
> > > 
> > > Oh, ok... got it what you mean now.
> > > 
> > > With:
> > > 
> > >    [PATCH v6 01/16] sched/core: Allow sched_setattr() to use the current policy
> > >    <20190115101513.2822-2-patrick.bellasi@arm.com>
> > > 
> > > we can call __sched_setscheduler() with:
> > > 
> > >    attr->sched_flags & SCHED_FLAG_KEEP_POLICY
> > > 
> > > whenever we want just to change the clamp values of a task without
> > > changing its class. Thus, we can end up returning from
> > > __sched_setscheduler() without doing an actual dequeue/enqueue.
> > 
> > I don't see that happening.. when KEEP_POLICY we set attr.sched_policy =
> > SETPARAM_POLICY. That is then checked again in __setscheduler_param(),
> > which is in the middle of that dequeue/enqueue.
> 
> Yes, I think I've forgot a check before we actually dequeue the task.
> 
> The current code does:
> 
> ---8<---
>    SYSCALL_DEFINE3(sched_setattr)
> 
>         // A) request to keep the same policy
>         if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
>             attr.sched_policy = SETPARAM_POLICY;
> 
>         sched_setattr()
>             // B) actually enforce the same policy
>             if (policy < 0)
>                 policy = oldpolicy = p->policy;
> 
>             // C) tune the clamp values
>             if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
>                 retval = __setscheduler_uclamp(p, attr);
> 
>             // D) tune attributes if policy is the same
>             if (unlikely(policy == p->policy))
>                 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
>                     goto change;
>                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
>                     goto change;
>                 if (dl_policy(policy) && dl_param_changed(p, attr))
>                     goto change;

		  if (util_changed)
		      goto change;

?

>                 return 0;
>         change:
> 
>             // E) dequeue/enqueue task
> ---8<---
> 
> So, probably in D) I've missed a check on SCHED_FLAG_KEEP_POLICY to
> enforce a return in that case...
> 
> > Also, and this might be 'broken', SETPARAM_POLICY _does_ reset all the
> > other attributes, it only preserves policy, but it will (re)set nice
> > level for example (see that same function).
> 
> Mmm... right... my bad! :/
> 
> > So maybe we want to introduce another (few?) FLAG_KEEP flag(s) that
> > preserve the other bits; I'm thinking at least KEEP_PARAM and KEEP_UTIL
> > or something.
> 
> Yes, I would say we have two options:
> 
>  1) SCHED_FLAG_KEEP_POLICY enforces all the scheduling class specific
>     attributes, but cross class attributes (e.g. uclamp)
>
>  2) add SCHED_KEEP_NICE, SCHED_KEEP_PRIO, and SCED_KEEP_PARAMS
>     and use them in the if conditions in D)

So the current KEEP_POLICY basically provides sched_setparam(), and
given we have that as a syscall, that is supposedly a useful
functionality.

Also, NICE/PRIO/DL* is all the same thing and depends on the policy,
KEEP_PARAM should cover the lot

And I suppose the UTIL_CLAMP is !KEEP_UTIL; we could go either way
around with that flag.

> In both cases the goal should be to return from code block D).

I don't think so; we really do want to 'goto change' for util changes
too I think. Why duplicate part of that logic?
Patrick Bellasi Jan. 22, 2019, 3:33 p.m. UTC | #8
On 22-Jan 15:57, Peter Zijlstra wrote:
> On Tue, Jan 22, 2019 at 02:01:15PM +0000, Patrick Bellasi wrote:
> > On 22-Jan 14:28, Peter Zijlstra wrote:
> > > On Tue, Jan 22, 2019 at 10:43:05AM +0000, Patrick Bellasi wrote:
> > > > On 22-Jan 10:37, Peter Zijlstra wrote:
> > > 
> > > > > Sure, I get that. What I don't get is why you're adding that (2) here.
> > > > > Like said, __sched_setscheduler() already does a dequeue/enqueue under
> > > > > rq->lock, which should already take care of that.
> > > > 
> > > > Oh, ok... got it what you mean now.
> > > > 
> > > > With:
> > > > 
> > > >    [PATCH v6 01/16] sched/core: Allow sched_setattr() to use the current policy
> > > >    <20190115101513.2822-2-patrick.bellasi@arm.com>
> > > > 
> > > > we can call __sched_setscheduler() with:
> > > > 
> > > >    attr->sched_flags & SCHED_FLAG_KEEP_POLICY
> > > > 
> > > > whenever we want just to change the clamp values of a task without
> > > > changing its class. Thus, we can end up returning from
> > > > __sched_setscheduler() without doing an actual dequeue/enqueue.
> > > 
> > > I don't see that happening.. when KEEP_POLICY we set attr.sched_policy =
> > > SETPARAM_POLICY. That is then checked again in __setscheduler_param(),
> > > which is in the middle of that dequeue/enqueue.
> > 
> > Yes, I think I've forgot a check before we actually dequeue the task.
> > 
> > The current code does:
> > 
> > ---8<---
> >    SYSCALL_DEFINE3(sched_setattr)
> > 
> >         // A) request to keep the same policy
> >         if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
> >             attr.sched_policy = SETPARAM_POLICY;
> > 
> >         sched_setattr()
> >             // B) actually enforce the same policy
> >             if (policy < 0)
> >                 policy = oldpolicy = p->policy;
> > 
> >             // C) tune the clamp values
> >             if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
> >                 retval = __setscheduler_uclamp(p, attr);
> > 
> >             // D) tune attributes if policy is the same
> >             if (unlikely(policy == p->policy))
> >                 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
> >                     goto change;
> >                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
> >                     goto change;
> >                 if (dl_policy(policy) && dl_param_changed(p, attr))
> >                     goto change;
> 
> 		  if (util_changed)
> 		      goto change;
> 
> ?
> 
> >                 return 0;
> >         change:
> > 
> >             // E) dequeue/enqueue task
> > ---8<---
> > 
> > So, probably in D) I've missed a check on SCHED_FLAG_KEEP_POLICY to
> > enforce a return in that case...
> > 
> > > Also, and this might be 'broken', SETPARAM_POLICY _does_ reset all the
> > > other attributes, it only preserves policy, but it will (re)set nice
> > > level for example (see that same function).
> > 
> > Mmm... right... my bad! :/
> > 
> > > So maybe we want to introduce another (few?) FLAG_KEEP flag(s) that
> > > preserve the other bits; I'm thinking at least KEEP_PARAM and KEEP_UTIL
> > > or something.
> > 
> > Yes, I would say we have two options:
> > 
> >  1) SCHED_FLAG_KEEP_POLICY enforces all the scheduling class specific
> >     attributes, but cross class attributes (e.g. uclamp)
> >
> >  2) add SCHED_KEEP_NICE, SCHED_KEEP_PRIO, and SCED_KEEP_PARAMS
> >     and use them in the if conditions in D)
> 
> So the current KEEP_POLICY basically provides sched_setparam(), and

But it's not exposed user-space.

> given we have that as a syscall, that is supposedly a useful
> functionality.

For uclamp is definitively useful to change clamps without the need to
read beforehand the current policy params and use them in a following
set syscall... which is racy pattern.

> Also, NICE/PRIO/DL* is all the same thing and depends on the policy,
> KEEP_PARAM should cover the lot

Right, that makes sense.

> And I suppose the UTIL_CLAMP is !KEEP_UTIL; we could go either way
> around with that flag.

What about getting rid of the racy case above by exposing userspace
only the new UTIL_CLAMP and, on:

  sched_setscheduler(flags: UTIL_CLAMP)

we enforce the other two flags from the syscall:

---8<---
        SYSCALL_DEFINE3(sched_setattr)
            if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) {
                attr.sched_policy = SETPARAM_POLICY;
                attr.sched_flags |= (KEEP_POLICY|KEEP_PARAMS);
            }
---8<---

This will not make possible to change class and set flags in one go,
but honestly that's likely a very limited use-case, isn't it ?

> > In both cases the goal should be to return from code block D).
> 
> I don't think so; we really do want to 'goto change' for util changes
> too I think. Why duplicate part of that logic?

But that will force a dequeue/enqueue... isn't too much overhead just
to change a clamp value? Perhaps we can also end up with some wired
side-effects like the task being preempted ?

Consider also that the uclamp_task_update_active() added by this patch
not only has lower overhead but it will be use also by cgroups where
we want to force update all the tasks on a cgroup's clamp change.
Peter Zijlstra Jan. 23, 2019, 9:16 a.m. UTC | #9
On Tue, Jan 22, 2019 at 03:33:15PM +0000, Patrick Bellasi wrote:
> On 22-Jan 15:57, Peter Zijlstra wrote:
> > On Tue, Jan 22, 2019 at 02:01:15PM +0000, Patrick Bellasi wrote:

> > > Yes, I would say we have two options:
> > > 
> > >  1) SCHED_FLAG_KEEP_POLICY enforces all the scheduling class specific
> > >     attributes, but cross class attributes (e.g. uclamp)
> > >
> > >  2) add SCHED_KEEP_NICE, SCHED_KEEP_PRIO, and SCED_KEEP_PARAMS
> > >     and use them in the if conditions in D)
> > 
> > So the current KEEP_POLICY basically provides sched_setparam(), and
> 
> But it's not exposed user-space.

Correct; not until your first patch indeed.

> > given we have that as a syscall, that is supposedly a useful
> > functionality.
> 
> For uclamp is definitively useful to change clamps without the need to
> read beforehand the current policy params and use them in a following
> set syscall... which is racy pattern.

Right; but my argument was mostly that if sched_setparam() is a useful
interface, a 'pure' KEEP_POLICY would be too and your (1) looses that.

> > And I suppose the UTIL_CLAMP is !KEEP_UTIL; we could go either way
> > around with that flag.
> 
> What about getting rid of the racy case above by exposing userspace
> only the new UTIL_CLAMP and, on:
> 
>   sched_setscheduler(flags: UTIL_CLAMP)
> 
> we enforce the other two flags from the syscall:
> 
> ---8<---
>         SYSCALL_DEFINE3(sched_setattr)
>             if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) {
>                 attr.sched_policy = SETPARAM_POLICY;
>                 attr.sched_flags |= (KEEP_POLICY|KEEP_PARAMS);
>             }
> ---8<---
> 
> This will not make possible to change class and set flags in one go,
> but honestly that's likely a very limited use-case, isn't it ?

So I must admit to not seeing much use for sched_setparam() (and its
equivalents) myself, but given it is an existing interface, I also think
it would be nice to cover that functionality in the sched_setattr()
call.

That is; I know of userspace priority-ceiling implementations using
sched_setparam(), but I don't see any reason why that wouldn't also work
with sched_setscheduler() (IOW always also set the policy).

> > > In both cases the goal should be to return from code block D).
> > 
> > I don't think so; we really do want to 'goto change' for util changes
> > too I think. Why duplicate part of that logic?
> 
> But that will force a dequeue/enqueue... isn't too much overhead just
> to change a clamp value?

These syscalls aren't what I consider fast paths anyway. However, there
are people that rely on the scheduler syscalls not to schedule
themselves, or rather be non-blocking (see for example that prio-ceiling
implementation).

And in that respect the newly introduced uclamp_mutex does appear to be
a problem.

Also; do you expect these clamp values to be changed often?

> Perhaps we can also end up with some wired

s/wired/weird/, right?

> side-effects like the task being preempted ?

Nothing worse than any other random reschedule would cause.

> Consider also that the uclamp_task_update_active() added by this patch
> not only has lower overhead but it will be use also by cgroups where
> we want to force update all the tasks on a cgroup's clamp change.

I haven't gotten that far; but I would prefer not to have two different
'change' paths in __sched_setscheduler().
Patrick Bellasi Jan. 23, 2019, 2:14 p.m. UTC | #10
On 23-Jan 10:16, Peter Zijlstra wrote:
> On Tue, Jan 22, 2019 at 03:33:15PM +0000, Patrick Bellasi wrote:
> > On 22-Jan 15:57, Peter Zijlstra wrote:
> > > On Tue, Jan 22, 2019 at 02:01:15PM +0000, Patrick Bellasi wrote:
> 
> > > > Yes, I would say we have two options:
> > > > 
> > > >  1) SCHED_FLAG_KEEP_POLICY enforces all the scheduling class specific
> > > >     attributes, but cross class attributes (e.g. uclamp)
> > > >
> > > >  2) add SCHED_KEEP_NICE, SCHED_KEEP_PRIO, and SCED_KEEP_PARAMS
> > > >     and use them in the if conditions in D)
> > > 
> > > So the current KEEP_POLICY basically provides sched_setparam(), and
> > 
> > But it's not exposed user-space.
> 
> Correct; not until your first patch indeed.
> 
> > > given we have that as a syscall, that is supposedly a useful
> > > functionality.
> > 
> > For uclamp is definitively useful to change clamps without the need to
> > read beforehand the current policy params and use them in a following
> > set syscall... which is racy pattern.
> 
> Right; but my argument was mostly that if sched_setparam() is a useful
> interface, a 'pure' KEEP_POLICY would be too and your (1) looses that.

Ok, that's an argument in favour of option (2).

> > > And I suppose the UTIL_CLAMP is !KEEP_UTIL; we could go either way
> > > around with that flag.
> > 
> > What about getting rid of the racy case above by exposing userspace
> > only the new UTIL_CLAMP and, on:
> > 
> >   sched_setscheduler(flags: UTIL_CLAMP)
> > 
> > we enforce the other two flags from the syscall:
> > 
> > ---8<---
> >         SYSCALL_DEFINE3(sched_setattr)
> >             if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) {
> >                 attr.sched_policy = SETPARAM_POLICY;
> >                 attr.sched_flags |= (KEEP_POLICY|KEEP_PARAMS);
> >             }
> > ---8<---
> > 
> > This will not make possible to change class and set flags in one go,
> > but honestly that's likely a very limited use-case, isn't it ?
> 
> So I must admit to not seeing much use for sched_setparam() (and its
> equivalents) myself, but given it is an existing interface, I also think
> it would be nice to cover that functionality in the sched_setattr()
> call.

Which will make them sort-of equivalent... meaning: both the POSIX
sched_setparam() and the !POSIX sched_setattr() will allow to change
params/attributes without changing the policy.

> That is; I know of userspace priority-ceiling implementations using
> sched_setparam(), but I don't see any reason why that wouldn't also work
> with sched_setscheduler() (IOW always also set the policy).

The sched_setscheduler() requires a policy to be explicitely defined,
it's a mandatory parameter and has to be specified.

Unless a RT task could be blocked by a FAIR one and you need
sched_setscheduler() to boost both prio and class (which looks like a
poor RT design to begin with) why would you use sched_setscheduler()
instead of sched_setparam()?

They are both POSIX calls and, AFAIU, sched_setparam() seems to be
designed exactly for those kind on use cases.

> > > > In both cases the goal should be to return from code block D).
> > > 
> > > I don't think so; we really do want to 'goto change' for util changes
> > > too I think. Why duplicate part of that logic?
> > 
> > But that will force a dequeue/enqueue... isn't too much overhead just
> > to change a clamp value?
> 
> These syscalls aren't what I consider fast paths anyway. However, there
> are people that rely on the scheduler syscalls not to schedule
> themselves, or rather be non-blocking (see for example that prio-ceiling
> implementation).
> 
> And in that respect the newly introduced uclamp_mutex does appear to be
> a problem.

Mmm... could be... I'll look better into it. Could be that that mutex
is not really required. We don't need to serialize task specific
clamp changes and anyway the protected code never sleeps and uses
atomic instruction.

> Also; do you expect these clamp values to be changed often?

Not really, the most common use cases are:
 a) a resource manager (e.g. the Android run-time) set clamps for a
    bunch of tasks whenever you switch for example from one app to
    antoher... but that will be done via cgroups (i.e. different path)
 b) a task can relax his constraints to save energy (something
    conceptually similar to use a deferrable timers)

In both cases I expect a limited call frequency.

> > Perhaps we can also end up with some wired
> 
> s/wired/weird/, right?

Right :)

> > side-effects like the task being preempted ?
> 
> Nothing worse than any other random reschedule would cause.
> 
> > Consider also that the uclamp_task_update_active() added by this patch
> > not only has lower overhead but it will be use also by cgroups where
> > we want to force update all the tasks on a cgroup's clamp change.
> 
> I haven't gotten that far; but I would prefer not to have two different
> 'change' paths in __sched_setscheduler().

Yes, I agree that two paths in __sched_setscheduler() could be
confusing. Still we have to consider that here we are adding
"not class specific" attributes.

What if we keep "not class specific" code completely outside of
__sched_setscheduler() and do something like:

---8<---
    int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
    {
            retval = __sched_setattr(p, attr);
            if (retval)
                return retval;
            return __sched_setscheduler(p, attr, true, true);
    }
    EXPORT_SYMBOL_GPL(sched_setattr);
---8<---

where __sched_setattr() will collect all the tunings which do not
require an enqueue/dequeue, so far only the new uclamp settings, while
the rest remains under __sched_setscheduler().

Thoughts ?
Peter Zijlstra Jan. 23, 2019, 6:59 p.m. UTC | #11
On Wed, Jan 23, 2019 at 02:14:26PM +0000, Patrick Bellasi wrote:

> > > Consider also that the uclamp_task_update_active() added by this patch
> > > not only has lower overhead but it will be use also by cgroups where
> > > we want to force update all the tasks on a cgroup's clamp change.
> > 
> > I haven't gotten that far; but I would prefer not to have two different
> > 'change' paths in __sched_setscheduler().
> 
> Yes, I agree that two paths in __sched_setscheduler() could be
> confusing. Still we have to consider that here we are adding
> "not class specific" attributes.

But that change thing is not class specific; the whole:


	rq = task_rq_lock(p, &rf);
	queued = task_on_rq_queued(p);
	running = task_current(rq, p);
	if (queued)
		dequeue_task(rq, p, queue_flags);
	if (running)
		put_prev_task(rq, p);


	/* @p is in it's invariant state; frob it's state */


	if (queued)
		enqueue_task(rq, p, queue_flags);
	if (running)
		set_curr_task(rq, p);
	task_rq_unlock(rq, p, &rf);


pattern is all over the place; it is just because C sucks that that
isn't more explicitly shared (do_set_cpus_allowed(), rt_mutex_setprio(),
set_user_nice(), __sched_setscheduler(), sched_setnuma(),
sched_move_task()).

This is _the_ pattern for changing state and is not class specific at
all.
Patrick Bellasi Jan. 24, 2019, 11:21 a.m. UTC | #12
On 23-Jan 19:59, Peter Zijlstra wrote:
> On Wed, Jan 23, 2019 at 02:14:26PM +0000, Patrick Bellasi wrote:
> 
> > > > Consider also that the uclamp_task_update_active() added by this patch
> > > > not only has lower overhead but it will be use also by cgroups where
> > > > we want to force update all the tasks on a cgroup's clamp change.
> > > 
> > > I haven't gotten that far; but I would prefer not to have two different
> > > 'change' paths in __sched_setscheduler().
> > 
> > Yes, I agree that two paths in __sched_setscheduler() could be
> > confusing. Still we have to consider that here we are adding
> > "not class specific" attributes.
> 
> But that change thing is not class specific; the whole:
> 
> 
> 	rq = task_rq_lock(p, &rf);
> 	queued = task_on_rq_queued(p);
> 	running = task_current(rq, p);
> 	if (queued)
> 		dequeue_task(rq, p, queue_flags);
> 	if (running)
> 		put_prev_task(rq, p);
> 
> 
> 	/* @p is in it's invariant state; frob it's state */
> 
> 
> 	if (queued)
> 		enqueue_task(rq, p, queue_flags);
> 	if (running)
> 		set_curr_task(rq, p);
> 	task_rq_unlock(rq, p, &rf);
> 
> 
> pattern is all over the place; it is just because C sucks that that

Yes, understand, don't want to enter a language war :)

> isn't more explicitly shared (do_set_cpus_allowed(), rt_mutex_setprio(),
> set_user_nice(), __sched_setscheduler(), sched_setnuma(),
> sched_move_task()).
> 
> This is _the_ pattern for changing state and is not class specific at
> all.

Right, that pattern is not "class specific" true and I should have not
used that term to begin with.

What I was trying to point out is that all the calls above directly
affect the current scheduling decision and "requires" a
dequeue/enqueue pattern.

When a task-specific uclamp value is changed for a task, instead, a
dequeue/enqueue is not needed. As long as we are doing a lazy update,
that sounds just like not necessary overhead.

However, there could still be value in keeping code consistent and if
you prefer it this way what should I do?

---8<---
    __sched_setscheduler()
        ...
        if (policy < 0)
            policy = oldpolicy = p->policy;
        ...
        if (unlikely(policy == p->policy)) {
            ...
            if (uclamp_changed())         // Force dequeue/enqueue
                goto change;
        }
    change:
        ...

        if (queued)
	    dequeue_task(rq, p, queue_flags);
	if (running)
	    put_prev_task(rq, p);

        __setscheduler_uclamp();
	__setscheduler(rq, p, attr, pi);

	if (queued)
	    enqueue_task(rq, p, queue_flags);
	if (running)
	    set_curr_task(rq, p);
        ...
---8<---

Could be something like that ok with you?

Not sure about side-effects on p->prio(): for CFS seems to be reset to
NORMAL in this case :/
Peter Zijlstra Jan. 24, 2019, 12:38 p.m. UTC | #13
On Thu, Jan 24, 2019 at 11:21:53AM +0000, Patrick Bellasi wrote:

> When a task-specific uclamp value is changed for a task, instead, a
> dequeue/enqueue is not needed. As long as we are doing a lazy update,
> that sounds just like not necessary overhead.

When that overhead is shown to be a problem, is when we'll look at that
:-)

> However, there could still be value in keeping code consistent and if
> you prefer it this way what should I do?
> 
> ---8<---
>     __sched_setscheduler()
>         ...
>         if (policy < 0)
>             policy = oldpolicy = p->policy;
>         ...
>         if (unlikely(policy == p->policy)) {
>             ...
>             if (uclamp_changed())         // Force dequeue/enqueue
>                 goto change;
>         }
>     change:
>         ...
> 
>         if (queued)
> 	    dequeue_task(rq, p, queue_flags);
> 	if (running)
> 	    put_prev_task(rq, p);
> 
>         __setscheduler_uclamp();
> 	__setscheduler(rq, p, attr, pi);
> 
> 	if (queued)
> 	    enqueue_task(rq, p, queue_flags);
> 	if (running)
> 	    set_curr_task(rq, p);
>         ...
> ---8<---
> 
> Could be something like that ok with you?

Yes, that's about what I was expecting.

> Not sure about side-effects on p->prio(): for CFS seems to be reset to
> NORMAL in this case :/

That's what we need KEEP_PARAM for, right?
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 190137cd7b3b..67f059ee0a05 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -884,6 +884,38 @@  static inline void uclamp_cpu_dec(struct rq *rq, struct task_struct *p)
 		uclamp_cpu_dec_id(p, rq, clamp_id);
 }
 
+static inline void
+uclamp_task_update_active(struct task_struct *p, unsigned int clamp_id)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+
+	/*
+	 * Lock the task and the CPU where the task is (or was) queued.
+	 *
+	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+	 * price to pay to safely serialize util_{min,max} updates with
+	 * enqueues, dequeues and migration operations.
+	 * This is the same locking schema used by __set_cpus_allowed_ptr().
+	 */
+	rq = task_rq_lock(p, &rf);
+
+	/*
+	 * Setting the clamp bucket is serialized by task_rq_lock().
+	 * If the task is not yet RUNNABLE and its task_struct is not
+	 * affecting a valid clamp bucket, the next time it's enqueued,
+	 * it will already see the updated clamp bucket value.
+	 */
+	if (!p->uclamp[clamp_id].active)
+		goto done;
+
+	uclamp_cpu_dec_id(p, rq, clamp_id);
+	uclamp_cpu_inc_id(p, rq, clamp_id);
+
+done:
+	task_rq_unlock(rq, p, &rf);
+}
+
 static void uclamp_bucket_dec(unsigned int clamp_id, unsigned int bucket_id)
 {
 	union uclamp_map *uc_maps = &uclamp_maps[clamp_id][0];
@@ -907,8 +939,8 @@  static void uclamp_bucket_dec(unsigned int clamp_id, unsigned int bucket_id)
 					  &uc_map_old.data, uc_map_new.data));
 }
 
-static void uclamp_bucket_inc(struct uclamp_se *uc_se, unsigned int clamp_id,
-			      unsigned int clamp_value)
+static void uclamp_bucket_inc(struct task_struct *p, struct uclamp_se *uc_se,
+			      unsigned int clamp_id, unsigned int clamp_value)
 {
 	union uclamp_map *uc_maps = &uclamp_maps[clamp_id][0];
 	unsigned int prev_bucket_id = uc_se->bucket_id;
@@ -979,6 +1011,9 @@  static void uclamp_bucket_inc(struct uclamp_se *uc_se, unsigned int clamp_id,
 	uc_se->value = clamp_value;
 	uc_se->bucket_id = bucket_id;
 
+	if (p)
+		uclamp_task_update_active(p, clamp_id);
+
 	if (uc_se->mapped)
 		uclamp_bucket_dec(clamp_id, prev_bucket_id);
 
@@ -1008,11 +1043,11 @@  static int __setscheduler_uclamp(struct task_struct *p,
 
 	mutex_lock(&uclamp_mutex);
 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
-		uclamp_bucket_inc(&p->uclamp[UCLAMP_MIN],
+		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MIN],
 				  UCLAMP_MIN, lower_bound);
 	}
 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
-		uclamp_bucket_inc(&p->uclamp[UCLAMP_MAX],
+		uclamp_bucket_inc(p, &p->uclamp[UCLAMP_MAX],
 				  UCLAMP_MAX, upper_bound);
 	}
 	mutex_unlock(&uclamp_mutex);
@@ -1049,7 +1084,8 @@  static void uclamp_fork(struct task_struct *p, bool reset)
 
 		p->uclamp[clamp_id].mapped = false;
 		p->uclamp[clamp_id].active = false;
-		uclamp_bucket_inc(&p->uclamp[clamp_id], clamp_id, clamp_value);
+		uclamp_bucket_inc(NULL, &p->uclamp[clamp_id],
+				  clamp_id, clamp_value);
 	}
 }
 
@@ -1069,7 +1105,7 @@  static void __init init_uclamp(void)
 	memset(uclamp_maps, 0, sizeof(uclamp_maps));
 	for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
 		uc_se = &init_task.uclamp[clamp_id];
-		uclamp_bucket_inc(uc_se, clamp_id, uclamp_none(clamp_id));
+		uclamp_bucket_inc(NULL, uc_se, clamp_id, uclamp_none(clamp_id));
 	}
 }