diff mbox

[v2,04/10] xen: credit2: only reset credit on reset condition

Message ID 147520402650.22544.2188671255927985759.stgit@Solace.fritz.box (mailing list archive)
State New, archived
Headers show

Commit Message

Dario Faggioli Sept. 30, 2016, 2:53 a.m. UTC
The condition for a Credit2 scheduling epoch coming to an
end is that the vcpu at the front of the runqueue has negative
credits. However, it is possible, that runq_candidate() does
not actually return to the scheduler the first vcpu in the
runqueue (e.g., because such vcpu can't run on the cpu that
is going through the scheduler, because of hard-affinity).

If that happens, we should not trigger a credit reset, or we
risk altering the lenght of a scheduler epoch, wrt what the
original idea of the algorithm was.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
---
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Anshul Makkar <anshul.makkar@citrix.com>
---
Changes from v1:
 * new patch, containing part of what was in patch 5;
 * (wrt v1 patch 5) 'pos' parameter to runq_candidate renamed 'skipped', as
   requested during review.
---
 xen/common/sched_credit2.c |   33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

Comments

George Dunlap Sept. 30, 2016, 11:28 a.m. UTC | #1
On 30/09/16 03:53, Dario Faggioli wrote:
> The condition for a Credit2 scheduling epoch coming to an
> end is that the vcpu at the front of the runqueue has negative
> credits. However, it is possible, that runq_candidate() does
> not actually return to the scheduler the first vcpu in the
> runqueue (e.g., because such vcpu can't run on the cpu that
> is going through the scheduler, because of hard-affinity).
> 
> If that happens, we should not trigger a credit reset, or we
> risk altering the lenght of a scheduler epoch, wrt what the
> original idea of the algorithm was.
> 
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

Reviewed-by: George Dunlap <george.dunlap@citrix.com>

> ---
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Anshul Makkar <anshul.makkar@citrix.com>
> ---
> Changes from v1:
>  * new patch, containing part of what was in patch 5;
>  * (wrt v1 patch 5) 'pos' parameter to runq_candidate renamed 'skipped', as
>    requested during review.
> ---
>  xen/common/sched_credit2.c |   33 ++++++++++++++++++++++++++++-----
>  1 file changed, 28 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index 3986441..72e31b5 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -2244,12 +2244,15 @@ void __dump_execstate(void *unused);
>  static struct csched2_vcpu *
>  runq_candidate(struct csched2_runqueue_data *rqd,
>                 struct csched2_vcpu *scurr,
> -               int cpu, s_time_t now)
> +               int cpu, s_time_t now,
> +               unsigned int *skipped)
>  {
>      struct list_head *iter;
>      struct csched2_vcpu *snext = NULL;
>      struct csched2_private *prv = CSCHED2_PRIV(per_cpu(scheduler, cpu));
>  
> +    *skipped = 0;
> +
>      /* Default to current if runnable, idle otherwise */
>      if ( vcpu_runnable(scurr->vcpu) )
>          snext = scurr;
> @@ -2273,7 +2276,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>  
>          /* Only consider vcpus that are allowed to run on this processor. */
>          if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
> +        {
> +            (*skipped)++;
>              continue;
> +        }
>  
>          /*
>           * If a vcpu is meant to be picked up by another processor, and such
> @@ -2282,6 +2288,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
>               cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
>          {
> +            (*skipped)++;
>              SCHED_STAT_CRANK(deferred_to_tickled_cpu);
>              continue;
>          }
> @@ -2291,6 +2298,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          if ( svc->vcpu->processor != cpu
>               && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
>          {
> +            (*skipped)++;
>              SCHED_STAT_CRANK(migrate_resisted);
>              continue;
>          }
> @@ -2308,11 +2316,12 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>      {
>          struct {
>              unsigned vcpu:16, dom:16;
> -            unsigned tickled_cpu;
> +            unsigned tickled_cpu, skipped;
>          } d;
>          d.dom = snext->vcpu->domain->domain_id;
>          d.vcpu = snext->vcpu->vcpu_id;
>          d.tickled_cpu = snext->tickled_cpu;
> +        d.skipped = *skipped;
>          __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
>                      sizeof(d),
>                      (unsigned char *)&d);
> @@ -2336,6 +2345,7 @@ csched2_schedule(
>      struct csched2_runqueue_data *rqd;
>      struct csched2_vcpu * const scurr = CSCHED2_VCPU(current);
>      struct csched2_vcpu *snext = NULL;
> +    unsigned int skipped_vcpus = 0;
>      struct task_slice ret;
>  
>      SCHED_STAT_CRANK(schedule);
> @@ -2385,7 +2395,7 @@ csched2_schedule(
>          snext = CSCHED2_VCPU(idle_vcpu[cpu]);
>      }
>      else
> -        snext = runq_candidate(rqd, scurr, cpu, now);
> +        snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus);
>  
>      /* If switching from a non-idle runnable vcpu, put it
>       * back on the runqueue. */
> @@ -2409,8 +2419,21 @@ csched2_schedule(
>              __set_bit(__CSFLAG_scheduled, &snext->flags);
>          }
>  
> -        /* Check for the reset condition */
> -        if ( snext->credit <= CSCHED2_CREDIT_RESET )
> +        /*
> +         * The reset condition is "has a scheduler epoch come to an end?".
> +         * The way this is enforced is checking whether the vcpu at the top
> +         * of the runqueue has negative credits. This means the epochs have
> +         * variable lenght, as in one epoch expores when:
> +         *  1) the vcpu at the top of the runqueue has executed for
> +         *     around 10 ms (with default parameters);
> +         *  2) no other vcpu with higher credits wants to run.
> +         *
> +         * Here, where we want to check for reset, we need to make sure the
> +         * proper vcpu is being used. In fact, runqueue_candidate() may have
> +         * not returned the first vcpu in the runqueue, for various reasons
> +         * (e.g., affinity). Only trigger a reset when it does.
> +         */
> +        if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
>          {
>              reset_credit(ops, cpu, now, snext);
>              balance_load(ops, cpu, now);
>
Anshul Makkar Sept. 30, 2016, 12:25 p.m. UTC | #2
On 30/09/16 03:53, Dario Faggioli wrote:
> The condition for a Credit2 scheduling epoch coming to an
> end is that the vcpu at the front of the runqueue has negative
> credits. However, it is possible, that runq_candidate() does
> not actually return to the scheduler the first vcpu in the
> runqueue (e.g., because such vcpu can't run on the cpu that
> is going through the scheduler, because of hard-affinity).
>
> If that happens, we should not trigger a credit reset, or we
> risk altering the lenght of a scheduler epoch, wrt what the
> original idea of the algorithm was.
>
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> ---
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Anshul Makkar <anshul.makkar@citrix.com>
> ---
>
>          /*
>           * If a vcpu is meant to be picked up by another processor, and such
> @@ -2282,6 +2288,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
>               cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
>          {
> +            (*skipped)++;
>              SCHED_STAT_CRANK(deferred_to_tickled_cpu);
>              continue;
>          }
> @@ -2291,6 +2298,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          if ( svc->vcpu->processor != cpu
>               && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
>          {
> +            (*skipped)++;
>              SCHED_STAT_CRANK(migrate_resisted);
>              continue;
>          }
> @@ -2308,11 +2316,12 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>      {
>          struct {
>              unsigned vcpu:16, dom:16;
> -            unsigned tickled_cpu;
> +            unsigned tickled_cpu, skipped;
>          } d;
>          d.dom = snext->vcpu->domain->domain_id;
>          d.vcpu = snext->vcpu->vcpu_id;
>          d.tickled_cpu = snext->tickled_cpu;
> +        d.skipped = *skipped;
>          __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
>                      sizeof(d),
>                      (unsigned char *)&d);
> @@ -2336,6 +2345,7 @@ csched2_schedule(
>      struct csched2_runqueue_data *rqd;
>      struct csched2_vcpu * const scurr = CSCHED2_VCPU(current);
>      struct csched2_vcpu *snext = NULL;
> +    unsigned int skipped_vcpus = 0;
>      struct task_slice ret;
>
>      SCHED_STAT_CRANK(schedule);
> @@ -2385,7 +2395,7 @@ csched2_schedule(
>          snext = CSCHED2_VCPU(idle_vcpu[cpu]);
>      }
>      else
> -        snext = runq_candidate(rqd, scurr, cpu, now);
> +        snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus);
>
>      /* If switching from a non-idle runnable vcpu, put it
>       * back on the runqueue. */
> @@ -2409,8 +2419,21 @@ csched2_schedule(
>              __set_bit(__CSFLAG_scheduled, &snext->flags);
>          }
>
> -        /* Check for the reset condition */
> -        if ( snext->credit <= CSCHED2_CREDIT_RESET )
> +        /*
> +         * The reset condition is "has a scheduler epoch come to an end?".
> +         * The way this is enforced is checking whether the vcpu at the top
> +         * of the runqueue has negative credits. This means the epochs have
> +         * variable lenght, as in one epoch expores when:
> +         *  1) the vcpu at the top of the runqueue has executed for
> +         *     around 10 ms (with default parameters);
> +         *  2) no other vcpu with higher credits wants to run.
> +         *
> +         * Here, where we want to check for reset, we need to make sure the
> +         * proper vcpu is being used. In fact, runqueue_candidate() may have
> +         * not returned the first vcpu in the runqueue, for various reasons
> +         * (e.g., affinity). Only trigger a reset when it does.
> +         */
> +        if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
>          {
Is there no other way of checking that the returned vcpu from 
runqueue_candidate is the first one apart from using "skipped_vcpu" 
extra variable. Looks a bit ugly.
>              reset_credit(ops, cpu, now, snext);
>              balance_load(ops, cpu, now);
>
Dario Faggioli Sept. 30, 2016, 12:57 p.m. UTC | #3
On Fri, 2016-09-30 at 13:25 +0100, anshul makkar wrote:
> On 30/09/16 03:53, Dario Faggioli wrote:
> > @@ -2336,6 +2345,7 @@ csched2_schedule(
> >      struct csched2_runqueue_data *rqd;
> >      struct csched2_vcpu * const scurr = CSCHED2_VCPU(current);
> >      struct csched2_vcpu *snext = NULL;
> > +    unsigned int skipped_vcpus = 0;
> >      struct task_slice ret;
> > 
> >      SCHED_STAT_CRANK(schedule);
> > @@ -2385,7 +2395,7 @@ csched2_schedule(
> >          snext = CSCHED2_VCPU(idle_vcpu[cpu]);
> >      }
> >      else
> > -        snext = runq_candidate(rqd, scurr, cpu, now);
> > +        snext = runq_candidate(rqd, scurr, cpu, now,
> > &skipped_vcpus);
> > 
> >      /* If switching from a non-idle runnable vcpu, put it
> >       * back on the runqueue. */
> > @@ -2409,8 +2419,21 @@ csched2_schedule(
> >              __set_bit(__CSFLAG_scheduled, &snext->flags);
> >          }
> > 
> > -        /* Check for the reset condition */
> > -        if ( snext->credit <= CSCHED2_CREDIT_RESET )
> > +        /*
> > +         * The reset condition is "has a scheduler epoch come to
> > an end?".
> > +         * The way this is enforced is checking whether the vcpu
> > at the top
> > +         * of the runqueue has negative credits. This means the
> > epochs have
> > +         * variable lenght, as in one epoch expores when:
> > +         *  1) the vcpu at the top of the runqueue has executed
> > for
> > +         *     around 10 ms (with default parameters);
> > +         *  2) no other vcpu with higher credits wants to run.
> > +         *
> > +         * Here, where we want to check for reset, we need to make
> > sure the
> > +         * proper vcpu is being used. In fact,
> > runqueue_candidate() may have
> > +         * not returned the first vcpu in the runqueue, for
> > various reasons
> > +         * (e.g., affinity). Only trigger a reset when it does.
> > +         */
> > +        if ( skipped_vcpus == 0 && snext->credit <=
> > CSCHED2_CREDIT_RESET )
> >          {
> Is there no other way of checking that the returned vcpu from 
> runqueue_candidate is the first one apart from using "skipped_vcpu" 
> extra variable. Looks a bit ugly.
>
Well, I guess we can try to use list_first_entry() on the runqueue, and
compare it with snext.

However, that "leaks" the bit of information that the runqueue is
implemented as a list, and would break if at some point we decide to
use something different. TBF, this is not a big problem, as we're
inside the scheduler implementation anyway (so not really too big of a
leak), and we'd most likely notice the breakage at build time... but
still.

More important, I don't think that "just" doing that would work. In
fact, if scurr is running, and there is no-one in the runqueue, and
scurr's credits are negative, right now, we reset. So, to catch that
case the condition would need to become something like this:

 if ( (list_empty(&RQD(ops, cpu)->runq) ||
       list_first_entry(&RQD(ops, cpu)->runq) == snext) &&
      snext->credit <= CSCHED2_CREDIT_RESET )

I agree that counting is a bit ugly, but this one here above does not
look much prettier to me.

Also, the number of skipped vcpus, if reported via tracing, does convey
some information on the status of the runqueue. Nothing crucial, but
something that could be useful (at least, it was to me, during
developing and debugging).

So, for these reasons, I'd be inclined to leave this as it is.

Thanks and Regards,
Dario
diff mbox

Patch

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 3986441..72e31b5 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -2244,12 +2244,15 @@  void __dump_execstate(void *unused);
 static struct csched2_vcpu *
 runq_candidate(struct csched2_runqueue_data *rqd,
                struct csched2_vcpu *scurr,
-               int cpu, s_time_t now)
+               int cpu, s_time_t now,
+               unsigned int *skipped)
 {
     struct list_head *iter;
     struct csched2_vcpu *snext = NULL;
     struct csched2_private *prv = CSCHED2_PRIV(per_cpu(scheduler, cpu));
 
+    *skipped = 0;
+
     /* Default to current if runnable, idle otherwise */
     if ( vcpu_runnable(scurr->vcpu) )
         snext = scurr;
@@ -2273,7 +2276,10 @@  runq_candidate(struct csched2_runqueue_data *rqd,
 
         /* Only consider vcpus that are allowed to run on this processor. */
         if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+        {
+            (*skipped)++;
             continue;
+        }
 
         /*
          * If a vcpu is meant to be picked up by another processor, and such
@@ -2282,6 +2288,7 @@  runq_candidate(struct csched2_runqueue_data *rqd,
         if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
              cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
         {
+            (*skipped)++;
             SCHED_STAT_CRANK(deferred_to_tickled_cpu);
             continue;
         }
@@ -2291,6 +2298,7 @@  runq_candidate(struct csched2_runqueue_data *rqd,
         if ( svc->vcpu->processor != cpu
              && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
         {
+            (*skipped)++;
             SCHED_STAT_CRANK(migrate_resisted);
             continue;
         }
@@ -2308,11 +2316,12 @@  runq_candidate(struct csched2_runqueue_data *rqd,
     {
         struct {
             unsigned vcpu:16, dom:16;
-            unsigned tickled_cpu;
+            unsigned tickled_cpu, skipped;
         } d;
         d.dom = snext->vcpu->domain->domain_id;
         d.vcpu = snext->vcpu->vcpu_id;
         d.tickled_cpu = snext->tickled_cpu;
+        d.skipped = *skipped;
         __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
                     sizeof(d),
                     (unsigned char *)&d);
@@ -2336,6 +2345,7 @@  csched2_schedule(
     struct csched2_runqueue_data *rqd;
     struct csched2_vcpu * const scurr = CSCHED2_VCPU(current);
     struct csched2_vcpu *snext = NULL;
+    unsigned int skipped_vcpus = 0;
     struct task_slice ret;
 
     SCHED_STAT_CRANK(schedule);
@@ -2385,7 +2395,7 @@  csched2_schedule(
         snext = CSCHED2_VCPU(idle_vcpu[cpu]);
     }
     else
-        snext = runq_candidate(rqd, scurr, cpu, now);
+        snext = runq_candidate(rqd, scurr, cpu, now, &skipped_vcpus);
 
     /* If switching from a non-idle runnable vcpu, put it
      * back on the runqueue. */
@@ -2409,8 +2419,21 @@  csched2_schedule(
             __set_bit(__CSFLAG_scheduled, &snext->flags);
         }
 
-        /* Check for the reset condition */
-        if ( snext->credit <= CSCHED2_CREDIT_RESET )
+        /*
+         * The reset condition is "has a scheduler epoch come to an end?".
+         * The way this is enforced is checking whether the vcpu at the top
+         * of the runqueue has negative credits. This means the epochs have
+         * variable lenght, as in one epoch expores when:
+         *  1) the vcpu at the top of the runqueue has executed for
+         *     around 10 ms (with default parameters);
+         *  2) no other vcpu with higher credits wants to run.
+         *
+         * Here, where we want to check for reset, we need to make sure the
+         * proper vcpu is being used. In fact, runqueue_candidate() may have
+         * not returned the first vcpu in the runqueue, for various reasons
+         * (e.g., affinity). Only trigger a reset when it does.
+         */
+        if ( skipped_vcpus == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
         {
             reset_credit(ops, cpu, now, snext);
             balance_load(ops, cpu, now);