@@ -296,14 +296,10 @@ __runq_insert(struct csched_unit *svc)
* runnable unit if we can. The next runq_sort will bring it forward
* within 30ms if the queue too long. */
if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags)
- && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
- {
+ && __runq_elem(iter)->pri > CSCHED_PRI_IDLE
+ && iter->next != runq)
iter=iter->next;
- /* Some sanity checks */
- BUG_ON(iter == runq);
- }
-
list_add_tail(&svc->runq_elem, iter);
}
@@ -319,6 +315,11 @@ __runq_remove(struct csched_unit *svc)
{
BUG_ON( !__unit_on_runq(svc) );
list_del_init(&svc->runq_elem);
+
+ /*
+ * Clear YIELD flag when scheduling back in
+ */
+ clear_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags);
}
static inline void
@@ -1638,6 +1639,13 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
if ( speer->pri <= pri )
break;
+ /*
+ * Don't steal a UNIT which has yielded; it's waiting for a
+ * reason
+ */
+ if (test_bit(CSCHED_FLAG_UNIT_YIELD, &speer->flags))
+ continue;
+
/* Is this UNIT runnable on our PCPU? */
unit = speer->unit;
BUG_ON( is_idle_unit(unit) );
@@ -1955,11 +1963,6 @@ static void cf_check csched_schedule(
dec_nr_runnable(sched_cpu);
}
- /*
- * Clear YIELD flag before scheduling out
- */
- clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
-
do {
snext = __runq_elem(runq->next);
@@ -1974,10 +1977,11 @@ static void cf_check csched_schedule(
/*
* SMP Load balance:
*
- * If the next highest priority local runnable UNIT has already eaten
- * through its credits, look on other PCPUs to see if we have more
- * urgent work... If not, csched_load_balance() will return snext, but
- * already removed from the runq.
+ * If the next highest priority local runnable UNIT has
+ * already eaten through its credits (and we're below the
+ * balancing ratelimit), look on other PCPUs to see if we have
+ * more urgent work... If we don't, csched_load_balance() will
+ * return snext, but already removed from the runq.
*/
if ( snext->pri <= CSCHED_PRI_TS_OVER
&& now - spc->last_load_balance > prv->load_balance_ratelimit) {
On large systems with many vcpus yielding due to spinlock priority inversion, it's not uncommon for a vcpu to yield its timeslice, only to be immediately stolen by another pcpu looking for higher-priority work. To prevent this: * Keep the YIELD flag until a vcpu is removed from a runqueue * When looking for work to steal, skip vcpus which have yielded NB that this does mean that sometimes a VM is inserted into an empty runqueue; handle that case. Signed-off-by: George Dunlap <george.dunlap@cloud.com> --- CC: Dario Faggioli <dfaggioli@suse.com> --- xen/common/sched/credit.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-)