From patchwork Sat Sep 14 08:52:48 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: =?utf-8?b?SsO8cmdlbiBHcm/Dnw==?= <jgross@suse.com>
X-Patchwork-Id: 11145657
Return-Path: <SRS0=7ja4=XJ=lists.xenproject.org=xen-devel-bounces@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 308051800
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Sat, 14 Sep 2019 08:56:14 +0000 (UTC)
Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by mail.kernel.org (Postfix) with ESMTPS id 0B67520717
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Sat, 14 Sep 2019 08:56:14 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 0B67520717
Authentication-Results: mail.kernel.org;
 dmarc=none (p=none dis=none) header.from=suse.com
Authentication-Results: mail.kernel.org;
 spf=none smtp.mailfrom=xen-devel-bounces@lists.xenproject.org
Received: from localhost ([127.0.0.1] helo=lists.xenproject.org)
	by lists.xenproject.org with esmtp (Exim 4.89)
	(envelope-from <xen-devel-bounces@lists.xenproject.org>)
	id 1i93ph-0002FV-Ey; Sat, 14 Sep 2019 08:55:05 +0000
Received: from all-amaz-eas1.inumbo.com ([34.197.232.57]
 helo=us1-amaz-eas2.inumbo.com)
 by lists.xenproject.org with esmtp (Exim 4.89)
 (envelope-from <SRS0=rDpt=XJ=suse.com=jgross@srs-us1.protection.inumbo.net>)
 id 1i93pf-0002Ci-Sa
 for xen-devel@lists.xenproject.org; Sat, 14 Sep 2019 08:55:03 +0000
X-Inumbo-ID: 1320bd48-d6cd-11e9-95c1-12813bfff9fa
Received: from mx1.suse.de (unknown [195.135.220.15])
 by us1-amaz-eas2.inumbo.com (Halon) with ESMTPS
 id 1320bd48-d6cd-11e9-95c1-12813bfff9fa;
 Sat, 14 Sep 2019 08:53:09 +0000 (UTC)
X-Virus-Scanned: by amavisd-new at test-mx.suse.de
Received: from relay2.suse.de (unknown [195.135.220.254])
 by mx1.suse.de (Postfix) with ESMTP id 9D32AB66A;
 Sat, 14 Sep 2019 08:53:08 +0000 (UTC)
From: Juergen Gross <jgross@suse.com>
To: xen-devel@lists.xenproject.org
Date: Sat, 14 Sep 2019 10:52:48 +0200
Message-Id: <20190914085251.18816-45-jgross@suse.com>
X-Mailer: git-send-email 2.16.4
In-Reply-To: <20190914085251.18816-1-jgross@suse.com>
References: <20190914085251.18816-1-jgross@suse.com>
Subject: [Xen-devel] [PATCH v3 44/47] xen/sched: support differing
 granularity in schedule_cpu_[add/rm]()
X-BeenThere: xen-devel@lists.xenproject.org
X-Mailman-Version: 2.1.23
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xenproject.org>
List-Unsubscribe: <https://lists.xenproject.org/mailman/options/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xenproject.org>
List-Help: <mailto:xen-devel-request@lists.xenproject.org?subject=help>
List-Subscribe: <https://lists.xenproject.org/mailman/listinfo/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=subscribe>
Cc: Juergen Gross <jgross@suse.com>,
 George Dunlap <george.dunlap@eu.citrix.com>,
 Dario Faggioli <dfaggioli@suse.com>
MIME-Version: 1.0
Errors-To: xen-devel-bounces@lists.xenproject.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>

With core scheduling active schedule_cpu_[add/rm]() has to cope with
different scheduling granularity: a cpu not in any cpupool is subject
to granularity 1 (cpu scheduling), while a cpu in a cpupool might be
in a scheduling resource with more than one cpu.

Handle that by having arrays of old/new pdata and vdata and loop over
those where appropriate.

Additionally the scheduling resource(s) must either be merged or
split.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/common/cpupool.c  |  18 ++--
 xen/common/schedule.c | 226 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 204 insertions(+), 40 deletions(-)

diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index f53ceec3ab..ada3a7e824 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -531,6 +531,7 @@ static void cpupool_cpu_remove(unsigned int cpu)
         ret = cpupool_unassign_cpu_finish(cpupool0);
         BUG_ON(ret);
     }
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
 }
 
 /*
@@ -580,20 +581,19 @@ static void cpupool_cpu_remove_forced(unsigned int cpu)
     struct cpupool **c;
     int ret;
 
-    if ( cpumask_test_cpu(cpu, &cpupool_free_cpus) )
-        cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-    else
+    for_each_cpupool ( c )
     {
-        for_each_cpupool(c)
+        if ( cpumask_test_cpu(cpu, (*c)->cpu_valid) )
         {
-            if ( cpumask_test_cpu(cpu, (*c)->cpu_valid) )
-            {
-                ret = cpupool_unassign_cpu(*c, cpu);
-                BUG_ON(ret);
-            }
+            ret = cpupool_unassign_cpu_start(*c, cpu);
+            BUG_ON(ret);
+            ret = cpupool_unassign_cpu_finish(*c);
+            BUG_ON(ret);
         }
     }
 
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+
     rcu_read_lock(&sched_res_rculock);
     sched_rm_cpu(cpu);
     rcu_read_unlock(&sched_res_rculock);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 882b3baf42..4d56cd907f 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -404,27 +404,30 @@ static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
     unit->runstate_cnt[v->runstate.state]++;
 }
 
-static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+static struct sched_unit *sched_alloc_unit_mem(void)
 {
-    struct sched_unit *unit, **prev_unit;
-    struct domain *d = v->domain;
-    unsigned int gran = d->cpupool ? d->cpupool->granularity : 1;
+    struct sched_unit *unit;
 
-    for_each_sched_unit ( d, unit )
-        if ( unit->vcpu_list->vcpu_id / gran == v->vcpu_id / gran )
-            break;
+    unit = xzalloc(struct sched_unit);
+    if ( !unit )
+        return NULL;
 
-    if ( unit )
+    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
     {
-        sched_unit_add_vcpu(unit, v);
-        return unit;
+        sched_free_unit_mem(unit);
+        unit = NULL;
     }
 
-    if ( (unit = xzalloc(struct sched_unit)) == NULL )
-        return NULL;
+    return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+    struct sched_unit **prev_unit;
 
     unit->domain = d;
-    sched_unit_add_vcpu(unit, v);
 
     for ( prev_unit = &d->sched_unit_list; *prev_unit;
           prev_unit = &(*prev_unit)->next_in_list )
@@ -434,17 +437,31 @@ static struct sched_unit *sched_alloc_unit(struct vcpu *v)
 
     unit->next_in_list = *prev_unit;
     *prev_unit = unit;
+}
 
-    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
-         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
-         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
-        goto fail;
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+    struct sched_unit *unit;
+    struct domain *d = v->domain;
+    unsigned int gran = d->cpupool ? d->cpupool->granularity : 1;
 
-    return unit;
+    for_each_sched_unit ( d, unit )
+        if ( unit->vcpu_list->vcpu_id / gran == v->vcpu_id / gran )
+            break;
 
- fail:
-    sched_free_unit(unit, v);
-    return NULL;
+    if ( unit )
+    {
+        sched_unit_add_vcpu(unit, v);
+        return unit;
+    }
+
+    if ( (unit = sched_alloc_unit_mem()) == NULL )
+        return NULL;
+
+    sched_unit_add_vcpu(unit, v);
+    sched_domain_insert_unit(unit, d);
+
+    return unit;
 }
 
 static unsigned int sched_select_initial_cpu(const struct vcpu *v)
@@ -2381,18 +2398,28 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
-static int cpu_schedule_up(unsigned int cpu)
+static struct sched_resource *sched_alloc_res(void)
 {
     struct sched_resource *sd;
 
     sd = xzalloc(struct sched_resource);
     if ( sd == NULL )
-        return -ENOMEM;
+        return NULL;
     if ( !zalloc_cpumask_var(&sd->cpus) )
     {
         xfree(sd);
-        return -ENOMEM;
+        return NULL;
     }
+    return sd;
+}
+
+static int cpu_schedule_up(unsigned int cpu)
+{
+    struct sched_resource *sd;
+
+    sd = sched_alloc_res();
+    if ( sd == NULL )
+        return -ENOMEM;
 
     sd->master_cpu = cpu;
     cpumask_copy(sd->cpus, cpumask_of(cpu));
@@ -2442,6 +2469,8 @@ static void sched_res_free(struct rcu_head *head)
     struct sched_resource *sd = container_of(head, struct sched_resource, rcu);
 
     free_cpumask_var(sd->cpus);
+    if ( sd->sched_unit_idle )
+        sched_free_unit_mem(sd->sched_unit_idle);
     xfree(sd);
 }
 
@@ -2456,6 +2485,8 @@ static void cpu_schedule_down(unsigned int cpu)
     kill_timer(&sd->s_timer);
 
     set_sched_res(cpu, NULL);
+    /* Keep idle unit. */
+    sd->sched_unit_idle = NULL;
     call_rcu(&sd->rcu, sched_res_free);
 
     rcu_read_unlock(&sched_res_rculock);
@@ -2535,6 +2566,30 @@ static struct notifier_block cpu_schedule_nfb = {
     .notifier_call = cpu_schedule_callback
 };
 
+static const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt,
+                                              unsigned int cpu)
+{
+    const cpumask_t *mask;
+
+    switch ( opt )
+    {
+    case SCHED_GRAN_cpu:
+        mask = cpumask_of(cpu);
+        break;
+    case SCHED_GRAN_core:
+        mask = per_cpu(cpu_sibling_mask, cpu);
+        break;
+    case SCHED_GRAN_socket:
+        mask = per_cpu(cpu_core_mask, cpu);
+        break;
+    default:
+        ASSERT_UNREACHABLE();
+        return NULL;
+    }
+
+    return mask;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
@@ -2693,6 +2748,46 @@ int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
      */
     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
+    if ( c->granularity > 1 )
+    {
+        const cpumask_t *mask;
+        unsigned int cpu_iter, idx = 0;
+        struct sched_unit *old_unit, *master_unit;
+        struct sched_resource *sd_old;
+
+        /*
+         * We need to merge multiple idle_vcpu units and sched_resource structs
+         * into one. As the free cpus all share the same lock we are fine doing
+         * that now. The worst which could happen would be someone waiting for
+         * the lock, thus dereferencing sched_res->schedule_lock. This is the
+         * reason we are freeing struct sched_res via call_rcu() to avoid the
+         * lock pointer suddenly disappearing.
+         */
+        mask = sched_get_opt_cpumask(c->opt_granularity, cpu);
+        master_unit = idle_vcpu[cpu]->sched_unit;
+
+        for_each_cpu ( cpu_iter, mask )
+        {
+            if ( idx )
+                cpumask_clear_cpu(cpu_iter, sched_res_mask);
+
+            per_cpu(sched_res_idx, cpu_iter) = idx++;
+
+            if ( cpu == cpu_iter )
+                continue;
+
+            old_unit = idle_vcpu[cpu_iter]->sched_unit;
+            sd_old = get_sched_res(cpu_iter);
+            kill_timer(&sd_old->s_timer);
+            idle_vcpu[cpu_iter]->sched_unit = master_unit;
+            master_unit->runstate_cnt[RUNSTATE_running]++;
+            set_sched_res(cpu_iter, sd);
+            cpumask_set_cpu(cpu_iter, sd->cpus);
+
+            call_rcu(&sd_old->rcu, sched_res_free);
+        }
+    }
+
     new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
 
     sd->scheduler = new_ops;
@@ -2730,33 +2825,100 @@ out:
  */
 int schedule_cpu_rm(unsigned int cpu)
 {
-    struct vcpu *idle;
     void *ppriv_old, *vpriv_old;
-    struct sched_resource *sd;
+    struct sched_resource *sd, **sd_new = NULL;
+    struct sched_unit *unit;
     struct scheduler *old_ops;
     spinlock_t *old_lock;
     unsigned long flags;
+    int idx, ret = -ENOMEM;
+    unsigned int cpu_iter;
 
     rcu_read_lock(&sched_res_rculock);
 
     sd = get_sched_res(cpu);
     old_ops = sd->scheduler;
 
+    if ( sd->granularity > 1 )
+    {
+        sd_new = xmalloc_array(struct sched_resource *, sd->granularity - 1);
+        if ( !sd_new )
+            goto out;
+        for ( idx = 0; idx < sd->granularity - 1; idx++ )
+        {
+            sd_new[idx] = sched_alloc_res();
+            if ( sd_new[idx] )
+            {
+                sd_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
+                if ( !sd_new[idx]->sched_unit_idle )
+                {
+                    sched_res_free(&sd_new[idx]->rcu);
+                    sd_new[idx] = NULL;
+                }
+            }
+            if ( !sd_new[idx] )
+            {
+                for ( idx--; idx >= 0; idx-- )
+                    sched_res_free(&sd_new[idx]->rcu);
+                goto out;
+            }
+            sd_new[idx]->curr = sd_new[idx]->sched_unit_idle;
+            sd_new[idx]->scheduler = &sched_idle_ops;
+            sd_new[idx]->granularity = 1;
+
+            /* We want the lock not to change when replacing the resource. */
+            sd_new[idx]->schedule_lock = sd->schedule_lock;
+        }
+    }
+
+    ret = 0;
     ASSERT(sd->cpupool != NULL);
     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
     ASSERT(!cpumask_test_cpu(cpu, sd->cpupool->cpu_valid));
 
-    idle = idle_vcpu[cpu];
-
     sched_do_tick_suspend(old_ops, cpu);
 
     /* See comment in schedule_cpu_add() regarding lock switching. */
     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
-    vpriv_old = idle->sched_unit->priv;
+    vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
     ppriv_old = sd->sched_priv;
 
-    idle->sched_unit->priv = NULL;
+    idx = 0;
+    for_each_cpu ( cpu_iter, sd->cpus )
+    {
+        per_cpu(sched_res_idx, cpu_iter) = 0;
+        if ( cpu_iter == cpu )
+        {
+            idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
+        }
+        else
+        {
+            /* Initialize unit. */
+            unit = sd_new[idx]->sched_unit_idle;
+            unit->res = sd_new[idx];
+            unit->is_running = true;
+            sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
+            sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
+
+            /* Adjust cpu masks of resources (old and new). */
+            cpumask_clear_cpu(cpu_iter, sd->cpus);
+            cpumask_set_cpu(cpu_iter, sd_new[idx]->cpus);
+
+            /* Init timer. */
+            init_timer(&sd_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+
+            /* Last resource initializations and insert resource pointer. */
+            sd_new[idx]->master_cpu = cpu_iter;
+            set_sched_res(cpu_iter, sd_new[idx]);
+
+            /* Last action: set the new lock pointer. */
+            smp_mb();
+            sd_new[idx]->schedule_lock = &sched_free_cpu_lock;
+
+            idx++;
+        }
+    }
     sd->scheduler = &sched_idle_ops;
     sd->sched_priv = NULL;
 
@@ -2774,9 +2936,11 @@ int schedule_cpu_rm(unsigned int cpu)
     sd->granularity = 1;
     sd->cpupool = NULL;
 
+out:
     rcu_read_unlock(&sched_res_rculock);
+    xfree(sd_new);
 
-    return 0;
+    return ret;
 }
 
 struct scheduler *scheduler_get_default(void)