@@ -1701,6 +1701,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
smp_cpus_done();
+ scheduler_smp_init();
+
do_initcalls();
if ( opt_watchdog )
@@ -55,9 +55,32 @@ boolean_param("sched_smt_power_savings", sched_smt_power_savings);
int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
integer_param("sched_ratelimit_us", sched_ratelimit_us);
+static enum {
+ SCHED_GRAN_thread,
+ SCHED_GRAN_core,
+ SCHED_GRAN_socket
+} opt_sched_granularity = SCHED_GRAN_thread;
+
+#ifdef CONFIG_X86
+static int __init sched_select_granularity(const char *str)
+{
+ if (strcmp("thread", str) == 0)
+ opt_sched_granularity = SCHED_GRAN_thread;
+ else if (strcmp("core", str) == 0)
+ opt_sched_granularity = SCHED_GRAN_core;
+ else if (strcmp("socket", str) == 0)
+ opt_sched_granularity = SCHED_GRAN_socket;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+custom_param("sched_granularity", sched_select_granularity);
+#endif
+
/* Number of vcpus per struct sched_item. */
unsigned int sched_granularity = 1;
-const cpumask_t *sched_res_mask = &cpumask_all;
+cpumask_var_t sched_res_mask;
/* Various timer handlers. */
static void s_timer_fn(void *unused);
@@ -68,6 +91,7 @@ static void poll_timer_fn(void *data);
/* This is global for now so that private implementations can reach it */
DEFINE_PER_CPU(struct scheduler *, scheduler);
DEFINE_PER_CPU(struct sched_resource *, sched_res);
+static DEFINE_PER_CPU(unsigned int, sched_res_idx);
/* Scratch space for cpumasks. */
DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
@@ -82,6 +106,12 @@ static struct scheduler __read_mostly ops;
(( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
: (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
+static inline struct vcpu *sched_item2vcpu_cpu(const struct sched_item *item,
+ unsigned int cpu)
+{
+ return item->domain->vcpu[item->item_id + per_cpu(sched_res_idx, cpu)];
+}
+
static inline struct scheduler *dom_scheduler(const struct domain *d)
{
if ( likely(d->cpupool != NULL) )
@@ -300,25 +330,10 @@ static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
spin_unlock_irqrestore(lock1, flags);
}
-static void sched_free_item(struct sched_item *item, struct vcpu *v)
+static void sched_free_item_mem(struct sched_item *item)
{
- struct sched_item *prev_item;
struct domain *d = item->domain;
- struct vcpu *vitem;
- unsigned int cnt = 0;
-
- /* Don't count to be released vcpu, might be not in vcpu list yet. */
- for_each_sched_item_vcpu ( item, vitem )
- if ( vitem != v )
- cnt++;
-
- v->sched_item = NULL;
-
- if ( cnt )
- return;
-
- if ( item->vcpu == v )
- item->vcpu = v->next_in_list;
+ struct sched_item *prev_item;
if ( d->sched_item_list == item )
d->sched_item_list = item->next_in_list;
@@ -342,6 +357,30 @@ static void sched_free_item(struct sched_item *item, struct vcpu *v)
xfree(item);
}
+static void sched_free_item(struct sched_item *item, struct vcpu *v)
+{
+ struct vcpu *vitem;
+ unsigned int cnt = 0;
+
+ /* Don't count to be released vcpu, might be not in vcpu list yet. */
+ for_each_sched_item_vcpu ( item, vitem )
+ if ( vitem != v )
+ cnt++;
+
+ v->sched_item = NULL;
+
+ if ( item->vcpu == v )
+ item->vcpu = v->next_in_list;
+
+ if ( is_idle_domain(item->domain) )
+ item->run_cnt--;
+ else
+ item->idle_cnt--;
+
+ if ( !cnt )
+ sched_free_item_mem(item);
+}
+
static void sched_item_add_vcpu(struct sched_item *item, struct vcpu *v)
{
v->sched_item = item;
@@ -1847,7 +1886,7 @@ static void sched_slave(void)
pcpu_schedule_unlock_irq(lock, cpu);
- sched_context_switch(vprev, next->vcpu, now);
+ sched_context_switch(vprev, sched_item2vcpu_cpu(next, cpu), now);
}
/*
@@ -1906,7 +1945,7 @@ static void schedule(void)
pcpu_schedule_unlock_irq(lock, cpu);
- vnext = next->vcpu;
+ vnext = sched_item2vcpu_cpu(next, cpu);
sched_context_switch(vprev, vnext, now);
}
@@ -1964,8 +2003,14 @@ static int cpu_schedule_up(unsigned int cpu)
sd = xmalloc(struct sched_resource);
if ( sd == NULL )
return -ENOMEM;
+ if ( !zalloc_cpumask_var(&sd->cpus) )
+ {
+ xfree(sd);
+ return -ENOMEM;
+ }
+
sd->processor = cpu;
- sd->cpus = cpumask_of(cpu);
+ cpumask_copy(sd->cpus, cpumask_of(cpu));
per_cpu(sched_res, cpu) = sd;
per_cpu(scheduler, cpu) = &ops;
@@ -2025,6 +2070,12 @@ static void cpu_schedule_down(unsigned int cpu)
struct sched_resource *sd = per_cpu(sched_res, cpu);
struct scheduler *sched = per_cpu(scheduler, cpu);
+ cpumask_clear_cpu(cpu, sd->cpus);
+ per_cpu(sched_res, cpu) = NULL;
+
+ if ( cpumask_weight(sd->cpus) )
+ return;
+
SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
SCHED_OP(sched, free_vdata, idle_vcpu[cpu]->sched_item->priv);
@@ -2032,18 +2083,67 @@ static void cpu_schedule_down(unsigned int cpu)
sd->sched_priv = NULL;
kill_timer(&sd->s_timer);
+ free_cpumask_var(sd->cpus);
+ cpumask_clear_cpu(cpu, sched_res_mask);
- xfree(per_cpu(sched_res, cpu));
- per_cpu(sched_res, cpu) = NULL;
+ xfree(sd);
}
void scheduler_percpu_init(unsigned int cpu)
{
struct scheduler *sched = per_cpu(scheduler, cpu);
struct sched_resource *sd = per_cpu(sched_res, cpu);
+ const cpumask_t *mask;
+ unsigned int master_cpu;
+ spinlock_t *lock;
+ struct sched_item *old_item, *master_item;
+
+ if ( system_state == SYS_STATE_resume )
+ return;
+
+ switch ( opt_sched_granularity )
+ {
+ case SCHED_GRAN_thread:
+ mask = cpumask_of(cpu);
+ break;
+ case SCHED_GRAN_core:
+ mask = per_cpu(cpu_sibling_mask, cpu);
+ break;
+ case SCHED_GRAN_socket:
+ mask = per_cpu(cpu_core_mask, cpu);
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ return;
+ }
- if ( system_state != SYS_STATE_resume )
+ if ( cpu == 0 || cpumask_weight(mask) == 1 )
+ {
+ cpumask_set_cpu(cpu, sched_res_mask);
SCHED_OP(sched, init_pdata, sd->sched_priv, cpu);
+ return;
+ }
+
+ master_cpu = cpumask_first(mask);
+ master_item = idle_vcpu[master_cpu]->sched_item;
+ lock = pcpu_schedule_lock(master_cpu);
+
+ /* Merge idle_vcpu item and sched_resource into master cpu. */
+ old_item = idle_vcpu[cpu]->sched_item;
+ idle_vcpu[cpu]->sched_item = master_item;
+ per_cpu(sched_res, cpu) = per_cpu(sched_res, master_cpu);
+ per_cpu(sched_res_idx, cpu) = cpumask_weight(per_cpu(sched_res, cpu)->cpus);
+ cpumask_set_cpu(cpu, per_cpu(sched_res, cpu)->cpus);
+ master_item->run_cnt += old_item->run_cnt;
+ master_item->idle_cnt += old_item->idle_cnt;
+
+ pcpu_schedule_unlock(lock, master_cpu);
+
+ SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
+ SCHED_OP(sched, free_vdata, old_item->priv);
+
+ xfree(sd);
+ sched_free_item_mem(old_item);
}
static int cpu_schedule_callback(
@@ -2123,6 +2223,51 @@ static struct notifier_block cpu_schedule_nfb = {
.notifier_call = cpu_schedule_callback
};
+static unsigned int __init sched_check_granularity(void)
+{
+ unsigned int cpu;
+ unsigned int siblings, gran = 0;
+
+ for_each_online_cpu( cpu )
+ {
+ switch ( opt_sched_granularity )
+ {
+ case SCHED_GRAN_thread:
+ /* If granularity is "thread" we are fine already. */
+ return 1;
+ case SCHED_GRAN_core:
+ siblings = cpumask_weight(per_cpu(cpu_sibling_mask, cpu));
+ break;
+ case SCHED_GRAN_socket:
+ siblings = cpumask_weight(per_cpu(cpu_core_mask, cpu));
+ break;
+ default:
+ ASSERT_UNREACHABLE();
+ return 0;
+ }
+
+ if ( gran == 0 )
+ gran = siblings;
+ else if ( gran != siblings )
+ return 0;
+ }
+
+ return gran;
+}
+
+/* Setup data for selected scheduler granularity. */
+void __init scheduler_smp_init(void)
+{
+ unsigned int gran;
+
+ gran = sched_check_granularity();
+ if ( gran == 0 )
+ panic("Illegal cpu configuration for scheduling granularity!\n"
+ "Please use thread scheduling.\n");
+
+ sched_granularity = gran;
+}
+
/* Initialise the data structures. */
void __init scheduler_init(void)
{
@@ -2154,6 +2299,9 @@ void __init scheduler_init(void)
printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
}
+ if ( !zalloc_cpumask_var(&sched_res_mask) )
+ BUG();
+
if ( cpu_schedule_up(0) )
BUG();
register_cpu_notifier(&cpu_schedule_nfb);
@@ -23,7 +23,7 @@ extern cpumask_t cpupool_free_cpus;
extern int sched_ratelimit_us;
/* Scheduling resource mask. */
-extern const cpumask_t *sched_res_mask;
+extern cpumask_var_t sched_res_mask;
/*
* In order to allow a scheduler to remap the lock->cpu mapping,
@@ -43,7 +43,7 @@ struct sched_resource {
struct timer s_timer; /* scheduling timer */
atomic_t urgent_count; /* how many urgent vcpus */
unsigned processor;
- const cpumask_t *cpus; /* cpus covered by this struct */
+ cpumask_var_t cpus; /* cpus covered by this struct */
};
#define curr_on_cpu(c) (per_cpu(sched_res, c)->curr)
@@ -636,6 +636,7 @@ void noreturn asm_domain_crash_synchronous(unsigned long addr);
void scheduler_init(void);
void scheduler_percpu_init(unsigned int cpu);
+void scheduler_smp_init(void);
int sched_init_vcpu(struct vcpu *v);
void sched_destroy_vcpu(struct vcpu *v);
int sched_init_domain(struct domain *d, int poolid);
Add a scheduling granularity enum ("thread", "core", "socket") for specification of the scheduling granularity. Initially it is set to "thread", this can be modified by the new boot parameter (x86 only) "sched_granularity". According to the selected granularity sched_granularity is set after all cpus are online. The sched items of the idle vcpus and the sched resources of the physical cpus need to be combined in case sched_granularity > 1, this happens before the init_pdata hook of the active scheduler is being called. A test is added for all sched resources holding the same number of cpus. For now panic if this is not the case. Signed-off-by: Juergen Gross <jgross@suse.com> --- xen/arch/x86/setup.c | 2 + xen/common/schedule.c | 196 +++++++++++++++++++++++++++++++++++++++------ xen/include/xen/sched-if.h | 4 +- xen/include/xen/sched.h | 1 + 4 files changed, 177 insertions(+), 26 deletions(-)