@@ -870,6 +870,7 @@ enum cpu_idle_type {
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_WORKLOAD_CONSOLIDATION 0x8000 /* consolidate CPU workload */
#ifdef CONFIG_SCHED_SMT
static inline const int cpu_smt_flags(void)
@@ -881,7 +882,7 @@ static inline const int cpu_smt_flags(void)
#ifdef CONFIG_SCHED_MC
static inline const int cpu_core_flags(void)
{
- return SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_PKG_RESOURCES | SD_WORKLOAD_CONSOLIDATION;
}
#endif
@@ -973,6 +974,11 @@ struct sched_domain {
struct rcu_head rcu; /* used during destruction */
};
+ unsigned int total_groups; /* total group number */
+ unsigned int group_number; /* this CPU's group sequence */
+ unsigned int consolidating_coeff; /* consolidating coefficient */
+ struct sched_group *first_group; /* ordered by CPU number */
+
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
@@ -4941,7 +4941,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table = sd_alloc_ctl_entry(15);
if (table == NULL)
return NULL;
@@ -4974,7 +4974,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ set_table_entry(&table[13], "consolidating_coeff", &sd->consolidating_coeff,
+ sizeof(int), 0644, proc_dointvec, false);
+ /* &table[14] is terminator */
return table;
}
@@ -5586,7 +5588,7 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES, 1);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
@@ -5601,10 +5603,41 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING, 1);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
+
+DEFINE_PER_CPU(struct sched_domain *, sd_wc);
+
+static void update_wc_domain(struct sched_domain *sd, int cpu)
+{
+ while (sd) {
+ int i = 0, j = 0, first, min = INT_MAX;
+ struct sched_group *group;
+
+ group = sd->groups;
+ first = group_first_cpu(group);
+ do {
+ int k = group_first_cpu(group);
+ i += 1;
+ if (k < first)
+ j += 1;
+ if (k < min) {
+ sd->first_group = group;
+ min = k;
+ }
+ } while (group = group->next, group != sd->groups);
+
+ sd->total_groups = i;
+ sd->group_number = j;
+ sd = sd->parent;
+ }
+
+ sd = highest_flag_domain(cpu, SD_WORKLOAD_CONSOLIDATION, 0);
+ rcu_assign_pointer(per_cpu(sd_wc, cpu), sd);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -5653,6 +5686,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
destroy_sched_domains(tmp, cpu);
update_top_cache_domain(cpu);
+
+ update_wc_domain(sd, cpu);
}
/* cpus with isolated domains */
@@ -6069,6 +6104,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
+ .consolidating_coeff = 0,
};
/*
@@ -6098,6 +6134,8 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
}
#endif
+ } else if (sd->flags & SD_WORKLOAD_CONSOLIDATION) {
+ sd->consolidating_coeff = 160;
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
@@ -695,16 +695,22 @@ extern void sched_ttwu_pending(void);
* be returned.
* @flag: The flag to check for the highest sched_domain
* for the given cpu.
+ * @all: The flag is contained by all sched_domains from the hightest down
*
* Returns the highest sched_domain of a cpu which contains the given flag.
*/
-static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+static inline struct
+sched_domain *highest_flag_domain(int cpu, int flag, int all)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
- break;
+ if (!(sd->flags & flag)) {
+ if (all)
+ break;
+ else
+ continue;
+ }
hsd = sd;
}
@@ -729,6 +735,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_wc);
struct sched_group_capacity {
atomic_t ref;
Workload Consolidation is completely CPU topology and policy driven. To do so, we define SD_WORKLOAD_CONSOLIDATION, and add some fields in sched_domain struct: 1) total_groups is the group number in total in this domain 2) group_number is this CPU's group sequence number 3) consolidating_coeff is the coefficient for consolidating CPUs, and is changeable via sysctl tool to make consolidation more aggressive or less 4) first_group is the pointer to this domain's first group ordered by CPU number This patchset enables SD_WORKLOAD_CONSOLIDATION in MC domain by default. But we need to come up with a better way to determine on which architecture this flag should be enabled or not. Thanks to PeterZ and Dietmar for pointing this out and help me finally understand it. Signed-off-by: Yuyang Du <yuyang.du@intel.com> --- include/linux/sched.h | 8 +++++++- kernel/sched/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 13 ++++++++++--- 3 files changed, 59 insertions(+), 8 deletions(-)