@@ -72,6 +72,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ int overutilized;
};
struct sched_domain {
@@ -5197,6 +5197,28 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static inline int cpu_overutilized(int cpu);
+
+static inline int sd_overutilized(struct sched_domain *sd)
+{
+ return READ_ONCE(sd->shared->overutilized);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (sd && !sd_overutilized(sd) && cpu_overutilized(rq->cpu))
+ WRITE_ONCE(sd->shared->overutilized, 1);
+ rcu_read_unlock();
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) {}
+#endif /* CONFIG_SMP */
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5246,8 +5268,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ update_overutilized_status(rq);
+ }
hrtick_update(rq);
}
@@ -6379,6 +6403,11 @@ static inline int util_fits_capacity(unsigned long util, unsigned long capacity)
return capacity * 1024 > util * capacity_margin;
}
+static inline int cpu_overutilized(int cpu)
+{
+ return !util_fits_capacity(cpu_util(cpu), capacity_of(cpu));
+}
+
/*
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
@@ -7617,6 +7646,7 @@ struct sd_lb_stats {
unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long total_util; /* Total util of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7637,6 +7667,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
+ .total_util = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
@@ -7933,11 +7964,12 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, int *overutilized)
{
unsigned long load;
int i, nr_running;
@@ -7974,6 +8006,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (cpu_overutilized(i))
+ *overutilized = 1;
}
/* Adjust by relative CPU capacity of the group */
@@ -8101,6 +8136,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false;
+ int overutilized = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -8127,7 +8163,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
@@ -8159,6 +8195,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sds->total_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -8180,6 +8217,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
+
+ if (READ_ONCE(env->sd->shared->overutilized) != overutilized)
+ WRITE_ONCE(env->sd->shared->overutilized, overutilized);
+
+ /*
+ * If the domain util is greater that domain capacity, load balancing
+ * needs to be done at the next sched domain level as well.
+ */
+ if (env->sd->parent &&
+ !util_fits_capacity(sds->total_util, sds->total_capacity))
+ WRITE_ONCE(env->sd->parent->shared->overutilized, 1);
}
/**
@@ -9055,6 +9103,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
}
max_cost += sd->max_newidle_lb_cost;
+ if (static_branch_unlikely(&sched_energy_present) &&
+ !sd_overutilized(sd))
+ continue;
+
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -9622,6 +9674,10 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
break;
}
+ if (static_branch_unlikely(&sched_energy_present) &&
+ !sd_overutilized(sd))
+ continue;
+
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
@@ -9755,6 +9811,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ update_overutilized_status(rq);
}
/*
@@ -11,6 +11,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/sched/debug.h>
+#include <linux/sched/energy.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/idle.h>
#include <linux/sched/init.h>
@@ -1184,15 +1184,11 @@ sd_init(struct sched_domain_topology_level *tl,
sd->idle_idx = 1;
}
- /*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
- */
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+
+ if (sd->flags & SD_SHARE_PKG_RESOURCES)
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
sd->private = sdd;