diff mbox series

[4/4] KVM guest implementation

Message ID 20210831015919.13006-4-skyele@sjtu.edu.cn (mailing list archive)
State New, archived
Headers show
Series [1/4] KVM: x86: Introduce .pcpu_is_idle() stub infrastructure | expand

Commit Message

Tianqiang Xu Aug. 31, 2021, 1:59 a.m. UTC
Guest OS uses 'is_idle' field of kvm_steal_time to know if a pCPU
is idle and decides whether to schedule a task to a preempted vCPU
or not. If the pCPU is idle, scheduling a task to this pCPU will
improve cpu utilization. If not, avoiding scheduling a task to this
preempted vCPU can avoid host/guest switch, hence improving performance.

Guest OS invokes available_idle_cpu_sched() to get the value of
'is_idle' field of kvm_steal_time.

Other modules in kernel except kernel/sched/fair.c which invokes
available_idle_cpu() is left unchanged, because other modules in
kernel need the semantic provided by 'preempted' field of kvm_steal_time.

--
Authors: Tianqiang Xu, Dingji Li, Zeyu Mi
	 Shanghai Jiao Tong University

Signed-off-by: Tianqiang Xu <skyele@sjtu.edu.cn>

---
 kernel/sched/fair.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

Comments

Peter Zijlstra Aug. 31, 2021, 7:21 a.m. UTC | #1
On Tue, Aug 31, 2021 at 09:59:19AM +0800, Tianqiang Xu wrote:
> Guest OS uses 'is_idle' field of kvm_steal_time to know if a pCPU
> is idle and decides whether to schedule a task to a preempted vCPU
> or not. If the pCPU is idle, scheduling a task to this pCPU will
> improve cpu utilization. If not, avoiding scheduling a task to this
> preempted vCPU can avoid host/guest switch, hence improving performance.
> 
> Guest OS invokes available_idle_cpu_sched() to get the value of
> 'is_idle' field of kvm_steal_time.
> 
> Other modules in kernel except kernel/sched/fair.c which invokes
> available_idle_cpu() is left unchanged, because other modules in
> kernel need the semantic provided by 'preempted' field of kvm_steal_time.

> ---
>  kernel/sched/fair.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)

Goes and replaces every single available_idle_cpu() in fair with the new
function that doesn't consider vCPU preemption.

So what do you reckon now happens in the oversubscribed virt scenario
where each CPU has multiple vCPUs?
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..f69f0a8d2abe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5871,13 +5871,13 @@  wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+	if (available_idle_cpu_sched(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+		return available_idle_cpu_sched(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 		return this_cpu;
 
-	if (available_idle_cpu(prev_cpu))
+	if (available_idle_cpu_sched(prev_cpu))
 		return prev_cpu;
 
 	return nr_cpumask_bits;
@@ -5976,7 +5976,7 @@  find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		if (sched_idle_cpu(i))
 			return i;
 
-		if (available_idle_cpu(i)) {
+		if (available_idle_cpu_sched(i)) {
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
 				/*
@@ -6064,7 +6064,7 @@  static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 
 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+	if ((available_idle_cpu_sched(cpu) || sched_idle_cpu(cpu)) &&
 	    sched_cpu_cookie_match(cpu_rq(cpu), p))
 		return cpu;
 
@@ -6115,7 +6115,7 @@  void __update_idle_core(struct rq *rq)
 		if (cpu == core)
 			continue;
 
-		if (!available_idle_cpu(cpu))
+		if (!available_idle_cpu_sched(cpu))
 			goto unlock;
 	}
 
@@ -6138,7 +6138,7 @@  static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 		return __select_idle_cpu(core, p);
 
 	for_each_cpu(cpu, cpu_smt_mask(core)) {
-		if (!available_idle_cpu(cpu)) {
+		if (!available_idle_cpu_sched(cpu)) {
 			idle = false;
 			if (*idle_cpu == -1) {
 				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
@@ -6171,7 +6171,7 @@  static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
 		    !cpumask_test_cpu(cpu, sched_domain_span(sd)))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (available_idle_cpu_sched(cpu) || sched_idle_cpu(cpu))
 			return cpu;
 	}
 
@@ -6302,7 +6302,7 @@  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!available_idle_cpu_sched(cpu) && !sched_idle_cpu(cpu))
 			continue;
 		if (fits_capacity(task_util, cpu_cap))
 			return cpu;
@@ -6348,7 +6348,7 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	if ((available_idle_cpu_sched(target) || sched_idle_cpu(target)) &&
 	    asym_fits_capacity(task_util, target))
 		return target;
 
@@ -6356,7 +6356,7 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+	    (available_idle_cpu_sched(prev) || sched_idle_cpu(prev)) &&
 	    asym_fits_capacity(task_util, prev))
 		return prev;
 
@@ -6379,7 +6379,7 @@  static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    (available_idle_cpu_sched(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_capacity(task_util, recent_used_cpu)) {
 		/*