diff mbox series

[RFC,2/3] kvm/x86: Add support for gtime halted

Message ID 20250218202618.567363-3-sieberf@amazon.com (mailing list archive)
State New
Headers show
Series kvm,sched: Add gtime halted | expand

Commit Message

Fernand Sieber Feb. 18, 2025, 8:26 p.m. UTC
The previous commit introduced the concept of guest time halted to allow
the hypervisor to track real guest CPU activity (halted cyles) with
mwait/hlt/pause pass through enabled.

This commits implements it for the x86 architecture. We track the number of
actual cycles executed by the guest by taking two reads on MSR_IA32_MPERF,
one before vcpu enter and the other after vcpu exit. These two reads happen
immediately before and after guest_timing_enter/exit_irqoff which are the
architecture independent points for gtime accounting. The difference between
the reads corresponds to the number of unhalted cycles. We get the number
of halted cycles by using the tsc difference with the unhalted cycles and
tolerate slight approximations.
---
 arch/x86/include/asm/tsc.h |  1 +
 arch/x86/kernel/tsc.c      | 13 +++++++++++++
 arch/x86/kvm/x86.c         | 26 ++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

--
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
diff mbox series

Patch

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94408a784c8e..00ad09e7268e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -37,6 +37,7 @@  extern void mark_tsc_async_resets(char *reason);
 extern unsigned long native_calibrate_cpu_early(void);
 extern unsigned long native_calibrate_tsc(void);
 extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
+extern unsigned long long cycles2ns(unsigned long long cycles);

 extern int tsc_clocksource_reliable;
 #ifdef CONFIG_X86_TSC
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 34dec0b72ea8..80bb12357148 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -144,6 +144,19 @@  static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
 	return ns;
 }

+unsigned long long cycles2ns(unsigned long long cyc)
+{
+       struct cyc2ns_data data;
+       unsigned long long ns;
+
+       cyc2ns_read_begin(&data);
+       ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+       cyc2ns_read_end();
+
+       return ns;
+}
+EXPORT_SYMBOL(cycles2ns);
+
 static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
 {
 	unsigned long long ns_now;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02159c967d29..46975b0a63a5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10688,6 +10688,19 @@  static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 	kvm_x86_call(set_apic_access_page_addr)(vcpu);
 }

+static bool needs_halted_accounting(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->kvm->arch.mwait_in_guest ||
+			vcpu->kvm->arch.hlt_in_guest ||
+			vcpu->kvm->arch.pause_in_guest) &&
+		boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static long long get_unhalted_cycles(void)
+{
+	return __rdmsr(MSR_IA32_MPERF);
+}
+
 /*
  * Called within kvm->srcu read side.
  * Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10697,6 +10710,8 @@  static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
+	unsigned long long cycles, cycles_start = 0;
+	unsigned long long unhalted_cycles, unhalted_cycles_start = 0;
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
@@ -10968,6 +10983,10 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		set_debugreg(0, 7);
 	}

+	if (needs_halted_accounting(vcpu)) {
+		cycles_start = get_cycles();
+		unhalted_cycles_start = get_unhalted_cycles();
+	}
 	guest_timing_enter_irqoff();

 	for (;;) {
@@ -11060,6 +11079,13 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * acceptable for all known use cases.
 	 */
 	guest_timing_exit_irqoff();
+	if (needs_halted_accounting(vcpu)) {
+		cycles = get_cycles() - cycles_start;
+		unhalted_cycles = get_unhalted_cycles() -
+			unhalted_cycles_start;
+		if (likely(cycles > unhalted_cycles))
+			current->gtime_halted += cycles2ns(cycles - unhalted_cycles);
+	}

 	local_irq_enable();
 	preempt_enable();