diff mbox

[TSC,trapping,/,migration,1/2] Add TSC trapping for SVM and VMX

Message ID 1294308645-31113-2-git-send-email-zamsden@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zachary Amsden Jan. 6, 2011, 10:10 a.m. UTC
None
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff651b7..6cce67a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -452,6 +452,8 @@  struct kvm_arch {
 	u32 virtual_tsc_khz;
 	u32 virtual_tsc_mult;
 	s8 virtual_tsc_shift;
+	bool tsc_trapping;
+	u32 tsc_flags;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -575,6 +577,8 @@  struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+	void (*set_tsc_trapping)(struct kvm_vcpu *vcpu, bool trap);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
@@ -582,8 +586,6 @@  struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
-
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d19..315ead5 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -40,5 +40,6 @@  struct pvclock_wall_clock {
 } __attribute__((__packed__));
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
+#define PVCLOCK_TSC_TRAPPED_BIT (1 << 1)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c929d00..af48be9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -806,6 +806,8 @@  static void init_vmcb(struct vcpu_svm *svm)
 				(1ULL << INTERCEPT_MONITOR) |
 				(1ULL << INTERCEPT_MWAIT);
 
+	kvm_setup_tsc_trapping(&svm->vcpu);
+
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->int_ctl = V_INTR_MASKING_MASK;
@@ -1038,6 +1040,15 @@  static void svm_clear_vintr(struct vcpu_svm *svm)
 	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
+static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	if (trap)
+		svm->vmcb->control.intercept |= 1ULL << INTERCEPT_RDTSC;
+	else
+		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_RDTSC);
+}
+
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
 	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -2497,6 +2508,13 @@  static int task_switch_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+	kvm_read_tsc(&svm->vcpu);
+	return 1;
+}
+
 static int cpuid_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
@@ -2833,6 +2851,7 @@  static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
+	[SVM_EXIT_RDTSC]			= rdtsc_interception,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
@@ -3676,6 +3695,7 @@  static struct kvm_x86_ops svm_x86_ops = {
 
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
+	.set_tsc_trapping = svm_set_tsc_trapping,
 
 	.set_tdp_cr3 = set_tdp_cr3,
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 28c72da..3516d18 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2631,6 +2631,7 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
 	kvm_write_tsc(&vmx->vcpu, 0);
+	kvm_setup_tsc_trapping(&vmx->vcpu);
 
 	return 0;
 }
@@ -2770,6 +2771,18 @@  out:
 	return ret;
 }
 
+static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+	u32 cpu_based_vm_exec_control;
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	if (trap)
+		cpu_based_vm_exec_control |= CPU_BASED_RDTSC_EXITING;
+	else
+		cpu_based_vm_exec_control &= ~CPU_BASED_RDTSC_EXITING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3359,6 +3372,12 @@  static int handle_invlpg(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_rdtsc(struct kvm_vcpu *vcpu)
+{
+	kvm_read_tsc(vcpu);
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
@@ -3651,6 +3670,7 @@  static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
+	[EXIT_REASON_RDTSC]		      = handle_rdtsc,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
@@ -4339,6 +4359,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+	.set_tsc_trapping = vmx_set_tsc_trapping,
 
 	.set_tdp_cr3 = vmx_set_cr3,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a339e50..bbcd582 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -95,6 +95,12 @@  static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
+static int __read_mostly tsc_trap = 1;
+module_param(tsc_trap, int, S_IRUGO);
+
+static bool __read_mostly tsc_auto = 1;
+module_param(tsc_auto, bool, S_IRUGO);
+
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
@@ -1058,6 +1064,8 @@  static void update_pvclock(struct kvm_vcpu *v,
 	pvclock->tsc_timestamp = tsc_timestamp;
 	pvclock->system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	pvclock->flags = 0;
+	if (v->kvm->arch.tsc_trapping)
+		pvclock->flags |= PVCLOCK_TSC_TRAPPED_BIT;
 }
 
 static void update_user_kvmclock(struct kvm_vcpu *v,
@@ -1072,6 +1080,18 @@  static void update_user_kvmclock(struct kvm_vcpu *v,
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+void kvm_read_tsc(struct kvm_vcpu *vcpu)
+{
+	u64 tsc;
+	s64 kernel_ns = get_kernel_ns();
+
+	tsc = compute_guest_tsc(vcpu, kernel_ns);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)tsc);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tsc >> 32);
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_read_tsc);
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags;
@@ -1198,6 +1218,55 @@  static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu)
+{
+	struct kvm_arch *arch = &vcpu->kvm->arch;
+	int trap;
+	bool tsc_underrun, tsc_overrun;
+
+	/*
+	 * First, establish rate differences outside NTP correction boundary.
+	 * N.B. - virtual_tsc_khz may not yet be known, in which case it is
+	 * assumed the host rate will be used; guard against this in overrun.
+	 */
+	u64 max_tsc_ull = max_tsc_khz * 1000000ULL;
+	tsc_overrun = (arch->virtual_tsc_khz &&
+		       arch->virtual_tsc_khz * 1000500ULL < max_tsc_ull);
+	tsc_underrun = (arch->virtual_tsc_khz * 999500ULL > max_tsc_ull);
+
+	/*
+	 * We must trap if we have unstable TSC and a hint from userspace that
+	 * SMP is required; also, if we want a fixed rate and the max TSC rate
+	 * exceeds the VM rate by over 500 ppm (the maximum NTP slew rate).
+	 */
+	trap =
+	  (check_tsc_unstable() &&
+	      (arch->tsc_flags & KVM_TSC_FLAG_SMP_COHERENCY)) ||
+	  ((arch->tsc_flags & KVM_TSC_FLAG_FIXED_RATE) &&
+	      (tsc_overrun || tsc_underrun));
+
+	/*
+	 * Auto-selection: if we have no guidance from userspace, we can't
+	 * know if VCPUs will be added, so assume SMP, as it is difficult to
+	 * switch other CPUs into trapping mode after they have started
+	 */
+	if (tsc_auto)
+		trap |= (tsc_overrun || check_tsc_unstable());
+
+	/* tsc_trap (module parameter) overrides explicit choice */
+	if (tsc_trap != 0)
+		trap = (tsc_trap > 0);
+
+	/* Correct untrapped underrun with catchup */
+	if (!trap && tsc_underrun)
+		vcpu->arch.tsc_catchup = 1;
+
+	vcpu->kvm->arch.tsc_trapping = trap;
+	kvm_x86_ops->set_tsc_trapping(vcpu, trap);
+	pr_debug("kvm: set trap mode %d on vcpu %d\n", trap, vcpu->vcpu_id);
+}
+EXPORT_SYMBOL_GPL(kvm_setup_tsc_trapping);
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1962,6 +2031,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
+	case KVM_CAP_TSC_CONTROL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3535,7 +3605,30 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_TSC_CONTROL: {
+		struct kvm_tsc_control user_tsc;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_tsc, argp, sizeof(user_tsc)))
+			goto out;
+
+		r = -EINVAL;
+		if (user_tsc.flags &
+		    ~(KVM_TSC_FLAG_FIXED_RATE |
+		      KVM_TSC_FLAG_SMP_COHERENCY))
+			goto out;
 
+		if (user_tsc.tsc_khz &&
+			(user_tsc.tsc_khz > KVM_TSC_MAX_KHZ ||
+			 user_tsc.tsc_khz < KVM_TSC_MIN_KHZ))
+			goto out;
+
+		if (user_tsc.tsc_khz)
+			kvm_arch_set_tsc_khz(kvm, user_tsc.tsc_khz);
+
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -5222,7 +5315,14 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+	/*
+	 * We only need to record this for unstable, passthrough TSC.
+	 * Since the host clocksource will not be TSC in that case, we
+	 * risk going backwards during recalibration of kvmclock due to
+	 * differing clock resolution.
+	 */
+	if (!vcpu->kvm->arch.tsc_trapping && check_tsc_unstable())
+		kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
 
 	atomic_set(&vcpu->guest_mode, 0);
 	smp_wmb();
@@ -5777,14 +5877,11 @@  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
-						unsigned int id)
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
-		printk_once(KERN_WARNING
-		"kvm: SMP vm created on host with unstable TSC; "
-		"guest TSC will not be reliable\n");
-	return kvm_x86_ops->vcpu_create(kvm, id);
+	struct kvm_vcpu *vcpu;
+	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+	return vcpu;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414..6afa64f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,5 +75,7 @@  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_read_tsc(struct kvm_vcpu *vcpu);
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu);
 
 #endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 919ae53..cb97e53 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -540,6 +540,8 @@  struct kvm_ppc_pvinfo {
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_TSC_CONTROL 59
+
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -619,6 +621,17 @@  struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+struct kvm_tsc_control {
+	__u32 flags;
+	__u32 tsc_khz;
+};
+
+#define KVM_TSC_FLAG_FIXED_RATE		(1 << 0)
+#define KVM_TSC_FLAG_SMP_COHERENCY	(1 << 1)
+
+#define KVM_TSC_MIN_KHZ 16000		/* 16 MHz, slower than first Pentium */
+#define KVM_TSC_MAX_KHZ 100000000	/* 100 GHz, good for a few years */
+
 /*
  * ioctls for VM fds
  */
@@ -676,6 +689,8 @@  struct kvm_clock_data {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_TSC_CONTROL		  _IOW(KVMIO,  0xa2, struct kvm_tsc_control)
 
 /*
  * ioctls for vcpu fds