@@ -452,6 +452,8 @@ struct kvm_arch {
u32 virtual_tsc_khz;
u32 virtual_tsc_mult;
s8 virtual_tsc_shift;
+ bool tsc_trapping;
+ u32 tsc_flags;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -575,6 +577,8 @@ struct kvm_x86_ops {
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*set_tsc_trapping)(struct kvm_vcpu *vcpu, bool trap);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,8 +586,6 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
-
const struct trace_print_flags *exit_reasons_str;
};
@@ -40,5 +40,6 @@ struct pvclock_wall_clock {
} __attribute__((__packed__));
#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_TSC_TRAPPED_BIT (1 << 1)
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
@@ -806,6 +806,8 @@ static void init_vmcb(struct vcpu_svm *svm)
(1ULL << INTERCEPT_MONITOR) |
(1ULL << INTERCEPT_MWAIT);
+ kvm_setup_tsc_trapping(&svm->vcpu);
+
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1038,6 +1040,15 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
}
+static void svm_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (trap)
+ svm->vmcb->control.intercept |= 1ULL << INTERCEPT_RDTSC;
+ else
+ svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_RDTSC);
+}
+
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -2497,6 +2508,13 @@ static int task_switch_interception(struct vcpu_svm *svm)
return 1;
}
+static int rdtsc_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ kvm_read_tsc(&svm->vcpu);
+ return 1;
+}
+
static int cpuid_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
@@ -2833,6 +2851,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDTSC] = rdtsc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
@@ -3676,6 +3695,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
+ .set_tsc_trapping = svm_set_tsc_trapping,
.set_tdp_cr3 = set_tdp_cr3,
};
@@ -2631,6 +2631,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
kvm_write_tsc(&vmx->vcpu, 0);
+ kvm_setup_tsc_trapping(&vmx->vcpu);
return 0;
}
@@ -2770,6 +2771,18 @@ out:
return ret;
}
+static void vmx_set_tsc_trapping(struct kvm_vcpu *vcpu, bool trap)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ if (trap)
+ cpu_based_vm_exec_control |= CPU_BASED_RDTSC_EXITING;
+ else
+ cpu_based_vm_exec_control &= ~CPU_BASED_RDTSC_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
@@ -3359,6 +3372,12 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_rdtsc(struct kvm_vcpu *vcpu)
+{
+ kvm_read_tsc(vcpu);
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
@@ -3651,6 +3670,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDTSC] = handle_rdtsc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
[EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
@@ -4339,6 +4359,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
+ .set_tsc_trapping = vmx_set_tsc_trapping,
.set_tdp_cr3 = vmx_set_cr3,
};
@@ -95,6 +95,12 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+static int __read_mostly tsc_trap = 1;
+module_param(tsc_trap, int, S_IRUGO);
+
+static bool __read_mostly tsc_auto = 1;
+module_param(tsc_auto, bool, S_IRUGO);
+
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
@@ -1058,6 +1064,8 @@ static void update_pvclock(struct kvm_vcpu *v,
pvclock->tsc_timestamp = tsc_timestamp;
pvclock->system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
pvclock->flags = 0;
+ if (v->kvm->arch.tsc_trapping)
+ pvclock->flags |= PVCLOCK_TSC_TRAPPED_BIT;
}
static void update_user_kvmclock(struct kvm_vcpu *v,
@@ -1072,6 +1080,18 @@ static void update_user_kvmclock(struct kvm_vcpu *v,
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
}
+void kvm_read_tsc(struct kvm_vcpu *vcpu)
+{
+ u64 tsc;
+ s64 kernel_ns = get_kernel_ns();
+
+ tsc = compute_guest_tsc(vcpu, kernel_ns);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)tsc);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tsc >> 32);
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_read_tsc);
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags;
@@ -1198,6 +1218,55 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
}
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ int trap;
+ bool tsc_underrun, tsc_overrun;
+
+ /*
+ * First, establish rate differences outside NTP correction boundary.
+ * N.B. - virtual_tsc_khz may not yet be known, in which case it is
+ * assumed the host rate will be used; guard against this in overrun.
+ */
+ u64 max_tsc_ull = max_tsc_khz * 1000000ULL;
+ tsc_overrun = (arch->virtual_tsc_khz &&
+ arch->virtual_tsc_khz * 1000500ULL < max_tsc_ull);
+ tsc_underrun = (arch->virtual_tsc_khz * 999500ULL > max_tsc_ull);
+
+ /*
+ * We must trap if we have unstable TSC and a hint from userspace that
+ * SMP is required; also, if we want a fixed rate and the max TSC rate
+ * exceeds the VM rate by over 500 ppm (the maximum NTP slew rate).
+ */
+ trap =
+ (check_tsc_unstable() &&
+ (arch->tsc_flags & KVM_TSC_FLAG_SMP_COHERENCY)) ||
+ ((arch->tsc_flags & KVM_TSC_FLAG_FIXED_RATE) &&
+ (tsc_overrun || tsc_underrun));
+
+ /*
+ * Auto-selection: if we have no guidance from userspace, we can't
+ * know if VCPUs will be added, so assume SMP, as it is difficult to
+ * switch other CPUs into trapping mode after they have started
+ */
+ if (tsc_auto)
+ trap |= (tsc_overrun || check_tsc_unstable());
+
+ /* tsc_trap (module parameter) overrides explicit choice */
+ if (tsc_trap != 0)
+ trap = (tsc_trap > 0);
+
+ /* Correct untrapped underrun with catchup */
+ if (!trap && tsc_underrun)
+ vcpu->arch.tsc_catchup = 1;
+
+ vcpu->kvm->arch.tsc_trapping = trap;
+ kvm_x86_ops->set_tsc_trapping(vcpu, trap);
+ pr_debug("kvm: set trap mode %d on vcpu %d\n", trap, vcpu->vcpu_id);
+}
+EXPORT_SYMBOL_GPL(kvm_setup_tsc_trapping);
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
@@ -1962,6 +2031,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
+ case KVM_CAP_TSC_CONTROL:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3535,7 +3605,30 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_TSC_CONTROL: {
+ struct kvm_tsc_control user_tsc;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_tsc, argp, sizeof(user_tsc)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_tsc.flags &
+ ~(KVM_TSC_FLAG_FIXED_RATE |
+ KVM_TSC_FLAG_SMP_COHERENCY))
+ goto out;
+ if (user_tsc.tsc_khz &&
+ (user_tsc.tsc_khz > KVM_TSC_MAX_KHZ ||
+ user_tsc.tsc_khz < KVM_TSC_MIN_KHZ))
+ goto out;
+
+ if (user_tsc.tsc_khz)
+ kvm_arch_set_tsc_khz(kvm, user_tsc.tsc_khz);
+
+ r = 0;
+ break;
+ }
default:
;
}
@@ -5222,7 +5315,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+ /*
+ * We only need to record this for unstable, passthrough TSC.
+ * Since the host clocksource will not be TSC in that case, we
+ * risk going backwards during recalibration of kvmclock due to
+ * differing clock resolution.
+ */
+ if (!vcpu->kvm->arch.tsc_trapping && check_tsc_unstable())
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
atomic_set(&vcpu->guest_mode, 0);
smp_wmb();
@@ -5777,14 +5877,11 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
{
- if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
- return kvm_x86_ops->vcpu_create(kvm, id);
+ struct kvm_vcpu *vcpu;
+ vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+ return vcpu;
}
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -75,5 +75,7 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_read_tsc(struct kvm_vcpu *vcpu);
+void kvm_setup_tsc_trapping(struct kvm_vcpu *vcpu);
#endif
@@ -540,6 +540,8 @@ struct kvm_ppc_pvinfo {
#endif
#define KVM_CAP_PPC_GET_PVINFO 57
#define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_TSC_CONTROL 59
+
#ifdef KVM_CAP_IRQ_ROUTING
@@ -619,6 +621,17 @@ struct kvm_clock_data {
__u32 pad[9];
};
+struct kvm_tsc_control {
+ __u32 flags;
+ __u32 tsc_khz;
+};
+
+#define KVM_TSC_FLAG_FIXED_RATE (1 << 0)
+#define KVM_TSC_FLAG_SMP_COHERENCY (1 << 1)
+
+#define KVM_TSC_MIN_KHZ 16000 /* 16 MHz, slower than first Pentium */
+#define KVM_TSC_MAX_KHZ 100000000 /* 100 GHz, good for a few years */
+
/*
* ioctls for VM fds
*/
@@ -676,6 +689,8 @@ struct kvm_clock_data {
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
/* Available with KVM_CAP_PPC_GET_PVINFO */
#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_TSC_CONTROL _IOW(KVMIO, 0xa2, struct kvm_tsc_control)
/*
* ioctls for vcpu fds