@@ -741,9 +741,15 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
static DEFINE_PER_CPU(int, cpu_tsc_shift);
static DEFINE_PER_CPU(s64, cpu_tsc_offset);
static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static DEFINE_PER_CPU(atomic_t, cpu_tsc_synchronized);
static int tsc_base_cpu = -1;
static unsigned long ref_tsc_khz;
+static inline int cpu_is_tsc_synchronized(int cpu)
+{
+ return (atomic_read(&per_cpu(cpu_tsc_synchronized, cpu)) != 0);
+}
+
static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
unsigned long divisor, unsigned long *rptr)
{
@@ -923,6 +929,7 @@ static void kvm_sync_tsc(void *cpup)
accumulator -= delta[i+SYNC_TRIES];
accumulator = accumulator / (SYNC_TRIES*2-12);
per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+ atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
}
local_irq_restore(flags);
@@ -931,6 +938,11 @@ static void kvm_sync_tsc(void *cpup)
static void kvm_do_sync_tsc(int cpu)
{
spin_lock(&kvm_tsc_lock);
+
+ /* tsc_base_cpu can change without tsc_lock, so recheck */
+ if (unlikely(cpu == tsc_base_cpu))
+ goto out_unlock;
+
if (raw_smp_processor_id() != tsc_base_cpu) {
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 0);
@@ -940,6 +952,8 @@ static void kvm_do_sync_tsc(int cpu)
smp_call_function_single(tsc_base_cpu, kvm_sync_tsc,
(void *)&cpu, 1);
}
+
+out_unlock:
spin_unlock(&kvm_tsc_lock);
}
@@ -1656,7 +1670,6 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
- BUG_ON(per_cpu(cpu_tsc_khz, cpu) == 0);
kvm_request_guest_time_update(vcpu);
}
@@ -3461,11 +3474,46 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-static void bounce_off(void *info)
+static void evict(void *info)
{
/* nothing */
}
+static struct execute_work resync_work;
+static int work_scheduled;
+
+static void resync_user(struct work_struct *work)
+{
+ int cpu;
+
+ work_scheduled = 0;
+ for_each_online_cpu(cpu)
+ if (cpu != tsc_base_cpu)
+ kvm_do_sync_tsc(cpu);
+}
+
+static void resync(void *info)
+{
+ int cpu;
+ u64 tsc;
+
+ /* Fixup our own values to stay in sync with the reference */
+ cpu = raw_smp_processor_id();
+ tsc = compute_ref_tsc(cpu);
+ per_cpu(cpu_tsc_measure_base, cpu) = native_read_tsc();
+ per_cpu(cpu_tsc_offset, cpu) = tsc;
+ compute_best_multiplier(ref_tsc_khz, per_cpu(cpu_tsc_khz, cpu),
+ &per_cpu(cpu_tsc_multiplier, cpu),
+ &per_cpu(cpu_tsc_shift, cpu));
+ atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 1);
+
+ /* Then, get everybody else on board */
+ if (!work_scheduled) {
+ work_scheduled = 1;
+ execute_in_process_context(resync_user, &resync_work);
+ }
+}
+
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3474,39 +3522,68 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != freq->cpu)
- continue;
- if (!kvm_request_guest_time_update(vcpu))
- continue;
- if (vcpu->cpu != smp_processor_id())
- send_ipi++;
+ /*
+ * There is no way to precisely know the TSC value at which time the
+ * CPU frequency actually changed, and these callbacks may happen at
+ * different times, thus there will be drift in the reference TSC
+ * clock across all CPUs. To avoid this problem, we forcibly evict
+ * any CPUs which may be running in hardware virtualization.
+ *
+ * We do this by setting cpu_tsc_synchronized to zero and polling for
+ * this value to change when entering hardware virtualization.
+ */
+ if (val == CPUFREQ_PRECHANGE) {
+ get_online_cpus();
+ atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu), 0);
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ if (vcpu->cpu != smp_processor_id())
+ send_ipi++;
+ kvm_request_guest_time_update(vcpu);
+ }
+ }
+ spin_unlock(&kvm_lock);
+ if (send_ipi) {
+ /*
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, evict, NULL, 1);
}
- }
- spin_unlock(&kvm_lock);
- if (freq->old < freq->new && send_ipi) {
/*
- * We upscale the frequency. Must make the guest
- * doesn't see old kvmclock values while running with
- * the new frequency, otherwise we risk the guest sees
- * time go backwards.
- *
- * In case we update the frequency for another cpu
- * (which might be in guest context) send an interrupt
- * to kick the cpu out of guest context. Next time
- * guest context is entered kvmclock will be updated,
- * so the guest will not see stale values.
+ * The update of the frequency can't happen while a VM
+ * is running, nor can it happen during init when we can
+ * race against the init code setting the first known freq.
+ * Just use the vm_tsc_lock for a mutex.
*/
- smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ spin_lock(&kvm_tsc_lock);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+ spin_unlock(&kvm_tsc_lock);
+
+ return 0;
+ }
+
+ /*
+ * After the change, we must resynchronize affected CPUs.
+ * If the master reference changes, that means all CPUs.
+ * Note that some CPUs may be resynchronized twice, once
+ * by the master, and once by themselves, depending on the
+ * order in which this notifier is called; this is harmless.
+ */
+ if (val == CPUFREQ_POSTCHANGE) {
+ if (freq->cpu == tsc_base_cpu)
+ smp_call_function_single(freq->cpu, resync, NULL, 1);
+ else if (cpu_is_tsc_synchronized(tsc_base_cpu))
+ /* Can't do this if base is not yet updated */
+ kvm_do_sync_tsc(freq->cpu);
+ put_online_cpus();
}
return 0;
}
@@ -3536,8 +3613,7 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier,
case CPU_DYING:
case CPU_UP_CANCELED:
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- per_cpu(cpu_tsc_khz, cpu) = 0;
+ atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 0);
break;
case CPU_ONLINE:
@@ -3590,6 +3666,12 @@ static void kvm_timer_init(void)
}
}
#endif
+
+ /*
+ * Register notifiers for both CPU add / remove and CPU frequency
+ * change. Must be careful to avoid subtle races, as frequency
+ * can change at any time.
+ */
register_cpu_notifier(&kvm_x86_cpu_notifier);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
@@ -3598,7 +3680,10 @@ static void kvm_timer_init(void)
unsigned long khz = cpufreq_get(cpu);
if (!khz)
khz = tsc_khz;
- per_cpu(cpu_tsc_khz, cpu) = khz;
+ spin_lock(&kvm_tsc_lock);
+ if (!per_cpu(cpu_tsc_khz, cpu))
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ spin_unlock(&kvm_tsc_lock);
}
} else {
for_each_possible_cpu(cpu)
@@ -3606,12 +3691,7 @@ static void kvm_timer_init(void)
}
tsc_base_cpu = get_cpu();
ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
- per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
- per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
- per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
- for_each_online_cpu(cpu)
- if (cpu != tsc_base_cpu)
- kvm_do_sync_tsc(cpu);
+ resync(NULL);
put_cpu();
}
@@ -4097,7 +4177,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
clear_bit(KVM_REQ_KICK, &vcpu->requests);
smp_mb__after_clear_bit();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
+ if (vcpu->requests || need_resched() || signal_pending(current) ||
+ !cpu_is_tsc_synchronized(smp_processor_id())) {
set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
preempt_enable();