different TSC offsets from each other. The TSC delta is measured
(in cross-cache-directions for highest accuracy) and stored.
Signed-off-by: Zachary Amsden <zamsden@redhat.com>
---
arch/x86/kvm/x86.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 202 insertions(+), 0 deletions(-)
@@ -730,6 +730,196 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
+static DEFINE_PER_CPU(int, cpu_tsc_shift);
+static DEFINE_PER_CPU(s64, cpu_tsc_offset);
+static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static int tsc_base_cpu = -1;
+static unsigned long ref_tsc_khz;
+
+static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
+ unsigned long divisor, unsigned long *rptr)
+{
+ unsigned long quotient, remainder;
+ __asm__ ( "div %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (lo), "1" (hi), "rm" (divisor));
+ *rptr = remainder;
+ return quotient;
+}
+
+/*
+ * compute the best multipler and shift pair m,s, such that for n,
+ * n * a / b = (n * m) >> s
+ */
+static void compute_best_multiplier(unsigned long a, unsigned long b,
+ unsigned long *m, int *s)
+{
+ int shift, bit;
+ unsigned long lo, hi, remainder, mult;
+
+ /*
+ * By pre-shifting and using maximum machine width, we get the most
+ * bits of precision.
+ */
+ shift = BITS_PER_LONG + fls(b) - fls(a) - 1;
+ if (shift > BITS_PER_LONG)
+ shift = BITS_PER_LONG;
+ if (shift < 0)
+ shift = 0;
+ lo = a << shift;
+ hi = a >> (BITS_PER_LONG - shift);
+ mult = div_precise(hi, lo, b, &remainder);
+
+ /* See if it can be further simplified */
+ bit = __ffs(mult);
+ if (bit > shift)
+ bit = shift;
+ mult >>= bit;
+ shift -= bit;
+ *m = mult;
+ *s = shift;
+}
+
+static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
+ int shift)
+{
+ unsigned long top, bot;
+
+ __asm__ ( "mul %3; shrd %1, %0" :
+ "=&a" (bot), "=&d" (top) :
+ "0" (mult), "rm" (val), "c" (shift));
+ return bot;
+}
+
+static inline u64 compute_ref_tsc(int cpu)
+{
+ u64 tsc = native_read_tsc() - per_cpu(cpu_tsc_measure_base, cpu);
+ tsc = mult_precise(tsc, per_cpu(cpu_tsc_multiplier, cpu),
+ per_cpu(cpu_tsc_shift, cpu));
+ return tsc + per_cpu(cpu_tsc_offset, cpu);
+}
+
+#define SYNC_TRIES 64
+
+/*
+ * sync_tsc_helper is a dual-entry coroutine meant to be run by only
+ * two CPUs at a time, one of which is a measuring CPU. Both CPUs
+ * synchronize entry and exit as well as the central recording loop
+ * using only memory barriers and atomic variables to avoid lock latency.
+ *
+ * To discount cache latency effects, this routine will be called
+ * twice, one with the measure / recording CPUs reversed. In addition,
+ * the first 4 and last 2 results will be discarded to allow branch
+ * predicition to become established (and to discount effects from
+ * a potentially early predicted loop exit).
+ *
+ * Because of this, we must be extra careful to guard the entrance
+ * and exit against the CPU switch. I chose to use atomic instructions
+ * only at the end of the measure loop and use the same routine for
+ * both CPUs, with symmetric comparisons, and a statically placed
+ * recording array, hopefully maximizing the branch predicition and
+ * cache locality. The results appear quite good; on known to be
+ * synchronized CPUs, I typically get < 10 TSC delta measured, with
+ * maximum observed error on the order of 100 cycles.
+ *
+ * This doesn't account for NUMA cache effects, and could potentially
+ * be improved there by moving the delta[] array to the stack of the
+ * measuring CPU. In fact, this modification might be worth trying
+ * for non-NUMA systems as well, but this appears to be fine for now.
+ */
+static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+{
+ int tries;
+ static u64 tsc_other;
+ int junk = 0;
+ u64 tsc;
+ int cpu = raw_smp_processor_id();
+
+ if (cpu == measure_cpu) {
+ atomic_set(ready, 0);
+ while (!atomic_read(ready))
+ /* wait */;
+ } else {
+ while (atomic_read(ready))
+ /* wait */;
+ atomic_set(ready, 1);
+ }
+ for (tries = 0; tries < SYNC_TRIES; tries++) {
+ mb();
+ if (cpu == measure_cpu) {
+ atomic_set(ready, 0);
+ } else {
+ while (atomic_read(ready))
+ /* wait */;
+ }
+ native_cpuid(&junk, &junk, &junk, &junk);
+ tsc = compute_ref_tsc(cpu);
+ rdtsc_barrier();
+ if (cpu == measure_cpu) {
+ while (!atomic_read(ready))
+ /* wait */;
+ rmb();
+ delta[tries] = tsc - tsc_other;
+ } else {
+ tsc_other = tsc;
+ wmb();
+ atomic_set(ready, 1);
+ }
+ }
+ if (cpu == measure_cpu)
+ atomic_dec(ready);
+ else
+ atomic_inc(ready);
+ while (atomic_read(ready) != 1)
+ /* wait */;
+ mb();
+}
+
+static void kvm_sync_tsc(void *cpup)
+{
+ int new_cpu = *(int *)cpup;
+ unsigned long flags;
+ static s64 delta[SYNC_TRIES*2];
+ static atomic_t ready = ATOMIC_INIT(1);
+
+ BUG_ON(tsc_base_cpu == -1);
+ pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
+ local_irq_save(flags);
+ if (raw_smp_processor_id() == new_cpu) {
+ per_cpu(cpu_tsc_measure_base, new_cpu) = native_read_tsc();
+ per_cpu(cpu_tsc_offset, new_cpu) = 0;
+ compute_best_multiplier(ref_tsc_khz,
+ per_cpu(cpu_tsc_khz, new_cpu),
+ &per_cpu(cpu_tsc_multiplier, new_cpu),
+ &per_cpu(cpu_tsc_shift, new_cpu));
+ }
+ sync_tsc_helper(tsc_base_cpu, delta, &ready);
+ sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+ if (raw_smp_processor_id() == new_cpu) {
+ int i;
+ s64 accumulator = 0;
+
+ /*
+ * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
+ * subtract [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+ *
+ * this allows instruction cycle and cache differences to
+ * cancel each other out and drops warm up/cool down variation
+ *
+ * Note the arithmatic must be signed because of the divide
+ */
+
+ for (i = 4; i < SYNC_TRIES - 2; i++)
+ accumulator += delta[i];
+ for (i = 4; i < SYNC_TRIES - 2; i++)
+ accumulator -= delta[i+SYNC_TRIES];
+ accumulator = accumulator / (SYNC_TRIES*2-12);
+ per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+ pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu));
+ }
+ local_irq_restore(flags);
+}
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
@@ -3352,6 +3542,18 @@ static void kvm_timer_init(void)
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
+ tsc_base_cpu = get_cpu();
+ ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
+ per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
+ per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
+ per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
+ for_each_online_cpu(cpu)
+ if (cpu != tsc_base_cpu) {
+ smp_call_function_single(cpu, kvm_sync_tsc,
+ (void *)&cpu, 0);
+ kvm_sync_tsc((void *)&cpu);
+ }
+ put_cpu();
}
int kvm_arch_init(void *opaque)