diff mbox

[RFC:,tsc,virtualization,12/20] Higher accuracy TSC offset computation

Message ID 1260850127-9766-13-git-send-email-zamsden@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zachary Amsden Dec. 15, 2009, 4:08 a.m. UTC
None
diff mbox

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4266f..c66dede 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -840,6 +840,11 @@  again:
 EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
 
 #define SYNC_TRIES 64
+struct cpu_delta_array {
+	s64 delta[SYNC_TRIES];
+};
+static DEFINE_PER_CPU(struct cpu_delta_array, delta_array);
+
 
 /*
  * sync_tsc_helper is a dual-entry coroutine meant to be run by only
@@ -849,25 +854,16 @@  EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
  *
  * To discount cache latency effects, this routine will be called
  * twice, one with the measure / recording CPUs reversed.  In addition,
- * the first 4 and last 2 results will be discarded to allow branch
- * predicition to become established (and to discount effects from
- * a potentially early predicted loop exit).
+ * the first and last results will be discarded to allow branch predicition
+ * to become established (and to discount effects from a potentially early
+ * predicted loop exit).
  *
- * Because of this, we must be extra careful to guard the entrance
- * and exit against the CPU switch.  I chose to use atomic instructions
- * only at the end of the measure loop and use the same routine for
- * both CPUs, with symmetric comparisons, and a statically placed
- * recording array, hopefully maximizing the branch predicition and
- * cache locality.  The results appear quite good; on known to be
- * synchronized CPUs, I typically get < 10 TSC delta measured, with
- * maximum observed error on the order of 100 cycles.
- *
- * This doesn't account for NUMA cache effects, and could potentially
- * be improved there by moving the delta[] array to the stack of the
- * measuring CPU.  In fact, this modification might be worth trying
- * for non-NUMA systems as well, but this appears to be fine for now.
+ * We allocate the delta array as a per-cpu variable to get local cache
+ * allocation, and also take steps to statistically ignore outliers which
+ * might be caused by SMIs.  It may seem like overkill, but it is very
+ * accurate, which is what we're aiming for.
  */
-static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
 {
 	int tries;
 	static u64 tsc_other;
@@ -915,12 +911,59 @@  static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
 	mb();
 }
 
+/*
+ * Average and trim the samples of any outliers; we use > 2 x sigma
+ */
+static s64 average_samples(s64 *samples, unsigned num_samples)
+{
+	unsigned i, j;
+	s64 mean;
+	u64 stddev;
+	s64 average;
+
+	/* First get the mean */
+	mean = 0;
+	for (i = 0; i < num_samples; i++)
+		mean += samples[i];
+	mean = mean / num_samples;
+
+	/* Now the deviation */
+	stddev = 0;
+	for (i = 0; i < num_samples; i++) {
+		s64 dev = samples[i] - mean;
+		stddev += dev * dev;
+	}
+	stddev = stddev / (num_samples - 1);
+	stddev = int_sqrt(stddev);
+
+	/* Throw out anything outside 2x stddev */
+	stddev <<= 1;
+	average = 0, j = 0;
+	for (i = 0; i < num_samples; i++) {
+		s64 dev = samples[i] - mean;
+		if (dev < 0)
+			dev = -dev;
+		if (dev <= stddev) {
+			average += samples[i];
+			j++;
+		}
+	}
+	if (j > 0)
+		average = average / j;
+	else
+		printk(KERN_ERR "kvm: TSC samples failed to converge!\n");
+	pr_debug("%s: mean = %lld, stddev = %llu, average = %lld\n",
+		 __func__, mean, stddev, average);
+
+	return average;
+}
+
 static void kvm_sync_tsc(void *cpup)
 {
 	int new_cpu = *(int *)cpup;
 	unsigned long flags;
-	static s64 delta[SYNC_TRIES*2];
-	static atomic_t ready = ATOMIC_INIT(1);
+	s64 *delta1, *delta2;
+	static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);
 
 	BUG_ON(tsc_base_cpu == -1);
 	pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
@@ -933,27 +976,26 @@  static void kvm_sync_tsc(void *cpup)
 			&per_cpu(cpu_tsc_multiplier, new_cpu),
 			&per_cpu(cpu_tsc_shift, new_cpu));
 	}
-	sync_tsc_helper(tsc_base_cpu, delta, &ready);
-	sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+	delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
+	delta2 = per_cpu(delta_array, new_cpu).delta;
+	sync_tsc_helper(tsc_base_cpu, delta1, &ready);
+	sync_tsc_helper(new_cpu, delta2, &ready);
 	if (raw_smp_processor_id() == new_cpu) {
-		int i;
 		s64 accumulator = 0;
 
 		/*
-		 * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
-		 * subtract   [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+		 * accumulate [2,SYNC_TRIES-1) of tsc{base} - tsc{new}
+		 * subtract   [SYNC_TRIES+2,-1) of tsc{new} - tsc{base}
 		 *
 		 * this allows instruction cycle and cache differences to
 		 * cancel each other out and drops warm up/cool down variation
 		 *
 		 * Note the arithmatic must be signed because of the divide
 		 */
+		accumulator += average_samples(&delta1[2], SYNC_TRIES-3);
+		accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
+		accumulator /= 2;
 
-		for (i = 4; i < SYNC_TRIES - 2; i++)
-			accumulator += delta[i];
-		for (i = 4; i < SYNC_TRIES - 2; i++)
-			accumulator -= delta[i+SYNC_TRIES];
-		accumulator = accumulator / (SYNC_TRIES*2-12);
 		per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
 		++per_cpu(cpu_tsc_generation, new_cpu);
 		atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);