diff mbox series

[3/3] KVM: selftests: Print summary stats of memory latency distribution

Message ID 20221115173258.2530923-4-coltonlewis@google.com (mailing list archive)
State New, archived
Headers show
Series Calculate memory access latency stats | expand

Commit Message

Colton Lewis Nov. 15, 2022, 5:32 p.m. UTC
Print summary stats of the memory latency distribution in
nanoseconds. For every iteration, this prints the minimum, the
maximum, and the 50th, 90th, and 99th percentiles.

Stats are calculated by sorting the samples taken from all vcpus and
picking from the index corresponding with each percentile.

The conversion to nanoseconds needs the frequency of the Intel
timestamp counter, which is estimated by reading the counter before
and after sleeping for 1 second. This is not a pretty trick, but it
also exists in vmx_nested_tsc_scaling_test.c

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 .../selftests/kvm/dirty_log_perf_test.c       |  2 +
 .../selftests/kvm/include/perf_test_util.h    |  2 +
 .../selftests/kvm/lib/perf_test_util.c        | 62 +++++++++++++++++++
 3 files changed, 66 insertions(+)

Comments

Ricardo Koller Jan. 17, 2023, 8:45 p.m. UTC | #1
On Tue, Nov 15, 2022 at 05:32:58PM +0000, Colton Lewis wrote:
> Print summary stats of the memory latency distribution in
> nanoseconds. For every iteration, this prints the minimum, the
> maximum, and the 50th, 90th, and 99th percentiles.
> 
> Stats are calculated by sorting the samples taken from all vcpus and
> picking from the index corresponding with each percentile.
> 
> The conversion to nanoseconds needs the frequency of the Intel
> timestamp counter, which is estimated by reading the counter before
> and after sleeping for 1 second. This is not a pretty trick, but it
> also exists in vmx_nested_tsc_scaling_test.c
> 
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../selftests/kvm/dirty_log_perf_test.c       |  2 +
>  .../selftests/kvm/include/perf_test_util.h    |  2 +
>  .../selftests/kvm/lib/perf_test_util.c        | 62 +++++++++++++++++++
>  3 files changed, 66 insertions(+)
> 
> diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
> index 202f38a72851..2bc066bba460 100644
> --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
> +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
> @@ -274,6 +274,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>  	ts_diff = timespec_elapsed(start);
>  	pr_info("Populate memory time: %ld.%.9lds\n",
>  		ts_diff.tv_sec, ts_diff.tv_nsec);
> +	perf_test_print_percentiles(vm, nr_vcpus);
>  
>  	/* Enable dirty logging */
>  	clock_gettime(CLOCK_MONOTONIC, &start);
> @@ -304,6 +305,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>  		vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
>  		pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
>  			iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
> +		perf_test_print_percentiles(vm, nr_vcpus);
>  
>  		clock_gettime(CLOCK_MONOTONIC, &start);
>  		get_dirty_log(vm, bitmaps, p->slots);
> diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
> index 3d0b75ea866a..ca378c262f12 100644
> --- a/tools/testing/selftests/kvm/include/perf_test_util.h
> +++ b/tools/testing/selftests/kvm/include/perf_test_util.h
> @@ -47,6 +47,8 @@ struct perf_test_args {
>  
>  extern struct perf_test_args perf_test_args;
>  
> +void perf_test_print_percentiles(struct kvm_vm *vm, int nr_vcpus);
> +
>  struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
>  				   uint64_t vcpu_memory_bytes, int slots,
>  				   enum vm_mem_backing_src_type backing_src,
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index 0311da76bae0..927d22421f7c 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -115,6 +115,68 @@ void perf_test_guest_code(uint32_t vcpu_idx)
>  	}
>  }
>  
> +#if defined(__x86_64__)
> +/* This could be determined with the right sequence of cpuid
> + * instructions, but that's oddly complicated.
> + */
> +static uint64_t perf_test_intel_timer_frequency(void)
> +{
> +	uint64_t count_before;
> +	uint64_t count_after;
> +	uint64_t measured_freq;
> +	uint64_t adjusted_freq;
> +
> +	count_before = perf_test_timer_read();
> +	sleep(1);
> +	count_after = perf_test_timer_read();
> +
> +	/* Using 1 second implies our units are in Hz already. */
> +	measured_freq = count_after - count_before;
> +	/* Truncate to the nearest MHz. Clock frequencies are round numbers. */
> +	adjusted_freq = measured_freq / 1000000 * 1000000;
> +
> +	return adjusted_freq;
> +}
> +#endif
> +
> +static double perf_test_cycles_to_ns(double cycles)
> +{
> +#if defined(__aarch64__)
> +	return cycles * (1e9 / timer_get_cntfrq());
> +#elif defined(__x86_64__)
> +	static uint64_t timer_frequency;
> +
> +	if (timer_frequency == 0)
> +		timer_frequency = perf_test_intel_timer_frequency();
> +
> +	return cycles * (1e9 / timer_frequency);
> +#else
> +#warn __func__ " is not implemented for this architecture, will return 0"
> +	return 0.0;
> +#endif
> +}
> +
> +/* compare function for qsort */
> +static int perf_test_qcmp(const void *a, const void *b)
> +{
> +	return *(int *)a - *(int *)b;
> +}
> +
> +void perf_test_print_percentiles(struct kvm_vm *vm, int nr_vcpus)
> +{
> +	uint64_t n_samples = nr_vcpus * SAMPLES_PER_VCPU;
> +
> +	sync_global_from_guest(vm, latency_samples);
> +	qsort(latency_samples, n_samples, sizeof(uint64_t), &perf_test_qcmp);
> +
> +	pr_info("Latency distribution (ns) = min:%6.0lf, 50th:%6.0lf, 90th:%6.0lf, 99th:%6.0lf, max:%6.0lf\n",
> +		perf_test_cycles_to_ns((double)latency_samples[0]),
> +		perf_test_cycles_to_ns((double)latency_samples[n_samples / 2]),
> +		perf_test_cycles_to_ns((double)latency_samples[n_samples * 9 / 10]),
> +		perf_test_cycles_to_ns((double)latency_samples[n_samples * 99 / 100]),
> +		perf_test_cycles_to_ns((double)latency_samples[n_samples - 1]));
> +}

Latency distribution (ns) = min:   732, 50th:   792, 90th:   901, 99th:
                                ^^^
nit: would prefer to avoid the spaces 

> +
>  void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
>  			   struct kvm_vcpu *vcpus[],
>  			   uint64_t vcpu_memory_bytes,
> -- 
> 2.38.1.431.g37b22c650d-goog
>
Sean Christopherson Jan. 18, 2023, 4:43 p.m. UTC | #2
On Tue, Nov 15, 2022, Colton Lewis wrote:
> Print summary stats of the memory latency distribution in
> nanoseconds. For every iteration, this prints the minimum, the
> maximum, and the 50th, 90th, and 99th percentiles.
> 
> Stats are calculated by sorting the samples taken from all vcpus and
> picking from the index corresponding with each percentile.
> 
> The conversion to nanoseconds needs the frequency of the Intel
> timestamp counter, which is estimated by reading the counter before
> and after sleeping for 1 second. This is not a pretty trick, but it
> also exists in vmx_nested_tsc_scaling_test.c

This test shouldn't need to guesstimate the frequency, just use a VM-scoped
KVM_GET_TSC_KHZ, which will provide KVM's default TSC frequency, i.e. the host
frequency.  For hardware with a constant TSC, which is everything modern, that
will be as accurate as we can get.  For hardware without a constant TSC, well, buy
new hardware :-)

vmx_nested_tsc_scaling_test.c does the weird sleep() behavior because it's trying
to validate that the guest TSC counts at the correct rate, i.e. it is validating
KVM_GET_TSC_KHZ to some extent, and so obviously doesn't fully trust its result.
   
For tests that just want to measure time, there's no reason not to trust KVM.
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 202f38a72851..2bc066bba460 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -274,6 +274,7 @@  static void run_test(enum vm_guest_mode mode, void *arg)
 	ts_diff = timespec_elapsed(start);
 	pr_info("Populate memory time: %ld.%.9lds\n",
 		ts_diff.tv_sec, ts_diff.tv_nsec);
+	perf_test_print_percentiles(vm, nr_vcpus);
 
 	/* Enable dirty logging */
 	clock_gettime(CLOCK_MONOTONIC, &start);
@@ -304,6 +305,7 @@  static void run_test(enum vm_guest_mode mode, void *arg)
 		vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
 		pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
 			iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+		perf_test_print_percentiles(vm, nr_vcpus);
 
 		clock_gettime(CLOCK_MONOTONIC, &start);
 		get_dirty_log(vm, bitmaps, p->slots);
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
index 3d0b75ea866a..ca378c262f12 100644
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -47,6 +47,8 @@  struct perf_test_args {
 
 extern struct perf_test_args perf_test_args;
 
+void perf_test_print_percentiles(struct kvm_vm *vm, int nr_vcpus);
+
 struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
 				   uint64_t vcpu_memory_bytes, int slots,
 				   enum vm_mem_backing_src_type backing_src,
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 0311da76bae0..927d22421f7c 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -115,6 +115,68 @@  void perf_test_guest_code(uint32_t vcpu_idx)
 	}
 }
 
+#if defined(__x86_64__)
+/* This could be determined with the right sequence of cpuid
+ * instructions, but that's oddly complicated.
+ */
+static uint64_t perf_test_intel_timer_frequency(void)
+{
+	uint64_t count_before;
+	uint64_t count_after;
+	uint64_t measured_freq;
+	uint64_t adjusted_freq;
+
+	count_before = perf_test_timer_read();
+	sleep(1);
+	count_after = perf_test_timer_read();
+
+	/* Using 1 second implies our units are in Hz already. */
+	measured_freq = count_after - count_before;
+	/* Truncate to the nearest MHz. Clock frequencies are round numbers. */
+	adjusted_freq = measured_freq / 1000000 * 1000000;
+
+	return adjusted_freq;
+}
+#endif
+
+static double perf_test_cycles_to_ns(double cycles)
+{
+#if defined(__aarch64__)
+	return cycles * (1e9 / timer_get_cntfrq());
+#elif defined(__x86_64__)
+	static uint64_t timer_frequency;
+
+	if (timer_frequency == 0)
+		timer_frequency = perf_test_intel_timer_frequency();
+
+	return cycles * (1e9 / timer_frequency);
+#else
+#warn __func__ " is not implemented for this architecture, will return 0"
+	return 0.0;
+#endif
+}
+
+/* compare function for qsort */
+static int perf_test_qcmp(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+void perf_test_print_percentiles(struct kvm_vm *vm, int nr_vcpus)
+{
+	uint64_t n_samples = nr_vcpus * SAMPLES_PER_VCPU;
+
+	sync_global_from_guest(vm, latency_samples);
+	qsort(latency_samples, n_samples, sizeof(uint64_t), &perf_test_qcmp);
+
+	pr_info("Latency distribution (ns) = min:%6.0lf, 50th:%6.0lf, 90th:%6.0lf, 99th:%6.0lf, max:%6.0lf\n",
+		perf_test_cycles_to_ns((double)latency_samples[0]),
+		perf_test_cycles_to_ns((double)latency_samples[n_samples / 2]),
+		perf_test_cycles_to_ns((double)latency_samples[n_samples * 9 / 10]),
+		perf_test_cycles_to_ns((double)latency_samples[n_samples * 99 / 100]),
+		perf_test_cycles_to_ns((double)latency_samples[n_samples - 1]));
+}
+
 void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
 			   struct kvm_vcpu *vcpus[],
 			   uint64_t vcpu_memory_bytes,