Message ID | 20220217034947.180935-2-reijiw@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] KVM: selftests: kvm_vm_elf_load() and elfhdr_get() should close fd | expand |
Hi Reiji, First off, thanks for looking into this! Seems like a very useful thing to test :-) On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote: > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo > process of saving/restoring registers during live migration, and this > test quantifies the performance of the process. > > Signed-off-by: Reiji Watanabe <reijiw@google.com> > --- > tools/testing/selftests/kvm/.gitignore | 1 + > tools/testing/selftests/kvm/Makefile | 1 + > .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++ Without having looked at the diff yet, I wonder if this can be done for multiple architectures, as the whole get/set reg UAPI is used by several other architectures. > 3 files changed, 458 insertions(+) > create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore > index dce7de7755e6..b6b18aaf9322 100644 > --- a/tools/testing/selftests/kvm/.gitignore > +++ b/tools/testing/selftests/kvm/.gitignore > @@ -2,6 +2,7 @@ > /aarch64/arch_timer > /aarch64/debug-exceptions > /aarch64/get-reg-list > +/aarch64/get_set_regs_perf > /aarch64/psci_cpu_on_test > /aarch64/vgic_init > /aarch64/vgic_irq > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile > index 0e4926bc9a58..46a28ce89002 100644 > --- a/tools/testing/selftests/kvm/Makefile > +++ b/tools/testing/selftests/kvm/Makefile > @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test > TEST_GEN_PROGS_aarch64 += aarch64/arch_timer > TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions > TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list > +TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf > TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test > TEST_GEN_PROGS_aarch64 += aarch64/vgic_init > TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq > diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > new file mode 100644 > index 000000000000..27944ccce4a1 > --- /dev/null > +++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > @@ -0,0 +1,456 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test > + * > + * The test runs performance test of saving registers that are returned > + * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs > + * in another VM (a part of behavior during blackout on live migration). > + * > + * Copyright (c) 2022, Google LLC. > + */ > + > +#define _GNU_SOURCE > + > +#include <stdio.h> > +#include <stdlib.h> > +#include <stdint.h> > +#include <unistd.h> > +#include <time.h> > +#include <pthread.h> > + > +#include "kvm_util.h" > +#include "processor.h" > + > +#define N2USEC(nsec) ((nsec)/1000) How about defining: #define NSEC_PER_USEC 1000L and explicitly doing division inline? This is consistent with the kernel as well. > +#define REG_STR_LEN 64 > + > +struct reg_val { > + /* Big enough buffer even for the longest vector register */ > + uint64_t val[32]; > +}; > + > +struct thread_data { > + uint32_t vcpuid; > + bool is_set_reg; > + volatile bool *start; > + struct kvm_vm *vm; > + struct timespec time; > + struct kvm_reg_list *reg_list; > + > + /* > + * Pointer to an array of reg_val, which will hold a value of each > + * register. The number of elements in it is indicated in reg_list.n. > + */ > + struct reg_val *reg_vals; > + > + /* > + * Pointer to an array of timespec, which will hold per register > + * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of > + * elements in it is indicated in reg_list.n. > + */ > + struct timespec *reg_time; > +}; > + > +pthread_t threads[KVM_MAX_VCPUS]; > +struct thread_data thread_args[KVM_MAX_VCPUS]; > + > +struct sys_reg_params { > + u8 Op0; > + u8 Op1; > + u8 CRn; > + u8 CRm; > + u8 Op2; > +}; > + > +static int regid_to_params(u64 id, struct sys_reg_params *params) > +{ > + switch (id & KVM_REG_SIZE_MASK) { > + case KVM_REG_SIZE_U64: > + /* Any unused index bits means it's not valid. */ > + if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK > + | KVM_REG_ARM_COPROC_MASK > + | KVM_REG_ARM64_SYSREG_OP0_MASK > + | KVM_REG_ARM64_SYSREG_OP1_MASK > + | KVM_REG_ARM64_SYSREG_CRN_MASK > + | KVM_REG_ARM64_SYSREG_CRM_MASK > + | KVM_REG_ARM64_SYSREG_OP2_MASK)) > + return -1; > + params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) > + >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); > + params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) > + >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); > + params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) > + >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); > + params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) > + >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); > + params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) > + >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); > + return 0; > + default: > + return -1; > + } > +} > + > +static void reg_id_to_str(uint64_t id, char *str, int len) > +{ > + struct sys_reg_params p; > + > + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, > + "Not KVM_REG_ARM64 register (0x%lx)", id); > + > + switch (id & KVM_REG_ARM_COPROC_MASK) { > + case KVM_REG_ARM_CORE: > + snprintf(str, len, "CORE(id:0x%lx)", id); > + break; > + case KVM_REG_ARM64_SVE: > + snprintf(str, len, "SVE(id:0x%lx)", id); > + break; > + case KVM_REG_ARM_DEMUX: > + snprintf(str, len, "DEMUX(id:0x%lx)", id); > + break; > + case KVM_REG_ARM_FW: > + snprintf(str, len, "FW_REG_%ld", id & 0xffff); > + break; > + case KVM_REG_ARM64_SYSREG: > + if (regid_to_params(id, &p) == 0) > + snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)", > + p.Op0, p.Op1, p.CRn, p.CRm, p.Op2); > + else > + snprintf(str, len, "SYSREG(id:0x%lx)", id); > + break; > + default: > + snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id); > + } > +} I see that print_reg() in aarch64/get-reg-list.c implements most of this. Would it be possible to hoist that into a shared helper function? > +/* > + * Save or restore registers that were saved by get_regs_vcpu_thread with > + * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process. > + */ > +static void *vcpu_thread(void *data) > +{ > + int i, ret; > + struct thread_data *arg = data; > + struct kvm_reg_list *reg_list = arg->reg_list; > + bool is_set_reg = arg->is_set_reg; > + unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG; > + char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG"; Why not use vcpu_{get,set}_reg() instead? They already have informative assertions that print the name of the ioctl and the name of the errno. > + struct timespec ts_reg_diff, ts_reg_start; > + struct kvm_one_reg reg; > + > + memset(&arg->time, 0, sizeof(arg->time)); > + > + /* Wait until start is set */ > + while (!(*arg->start)) > + ; Have you considered using a pthread_barrier for this? > + /* Warm up the cache. */ > + reg.id = reg_list->reg[0]; > + reg.addr = (uint64_t)&arg->reg_vals[0]; > + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); > + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); I wouldn't expect VMMs to do this before starting migration. Do you think it would be better to replicate the behavior of typical userspace? > + pr_debug("%s[%d] start\n", __func__, arg->vcpuid); > + > + /* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */ > + for (i = 0; i < reg_list->n; i++) { > + clock_gettime(CLOCK_MONOTONIC, &ts_reg_start); > + > + reg.id = reg_list->reg[i]; > + reg.addr = (uint64_t)&arg->reg_vals[i]; > + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); > + > + ts_reg_diff = timespec_elapsed(ts_reg_start); > + arg->time = timespec_add(arg->time, ts_reg_diff); > + if (arg->reg_time) > + /* Need per register latency separately */ > + arg->reg_time[i] = ts_reg_diff; > + > + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); > + pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id, > + arg->reg_vals[i].val[0]); > + } > + > + return NULL; > +} > + > +void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data) > +{ > + uint64_t i; > + > + for (i = 0; i < cnt; i++) > + total[i] = timespec_add(total[i], data[i]); > +} > + > +struct result_timespec { > + struct timespec src; > + struct timespec dst; > + struct timespec *src_per_reg; > + struct timespec *dst_per_reg; > +}; > + > +/* > + * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test. > + * Create a VM with the given number of vCPUs, create a thread for each > + * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG > + * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing > + * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on > + * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to > + * the caller via @res. > + */ > +void run_test_one(int nvcpus, struct kvm_reg_list *reg_list, > + struct result_timespec *res) > +{ > + bool per_reg_data = res->src_per_reg ? true : false; > + uint64_t nregs = reg_list->n; > + volatile bool start = false; > + int i, ret; > + struct kvm_vm *src_vm, *dst_vm; > + struct thread_data *targ; > + struct timespec src_sum = (struct timespec){0}; > + struct timespec dst_sum = (struct timespec){0}; Is the cast required? > + if (per_reg_data) { > + /* Reset the buffers to save the per register result */ > + memset(res->src_per_reg, 0, nregs * sizeof(struct timespec)); > + memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec)); > + } > + > + /* Create VMs to save/restore registers */ > + src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); > + dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); > + > + /* Start saving registers process for src_vm */ > + > + /* Create source VM's vCPU threads */ > + for (i = 0; i < nvcpus; i++) { > + targ = &thread_args[i]; > + targ->vm = src_vm; > + targ->vcpuid = i; > + targ->is_set_reg = false; > + targ->start = &start; > + targ->reg_list = reg_list; > + targ->reg_vals = calloc(nregs, sizeof(struct reg_val)); > + TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals"); > + if (per_reg_data) { > + /* Per register result buffer for the vCPU */ > + targ->reg_time = calloc(nregs, sizeof(struct timespec)); > + TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time"); > + } > + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); > + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); > + } > + > + /* > + * Let threads start saving registers for vCPUs, and wait for all > + * threads to complete restoring registers. > + */ > + start = true; > + for (i = 0; i < nvcpus; i++) { > + targ = &thread_args[i]; > + ret = pthread_join(threads[i], NULL); > + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); > + src_sum = timespec_add(src_sum, targ->time); I think it would make more sense to measure wall time instead of CPU time here. It may not be immediately obvious when running this test why the reported time scales as the number of vCPUs increases. > + if (per_reg_data) > + aggregate_time(nregs, res->src_per_reg, targ->reg_time); > + } > + > + start = false; > + pr_debug("%s Saving registers completed.\n", __func__); > + > + /* Start restoring registers process for dst_vm */ > + > + /* Create destination VM's vCPU threads */ > + for (i = 0; i < nvcpus; i++) { > + targ = &thread_args[i]; > + /* Update fields that are different from the src case */ > + targ->vm = dst_vm; > + targ->is_set_reg = true; > + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); > + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); > + } > + > + /* > + * Let threads start saving registers for vCPUs, and wait for all > + * threads to complete restoring registers. > + */ > + start = true; > + for (i = 0; i < nvcpus; i++) { > + targ = &thread_args[i]; > + ret = pthread_join(threads[i], NULL); > + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); > + > + free(targ->reg_vals); > + dst_sum = timespec_add(dst_sum, targ->time); > + if (per_reg_data) { > + aggregate_time(nregs, res->dst_per_reg, targ->reg_time); > + free(targ->reg_time); > + } > + } > + > + kvm_vm_free(src_vm); > + kvm_vm_free(dst_vm); > + > + pr_debug("%s Restoring registers completed.\n", __func__); > + > + /* Calculate the vCPU average */ > + res->src = timespec_div(src_sum, nvcpus); > + res->dst = timespec_div(dst_sum, nvcpus); > + if (!per_reg_data) > + return; > + > + /* Calculate the vCPU average for each register */ > + for (i = 0; i < nregs; i++) { > + res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus); > + res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus); > + } > +} > + > +/* > + * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG) > + * performance test with the given number of vCPUs (@nvcpus) for the given > + * number (@iterations) of times. > + * Each iteration of the test returns the latency of saving/restoring all > + * the registers that KVM_GET_REG_LIST returns (See comments for > + * run_test_one() for more details of each iteration of the test). > + * Print the average latency of each iteration, and if @per_reg_result is set, > + * print the average latency for each register as well. > + */ > +void run_test(int nvcpus, int iterations, bool per_reg_result) > +{ > + int i, j; > + uint64_t nregs; > + struct result_timespec res = {0}; > + struct timespec src_sum = {0}; > + struct timespec dst_sum = {0}; > + struct timespec avg, src_avg, dst_avg; > + struct timespec *src_per_reg_sum, *dst_per_reg_sum; > + struct kvm_vm *vm; > + struct kvm_reg_list *reg_list; > + char reg_str[REG_STR_LEN]; > + > + /* Get kvm_reg_list */ > + vm = vm_create_default(0, 0, NULL); > + reg_list = vcpu_get_reg_list(vm, 0); Would it make sense to test some opt-in capabilities that expose additional registers (PMU, SVE, etc.)? > + kvm_vm_free(vm); > + nregs = reg_list->n; > + > + if (per_reg_result) { > + /* > + * Allocate buffers to save latency of KVM_GET_ONE_REG > + * and KVM_SET_ONE_REG respectively. > + */ > + > + /* Buffers for each iteration */ > + res.src_per_reg = calloc(nregs, sizeof(struct timespec)); > + res.dst_per_reg = calloc(nregs, sizeof(struct timespec)); > + > + /* Buffers for the sum */ > + src_per_reg_sum = calloc(nregs, sizeof(struct timespec)); > + dst_per_reg_sum = calloc(nregs, sizeof(struct timespec)); > + > + TEST_ASSERT(res.src_per_reg && res.dst_per_reg && > + src_per_reg_sum && dst_per_reg_sum, > + "Failed to allocate per register time buffer"); > + } > + > + pr_info("Iterations %d (vCPUs %d, regs %lld): ", > + iterations, nvcpus, reg_list->n); > + > + for (i = 0; i < iterations; i++) { > + /* Run the test */ > + run_test_one(nvcpus, reg_list, &res); > + > + src_sum = timespec_add(src_sum, res.src); > + dst_sum = timespec_add(dst_sum, res.dst); > + > + if (!per_reg_result) > + continue; > + > + /* Aggregate per register result */ > + for (j = 0; j < nregs; j++) { > + src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j], > + res.src_per_reg[j]); > + dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j], > + res.dst_per_reg[j]); > + } > + } > + > + /* Calculate the iteration average */ > + src_avg = timespec_div(src_sum, iterations); > + dst_avg = timespec_div(dst_sum, iterations); > + avg = timespec_add(src_avg, dst_avg); > + > + /* Print the average */ > + if (avg.tv_sec > 0) { > + pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n", > + avg.tv_sec, avg.tv_nsec, > + src_avg.tv_sec, src_avg.tv_nsec, > + dst_avg.tv_sec, dst_avg.tv_nsec); > + } else { > + pr_info("get+set %ldus (get %ldus, set %ldus)\n", > + N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec), > + N2USEC(dst_avg.tv_nsec)); > + } > + > + /* Print per register result when requested */ > + if (per_reg_result) { > + pr_info("Per Register Result:\n"); > + pr_info(" %-32s %13s %9s %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)"); > + for (i = 0; i < nregs; i++) { > + /* Calculate the average for the register */ > + src_avg = timespec_div(src_per_reg_sum[i], iterations); > + dst_avg = timespec_div(dst_per_reg_sum[i], iterations); > + avg = timespec_add(src_avg, dst_avg); > + > + assert(avg.tv_sec == 0); > + reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN); > + pr_info(" %-32s %13ld %9ld %9ld\n", reg_str, > + avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec); > + } Do you think it would be useful to optionally dump this granular data into a machine-parseable format (like CSV)? It could be helpful for analysis. We have several hundred exposed registers (and always growing!) and grokking stdout might be a bit difficult. > + free(res.src_per_reg); > + free(res.dst_per_reg); > + free(src_per_reg_sum); > + free(dst_per_reg_sum); > + } > + > + free(reg_list); > +} > + > +int main(int argc, char *argv[]) > +{ > + int opt; > + int nvcpus = 1; > + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); > + int iterations = 1000; > + bool per_reg_result = false; > + > + while ((opt = getopt(argc, argv, "hri:v:")) != -1) { > + switch (opt) { > + case 'i': > + iterations = atoi(optarg); > + break; > + case 'v': > + nvcpus = atoi(optarg); > + TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus, > + "Number of vcpus, must be between 1 and %d", > + max_vcpus); > + break; > + case 'r': > + per_reg_result = true; > + break; > + case 'h': > + default: > + printf("usage: %s [-h] [-v vcpus] [-i iterations]\n", > + argv[0]); > + exit(0); > + } > + } > + > + setbuf(stdout, NULL); > + > + run_test(nvcpus, iterations, per_reg_result); > + > + return 0; > +} > -- > 2.35.1.473.g83b2b277ed-goog > -- Best, Oliver
On Thu, 17 Feb 2022 04:52:10 +0000, Oliver Upton <oupton@google.com> wrote: > > Hi Reiji, > > First off, thanks for looking into this! Seems like a very useful thing > to test :-) > > On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote: > > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG > > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo > > process of saving/restoring registers during live migration, and this > > test quantifies the performance of the process. > > > > Signed-off-by: Reiji Watanabe <reijiw@google.com> > > --- > > tools/testing/selftests/kvm/.gitignore | 1 + > > tools/testing/selftests/kvm/Makefile | 1 + > > .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++ > [...] > Would it make sense to test some opt-in capabilities that expose > additional registers (PMU, SVE, etc.)? I think this is important. System registers are usually saved/restored in groups, and due to the way we walk the sysreg array, timings are unlikely to be uniform. Getting a grip on that could help restructure the walking if required (either per-group arrays, or maybe a tree structure). Note that all of this could equally apply to the guest trapping (the walk is the same). And yes, there are a lot of commonalities with get-reg-list, so reusing some of the existing infrastructure would be a good thing. M.
Hi Oliver, On Wed, Feb 16, 2022 at 8:52 PM Oliver Upton <oupton@google.com> wrote: > > Hi Reiji, > > First off, thanks for looking into this! Seems like a very useful thing > to test :-) > > On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote: > > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG > > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo > > process of saving/restoring registers during live migration, and this > > test quantifies the performance of the process. > > > > Signed-off-by: Reiji Watanabe <reijiw@google.com> > > --- > > tools/testing/selftests/kvm/.gitignore | 1 + > > tools/testing/selftests/kvm/Makefile | 1 + > > .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++ > > Without having looked at the diff yet, I wonder if this can be done for > multiple architectures, as the whole get/set reg UAPI is used by several > other architectures. > > > 3 files changed, 458 insertions(+) > > create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > > > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore > > index dce7de7755e6..b6b18aaf9322 100644 > > --- a/tools/testing/selftests/kvm/.gitignore > > +++ b/tools/testing/selftests/kvm/.gitignore > > @@ -2,6 +2,7 @@ > > /aarch64/arch_timer > > /aarch64/debug-exceptions > > /aarch64/get-reg-list > > +/aarch64/get_set_regs_perf > > /aarch64/psci_cpu_on_test > > /aarch64/vgic_init > > /aarch64/vgic_irq > > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile > > index 0e4926bc9a58..46a28ce89002 100644 > > --- a/tools/testing/selftests/kvm/Makefile > > +++ b/tools/testing/selftests/kvm/Makefile > > @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test > > TEST_GEN_PROGS_aarch64 += aarch64/arch_timer > > TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions > > TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list > > +TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf > > TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test > > TEST_GEN_PROGS_aarch64 += aarch64/vgic_init > > TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq > > diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > > new file mode 100644 > > index 000000000000..27944ccce4a1 > > --- /dev/null > > +++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c > > @@ -0,0 +1,456 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test > > + * > > + * The test runs performance test of saving registers that are returned > > + * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs > > + * in another VM (a part of behavior during blackout on live migration). > > + * > > + * Copyright (c) 2022, Google LLC. > > + */ > > + > > +#define _GNU_SOURCE > > + > > +#include <stdio.h> > > +#include <stdlib.h> > > +#include <stdint.h> > > +#include <unistd.h> > > +#include <time.h> > > +#include <pthread.h> > > + > > +#include "kvm_util.h" > > +#include "processor.h" > > + > > +#define N2USEC(nsec) ((nsec)/1000) > > How about defining: > > #define NSEC_PER_USEC 1000L > > and explicitly doing division inline? This is consistent with the kernel > as well. Sure, that sounds good. > > +#define REG_STR_LEN 64 > > + > > +struct reg_val { > > + /* Big enough buffer even for the longest vector register */ > > + uint64_t val[32]; > > +}; > > + > > +struct thread_data { > > + uint32_t vcpuid; > > + bool is_set_reg; > > + volatile bool *start; > > + struct kvm_vm *vm; > > + struct timespec time; > > + struct kvm_reg_list *reg_list; > > + > > + /* > > + * Pointer to an array of reg_val, which will hold a value of each > > + * register. The number of elements in it is indicated in reg_list.n. > > + */ > > + struct reg_val *reg_vals; > > + > > + /* > > + * Pointer to an array of timespec, which will hold per register > > + * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of > > + * elements in it is indicated in reg_list.n. > > + */ > > + struct timespec *reg_time; > > +}; > > + > > +pthread_t threads[KVM_MAX_VCPUS]; > > +struct thread_data thread_args[KVM_MAX_VCPUS]; > > + > > +struct sys_reg_params { > > + u8 Op0; > > + u8 Op1; > > + u8 CRn; > > + u8 CRm; > > + u8 Op2; > > +}; > > + > > +static int regid_to_params(u64 id, struct sys_reg_params *params) > > +{ > > + switch (id & KVM_REG_SIZE_MASK) { > > + case KVM_REG_SIZE_U64: > > + /* Any unused index bits means it's not valid. */ > > + if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK > > + | KVM_REG_ARM_COPROC_MASK > > + | KVM_REG_ARM64_SYSREG_OP0_MASK > > + | KVM_REG_ARM64_SYSREG_OP1_MASK > > + | KVM_REG_ARM64_SYSREG_CRN_MASK > > + | KVM_REG_ARM64_SYSREG_CRM_MASK > > + | KVM_REG_ARM64_SYSREG_OP2_MASK)) > > + return -1; > > + params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) > > + >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); > > + params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) > > + >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); > > + params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) > > + >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); > > + params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) > > + >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); > > + params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) > > + >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); > > + return 0; > > + default: > > + return -1; > > + } > > +} > > + > > +static void reg_id_to_str(uint64_t id, char *str, int len) > > +{ > > + struct sys_reg_params p; > > + > > + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, > > + "Not KVM_REG_ARM64 register (0x%lx)", id); > > + > > + switch (id & KVM_REG_ARM_COPROC_MASK) { > > + case KVM_REG_ARM_CORE: > > + snprintf(str, len, "CORE(id:0x%lx)", id); > > + break; > > + case KVM_REG_ARM64_SVE: > > + snprintf(str, len, "SVE(id:0x%lx)", id); > > + break; > > + case KVM_REG_ARM_DEMUX: > > + snprintf(str, len, "DEMUX(id:0x%lx)", id); > > + break; > > + case KVM_REG_ARM_FW: > > + snprintf(str, len, "FW_REG_%ld", id & 0xffff); > > + break; > > + case KVM_REG_ARM64_SYSREG: > > + if (regid_to_params(id, &p) == 0) > > + snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)", > > + p.Op0, p.Op1, p.CRn, p.CRm, p.Op2); > > + else > > + snprintf(str, len, "SYSREG(id:0x%lx)", id); > > + break; > > + default: > > + snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id); > > + } > > +} > > I see that print_reg() in aarch64/get-reg-list.c implements most of > this. Would it be possible to hoist that into a shared helper function? Register name formats that are printed by get-reg-list.c are unnecessarily long for some registers like below for this testing, which I don't think is convenient for this testing. KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp), Although I don't want to use this long format for this testing, I will look into creating a library function on this so that it can be used by both (or any other testings). > > > +/* > > + * Save or restore registers that were saved by get_regs_vcpu_thread with > > + * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process. > > + */ > > +static void *vcpu_thread(void *data) > > +{ > > + int i, ret; > > + struct thread_data *arg = data; > > + struct kvm_reg_list *reg_list = arg->reg_list; > > + bool is_set_reg = arg->is_set_reg; > > + unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG; > > + char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG"; > > Why not use vcpu_{get,set}_reg() instead? They already have informative > assertions that print the name of the ioctl and the name of the errno. Thank you for the comment ! This comment reminded me of another problem that I wanted to fix:) (Since TEST_ASSERT unconditionally evaluates its arguments, strerror(), which is internally serialized, is called from vcpu_{get,set}_reg()) > > + struct timespec ts_reg_diff, ts_reg_start; > > + struct kvm_one_reg reg; > > + > > + memset(&arg->time, 0, sizeof(arg->time)); > > + > > + /* Wait until start is set */ > > + while (!(*arg->start)) > > + ; > > Have you considered using a pthread_barrier for this? Thank you for the proposal. I will use that. > > > + /* Warm up the cache. */ > > + reg.id = reg_list->reg[0]; > > + reg.addr = (uint64_t)&arg->reg_vals[0]; > > + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); > > + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); > > I wouldn't expect VMMs to do this before starting migration. Do you > think it would be better to replicate the behavior of typical userspace? The reason why I added that was because it makes a bit of noise for per register result (and the impact on the whole is not significant). ytbcp26:/tmp# ./get_set_regs_perf -v8 -r| tail -n +4| sort -nk2 |tail -n 3 SYSREG(3,0,0,1,0) 1084 617 467 SYSREG(3,3,14,3,2) 2678 397 2281 CORE(id:0x6030000000100000) 2855 1459 1396 ytbcp26:/tmp# ./get_set_regs_perf -v8 -r| tail -n +4| sort -nk2 |tail -n 3 SYSREG(3,0,0,4,0) 994 461 533 SYSREG(3,0,0,1,0) 1052 582 470 SYSREG(3,3,14,3,2) 2675 396 2279 So, I would prefer to keep that. > > > + pr_debug("%s[%d] start\n", __func__, arg->vcpuid); > > + > > + /* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */ > > + for (i = 0; i < reg_list->n; i++) { > > + clock_gettime(CLOCK_MONOTONIC, &ts_reg_start); > > + > > + reg.id = reg_list->reg[i]; > > + reg.addr = (uint64_t)&arg->reg_vals[i]; > > + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); > > + > > + ts_reg_diff = timespec_elapsed(ts_reg_start); > > + arg->time = timespec_add(arg->time, ts_reg_diff); > > + if (arg->reg_time) > > + /* Need per register latency separately */ > > + arg->reg_time[i] = ts_reg_diff; > > + > > + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); > > + pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id, > > + arg->reg_vals[i].val[0]); > > + } > > + > > + return NULL; > > +} > > + > > +void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data) > > +{ > > + uint64_t i; > > + > > + for (i = 0; i < cnt; i++) > > + total[i] = timespec_add(total[i], data[i]); > > +} > > + > > +struct result_timespec { > > + struct timespec src; > > + struct timespec dst; > > + struct timespec *src_per_reg; > > + struct timespec *dst_per_reg; > > +}; > > + > > +/* > > + * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test. > > + * Create a VM with the given number of vCPUs, create a thread for each > > + * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG > > + * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing > > + * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on > > + * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to > > + * the caller via @res. > > + */ > > +void run_test_one(int nvcpus, struct kvm_reg_list *reg_list, > > + struct result_timespec *res) > > +{ > > + bool per_reg_data = res->src_per_reg ? true : false; > > + uint64_t nregs = reg_list->n; > > + volatile bool start = false; > > + int i, ret; > > + struct kvm_vm *src_vm, *dst_vm; > > + struct thread_data *targ; > > + struct timespec src_sum = (struct timespec){0}; > > + struct timespec dst_sum = (struct timespec){0}; > > Is the cast required? No, it isn't... I will fix that. > > > + if (per_reg_data) { > > + /* Reset the buffers to save the per register result */ > > + memset(res->src_per_reg, 0, nregs * sizeof(struct timespec)); > > + memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec)); > > + } > > + > > + /* Create VMs to save/restore registers */ > > + src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); > > + dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); > > + > > + /* Start saving registers process for src_vm */ > > + > > + /* Create source VM's vCPU threads */ > > + for (i = 0; i < nvcpus; i++) { > > + targ = &thread_args[i]; > > + targ->vm = src_vm; > > + targ->vcpuid = i; > > + targ->is_set_reg = false; > > + targ->start = &start; > > + targ->reg_list = reg_list; > > + targ->reg_vals = calloc(nregs, sizeof(struct reg_val)); > > + TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals"); > > + if (per_reg_data) { > > + /* Per register result buffer for the vCPU */ > > + targ->reg_time = calloc(nregs, sizeof(struct timespec)); > > + TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time"); > > + } > > + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); > > + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); > > + } > > + > > + /* > > + * Let threads start saving registers for vCPUs, and wait for all > > + * threads to complete restoring registers. > > + */ > > + start = true; > > + for (i = 0; i < nvcpus; i++) { > > + targ = &thread_args[i]; > > + ret = pthread_join(threads[i], NULL); > > + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); > > + src_sum = timespec_add(src_sum, targ->time); > > I think it would make more sense to measure wall time instead of CPU > time here. It may not be immediately obvious when running this test why > the reported time scales as the number of vCPUs increases. The result that is going to be shown is vCPU average (I don't want to include the costs incurred by the test code as much as possible). I will consider adding an optional behavior to show the worst result among vCPUs. > > > + if (per_reg_data) > > + aggregate_time(nregs, res->src_per_reg, targ->reg_time); > > + } > > + > > + start = false; > > + pr_debug("%s Saving registers completed.\n", __func__); > > + > > + /* Start restoring registers process for dst_vm */ > > + > > + /* Create destination VM's vCPU threads */ > > + for (i = 0; i < nvcpus; i++) { > > + targ = &thread_args[i]; > > + /* Update fields that are different from the src case */ > > + targ->vm = dst_vm; > > + targ->is_set_reg = true; > > + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); > > + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); > > + } > > + > > + /* > > + * Let threads start saving registers for vCPUs, and wait for all > > + * threads to complete restoring registers. > > + */ > > + start = true; > > + for (i = 0; i < nvcpus; i++) { > > + targ = &thread_args[i]; > > + ret = pthread_join(threads[i], NULL); > > + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); > > + > > + free(targ->reg_vals); > > + dst_sum = timespec_add(dst_sum, targ->time); > > + if (per_reg_data) { > > + aggregate_time(nregs, res->dst_per_reg, targ->reg_time); > > + free(targ->reg_time); > > + } > > + } > > + > > + kvm_vm_free(src_vm); > > + kvm_vm_free(dst_vm); > > + > > + pr_debug("%s Restoring registers completed.\n", __func__); > > + > > + /* Calculate the vCPU average */ > > + res->src = timespec_div(src_sum, nvcpus); > > + res->dst = timespec_div(dst_sum, nvcpus); > > + if (!per_reg_data) > > + return; > > + > > + /* Calculate the vCPU average for each register */ > > + for (i = 0; i < nregs; i++) { > > + res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus); > > + res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus); > > + } > > +} > > + > > +/* > > + * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG) > > + * performance test with the given number of vCPUs (@nvcpus) for the given > > + * number (@iterations) of times. > > + * Each iteration of the test returns the latency of saving/restoring all > > + * the registers that KVM_GET_REG_LIST returns (See comments for > > + * run_test_one() for more details of each iteration of the test). > > + * Print the average latency of each iteration, and if @per_reg_result is set, > > + * print the average latency for each register as well. > > + */ > > +void run_test(int nvcpus, int iterations, bool per_reg_result) > > +{ > > + int i, j; > > + uint64_t nregs; > > + struct result_timespec res = {0}; > > + struct timespec src_sum = {0}; > > + struct timespec dst_sum = {0}; > > + struct timespec avg, src_avg, dst_avg; > > + struct timespec *src_per_reg_sum, *dst_per_reg_sum; > > + struct kvm_vm *vm; > > + struct kvm_reg_list *reg_list; > > + char reg_str[REG_STR_LEN]; > > + > > + /* Get kvm_reg_list */ > > + vm = vm_create_default(0, 0, NULL); > > + reg_list = vcpu_get_reg_list(vm, 0); > > Would it make sense to test some opt-in capabilities that expose > additional registers (PMU, SVE, etc.)? Yes, I will implement that. > > > + kvm_vm_free(vm); > > + nregs = reg_list->n; > > + > > + if (per_reg_result) { > > + /* > > + * Allocate buffers to save latency of KVM_GET_ONE_REG > > + * and KVM_SET_ONE_REG respectively. > > + */ > > + > > + /* Buffers for each iteration */ > > + res.src_per_reg = calloc(nregs, sizeof(struct timespec)); > > + res.dst_per_reg = calloc(nregs, sizeof(struct timespec)); > > + > > + /* Buffers for the sum */ > > + src_per_reg_sum = calloc(nregs, sizeof(struct timespec)); > > + dst_per_reg_sum = calloc(nregs, sizeof(struct timespec)); > > + > > + TEST_ASSERT(res.src_per_reg && res.dst_per_reg && > > + src_per_reg_sum && dst_per_reg_sum, > > + "Failed to allocate per register time buffer"); > > + } > > + > > + pr_info("Iterations %d (vCPUs %d, regs %lld): ", > > + iterations, nvcpus, reg_list->n); > > + > > + for (i = 0; i < iterations; i++) { > > + /* Run the test */ > > + run_test_one(nvcpus, reg_list, &res); > > + > > + src_sum = timespec_add(src_sum, res.src); > > + dst_sum = timespec_add(dst_sum, res.dst); > > + > > + if (!per_reg_result) > > + continue; > > + > > + /* Aggregate per register result */ > > + for (j = 0; j < nregs; j++) { > > + src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j], > > + res.src_per_reg[j]); > > + dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j], > > + res.dst_per_reg[j]); > > + } > > + } > > + > > + /* Calculate the iteration average */ > > + src_avg = timespec_div(src_sum, iterations); > > + dst_avg = timespec_div(dst_sum, iterations); > > + avg = timespec_add(src_avg, dst_avg); > > + > > + /* Print the average */ > > + if (avg.tv_sec > 0) { > > + pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n", > > + avg.tv_sec, avg.tv_nsec, > > + src_avg.tv_sec, src_avg.tv_nsec, > > + dst_avg.tv_sec, dst_avg.tv_nsec); > > + } else { > > + pr_info("get+set %ldus (get %ldus, set %ldus)\n", > > + N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec), > > + N2USEC(dst_avg.tv_nsec)); > > + } > > + > > + /* Print per register result when requested */ > > + if (per_reg_result) { > > + pr_info("Per Register Result:\n"); > > + pr_info(" %-32s %13s %9s %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)"); > > + for (i = 0; i < nregs; i++) { > > + /* Calculate the average for the register */ > > + src_avg = timespec_div(src_per_reg_sum[i], iterations); > > + dst_avg = timespec_div(dst_per_reg_sum[i], iterations); > > + avg = timespec_add(src_avg, dst_avg); > > + > > + assert(avg.tv_sec == 0); > > + reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN); > > + pr_info(" %-32s %13ld %9ld %9ld\n", reg_str, > > + avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec); > > + } > > Do you think it would be useful to optionally dump this granular data > into a machine-parseable format (like CSV)? It could be helpful for > analysis. We have several hundred exposed registers (and always > growing!) and grokking stdout might be a bit difficult. I am thinking of changing spaces in the output into tabs so that the output can be parsed easily (the stdout should be redirected to the file as needed). Then, since I am checking the result with sort+tail/head, one output format should be enough for both rather than adding extra options. BTW, some register names includes ',' because they are printed like "SYSREG(op0, op1, ...). Thank you so much for the review! Regards, Reiji > > > + free(res.src_per_reg); > > + free(res.dst_per_reg); > > + free(src_per_reg_sum); > > + free(dst_per_reg_sum); > > + } > > + > > + free(reg_list); > > +} > > + > > +int main(int argc, char *argv[]) > > +{ > > + int opt; > > + int nvcpus = 1; > > + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); > > + int iterations = 1000; > > + bool per_reg_result = false; > > + > > + while ((opt = getopt(argc, argv, "hri:v:")) != -1) { > > + switch (opt) { > > + case 'i': > > + iterations = atoi(optarg); > > + break; > > + case 'v': > > + nvcpus = atoi(optarg); > > + TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus, > > + "Number of vcpus, must be between 1 and %d", > > + max_vcpus); > > + break; > > + case 'r': > > + per_reg_result = true; > > + break; > > + case 'h': > > + default: > > + printf("usage: %s [-h] [-v vcpus] [-i iterations]\n", > > + argv[0]); > > + exit(0); > > + } > > + } > > + > > + setbuf(stdout, NULL); > > + > > + run_test(nvcpus, iterations, per_reg_result); > > + > > + return 0; > > +} > > -- > > 2.35.1.473.g83b2b277ed-goog > > > > -- > Best, > Oliver
Hi Marc, On Thu, Feb 17, 2022 at 1:12 AM Marc Zyngier <maz@kernel.org> wrote: > > On Thu, 17 Feb 2022 04:52:10 +0000, > Oliver Upton <oupton@google.com> wrote: > > > > Hi Reiji, > > > > First off, thanks for looking into this! Seems like a very useful thing > > to test :-) > > > > On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote: > > > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG > > > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo > > > process of saving/restoring registers during live migration, and this > > > test quantifies the performance of the process. > > > > > > Signed-off-by: Reiji Watanabe <reijiw@google.com> > > > --- > > > tools/testing/selftests/kvm/.gitignore | 1 + > > > tools/testing/selftests/kvm/Makefile | 1 + > > > .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++ > > > > [...] > > > Would it make sense to test some opt-in capabilities that expose > > additional registers (PMU, SVE, etc.)? > > I think this is important. System registers are usually saved/restored Yes, I will fix the test to include registers for opt-in features when supported. > in groups, and due to the way we walk the sysreg array, timings are > unlikely to be uniform. Getting a grip on that could help restructure > the walking if required (either per-group arrays, or maybe a tree > structure). The biggest system register table that I know is sys_reg_descs[], and KVM_SET_ONE_REG/KVM_GET_ONE_REG/emulation code already uses binary search to find the target entry. So, the search itself isn't that bad. The difference between the min and the max latency of KVM_GET_ONE_REG for the registers is always around 200nsec on Ampere Altra machine as far as I checked. > Note that all of this could equally apply to the guest trapping (the > walk is the same). > > And yes, there are a lot of commonalities with get-reg-list, so > reusing some of the existing infrastructure would be a good thing. I will look into that. Thanks, Reiji
Hi Reiji, On 2022-02-19 04:50, Reiji Watanabe wrote: > Hi Marc, > > On Thu, Feb 17, 2022 at 1:12 AM Marc Zyngier <maz@kernel.org> wrote: >> >> On Thu, 17 Feb 2022 04:52:10 +0000, >> Oliver Upton <oupton@google.com> wrote: >> >> > Would it make sense to test some opt-in capabilities that expose >> > additional registers (PMU, SVE, etc.)? >> >> I think this is important. System registers are usually saved/restored > > Yes, I will fix the test to include registers for opt-in features > when supported. > >> in groups, and due to the way we walk the sysreg array, timings are >> unlikely to be uniform. Getting a grip on that could help restructure >> the walking if required (either per-group arrays, or maybe a tree >> structure). > > The biggest system register table that I know is sys_reg_descs[], > and KVM_SET_ONE_REG/KVM_GET_ONE_REG/emulation code already uses > binary search to find the target entry. So, the search itself > isn't that bad. The difference between the min and the max > latency of KVM_GET_ONE_REG for the registers is always around > 200nsec on Ampere Altra machine as far as I checked. Even if it is OK so far, it is bound to get worse over time, as the architecture keeps adding all sort of things that we'll eventually have to save/restore. I see this test as a way to monitor this trend and work out when we need to invest in something better. Thanks, M.
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index dce7de7755e6..b6b18aaf9322 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -2,6 +2,7 @@ /aarch64/arch_timer /aarch64/debug-exceptions /aarch64/get-reg-list +/aarch64/get_set_regs_perf /aarch64/psci_cpu_on_test /aarch64/vgic_init /aarch64/vgic_irq diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 0e4926bc9a58..46a28ce89002 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c new file mode 100644 index 000000000000..27944ccce4a1 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test + * + * The test runs performance test of saving registers that are returned + * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs + * in another VM (a part of behavior during blackout on live migration). + * + * Copyright (c) 2022, Google LLC. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <time.h> +#include <pthread.h> + +#include "kvm_util.h" +#include "processor.h" + +#define N2USEC(nsec) ((nsec)/1000) +#define REG_STR_LEN 64 + +struct reg_val { + /* Big enough buffer even for the longest vector register */ + uint64_t val[32]; +}; + +struct thread_data { + uint32_t vcpuid; + bool is_set_reg; + volatile bool *start; + struct kvm_vm *vm; + struct timespec time; + struct kvm_reg_list *reg_list; + + /* + * Pointer to an array of reg_val, which will hold a value of each + * register. The number of elements in it is indicated in reg_list.n. + */ + struct reg_val *reg_vals; + + /* + * Pointer to an array of timespec, which will hold per register + * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of + * elements in it is indicated in reg_list.n. + */ + struct timespec *reg_time; +}; + +pthread_t threads[KVM_MAX_VCPUS]; +struct thread_data thread_args[KVM_MAX_VCPUS]; + +struct sys_reg_params { + u8 Op0; + u8 Op1; + u8 CRn; + u8 CRm; + u8 Op2; +}; + +static int regid_to_params(u64 id, struct sys_reg_params *params) +{ + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U64: + /* Any unused index bits means it's not valid. */ + if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK + | KVM_REG_ARM_COPROC_MASK + | KVM_REG_ARM64_SYSREG_OP0_MASK + | KVM_REG_ARM64_SYSREG_OP1_MASK + | KVM_REG_ARM64_SYSREG_CRN_MASK + | KVM_REG_ARM64_SYSREG_CRM_MASK + | KVM_REG_ARM64_SYSREG_OP2_MASK)) + return -1; + params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) + >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); + params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) + >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); + params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) + >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); + params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) + >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); + params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) + >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); + return 0; + default: + return -1; + } +} + +static void reg_id_to_str(uint64_t id, char *str, int len) +{ + struct sys_reg_params p; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, + "Not KVM_REG_ARM64 register (0x%lx)", id); + + switch (id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: + snprintf(str, len, "CORE(id:0x%lx)", id); + break; + case KVM_REG_ARM64_SVE: + snprintf(str, len, "SVE(id:0x%lx)", id); + break; + case KVM_REG_ARM_DEMUX: + snprintf(str, len, "DEMUX(id:0x%lx)", id); + break; + case KVM_REG_ARM_FW: + snprintf(str, len, "FW_REG_%ld", id & 0xffff); + break; + case KVM_REG_ARM64_SYSREG: + if (regid_to_params(id, &p) == 0) + snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)", + p.Op0, p.Op1, p.CRn, p.CRm, p.Op2); + else + snprintf(str, len, "SYSREG(id:0x%lx)", id); + break; + default: + snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id); + } +} + +/* + * Save or restore registers that were saved by get_regs_vcpu_thread with + * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process. + */ +static void *vcpu_thread(void *data) +{ + int i, ret; + struct thread_data *arg = data; + struct kvm_reg_list *reg_list = arg->reg_list; + bool is_set_reg = arg->is_set_reg; + unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG; + char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG"; + struct timespec ts_reg_diff, ts_reg_start; + struct kvm_one_reg reg; + + memset(&arg->time, 0, sizeof(arg->time)); + + /* Wait until start is set */ + while (!(*arg->start)) + ; + + /* Warm up the cache. */ + reg.id = reg_list->reg[0]; + reg.addr = (uint64_t)&arg->reg_vals[0]; + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); + + pr_debug("%s[%d] start\n", __func__, arg->vcpuid); + + /* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */ + for (i = 0; i < reg_list->n; i++) { + clock_gettime(CLOCK_MONOTONIC, &ts_reg_start); + + reg.id = reg_list->reg[i]; + reg.addr = (uint64_t)&arg->reg_vals[i]; + ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, ®); + + ts_reg_diff = timespec_elapsed(ts_reg_start); + arg->time = timespec_add(arg->time, ts_reg_diff); + if (arg->reg_time) + /* Need per register latency separately */ + arg->reg_time[i] = ts_reg_diff; + + TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id); + pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id, + arg->reg_vals[i].val[0]); + } + + return NULL; +} + +void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data) +{ + uint64_t i; + + for (i = 0; i < cnt; i++) + total[i] = timespec_add(total[i], data[i]); +} + +struct result_timespec { + struct timespec src; + struct timespec dst; + struct timespec *src_per_reg; + struct timespec *dst_per_reg; +}; + +/* + * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test. + * Create a VM with the given number of vCPUs, create a thread for each + * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG + * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing + * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on + * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to + * the caller via @res. + */ +void run_test_one(int nvcpus, struct kvm_reg_list *reg_list, + struct result_timespec *res) +{ + bool per_reg_data = res->src_per_reg ? true : false; + uint64_t nregs = reg_list->n; + volatile bool start = false; + int i, ret; + struct kvm_vm *src_vm, *dst_vm; + struct thread_data *targ; + struct timespec src_sum = (struct timespec){0}; + struct timespec dst_sum = (struct timespec){0}; + + if (per_reg_data) { + /* Reset the buffers to save the per register result */ + memset(res->src_per_reg, 0, nregs * sizeof(struct timespec)); + memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec)); + } + + /* Create VMs to save/restore registers */ + src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); + dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL); + + /* Start saving registers process for src_vm */ + + /* Create source VM's vCPU threads */ + for (i = 0; i < nvcpus; i++) { + targ = &thread_args[i]; + targ->vm = src_vm; + targ->vcpuid = i; + targ->is_set_reg = false; + targ->start = &start; + targ->reg_list = reg_list; + targ->reg_vals = calloc(nregs, sizeof(struct reg_val)); + TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals"); + if (per_reg_data) { + /* Per register result buffer for the vCPU */ + targ->reg_time = calloc(nregs, sizeof(struct timespec)); + TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time"); + } + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); + } + + /* + * Let threads start saving registers for vCPUs, and wait for all + * threads to complete restoring registers. + */ + start = true; + for (i = 0; i < nvcpus; i++) { + targ = &thread_args[i]; + ret = pthread_join(threads[i], NULL); + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); + src_sum = timespec_add(src_sum, targ->time); + if (per_reg_data) + aggregate_time(nregs, res->src_per_reg, targ->reg_time); + } + + start = false; + pr_debug("%s Saving registers completed.\n", __func__); + + /* Start restoring registers process for dst_vm */ + + /* Create destination VM's vCPU threads */ + for (i = 0; i < nvcpus; i++) { + targ = &thread_args[i]; + /* Update fields that are different from the src case */ + targ->vm = dst_vm; + targ->is_set_reg = true; + ret = pthread_create(&threads[i], NULL, vcpu_thread, targ); + TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret); + } + + /* + * Let threads start saving registers for vCPUs, and wait for all + * threads to complete restoring registers. + */ + start = true; + for (i = 0; i < nvcpus; i++) { + targ = &thread_args[i]; + ret = pthread_join(threads[i], NULL); + TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret); + + free(targ->reg_vals); + dst_sum = timespec_add(dst_sum, targ->time); + if (per_reg_data) { + aggregate_time(nregs, res->dst_per_reg, targ->reg_time); + free(targ->reg_time); + } + } + + kvm_vm_free(src_vm); + kvm_vm_free(dst_vm); + + pr_debug("%s Restoring registers completed.\n", __func__); + + /* Calculate the vCPU average */ + res->src = timespec_div(src_sum, nvcpus); + res->dst = timespec_div(dst_sum, nvcpus); + if (!per_reg_data) + return; + + /* Calculate the vCPU average for each register */ + for (i = 0; i < nregs; i++) { + res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus); + res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus); + } +} + +/* + * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG) + * performance test with the given number of vCPUs (@nvcpus) for the given + * number (@iterations) of times. + * Each iteration of the test returns the latency of saving/restoring all + * the registers that KVM_GET_REG_LIST returns (See comments for + * run_test_one() for more details of each iteration of the test). + * Print the average latency of each iteration, and if @per_reg_result is set, + * print the average latency for each register as well. + */ +void run_test(int nvcpus, int iterations, bool per_reg_result) +{ + int i, j; + uint64_t nregs; + struct result_timespec res = {0}; + struct timespec src_sum = {0}; + struct timespec dst_sum = {0}; + struct timespec avg, src_avg, dst_avg; + struct timespec *src_per_reg_sum, *dst_per_reg_sum; + struct kvm_vm *vm; + struct kvm_reg_list *reg_list; + char reg_str[REG_STR_LEN]; + + /* Get kvm_reg_list */ + vm = vm_create_default(0, 0, NULL); + reg_list = vcpu_get_reg_list(vm, 0); + kvm_vm_free(vm); + nregs = reg_list->n; + + if (per_reg_result) { + /* + * Allocate buffers to save latency of KVM_GET_ONE_REG + * and KVM_SET_ONE_REG respectively. + */ + + /* Buffers for each iteration */ + res.src_per_reg = calloc(nregs, sizeof(struct timespec)); + res.dst_per_reg = calloc(nregs, sizeof(struct timespec)); + + /* Buffers for the sum */ + src_per_reg_sum = calloc(nregs, sizeof(struct timespec)); + dst_per_reg_sum = calloc(nregs, sizeof(struct timespec)); + + TEST_ASSERT(res.src_per_reg && res.dst_per_reg && + src_per_reg_sum && dst_per_reg_sum, + "Failed to allocate per register time buffer"); + } + + pr_info("Iterations %d (vCPUs %d, regs %lld): ", + iterations, nvcpus, reg_list->n); + + for (i = 0; i < iterations; i++) { + /* Run the test */ + run_test_one(nvcpus, reg_list, &res); + + src_sum = timespec_add(src_sum, res.src); + dst_sum = timespec_add(dst_sum, res.dst); + + if (!per_reg_result) + continue; + + /* Aggregate per register result */ + for (j = 0; j < nregs; j++) { + src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j], + res.src_per_reg[j]); + dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j], + res.dst_per_reg[j]); + } + } + + /* Calculate the iteration average */ + src_avg = timespec_div(src_sum, iterations); + dst_avg = timespec_div(dst_sum, iterations); + avg = timespec_add(src_avg, dst_avg); + + /* Print the average */ + if (avg.tv_sec > 0) { + pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n", + avg.tv_sec, avg.tv_nsec, + src_avg.tv_sec, src_avg.tv_nsec, + dst_avg.tv_sec, dst_avg.tv_nsec); + } else { + pr_info("get+set %ldus (get %ldus, set %ldus)\n", + N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec), + N2USEC(dst_avg.tv_nsec)); + } + + /* Print per register result when requested */ + if (per_reg_result) { + pr_info("Per Register Result:\n"); + pr_info(" %-32s %13s %9s %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)"); + for (i = 0; i < nregs; i++) { + /* Calculate the average for the register */ + src_avg = timespec_div(src_per_reg_sum[i], iterations); + dst_avg = timespec_div(dst_per_reg_sum[i], iterations); + avg = timespec_add(src_avg, dst_avg); + + assert(avg.tv_sec == 0); + reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN); + pr_info(" %-32s %13ld %9ld %9ld\n", reg_str, + avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec); + } + + free(res.src_per_reg); + free(res.dst_per_reg); + free(src_per_reg_sum); + free(dst_per_reg_sum); + } + + free(reg_list); +} + +int main(int argc, char *argv[]) +{ + int opt; + int nvcpus = 1; + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + int iterations = 1000; + bool per_reg_result = false; + + while ((opt = getopt(argc, argv, "hri:v:")) != -1) { + switch (opt) { + case 'i': + iterations = atoi(optarg); + break; + case 'v': + nvcpus = atoi(optarg); + TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus, + "Number of vcpus, must be between 1 and %d", + max_vcpus); + break; + case 'r': + per_reg_result = true; + break; + case 'h': + default: + printf("usage: %s [-h] [-v vcpus] [-i iterations]\n", + argv[0]); + exit(0); + } + } + + setbuf(stdout, NULL); + + run_test(nvcpus, iterations, per_reg_result); + + return 0; +}
Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG for registers that are returned by KVM_GET_REG_LIST. This is a pseudo process of saving/restoring registers during live migration, and this test quantifies the performance of the process. Signed-off-by: Reiji Watanabe <reijiw@google.com> --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++ 3 files changed, 458 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c