Message ID | 20170110113856.7183-7-punit.agrawal@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
+Mark On 10/01/17 11:38, Punit Agrawal wrote: > Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping > certain VM operations, e.g, TLB and cache maintenance > operations. Selective trapping of these operations for specific VMs can > be used to track the frequency with which these occur during execution. > > Add a software PMU on the host that can support tracking VM > operations (in the form of PMU events). Supporting new events requires > providing callbacks to configure the VM to enable/disable the trapping > and read a count of the frequency. > > The host PMU events can be controlled by tools like perf that use > standard kernel perf interfaces. > > Signed-off-by: Punit Agrawal <punit.agrawal@arm.com> > Cc: Christoffer Dall <christoffer.dall@linaro.org> > Cc: Marc Zyngier <marc.zyngier@arm.com> > Cc: Will Deacon <will.deacon@arm.com> > --- > arch/arm/include/asm/kvm_host.h | 8 ++ > arch/arm/kvm/arm.c | 2 + > arch/arm64/include/asm/kvm_host.h | 8 ++ > virt/kvm/arm/host_pmu.c | 272 ++++++++++++++++++++++++++++++++++++++ > 4 files changed, 290 insertions(+) > create mode 100644 virt/kvm/arm/host_pmu.c > > diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > index 26f0c8a0b790..b988f8801b86 100644 > --- a/arch/arm/include/asm/kvm_host.h > +++ b/arch/arm/include/asm/kvm_host.h > @@ -289,6 +289,14 @@ static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) > int kvm_perf_init(void); > int kvm_perf_teardown(void); > > +#if !defined(CONFIG_KVM_HOST_PMU) > +static inline int kvm_host_pmu_init(void) { return 0; } > +static inline void kvm_host_pmu_teardown(void) { } > +#else > +int kvm_host_pmu_init(void); > +void kvm_host_pmu_teardown(void); > +#endif > + > void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); > > struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index 11676787ad49..058626b65b8d 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -1263,6 +1263,7 @@ static int init_subsystems(void) > goto out; > > kvm_perf_init(); > + kvm_host_pmu_init(); > kvm_coproc_table_init(); > > out: > @@ -1453,6 +1454,7 @@ int kvm_arch_init(void *opaque) > void kvm_arch_exit(void) > { > kvm_perf_teardown(); > + kvm_host_pmu_teardown(); > } > > static int arm_init(void) > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 1e83b707f14c..018f887e8964 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -349,6 +349,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, > int kvm_perf_init(void); > int kvm_perf_teardown(void); > > +#if !defined(CONFIG_KVM_HOST_PMU) > +static inline int kvm_host_pmu_init(void) { return 0; } > +static inline void kvm_host_pmu_teardown(void) { } > +#else > +int kvm_host_pmu_init(void); > +void kvm_host_pmu_teardown(void); > +#endif > + > struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); > > static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, > diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c > new file mode 100644 > index 000000000000..fc610ccc169a > --- /dev/null > +++ b/virt/kvm/arm/host_pmu.c > @@ -0,0 +1,272 @@ > +#include <linux/cpumask.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/kvm_host.h> > +#include <linux/list.h> > +#include <linux/perf_event.h> > +#include <linux/pid.h> > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/spinlock_types.h> > +#include <linux/sysfs.h> > + > +#include <asm/kvm_emulate.h> > + > +enum host_pmu_events { > + KVM_HOST_MAX_EVENTS, > +}; > + > +struct host_pmu { > + struct pmu pmu; > + spinlock_t event_list_lock; > + struct list_head event_list_head; > +} host_pmu; > +#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu)) > + > +typedef void (*configure_event_fn)(struct kvm *kvm, bool enable); > +typedef u64 (*get_event_count_fn)(struct kvm *kvm); > + > +struct kvm_event_cb { > + enum host_pmu_events event; > + get_event_count_fn get_event_count; > + configure_event_fn configure_event; > +}; > + > +struct event_data { > + bool enable; > + struct kvm *kvm; > + struct kvm_event_cb *cb; > + struct work_struct work; > + struct list_head event_list; > +}; > + > +static struct kvm_event_cb event_callbacks[] = { > +}; > + > +static struct attribute *event_attrs[] = { > + NULL, > +}; > + > +static struct attribute_group events_attr_group = { > + .name = "events", > + .attrs = event_attrs, > +}; > + > + > +#define VM_MASK GENMASK_ULL(31, 0) > +#define EVENT_MASK GENMASK_ULL(32, 39) > +#define EVENT_SHIFT (32) > + > +#define to_pid(cfg) ((cfg) & VM_MASK) > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) > + > +PMU_FORMAT_ATTR(vm, "config:0-31"); > +PMU_FORMAT_ATTR(event, "config:32-39"); I'm a bit confused by these. Can't you get the PID of the VM you're tracing directly from perf, without having to encode things? And if you can't, surely this should be a function of the size of pid_t? Mark, can you shine some light here? > + > +static struct attribute *format_attrs[] = { > + &format_attr_vm.attr, > + &format_attr_event.attr, > + NULL, > +}; > + > +static struct attribute_group format_attr_group = { > + .name = "format", > + .attrs = format_attrs, > +}; > + > +static const struct attribute_group *attr_groups[] = { > + &events_attr_group, > + &format_attr_group, > + NULL, > +}; > + > +static void host_event_destroy(struct perf_event *event) > +{ > + struct host_pmu *host_pmu = to_host_pmu(event->pmu); > + struct event_data *e_data = event->pmu_private; > + > + /* > + * Ensure outstanding work items related to this event are > + * completed before freeing resources. > + */ > + flush_work(&e_data->work); > + > + kvm_put_kvm(e_data->kvm); > + > + spin_lock(&host_pmu->event_list_lock); > + list_del(&e_data->event_list); > + spin_unlock(&host_pmu->event_list_lock); > + kfree(e_data); > +} > + > +void host_event_work(struct work_struct *work) > +{ > + struct event_data *e_data = container_of(work, struct event_data, work); > + struct kvm *kvm = e_data->kvm; > + > + e_data->cb->configure_event(kvm, e_data->enable); > +} > + > +static int host_event_init(struct perf_event *event) > +{ > + struct host_pmu *host_pmu = to_host_pmu(event->pmu); > + int event_id = to_event(event->attr.config); > + pid_t task_pid = to_pid(event->attr.config); > + struct event_data *e_data, *pos; > + bool found = false; > + struct pid *pid; > + struct kvm *kvm; > + int ret = 0; > + > + if (event->attr.type != event->pmu->type) > + return -ENOENT; > + > + if (has_branch_stack(event) || > + is_sampling_event(event) || > + event->attr.exclude_user || > + event->attr.exclude_kernel || > + event->attr.exclude_hv || > + event->attr.exclude_idle || > + event->attr.exclude_guest) { > + return -EINVAL; > + } > + > + if (event->attach_state == PERF_ATTACH_TASK) > + return -EOPNOTSUPP; > + > + if (event->cpu < 0) > + return -EINVAL; > + > + if (event_id >= KVM_HOST_MAX_EVENTS) > + return -EINVAL; > + > + pid = find_get_pid(task_pid); > + spin_lock(&kvm_lock); > + list_for_each_entry(kvm, &vm_list, vm_list) { > + if (kvm->pid == pid) { > + kvm_get_kvm(kvm); > + found = true; > + break; > + } > + } > + spin_unlock(&kvm_lock); > + put_pid(pid); > + > + if (!found) > + return -EINVAL; > + > + spin_lock(&host_pmu->event_list_lock); > + /* Make sure we don't already have the (event_id, kvm) pair */ > + list_for_each_entry(pos, &host_pmu->event_list_head, event_list) { > + if (pos->cb->event == event_id && > + pos->kvm->pid == pid) { > + kvm_put_kvm(kvm); > + ret = -EOPNOTSUPP; > + goto unlock; > + } > + } > + > + e_data = kzalloc(sizeof(*e_data), GFP_KERNEL); > + e_data->kvm = kvm; > + e_data->cb = &event_callbacks[event_id]; > + INIT_WORK(&e_data->work, host_event_work); > + event->pmu_private = e_data; > + event->cpu = cpumask_first(cpu_online_mask); > + event->destroy = host_event_destroy; > + > + list_add_tail(&e_data->event_list, &host_pmu->event_list_head); > + > +unlock: > + spin_unlock(&host_pmu->event_list_lock); > + > + return ret; > +} > + > +static void host_event_update(struct perf_event *event) > +{ > + struct event_data *e_data = event->pmu_private; > + struct kvm_event_cb *cb = e_data->cb; > + struct kvm *kvm = e_data->kvm; > + struct hw_perf_event *hw = &event->hw; > + u64 prev_count, new_count; > + > + do { > + prev_count = local64_read(&hw->prev_count); > + new_count = cb->get_event_count(kvm); > + } while (local64_xchg(&hw->prev_count, new_count) != prev_count); > + > + local64_add(new_count - prev_count, &event->count); > +} > + > +static void host_event_start(struct perf_event *event, int flags) > +{ > + struct event_data *e_data = event->pmu_private; > + struct kvm_event_cb *cb = e_data->cb; > + struct kvm *kvm = e_data->kvm; > + u64 val; > + > + val = cb->get_event_count(kvm); > + local64_set(&event->hw.prev_count, val); > + > + e_data->enable = true; > + schedule_work(&e_data->work); > +} > + > +static void host_event_stop(struct perf_event *event, int flags) > +{ > + struct event_data *e_data = event->pmu_private; > + > + e_data->enable = false; > + schedule_work(&e_data->work); > + > + if (flags & PERF_EF_UPDATE) > + host_event_update(event); > +} > + > +static int host_event_add(struct perf_event *event, int flags) > +{ > + if (flags & PERF_EF_START) > + host_event_start(event, flags); > + > + return 0; > +} > + > +static void host_event_del(struct perf_event *event, int flags) > +{ > + host_event_stop(event, PERF_EF_UPDATE); > +} > + > +static void host_event_read(struct perf_event *event) > +{ > + host_event_update(event); > +} > + > +static void init_host_pmu(struct host_pmu *host_pmu) > +{ > + host_pmu->pmu = (struct pmu) { > + .task_ctx_nr = perf_sw_context, > + .attr_groups = attr_groups, > + .event_init = host_event_init, > + .add = host_event_add, > + .del = host_event_del, > + .start = host_event_start, > + .stop = host_event_stop, > + .read = host_event_read, > + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, > + }; > + > + INIT_LIST_HEAD(&host_pmu->event_list_head); > + spin_lock_init(&host_pmu->event_list_lock); > +} > + > +int kvm_host_pmu_init(void) > +{ > + init_host_pmu(&host_pmu); > + > + return perf_pmu_register(&host_pmu.pmu, "kvm", -1); > +} > + > +void kvm_host_pmu_teardown(void) > +{ > + perf_pmu_unregister(&host_pmu.pmu); > +} > This patch really makes me think that there is nothing arm-specific in here at all. Why can't it be a generic feature through which architectures can expose events in a generic way (or as close as possible to being generic)? Thanks, M.
On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: > On 10/01/17 11:38, Punit Agrawal wrote: > > +#define VM_MASK GENMASK_ULL(31, 0) > > +#define EVENT_MASK GENMASK_ULL(32, 39) > > +#define EVENT_SHIFT (32) > > + > > +#define to_pid(cfg) ((cfg) & VM_MASK) > > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) > > + > > +PMU_FORMAT_ATTR(vm, "config:0-31"); > > +PMU_FORMAT_ATTR(event, "config:32-39"); > > I'm a bit confused by these. Can't you get the PID of the VM you're > tracing directly from perf, without having to encode things? And if you > can't, surely this should be a function of the size of pid_t? > > Mark, can you shine some light here? AFAICT, this is not necessary. The perf_event_open() syscall takes a PID separately from the perf_event_attr. i.e. we should be able to do: // monitor a particular vCPU perf_event_open(attr, vcpupid, -1, -1, 0) ... or .. // monitor a particular vCPU on a pCPU perf_event_open(attr, vcpupid, cpu, -1, 0) ... or ... // monitor all vCPUs on a pCPU perf_event_open(attr, -1, cpu, -1, 0) ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there should be no issue with using the perf_sw_context. If this is a bodge to avoid opening a perf_event per vCPU thread, then I completely disagree with the approach. This would be better handled in userspace by discovering the set of threads and opening events for each. Thanks, Mark.
Mark Rutland <mark.rutland@arm.com> writes: > On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: >> On 10/01/17 11:38, Punit Agrawal wrote: >> > +#define VM_MASK GENMASK_ULL(31, 0) >> > +#define EVENT_MASK GENMASK_ULL(32, 39) >> > +#define EVENT_SHIFT (32) >> > + >> > +#define to_pid(cfg) ((cfg) & VM_MASK) >> > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) >> > + >> > +PMU_FORMAT_ATTR(vm, "config:0-31"); >> > +PMU_FORMAT_ATTR(event, "config:32-39"); >> >> I'm a bit confused by these. Can't you get the PID of the VM you're >> tracing directly from perf, without having to encode things? With perf attached to a PID, the event gets scheduled out when the task is context switched. As the PID of the controlling process was used, none of the vCPU events were counted. > And if you >> can't, surely this should be a function of the size of pid_t? Agreed. I'll update above if we decide to carry on with this approach. More below... >> >> Mark, can you shine some light here? > > AFAICT, this is not necessary. > > The perf_event_open() syscall takes a PID separately from the > perf_event_attr. i.e. we should be able to do: > > // monitor a particular vCPU > perf_event_open(attr, vcpupid, -1, -1, 0) > > ... or .. > > // monitor a particular vCPU on a pCPU > perf_event_open(attr, vcpupid, cpu, -1, 0) > > ... or ... > > // monitor all vCPUs on a pCPU > perf_event_open(attr, -1, cpu, -1, 0) > > ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there > should be no issue with using the perf_sw_context. I might have missed it but none of the modes of invoking perf_event_open allow monitoring a set of process, i.e., all vcpus belonging to a particular VM, which was one of the aims and a feature I was carrying over from the previous version. If we do not care about this... > > If this is a bodge to avoid opening a perf_event per vCPU thread, then I > completely disagree with the approach. This would be better handled in > userspace by discovering the set of threads and opening events for > each. ... then requiring userspace to invoke perf_event_open perf vCPU will simplify this patch. Marc, any objections? > > Thanks, > Mark. > _______________________________________________ > kvmarm mailing list > kvmarm@lists.cs.columbia.edu > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Hi Marc, Marc Zyngier <marc.zyngier@arm.com> writes: > +Mark > > On 10/01/17 11:38, Punit Agrawal wrote: >> Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping >> certain VM operations, e.g, TLB and cache maintenance >> operations. Selective trapping of these operations for specific VMs can >> be used to track the frequency with which these occur during execution. >> >> Add a software PMU on the host that can support tracking VM >> operations (in the form of PMU events). Supporting new events requires >> providing callbacks to configure the VM to enable/disable the trapping >> and read a count of the frequency. >> >> The host PMU events can be controlled by tools like perf that use >> standard kernel perf interfaces. >> >> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com> >> Cc: Christoffer Dall <christoffer.dall@linaro.org> >> Cc: Marc Zyngier <marc.zyngier@arm.com> >> Cc: Will Deacon <will.deacon@arm.com> >> --- >> arch/arm/include/asm/kvm_host.h | 8 ++ >> arch/arm/kvm/arm.c | 2 + >> arch/arm64/include/asm/kvm_host.h | 8 ++ >> virt/kvm/arm/host_pmu.c | 272 ++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 290 insertions(+) >> create mode 100644 virt/kvm/arm/host_pmu.c >> [...] >> > > This patch really makes me think that there is nothing arm-specific in > here at all. Why can't it be a generic feature through which > architectures can expose events in a generic way (or as close as > possible to being generic)? I wasn't sure if other architectures are interested in the functionality - that's the only reason. Having said that, it can be turned off in the config so there shouldn't be any complaints. I'll move this to virt/kvm in the next incarnation. Thanks for taking a look. > > Thanks, > > M.
On 18/01/17 13:01, Punit Agrawal wrote: > Mark Rutland <mark.rutland@arm.com> writes: > >> On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: >>> On 10/01/17 11:38, Punit Agrawal wrote: >>>> +#define VM_MASK GENMASK_ULL(31, 0) >>>> +#define EVENT_MASK GENMASK_ULL(32, 39) >>>> +#define EVENT_SHIFT (32) >>>> + >>>> +#define to_pid(cfg) ((cfg) & VM_MASK) >>>> +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) >>>> + >>>> +PMU_FORMAT_ATTR(vm, "config:0-31"); >>>> +PMU_FORMAT_ATTR(event, "config:32-39"); >>> >>> I'm a bit confused by these. Can't you get the PID of the VM you're >>> tracing directly from perf, without having to encode things? > > With perf attached to a PID, the event gets scheduled out when the task > is context switched. As the PID of the controlling process was used, > none of the vCPU events were counted. > >> And if you >>> can't, surely this should be a function of the size of pid_t? > > Agreed. I'll update above if we decide to carry on with this > approach. More below... > >>> >>> Mark, can you shine some light here? >> >> AFAICT, this is not necessary. >> >> The perf_event_open() syscall takes a PID separately from the >> perf_event_attr. i.e. we should be able to do: >> >> // monitor a particular vCPU >> perf_event_open(attr, vcpupid, -1, -1, 0) >> >> ... or .. >> >> // monitor a particular vCPU on a pCPU >> perf_event_open(attr, vcpupid, cpu, -1, 0) >> >> ... or ... >> >> // monitor all vCPUs on a pCPU >> perf_event_open(attr, -1, cpu, -1, 0) >> >> ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there >> should be no issue with using the perf_sw_context. > > I might have missed it but none of the modes of invoking perf_event_open > allow monitoring a set of process, i.e., all vcpus belonging to a > particular VM, which was one of the aims and a feature I was carrying > over from the previous version. If we do not care about this... > >> >> If this is a bodge to avoid opening a perf_event per vCPU thread, then I >> completely disagree with the approach. This would be better handled in >> userspace by discovering the set of threads and opening events for >> each. > > ... then requiring userspace to invoke perf_event_open perf vCPU will > simplify this patch. > > Marc, any objections? Not so far, but I'm curious to find out how you determine which thread is a vcpu, let alone a given vcpu. Thanks, M.
On Wed, Jan 18, 2017 at 01:01:40PM +0000, Punit Agrawal wrote: > Mark Rutland <mark.rutland@arm.com> writes: > > > On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: > >> On 10/01/17 11:38, Punit Agrawal wrote: > >> > +#define VM_MASK GENMASK_ULL(31, 0) > >> > +#define EVENT_MASK GENMASK_ULL(32, 39) > >> > +#define EVENT_SHIFT (32) > >> > + > >> > +#define to_pid(cfg) ((cfg) & VM_MASK) > >> > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) > >> > + > >> > +PMU_FORMAT_ATTR(vm, "config:0-31"); > >> > +PMU_FORMAT_ATTR(event, "config:32-39"); > >> > >> I'm a bit confused by these. Can't you get the PID of the VM you're > >> tracing directly from perf, without having to encode things? > > With perf attached to a PID, the event gets scheduled out when the task > is context switched. As the PID of the controlling process was used, > none of the vCPU events were counted. So it sounds like userspace needs to deal with this by attaching to the PIDs of the vCPUs. Given that perf kvm seems to have knowledge of vCPUs, it would be nice to know why that logic isn't reusable here. Take a look in tools/perf/builtin-kvm.c and if it's not up to the job, then perhaps it can be improved. Will
Marc Zyngier <marc.zyngier@arm.com> writes: > On 18/01/17 13:01, Punit Agrawal wrote: >> Mark Rutland <mark.rutland@arm.com> writes: >> >>> On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: >>>> On 10/01/17 11:38, Punit Agrawal wrote: >>>>> +#define VM_MASK GENMASK_ULL(31, 0) >>>>> +#define EVENT_MASK GENMASK_ULL(32, 39) >>>>> +#define EVENT_SHIFT (32) >>>>> + >>>>> +#define to_pid(cfg) ((cfg) & VM_MASK) >>>>> +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) >>>>> + >>>>> +PMU_FORMAT_ATTR(vm, "config:0-31"); >>>>> +PMU_FORMAT_ATTR(event, "config:32-39"); >>>> >>>> I'm a bit confused by these. Can't you get the PID of the VM you're >>>> tracing directly from perf, without having to encode things? >> >> With perf attached to a PID, the event gets scheduled out when the task >> is context switched. As the PID of the controlling process was used, >> none of the vCPU events were counted. >> >>> And if you >>>> can't, surely this should be a function of the size of pid_t? >> >> Agreed. I'll update above if we decide to carry on with this >> approach. More below... >> >>>> >>>> Mark, can you shine some light here? >>> >>> AFAICT, this is not necessary. >>> >>> The perf_event_open() syscall takes a PID separately from the >>> perf_event_attr. i.e. we should be able to do: >>> >>> // monitor a particular vCPU >>> perf_event_open(attr, vcpupid, -1, -1, 0) >>> >>> ... or .. >>> >>> // monitor a particular vCPU on a pCPU >>> perf_event_open(attr, vcpupid, cpu, -1, 0) >>> >>> ... or ... >>> >>> // monitor all vCPUs on a pCPU >>> perf_event_open(attr, -1, cpu, -1, 0) >>> >>> ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there >>> should be no issue with using the perf_sw_context. >> >> I might have missed it but none of the modes of invoking perf_event_open >> allow monitoring a set of process, i.e., all vcpus belonging to a >> particular VM, which was one of the aims and a feature I was carrying >> over from the previous version. If we do not care about this... >> >>> >>> If this is a bodge to avoid opening a perf_event per vCPU thread, then I >>> completely disagree with the approach. This would be better handled in >>> userspace by discovering the set of threads and opening events for >>> each. >> >> ... then requiring userspace to invoke perf_event_open perf vCPU will >> simplify this patch. >> >> Marc, any objections? > > Not so far, but I'm curious to find out how you determine which thread > is a vcpu, let alone a given vcpu. I should've clarified in my reply that I wasn't looking to support the third instance from Mark's examples above - "monitor all vCPUs on a pCPU". I think it'll be quite expensive to figure out which threads from a given pool are vCPUs. For the other instances, we only need to find the vCPU for a given pid. Userspace will hand us a pid that needs to be checked against vCPUs to establish that it is a valid vCPU pid (here I was looking to use kvm_vcpu->pid and kvm->pid introduced in Patch 2). This will happen when setting up the event and the vCPU can be cached for later use. > > Thanks, > > M.
Will Deacon <will.deacon@arm.com> writes: > On Wed, Jan 18, 2017 at 01:01:40PM +0000, Punit Agrawal wrote: >> Mark Rutland <mark.rutland@arm.com> writes: >> >> > On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote: >> >> On 10/01/17 11:38, Punit Agrawal wrote: >> >> > +#define VM_MASK GENMASK_ULL(31, 0) >> >> > +#define EVENT_MASK GENMASK_ULL(32, 39) >> >> > +#define EVENT_SHIFT (32) >> >> > + >> >> > +#define to_pid(cfg) ((cfg) & VM_MASK) >> >> > +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) >> >> > + >> >> > +PMU_FORMAT_ATTR(vm, "config:0-31"); >> >> > +PMU_FORMAT_ATTR(event, "config:32-39"); >> >> >> >> I'm a bit confused by these. Can't you get the PID of the VM you're >> >> tracing directly from perf, without having to encode things? >> >> With perf attached to a PID, the event gets scheduled out when the task >> is context switched. As the PID of the controlling process was used, >> none of the vCPU events were counted. > > So it sounds like userspace needs to deal with this by attaching to the PIDs > of the vCPUs. Given that perf kvm seems to have knowledge of vCPUs, it would > be nice to know why that logic isn't reusable here. Take a look in > tools/perf/builtin-kvm.c and if it's not up to the job, then perhaps it can > be improved. Thanks for the pointer. I'll have a play with perf kvm to see if it can be leveraged here. > > Will > _______________________________________________ > kvmarm mailing list > kvmarm@lists.cs.columbia.edu > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote: > I should've clarified in my reply that I wasn't looking to support the > third instance from Mark's examples above - "monitor all vCPUs on a > pCPU". I think it'll be quite expensive to figure out which threads from > a given pool are vCPUs. I'm not sure I follow why you would need to do that? In that case, we'd open a CPU-bound perf event for the pCPU, which would get installed in the CPU context immediately. It would be present for all tasks. Given it's present for all tasks, we don't need to figure out which happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events. Am I missing something? > For the other instances, we only need to find the vCPU for a given > pid. Userspace will hand us a pid that needs to be checked against vCPUs > to establish that it is a valid vCPU pid (here I was looking to use > kvm_vcpu->pid and kvm->pid introduced in Patch 2). Thinking about this further, a pid is not a unique identifier for either a vCPU or a VM. A single task (which has a single pid), could own multiple VMs, each with multiple vCPUs. A thread pool (with several pids) could share those arbitrarily. So we need VM and vCPU IDs which are distinct from pids or tids. I see that struct kvm_vcpu has a vcpu_id (which from a glance appears to be local to the kvm instance). It's not clear to me if a kvm instance could be shared by multiple processes, or if we can get away with a process-local ID. Thanks, Mark.
Mark Rutland <mark.rutland@arm.com> writes: > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote: >> I should've clarified in my reply that I wasn't looking to support the >> third instance from Mark's examples above - "monitor all vCPUs on a >> pCPU". I think it'll be quite expensive to figure out which threads from >> a given pool are vCPUs. > > I'm not sure I follow why you would need to do that? > > In that case, we'd open a CPU-bound perf event for the pCPU, which would > get installed in the CPU context immediately. It would be present for > all tasks. > > Given it's present for all tasks, we don't need to figure out which > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events. > > Am I missing something? When enabling a CPU-bound event for pCPU, we'd have to enable trapping of TLB operations for the vCPUs running on pCPU. Have a look at Patch 7/9. Also, we'd have to enable/disable trapping when tasks are migrated between pCPUs. > >> For the other instances, we only need to find the vCPU for a given >> pid. Userspace will hand us a pid that needs to be checked against vCPUs >> to establish that it is a valid vCPU pid (here I was looking to use >> kvm_vcpu->pid and kvm->pid introduced in Patch 2). > > Thinking about this further, a pid is not a unique identifier for either > a vCPU or a VM. > > A single task (which has a single pid), could own multiple VMs, each > with multiple vCPUs. A thread pool (with several pids) could share those > arbitrarily. So we need VM and vCPU IDs which are distinct from pids or > tids. > > I see that struct kvm_vcpu has a vcpu_id (which from a glance appears to > be local to the kvm instance). It's not clear to me if a kvm instance > could be shared by multiple processes, or if we can get away with a > process-local ID. So far I've assumed that a VM pid is immutable. If that doesn't hold then we need to think of another mechanism to refer to a VM from userspace. > > Thanks, Mark. _______________________________________________ kvmarm >mailing list kvmarm@lists.cs.columbia.edu >https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote: > Mark Rutland <mark.rutland@arm.com> writes: > > > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote: > >> I should've clarified in my reply that I wasn't looking to support the > >> third instance from Mark's examples above - "monitor all vCPUs on a > >> pCPU". I think it'll be quite expensive to figure out which threads from > >> a given pool are vCPUs. > > > > I'm not sure I follow why you would need to do that? > > > > In that case, we'd open a CPU-bound perf event for the pCPU, which would > > get installed in the CPU context immediately. It would be present for > > all tasks. > > > > Given it's present for all tasks, we don't need to figure out which > > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events. > > > > Am I missing something? > > When enabling a CPU-bound event for pCPU, we'd have to enable trapping > of TLB operations for the vCPUs running on pCPU. Have a look at Patch > 7/9. > > Also, we'd have to enable/disable trapping when tasks are migrated > between pCPUs. Ah, so we can't configure the trap and leave it active, since it'll affect the host. We could have a per-cpu flag, and a hook into vcpu_run, but that's also gnarly. I'll have a think. > So far I've assumed that a VM pid is immutable. If that doesn't hold > then we need to think of another mechanism to refer to a VM from > userspace. Even if we can't migrate the VM between processes (i.e. it's immutable), it's still not unique within a process, so I'm fairly sure we need another mechanism (even if we get away with the common case today). Thanks, Mark.
On Wed, Jan 18, 2017 at 06:05:46PM +0000, Mark Rutland wrote: > On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote: > > Mark Rutland <mark.rutland@arm.com> writes: > > > > > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote: > > >> I should've clarified in my reply that I wasn't looking to support the > > >> third instance from Mark's examples above - "monitor all vCPUs on a > > >> pCPU". I think it'll be quite expensive to figure out which threads from > > >> a given pool are vCPUs. > > > > > > I'm not sure I follow why you would need to do that? > > > > > > In that case, we'd open a CPU-bound perf event for the pCPU, which would > > > get installed in the CPU context immediately. It would be present for > > > all tasks. > > > > > > Given it's present for all tasks, we don't need to figure out which > > > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events. > > > > > > Am I missing something? > > > > When enabling a CPU-bound event for pCPU, we'd have to enable trapping > > of TLB operations for the vCPUs running on pCPU. Have a look at Patch > > 7/9. > > > > Also, we'd have to enable/disable trapping when tasks are migrated > > between pCPUs. > > Ah, so we can't configure the trap and leave it active, since it'll > affect the host. > > We could have a per-cpu flag, and a hook into vcpu_run, but that's also > gnarly. > > I'll have a think. > > > So far I've assumed that a VM pid is immutable. If that doesn't hold > > then we need to think of another mechanism to refer to a VM from > > userspace. > > Even if we can't migrate the VM between processes (i.e. it's immutable), > it's still not unique within a process, so I'm fairly sure we need > another mechanism (even if we get away with the common case today). > I don't understand what the requirements here are exactly but the KVM API documentation says: In general file descriptors can be migrated among processes by means of fork() and the SCM_RIGHTS facility of unix domain socket. These kinds of tricks are explicitly not supported by kvm. While they will not cause harm to the host, their actual behavior is not guaranteed by the API. The only supported use is one virtual machine per process, and one vcpu per thread. So this code should maintain those semantics and it's fair to assume the thread group leader of a given VM stays the same, but the code must not rely on this fact for safe operations. I also don't see why a process couldn't open multiple VMs; however messy that may be, it appears possible to me. -Christoffer
Hi Christoffer, Christoffer Dall <christoffer.dall@linaro.org> writes: > On Wed, Jan 18, 2017 at 06:05:46PM +0000, Mark Rutland wrote: >> On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote: >> > Mark Rutland <mark.rutland@arm.com> writes: >> > >> > > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote: >> > >> I should've clarified in my reply that I wasn't looking to support the >> > >> third instance from Mark's examples above - "monitor all vCPUs on a >> > >> pCPU". I think it'll be quite expensive to figure out which threads from >> > >> a given pool are vCPUs. >> > > >> > > I'm not sure I follow why you would need to do that? >> > > >> > > In that case, we'd open a CPU-bound perf event for the pCPU, which would >> > > get installed in the CPU context immediately. It would be present for >> > > all tasks. >> > > >> > > Given it's present for all tasks, we don't need to figure out which >> > > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events. >> > > >> > > Am I missing something? >> > >> > When enabling a CPU-bound event for pCPU, we'd have to enable trapping >> > of TLB operations for the vCPUs running on pCPU. Have a look at Patch >> > 7/9. >> > >> > Also, we'd have to enable/disable trapping when tasks are migrated >> > between pCPUs. >> >> Ah, so we can't configure the trap and leave it active, since it'll >> affect the host. >> >> We could have a per-cpu flag, and a hook into vcpu_run, but that's also >> gnarly. >> >> I'll have a think. >> >> > So far I've assumed that a VM pid is immutable. If that doesn't hold >> > then we need to think of another mechanism to refer to a VM from >> > userspace. >> >> Even if we can't migrate the VM between processes (i.e. it's immutable), >> it's still not unique within a process, so I'm fairly sure we need >> another mechanism (even if we get away with the common case today). >> > I don't understand what the requirements here are exactly but the KVM > API documentation says: > > In general file descriptors can be migrated among processes by means > of fork() and the SCM_RIGHTS facility of unix domain socket. These > kinds of tricks are explicitly not supported by kvm. While they will > not cause harm to the host, their actual behavior is not guaranteed by > the API. The only supported use is one virtual machine per process, > and one vcpu per thread. > > So this code should maintain those semantics and it's fair to assume the > thread group leader of a given VM stays the same, but the code must not > rely on this fact for safe operations. Thanks for clarifying. The current version passes muster on these assumptions, but I'll have to take a closer look to convince myself of the safety. By moving to vCPU pids in the next version, things should further improve in this regard. > > I also don't see why a process couldn't open multiple VMs; however > messy that may be, it appears possible to me. I imagine there is an implicit reliance on the VMM to handle any resulting fallout if it chooses to do this. > > -Christoffer > _______________________________________________ > kvmarm mailing list > kvmarm@lists.cs.columbia.edu > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 26f0c8a0b790..b988f8801b86 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -289,6 +289,14 @@ static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) int kvm_perf_init(void); int kvm_perf_teardown(void); +#if !defined(CONFIG_KVM_HOST_PMU) +static inline int kvm_host_pmu_init(void) { return 0; } +static inline void kvm_host_pmu_teardown(void) { } +#else +int kvm_host_pmu_init(void); +void kvm_host_pmu_teardown(void); +#endif + void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 11676787ad49..058626b65b8d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1263,6 +1263,7 @@ static int init_subsystems(void) goto out; kvm_perf_init(); + kvm_host_pmu_init(); kvm_coproc_table_init(); out: @@ -1453,6 +1454,7 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { kvm_perf_teardown(); + kvm_host_pmu_teardown(); } static int arm_init(void) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1e83b707f14c..018f887e8964 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -349,6 +349,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int kvm_perf_init(void); int kvm_perf_teardown(void); +#if !defined(CONFIG_KVM_HOST_PMU) +static inline int kvm_host_pmu_init(void) { return 0; } +static inline void kvm_host_pmu_teardown(void) { } +#else +int kvm_host_pmu_init(void); +void kvm_host_pmu_teardown(void); +#endif + struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c new file mode 100644 index 000000000000..fc610ccc169a --- /dev/null +++ b/virt/kvm/arm/host_pmu.c @@ -0,0 +1,272 @@ +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/perf_event.h> +#include <linux/pid.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock_types.h> +#include <linux/sysfs.h> + +#include <asm/kvm_emulate.h> + +enum host_pmu_events { + KVM_HOST_MAX_EVENTS, +}; + +struct host_pmu { + struct pmu pmu; + spinlock_t event_list_lock; + struct list_head event_list_head; +} host_pmu; +#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu)) + +typedef void (*configure_event_fn)(struct kvm *kvm, bool enable); +typedef u64 (*get_event_count_fn)(struct kvm *kvm); + +struct kvm_event_cb { + enum host_pmu_events event; + get_event_count_fn get_event_count; + configure_event_fn configure_event; +}; + +struct event_data { + bool enable; + struct kvm *kvm; + struct kvm_event_cb *cb; + struct work_struct work; + struct list_head event_list; +}; + +static struct kvm_event_cb event_callbacks[] = { +}; + +static struct attribute *event_attrs[] = { + NULL, +}; + +static struct attribute_group events_attr_group = { + .name = "events", + .attrs = event_attrs, +}; + + +#define VM_MASK GENMASK_ULL(31, 0) +#define EVENT_MASK GENMASK_ULL(32, 39) +#define EVENT_SHIFT (32) + +#define to_pid(cfg) ((cfg) & VM_MASK) +#define to_event(cfg) (((cfg) & EVENT_MASK) >> EVENT_SHIFT) + +PMU_FORMAT_ATTR(vm, "config:0-31"); +PMU_FORMAT_ATTR(event, "config:32-39"); + +static struct attribute *format_attrs[] = { + &format_attr_vm.attr, + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group format_attr_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &events_attr_group, + &format_attr_group, + NULL, +}; + +static void host_event_destroy(struct perf_event *event) +{ + struct host_pmu *host_pmu = to_host_pmu(event->pmu); + struct event_data *e_data = event->pmu_private; + + /* + * Ensure outstanding work items related to this event are + * completed before freeing resources. + */ + flush_work(&e_data->work); + + kvm_put_kvm(e_data->kvm); + + spin_lock(&host_pmu->event_list_lock); + list_del(&e_data->event_list); + spin_unlock(&host_pmu->event_list_lock); + kfree(e_data); +} + +void host_event_work(struct work_struct *work) +{ + struct event_data *e_data = container_of(work, struct event_data, work); + struct kvm *kvm = e_data->kvm; + + e_data->cb->configure_event(kvm, e_data->enable); +} + +static int host_event_init(struct perf_event *event) +{ + struct host_pmu *host_pmu = to_host_pmu(event->pmu); + int event_id = to_event(event->attr.config); + pid_t task_pid = to_pid(event->attr.config); + struct event_data *e_data, *pos; + bool found = false; + struct pid *pid; + struct kvm *kvm; + int ret = 0; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (has_branch_stack(event) || + is_sampling_event(event) || + event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_guest) { + return -EINVAL; + } + + if (event->attach_state == PERF_ATTACH_TASK) + return -EOPNOTSUPP; + + if (event->cpu < 0) + return -EINVAL; + + if (event_id >= KVM_HOST_MAX_EVENTS) + return -EINVAL; + + pid = find_get_pid(task_pid); + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + if (kvm->pid == pid) { + kvm_get_kvm(kvm); + found = true; + break; + } + } + spin_unlock(&kvm_lock); + put_pid(pid); + + if (!found) + return -EINVAL; + + spin_lock(&host_pmu->event_list_lock); + /* Make sure we don't already have the (event_id, kvm) pair */ + list_for_each_entry(pos, &host_pmu->event_list_head, event_list) { + if (pos->cb->event == event_id && + pos->kvm->pid == pid) { + kvm_put_kvm(kvm); + ret = -EOPNOTSUPP; + goto unlock; + } + } + + e_data = kzalloc(sizeof(*e_data), GFP_KERNEL); + e_data->kvm = kvm; + e_data->cb = &event_callbacks[event_id]; + INIT_WORK(&e_data->work, host_event_work); + event->pmu_private = e_data; + event->cpu = cpumask_first(cpu_online_mask); + event->destroy = host_event_destroy; + + list_add_tail(&e_data->event_list, &host_pmu->event_list_head); + +unlock: + spin_unlock(&host_pmu->event_list_lock); + + return ret; +} + +static void host_event_update(struct perf_event *event) +{ + struct event_data *e_data = event->pmu_private; + struct kvm_event_cb *cb = e_data->cb; + struct kvm *kvm = e_data->kvm; + struct hw_perf_event *hw = &event->hw; + u64 prev_count, new_count; + + do { + prev_count = local64_read(&hw->prev_count); + new_count = cb->get_event_count(kvm); + } while (local64_xchg(&hw->prev_count, new_count) != prev_count); + + local64_add(new_count - prev_count, &event->count); +} + +static void host_event_start(struct perf_event *event, int flags) +{ + struct event_data *e_data = event->pmu_private; + struct kvm_event_cb *cb = e_data->cb; + struct kvm *kvm = e_data->kvm; + u64 val; + + val = cb->get_event_count(kvm); + local64_set(&event->hw.prev_count, val); + + e_data->enable = true; + schedule_work(&e_data->work); +} + +static void host_event_stop(struct perf_event *event, int flags) +{ + struct event_data *e_data = event->pmu_private; + + e_data->enable = false; + schedule_work(&e_data->work); + + if (flags & PERF_EF_UPDATE) + host_event_update(event); +} + +static int host_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + host_event_start(event, flags); + + return 0; +} + +static void host_event_del(struct perf_event *event, int flags) +{ + host_event_stop(event, PERF_EF_UPDATE); +} + +static void host_event_read(struct perf_event *event) +{ + host_event_update(event); +} + +static void init_host_pmu(struct host_pmu *host_pmu) +{ + host_pmu->pmu = (struct pmu) { + .task_ctx_nr = perf_sw_context, + .attr_groups = attr_groups, + .event_init = host_event_init, + .add = host_event_add, + .del = host_event_del, + .start = host_event_start, + .stop = host_event_stop, + .read = host_event_read, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + }; + + INIT_LIST_HEAD(&host_pmu->event_list_head); + spin_lock_init(&host_pmu->event_list_lock); +} + +int kvm_host_pmu_init(void) +{ + init_host_pmu(&host_pmu); + + return perf_pmu_register(&host_pmu.pmu, "kvm", -1); +} + +void kvm_host_pmu_teardown(void) +{ + perf_pmu_unregister(&host_pmu.pmu); +}
Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping certain VM operations, e.g, TLB and cache maintenance operations. Selective trapping of these operations for specific VMs can be used to track the frequency with which these occur during execution. Add a software PMU on the host that can support tracking VM operations (in the form of PMU events). Supporting new events requires providing callbacks to configure the VM to enable/disable the trapping and read a count of the frequency. The host PMU events can be controlled by tools like perf that use standard kernel perf interfaces. Signed-off-by: Punit Agrawal <punit.agrawal@arm.com> Cc: Christoffer Dall <christoffer.dall@linaro.org> Cc: Marc Zyngier <marc.zyngier@arm.com> Cc: Will Deacon <will.deacon@arm.com> --- arch/arm/include/asm/kvm_host.h | 8 ++ arch/arm/kvm/arm.c | 2 + arch/arm64/include/asm/kvm_host.h | 8 ++ virt/kvm/arm/host_pmu.c | 272 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 290 insertions(+) create mode 100644 virt/kvm/arm/host_pmu.c