diff mbox

[v3,6/9] kvm: arm/arm64: Add host pmu to support VM introspection

Message ID 20170110113856.7183-7-punit.agrawal@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Punit Agrawal Jan. 10, 2017, 11:38 a.m. UTC
Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping
certain VM operations, e.g, TLB and cache maintenance
operations. Selective trapping of these operations for specific VMs can
be used to track the frequency with which these occur during execution.

Add a software PMU on the host that can support tracking VM
operations (in the form of PMU events). Supporting new events requires
providing callbacks to configure the VM to enable/disable the trapping
and read a count of the frequency.

The host PMU events can be controlled by tools like perf that use
standard kernel perf interfaces.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
---
 arch/arm/include/asm/kvm_host.h   |   8 ++
 arch/arm/kvm/arm.c                |   2 +
 arch/arm64/include/asm/kvm_host.h |   8 ++
 virt/kvm/arm/host_pmu.c           | 272 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 290 insertions(+)
 create mode 100644 virt/kvm/arm/host_pmu.c

Comments

Marc Zyngier Jan. 18, 2017, 11:21 a.m. UTC | #1
+Mark

On 10/01/17 11:38, Punit Agrawal wrote:
> Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping
> certain VM operations, e.g, TLB and cache maintenance
> operations. Selective trapping of these operations for specific VMs can
> be used to track the frequency with which these occur during execution.
> 
> Add a software PMU on the host that can support tracking VM
> operations (in the form of PMU events). Supporting new events requires
> providing callbacks to configure the VM to enable/disable the trapping
> and read a count of the frequency.
> 
> The host PMU events can be controlled by tools like perf that use
> standard kernel perf interfaces.
> 
> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> ---
>  arch/arm/include/asm/kvm_host.h   |   8 ++
>  arch/arm/kvm/arm.c                |   2 +
>  arch/arm64/include/asm/kvm_host.h |   8 ++
>  virt/kvm/arm/host_pmu.c           | 272 ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 290 insertions(+)
>  create mode 100644 virt/kvm/arm/host_pmu.c
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 26f0c8a0b790..b988f8801b86 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -289,6 +289,14 @@ static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +#if !defined(CONFIG_KVM_HOST_PMU)
> +static inline int kvm_host_pmu_init(void) { return 0; }
> +static inline void kvm_host_pmu_teardown(void) { }
> +#else
> +int kvm_host_pmu_init(void);
> +void kvm_host_pmu_teardown(void);
> +#endif
> +
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 11676787ad49..058626b65b8d 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -1263,6 +1263,7 @@ static int init_subsystems(void)
>  		goto out;
>  
>  	kvm_perf_init();
> +	kvm_host_pmu_init();
>  	kvm_coproc_table_init();
>  
>  out:
> @@ -1453,6 +1454,7 @@ int kvm_arch_init(void *opaque)
>  void kvm_arch_exit(void)
>  {
>  	kvm_perf_teardown();
> +	kvm_host_pmu_teardown();
>  }
>  
>  static int arm_init(void)
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 1e83b707f14c..018f887e8964 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -349,6 +349,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +#if !defined(CONFIG_KVM_HOST_PMU)
> +static inline int kvm_host_pmu_init(void) { return 0; }
> +static inline void kvm_host_pmu_teardown(void) { }
> +#else
> +int kvm_host_pmu_init(void);
> +void kvm_host_pmu_teardown(void);
> +#endif
> +
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>  
>  static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
> diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c
> new file mode 100644
> index 000000000000..fc610ccc169a
> --- /dev/null
> +++ b/virt/kvm/arm/host_pmu.c
> @@ -0,0 +1,272 @@
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/pid.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock_types.h>
> +#include <linux/sysfs.h>
> +
> +#include <asm/kvm_emulate.h>
> +
> +enum host_pmu_events {
> +	KVM_HOST_MAX_EVENTS,
> +};
> +
> +struct host_pmu {
> +	struct pmu pmu;
> +	spinlock_t event_list_lock;
> +	struct list_head event_list_head;
> +} host_pmu;
> +#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu))
> +
> +typedef void (*configure_event_fn)(struct kvm *kvm, bool enable);
> +typedef u64 (*get_event_count_fn)(struct kvm *kvm);
> +
> +struct kvm_event_cb {
> +	enum host_pmu_events event;
> +	get_event_count_fn get_event_count;
> +	configure_event_fn configure_event;
> +};
> +
> +struct event_data {
> +	bool enable;
> +	struct kvm *kvm;
> +	struct kvm_event_cb *cb;
> +	struct work_struct work;
> +	struct list_head event_list;
> +};
> +
> +static struct kvm_event_cb event_callbacks[] = {
> +};
> +
> +static struct attribute *event_attrs[] = {
> +	NULL,
> +};
> +
> +static struct attribute_group events_attr_group = {
> +	.name	= "events",
> +	.attrs	= event_attrs,
> +};
> +
> +
> +#define VM_MASK	GENMASK_ULL(31, 0)
> +#define EVENT_MASK	GENMASK_ULL(32, 39)
> +#define EVENT_SHIFT	(32)
> +
> +#define to_pid(cfg)	((cfg) & VM_MASK)
> +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
> +
> +PMU_FORMAT_ATTR(vm, "config:0-31");
> +PMU_FORMAT_ATTR(event, "config:32-39");

I'm a bit confused by these. Can't you get the PID of the VM you're
tracing directly from perf, without having to encode things? And if you
can't, surely this should be a function of the size of pid_t?

Mark, can you shine some light here?

> +
> +static struct attribute *format_attrs[] = {
> +	&format_attr_vm.attr,
> +	&format_attr_event.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group format_attr_group = {
> +	.name	= "format",
> +	.attrs	= format_attrs,
> +};
> +
> +static const struct attribute_group *attr_groups[] = {
> +	&events_attr_group,
> +	&format_attr_group,
> +	NULL,
> +};
> +
> +static void host_event_destroy(struct perf_event *event)
> +{
> +	struct host_pmu *host_pmu = to_host_pmu(event->pmu);
> +	struct event_data *e_data = event->pmu_private;
> +
> +	/*
> +	 * Ensure outstanding work items related to this event are
> +	 * completed before freeing resources.
> +	 */
> +	flush_work(&e_data->work);
> +
> +	kvm_put_kvm(e_data->kvm);
> +
> +	spin_lock(&host_pmu->event_list_lock);
> +	list_del(&e_data->event_list);
> +	spin_unlock(&host_pmu->event_list_lock);
> +	kfree(e_data);
> +}
> +
> +void host_event_work(struct work_struct *work)
> +{
> +	struct event_data *e_data = container_of(work, struct event_data, work);
> +	struct kvm *kvm = e_data->kvm;
> +
> +	e_data->cb->configure_event(kvm, e_data->enable);
> +}
> +
> +static int host_event_init(struct perf_event *event)
> +{
> +	struct host_pmu *host_pmu = to_host_pmu(event->pmu);
> +	int event_id = to_event(event->attr.config);
> +	pid_t task_pid = to_pid(event->attr.config);
> +	struct event_data *e_data, *pos;
> +	bool found = false;
> +	struct pid *pid;
> +	struct kvm *kvm;
> +	int ret = 0;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (has_branch_stack(event)	||
> +	    is_sampling_event(event)	||
> +	    event->attr.exclude_user	||
> +	    event->attr.exclude_kernel	||
> +	    event->attr.exclude_hv	||
> +	    event->attr.exclude_idle	||
> +	    event->attr.exclude_guest) {
> +		return -EINVAL;
> +	}
> +
> +	if (event->attach_state == PERF_ATTACH_TASK)
> +		return -EOPNOTSUPP;
> +
> +	if (event->cpu < 0)
> +		return -EINVAL;
> +
> +	if (event_id >= KVM_HOST_MAX_EVENTS)
> +		return -EINVAL;
> +
> +	pid = find_get_pid(task_pid);
> +	spin_lock(&kvm_lock);
> +	list_for_each_entry(kvm, &vm_list, vm_list) {
> +		if (kvm->pid == pid) {
> +			kvm_get_kvm(kvm);
> +			found = true;
> +			break;
> +		}
> +	}
> +	spin_unlock(&kvm_lock);
> +	put_pid(pid);
> +
> +	if (!found)
> +		return -EINVAL;
> +
> +	spin_lock(&host_pmu->event_list_lock);
> +	/* Make sure we don't already have the (event_id, kvm) pair */
> +	list_for_each_entry(pos, &host_pmu->event_list_head, event_list) {
> +		if (pos->cb->event == event_id &&
> +		    pos->kvm->pid == pid) {
> +			kvm_put_kvm(kvm);
> +			ret = -EOPNOTSUPP;
> +			goto unlock;
> +		}
> +	}
> +
> +	e_data = kzalloc(sizeof(*e_data), GFP_KERNEL);
> +	e_data->kvm = kvm;
> +	e_data->cb = &event_callbacks[event_id];
> +	INIT_WORK(&e_data->work, host_event_work);
> +	event->pmu_private = e_data;
> +	event->cpu = cpumask_first(cpu_online_mask);
> +	event->destroy = host_event_destroy;
> +
> +	list_add_tail(&e_data->event_list, &host_pmu->event_list_head);
> +
> +unlock:
> +	spin_unlock(&host_pmu->event_list_lock);
> +
> +	return ret;
> +}
> +
> +static void host_event_update(struct perf_event *event)
> +{
> +	struct event_data *e_data = event->pmu_private;
> +	struct kvm_event_cb *cb = e_data->cb;
> +	struct kvm *kvm = e_data->kvm;
> +	struct hw_perf_event *hw = &event->hw;
> +	u64 prev_count, new_count;
> +
> +	do {
> +		prev_count = local64_read(&hw->prev_count);
> +		new_count = cb->get_event_count(kvm);
> +	} while (local64_xchg(&hw->prev_count, new_count) != prev_count);
> +
> +	local64_add(new_count - prev_count, &event->count);
> +}
> +
> +static void host_event_start(struct perf_event *event, int flags)
> +{
> +	struct event_data *e_data = event->pmu_private;
> +	struct kvm_event_cb *cb = e_data->cb;
> +	struct kvm *kvm = e_data->kvm;
> +	u64 val;
> +
> +	val = cb->get_event_count(kvm);
> +	local64_set(&event->hw.prev_count, val);
> +
> +	e_data->enable = true;
> +	schedule_work(&e_data->work);
> +}
> +
> +static void host_event_stop(struct perf_event *event, int flags)
> +{
> +	struct event_data *e_data = event->pmu_private;
> +
> +	e_data->enable = false;
> +	schedule_work(&e_data->work);
> +
> +	if (flags & PERF_EF_UPDATE)
> +		host_event_update(event);
> +}
> +
> +static int host_event_add(struct perf_event *event, int flags)
> +{
> +	if (flags & PERF_EF_START)
> +		host_event_start(event, flags);
> +
> +	return 0;
> +}
> +
> +static void host_event_del(struct perf_event *event, int flags)
> +{
> +	host_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static void host_event_read(struct perf_event *event)
> +{
> +	host_event_update(event);
> +}
> +
> +static void init_host_pmu(struct host_pmu *host_pmu)
> +{
> +	host_pmu->pmu = (struct pmu) {
> +		.task_ctx_nr	= perf_sw_context,
> +		.attr_groups	= attr_groups,
> +		.event_init	= host_event_init,
> +		.add		= host_event_add,
> +		.del		= host_event_del,
> +		.start		= host_event_start,
> +		.stop		= host_event_stop,
> +		.read		= host_event_read,
> +		.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
> +	};
> +
> +	INIT_LIST_HEAD(&host_pmu->event_list_head);
> +	spin_lock_init(&host_pmu->event_list_lock);
> +}
> +
> +int kvm_host_pmu_init(void)
> +{
> +	init_host_pmu(&host_pmu);
> +
> +	return perf_pmu_register(&host_pmu.pmu, "kvm", -1);
> +}
> +
> +void kvm_host_pmu_teardown(void)
> +{
> +	perf_pmu_unregister(&host_pmu.pmu);
> +}
> 

This patch really makes me think that there is nothing arm-specific in
here at all. Why can't it be a generic feature through which
architectures can expose events in a generic way (or as close as
possible to being generic)?

Thanks,

	M.
Mark Rutland Jan. 18, 2017, 11:35 a.m. UTC | #2
On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
> On 10/01/17 11:38, Punit Agrawal wrote:
> > +#define VM_MASK	GENMASK_ULL(31, 0)
> > +#define EVENT_MASK	GENMASK_ULL(32, 39)
> > +#define EVENT_SHIFT	(32)
> > +
> > +#define to_pid(cfg)	((cfg) & VM_MASK)
> > +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
> > +
> > +PMU_FORMAT_ATTR(vm, "config:0-31");
> > +PMU_FORMAT_ATTR(event, "config:32-39");
> 
> I'm a bit confused by these. Can't you get the PID of the VM you're
> tracing directly from perf, without having to encode things? And if you
> can't, surely this should be a function of the size of pid_t?
>
> Mark, can you shine some light here?

AFAICT, this is not necessary.

The perf_event_open() syscall takes a PID separately from the
perf_event_attr. i.e. we should be able to do:

// monitor a particular vCPU
perf_event_open(attr, vcpupid, -1, -1, 0)

... or .. 

// monitor a particular vCPU on a pCPU
perf_event_open(attr, vcpupid, cpu, -1, 0)

... or ...

// monitor all vCPUs on a pCPU
perf_event_open(attr, -1, cpu, -1, 0)

... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there
should be no issue with using the perf_sw_context.

If this is a bodge to avoid opening a perf_event per vCPU thread, then I
completely disagree with the approach. This would be better handled in
userspace by discovering the set of threads and opening events for each.

Thanks,
Mark.
Punit Agrawal Jan. 18, 2017, 1:01 p.m. UTC | #3
Mark Rutland <mark.rutland@arm.com> writes:

> On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
>> On 10/01/17 11:38, Punit Agrawal wrote:
>> > +#define VM_MASK	GENMASK_ULL(31, 0)
>> > +#define EVENT_MASK	GENMASK_ULL(32, 39)
>> > +#define EVENT_SHIFT	(32)
>> > +
>> > +#define to_pid(cfg)	((cfg) & VM_MASK)
>> > +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
>> > +
>> > +PMU_FORMAT_ATTR(vm, "config:0-31");
>> > +PMU_FORMAT_ATTR(event, "config:32-39");
>> 
>> I'm a bit confused by these. Can't you get the PID of the VM you're
>> tracing directly from perf, without having to encode things?

With perf attached to a PID, the event gets scheduled out when the task
is context switched. As the PID of the controlling process was used,
none of the vCPU events were counted.

> And if you
>> can't, surely this should be a function of the size of pid_t?

Agreed. I'll update above if we decide to carry on with this
approach. More below...

>>
>> Mark, can you shine some light here?
>
> AFAICT, this is not necessary.
>
> The perf_event_open() syscall takes a PID separately from the
> perf_event_attr. i.e. we should be able to do:
>
> // monitor a particular vCPU
> perf_event_open(attr, vcpupid, -1, -1, 0)
>
> ... or .. 
>
> // monitor a particular vCPU on a pCPU
> perf_event_open(attr, vcpupid, cpu, -1, 0)
>
> ... or ...
>
> // monitor all vCPUs on a pCPU
> perf_event_open(attr, -1, cpu, -1, 0)
>
> ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there
> should be no issue with using the perf_sw_context.

I might have missed it but none of the modes of invoking perf_event_open
allow monitoring a set of process, i.e., all vcpus belonging to a
particular VM, which was one of the aims and a feature I was carrying
over from the previous version. If we do not care about this...

>
> If this is a bodge to avoid opening a perf_event per vCPU thread, then I
> completely disagree with the approach. This would be better handled in
> userspace by discovering the set of threads and opening events for
> each.

... then requiring userspace to invoke perf_event_open perf vCPU will
simplify this patch.

Marc, any objections?

>
> Thanks,
> Mark.
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Punit Agrawal Jan. 18, 2017, 1:06 p.m. UTC | #4
Hi Marc,

Marc Zyngier <marc.zyngier@arm.com> writes:

> +Mark
>
> On 10/01/17 11:38, Punit Agrawal wrote:
>> Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping
>> certain VM operations, e.g, TLB and cache maintenance
>> operations. Selective trapping of these operations for specific VMs can
>> be used to track the frequency with which these occur during execution.
>> 
>> Add a software PMU on the host that can support tracking VM
>> operations (in the form of PMU events). Supporting new events requires
>> providing callbacks to configure the VM to enable/disable the trapping
>> and read a count of the frequency.
>> 
>> The host PMU events can be controlled by tools like perf that use
>> standard kernel perf interfaces.
>> 
>> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
>> Cc: Christoffer Dall <christoffer.dall@linaro.org>
>> Cc: Marc Zyngier <marc.zyngier@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> ---
>>  arch/arm/include/asm/kvm_host.h   |   8 ++
>>  arch/arm/kvm/arm.c                |   2 +
>>  arch/arm64/include/asm/kvm_host.h |   8 ++
>>  virt/kvm/arm/host_pmu.c           | 272 ++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 290 insertions(+)
>>  create mode 100644 virt/kvm/arm/host_pmu.c
>> 

[...]

>> 
>
> This patch really makes me think that there is nothing arm-specific in
> here at all. Why can't it be a generic feature through which
> architectures can expose events in a generic way (or as close as
> possible to being generic)?

I wasn't sure if other architectures are interested in the functionality
- that's the only reason. Having said that, it can be turned off in the
config so there shouldn't be any complaints.

I'll move this to virt/kvm in the next incarnation.

Thanks for taking a look.

>
> Thanks,
>
> 	M.
Marc Zyngier Jan. 18, 2017, 1:18 p.m. UTC | #5
On 18/01/17 13:01, Punit Agrawal wrote:
> Mark Rutland <mark.rutland@arm.com> writes:
> 
>> On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
>>> On 10/01/17 11:38, Punit Agrawal wrote:
>>>> +#define VM_MASK	GENMASK_ULL(31, 0)
>>>> +#define EVENT_MASK	GENMASK_ULL(32, 39)
>>>> +#define EVENT_SHIFT	(32)
>>>> +
>>>> +#define to_pid(cfg)	((cfg) & VM_MASK)
>>>> +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
>>>> +
>>>> +PMU_FORMAT_ATTR(vm, "config:0-31");
>>>> +PMU_FORMAT_ATTR(event, "config:32-39");
>>>
>>> I'm a bit confused by these. Can't you get the PID of the VM you're
>>> tracing directly from perf, without having to encode things?
> 
> With perf attached to a PID, the event gets scheduled out when the task
> is context switched. As the PID of the controlling process was used,
> none of the vCPU events were counted.
> 
>> And if you
>>> can't, surely this should be a function of the size of pid_t?
> 
> Agreed. I'll update above if we decide to carry on with this
> approach. More below...
> 
>>>
>>> Mark, can you shine some light here?
>>
>> AFAICT, this is not necessary.
>>
>> The perf_event_open() syscall takes a PID separately from the
>> perf_event_attr. i.e. we should be able to do:
>>
>> // monitor a particular vCPU
>> perf_event_open(attr, vcpupid, -1, -1, 0)
>>
>> ... or .. 
>>
>> // monitor a particular vCPU on a pCPU
>> perf_event_open(attr, vcpupid, cpu, -1, 0)
>>
>> ... or ...
>>
>> // monitor all vCPUs on a pCPU
>> perf_event_open(attr, -1, cpu, -1, 0)
>>
>> ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there
>> should be no issue with using the perf_sw_context.
> 
> I might have missed it but none of the modes of invoking perf_event_open
> allow monitoring a set of process, i.e., all vcpus belonging to a
> particular VM, which was one of the aims and a feature I was carrying
> over from the previous version. If we do not care about this...
> 
>>
>> If this is a bodge to avoid opening a perf_event per vCPU thread, then I
>> completely disagree with the approach. This would be better handled in
>> userspace by discovering the set of threads and opening events for
>> each.
> 
> ... then requiring userspace to invoke perf_event_open perf vCPU will
> simplify this patch.
> 
> Marc, any objections?

Not so far, but I'm curious to find out how you determine which thread
is a vcpu, let alone a given vcpu.

Thanks,

	M.
Will Deacon Jan. 18, 2017, 1:45 p.m. UTC | #6
On Wed, Jan 18, 2017 at 01:01:40PM +0000, Punit Agrawal wrote:
> Mark Rutland <mark.rutland@arm.com> writes:
> 
> > On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
> >> On 10/01/17 11:38, Punit Agrawal wrote:
> >> > +#define VM_MASK	GENMASK_ULL(31, 0)
> >> > +#define EVENT_MASK	GENMASK_ULL(32, 39)
> >> > +#define EVENT_SHIFT	(32)
> >> > +
> >> > +#define to_pid(cfg)	((cfg) & VM_MASK)
> >> > +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
> >> > +
> >> > +PMU_FORMAT_ATTR(vm, "config:0-31");
> >> > +PMU_FORMAT_ATTR(event, "config:32-39");
> >> 
> >> I'm a bit confused by these. Can't you get the PID of the VM you're
> >> tracing directly from perf, without having to encode things?
> 
> With perf attached to a PID, the event gets scheduled out when the task
> is context switched. As the PID of the controlling process was used,
> none of the vCPU events were counted.

So it sounds like userspace needs to deal with this by attaching to the PIDs
of the vCPUs. Given that perf kvm seems to have knowledge of vCPUs, it would
be nice to know why that logic isn't reusable here. Take a look in
tools/perf/builtin-kvm.c and if it's not up to the job, then perhaps it can
be improved.

Will
Punit Agrawal Jan. 18, 2017, 2:51 p.m. UTC | #7
Marc Zyngier <marc.zyngier@arm.com> writes:

> On 18/01/17 13:01, Punit Agrawal wrote:
>> Mark Rutland <mark.rutland@arm.com> writes:
>> 
>>> On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
>>>> On 10/01/17 11:38, Punit Agrawal wrote:
>>>>> +#define VM_MASK	GENMASK_ULL(31, 0)
>>>>> +#define EVENT_MASK	GENMASK_ULL(32, 39)
>>>>> +#define EVENT_SHIFT	(32)
>>>>> +
>>>>> +#define to_pid(cfg)	((cfg) & VM_MASK)
>>>>> +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
>>>>> +
>>>>> +PMU_FORMAT_ATTR(vm, "config:0-31");
>>>>> +PMU_FORMAT_ATTR(event, "config:32-39");
>>>>
>>>> I'm a bit confused by these. Can't you get the PID of the VM you're
>>>> tracing directly from perf, without having to encode things?
>> 
>> With perf attached to a PID, the event gets scheduled out when the task
>> is context switched. As the PID of the controlling process was used,
>> none of the vCPU events were counted.
>> 
>>> And if you
>>>> can't, surely this should be a function of the size of pid_t?
>> 
>> Agreed. I'll update above if we decide to carry on with this
>> approach. More below...
>> 
>>>>
>>>> Mark, can you shine some light here?
>>>
>>> AFAICT, this is not necessary.
>>>
>>> The perf_event_open() syscall takes a PID separately from the
>>> perf_event_attr. i.e. we should be able to do:
>>>
>>> // monitor a particular vCPU
>>> perf_event_open(attr, vcpupid, -1, -1, 0)
>>>
>>> ... or .. 
>>>
>>> // monitor a particular vCPU on a pCPU
>>> perf_event_open(attr, vcpupid, cpu, -1, 0)
>>>
>>> ... or ...
>>>
>>> // monitor all vCPUs on a pCPU
>>> perf_event_open(attr, -1, cpu, -1, 0)
>>>
>>> ... so this shouldn't be necessary. AFAICT, this is a SW PMU, so there
>>> should be no issue with using the perf_sw_context.
>> 
>> I might have missed it but none of the modes of invoking perf_event_open
>> allow monitoring a set of process, i.e., all vcpus belonging to a
>> particular VM, which was one of the aims and a feature I was carrying
>> over from the previous version. If we do not care about this...
>> 
>>>
>>> If this is a bodge to avoid opening a perf_event per vCPU thread, then I
>>> completely disagree with the approach. This would be better handled in
>>> userspace by discovering the set of threads and opening events for
>>> each.
>> 
>> ... then requiring userspace to invoke perf_event_open perf vCPU will
>> simplify this patch.
>> 
>> Marc, any objections?
>
> Not so far, but I'm curious to find out how you determine which thread
> is a vcpu, let alone a given vcpu.

I should've clarified in my reply that I wasn't looking to support the
third instance from Mark's examples above - "monitor all vCPUs on a
pCPU". I think it'll be quite expensive to figure out which threads from
a given pool are vCPUs.

For the other instances, we only need to find the vCPU for a given
pid. Userspace will hand us a pid that needs to be checked against vCPUs
to establish that it is a valid vCPU pid (here I was looking to use
kvm_vcpu->pid and kvm->pid introduced in Patch 2).

This will happen when setting up the event and the vCPU can be cached
for later use.

>
> Thanks,
>
> 	M.
Punit Agrawal Jan. 18, 2017, 2:58 p.m. UTC | #8
Will Deacon <will.deacon@arm.com> writes:

> On Wed, Jan 18, 2017 at 01:01:40PM +0000, Punit Agrawal wrote:
>> Mark Rutland <mark.rutland@arm.com> writes:
>> 
>> > On Wed, Jan 18, 2017 at 11:21:21AM +0000, Marc Zyngier wrote:
>> >> On 10/01/17 11:38, Punit Agrawal wrote:
>> >> > +#define VM_MASK	GENMASK_ULL(31, 0)
>> >> > +#define EVENT_MASK	GENMASK_ULL(32, 39)
>> >> > +#define EVENT_SHIFT	(32)
>> >> > +
>> >> > +#define to_pid(cfg)	((cfg) & VM_MASK)
>> >> > +#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
>> >> > +
>> >> > +PMU_FORMAT_ATTR(vm, "config:0-31");
>> >> > +PMU_FORMAT_ATTR(event, "config:32-39");
>> >> 
>> >> I'm a bit confused by these. Can't you get the PID of the VM you're
>> >> tracing directly from perf, without having to encode things?
>> 
>> With perf attached to a PID, the event gets scheduled out when the task
>> is context switched. As the PID of the controlling process was used,
>> none of the vCPU events were counted.
>
> So it sounds like userspace needs to deal with this by attaching to the PIDs
> of the vCPUs. Given that perf kvm seems to have knowledge of vCPUs, it would
> be nice to know why that logic isn't reusable here. Take a look in
> tools/perf/builtin-kvm.c and if it's not up to the job, then perhaps it can
> be improved.

Thanks for the pointer. I'll have a play with perf kvm to see if it can
be leveraged here.

>
> Will
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Mark Rutland Jan. 18, 2017, 3:17 p.m. UTC | #9
On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote:
> I should've clarified in my reply that I wasn't looking to support the
> third instance from Mark's examples above - "monitor all vCPUs on a
> pCPU". I think it'll be quite expensive to figure out which threads from
> a given pool are vCPUs.

I'm not sure I follow why you would need to do that?

In that case, we'd open a CPU-bound perf event for the pCPU, which would
get installed in the CPU context immediately. It would be present for
all tasks.

Given it's present for all tasks, we don't need to figure out which
happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events.

Am I missing something?

> For the other instances, we only need to find the vCPU for a given
> pid. Userspace will hand us a pid that needs to be checked against vCPUs
> to establish that it is a valid vCPU pid (here I was looking to use
> kvm_vcpu->pid and kvm->pid introduced in Patch 2).

Thinking about this further, a pid is not a unique identifier for either
a vCPU or a VM.

A single task (which has a single pid), could own multiple VMs, each
with multiple vCPUs. A thread pool (with several pids) could share those
arbitrarily. So we need VM and vCPU IDs which are distinct from pids or
tids.

I see that struct kvm_vcpu has a vcpu_id (which from a glance appears to
be local to the kvm instance). It's not clear to me if a kvm instance
could be shared by multiple processes, or if we can get away with a
process-local ID.

Thanks,
Mark.
Punit Agrawal Jan. 18, 2017, 4:17 p.m. UTC | #10
Mark Rutland <mark.rutland@arm.com> writes:

> On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote:
>> I should've clarified in my reply that I wasn't looking to support the
>> third instance from Mark's examples above - "monitor all vCPUs on a
>> pCPU". I think it'll be quite expensive to figure out which threads from
>> a given pool are vCPUs.
>
> I'm not sure I follow why you would need to do that?
>
> In that case, we'd open a CPU-bound perf event for the pCPU, which would
> get installed in the CPU context immediately. It would be present for
> all tasks.
>
> Given it's present for all tasks, we don't need to figure out which
> happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events.
>
> Am I missing something?

When enabling a CPU-bound event for pCPU, we'd have to enable trapping
of TLB operations for the vCPUs running on pCPU. Have a look at Patch
7/9.

Also, we'd have to enable/disable trapping when tasks are migrated
between pCPUs.

>
>> For the other instances, we only need to find the vCPU for a given
>> pid. Userspace will hand us a pid that needs to be checked against vCPUs
>> to establish that it is a valid vCPU pid (here I was looking to use
>> kvm_vcpu->pid and kvm->pid introduced in Patch 2).
>
> Thinking about this further, a pid is not a unique identifier for either
> a vCPU or a VM.
>
> A single task (which has a single pid), could own multiple VMs, each
> with multiple vCPUs. A thread pool (with several pids) could share those
> arbitrarily. So we need VM and vCPU IDs which are distinct from pids or
> tids.
>
> I see that struct kvm_vcpu has a vcpu_id (which from a glance appears to
> be local to the kvm instance). It's not clear to me if a kvm instance
> could be shared by multiple processes, or if we can get away with a
> process-local ID.

So far I've assumed that a VM pid is immutable. If that doesn't hold
then we need to think of another mechanism to refer to a VM from
userspace.

>
> Thanks, Mark.  _______________________________________________ kvmarm
>mailing list kvmarm@lists.cs.columbia.edu
>https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Mark Rutland Jan. 18, 2017, 6:05 p.m. UTC | #11
On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote:
> Mark Rutland <mark.rutland@arm.com> writes:
> 
> > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote:
> >> I should've clarified in my reply that I wasn't looking to support the
> >> third instance from Mark's examples above - "monitor all vCPUs on a
> >> pCPU". I think it'll be quite expensive to figure out which threads from
> >> a given pool are vCPUs.
> >
> > I'm not sure I follow why you would need to do that?
> >
> > In that case, we'd open a CPU-bound perf event for the pCPU, which would
> > get installed in the CPU context immediately. It would be present for
> > all tasks.
> >
> > Given it's present for all tasks, we don't need to figure out which
> > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events.
> >
> > Am I missing something?
> 
> When enabling a CPU-bound event for pCPU, we'd have to enable trapping
> of TLB operations for the vCPUs running on pCPU. Have a look at Patch
> 7/9.
> 
> Also, we'd have to enable/disable trapping when tasks are migrated
> between pCPUs.

Ah, so we can't configure the trap and leave it active, since it'll
affect the host.

We could have a per-cpu flag, and a hook into vcpu_run, but that's also
gnarly.

I'll have a think.

> So far I've assumed that a VM pid is immutable. If that doesn't hold
> then we need to think of another mechanism to refer to a VM from
> userspace.

Even if we can't migrate the VM between processes (i.e. it's immutable),
it's still not unique within a process, so I'm fairly sure we need
another mechanism (even if we get away with the common case today).

Thanks,
Mark.
Christoffer Dall Jan. 19, 2017, 4:42 p.m. UTC | #12
On Wed, Jan 18, 2017 at 06:05:46PM +0000, Mark Rutland wrote:
> On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote:
> > Mark Rutland <mark.rutland@arm.com> writes:
> > 
> > > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote:
> > >> I should've clarified in my reply that I wasn't looking to support the
> > >> third instance from Mark's examples above - "monitor all vCPUs on a
> > >> pCPU". I think it'll be quite expensive to figure out which threads from
> > >> a given pool are vCPUs.
> > >
> > > I'm not sure I follow why you would need to do that?
> > >
> > > In that case, we'd open a CPU-bound perf event for the pCPU, which would
> > > get installed in the CPU context immediately. It would be present for
> > > all tasks.
> > >
> > > Given it's present for all tasks, we don't need to figure out which
> > > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events.
> > >
> > > Am I missing something?
> > 
> > When enabling a CPU-bound event for pCPU, we'd have to enable trapping
> > of TLB operations for the vCPUs running on pCPU. Have a look at Patch
> > 7/9.
> > 
> > Also, we'd have to enable/disable trapping when tasks are migrated
> > between pCPUs.
> 
> Ah, so we can't configure the trap and leave it active, since it'll
> affect the host.
> 
> We could have a per-cpu flag, and a hook into vcpu_run, but that's also
> gnarly.
> 
> I'll have a think.
> 
> > So far I've assumed that a VM pid is immutable. If that doesn't hold
> > then we need to think of another mechanism to refer to a VM from
> > userspace.
> 
> Even if we can't migrate the VM between processes (i.e. it's immutable),
> it's still not unique within a process, so I'm fairly sure we need
> another mechanism (even if we get away with the common case today).
> 
I don't understand what the requirements here are exactly but the KVM
API documentation says:

  In general file descriptors can be migrated among processes by means
  of fork() and the SCM_RIGHTS facility of unix domain socket.  These
  kinds of tricks are explicitly not supported by kvm.  While they will
  not cause harm to the host, their actual behavior is not guaranteed by
  the API.  The only supported use is one virtual machine per process,
  and one vcpu per thread.

So this code should maintain those semantics and it's fair to assume the
thread group leader of a given VM stays the same, but the code must not
rely on this fact for safe operations.

I also don't see why a process couldn't open multiple VMs; however
messy that may be, it appears possible to me.

-Christoffer
Punit Agrawal Jan. 23, 2017, 11:21 a.m. UTC | #13
Hi Christoffer,

Christoffer Dall <christoffer.dall@linaro.org> writes:

> On Wed, Jan 18, 2017 at 06:05:46PM +0000, Mark Rutland wrote:
>> On Wed, Jan 18, 2017 at 04:17:18PM +0000, Punit Agrawal wrote:
>> > Mark Rutland <mark.rutland@arm.com> writes:
>> > 
>> > > On Wed, Jan 18, 2017 at 02:51:31PM +0000, Punit Agrawal wrote:
>> > >> I should've clarified in my reply that I wasn't looking to support the
>> > >> third instance from Mark's examples above - "monitor all vCPUs on a
>> > >> pCPU". I think it'll be quite expensive to figure out which threads from
>> > >> a given pool are vCPUs.
>> > >
>> > > I'm not sure I follow why you would need to do that?
>> > >
>> > > In that case, we'd open a CPU-bound perf event for the pCPU, which would
>> > > get installed in the CPU context immediately. It would be present for
>> > > all tasks.
>> > >
>> > > Given it's present for all tasks, we don't need to figure out which
>> > > happen to have vCPUs. The !vCPU tasks simply shouldn't trigger events.
>> > >
>> > > Am I missing something?
>> > 
>> > When enabling a CPU-bound event for pCPU, we'd have to enable trapping
>> > of TLB operations for the vCPUs running on pCPU. Have a look at Patch
>> > 7/9.
>> > 
>> > Also, we'd have to enable/disable trapping when tasks are migrated
>> > between pCPUs.
>> 
>> Ah, so we can't configure the trap and leave it active, since it'll
>> affect the host.
>> 
>> We could have a per-cpu flag, and a hook into vcpu_run, but that's also
>> gnarly.
>> 
>> I'll have a think.
>> 
>> > So far I've assumed that a VM pid is immutable. If that doesn't hold
>> > then we need to think of another mechanism to refer to a VM from
>> > userspace.
>> 
>> Even if we can't migrate the VM between processes (i.e. it's immutable),
>> it's still not unique within a process, so I'm fairly sure we need
>> another mechanism (even if we get away with the common case today).
>> 
> I don't understand what the requirements here are exactly but the KVM
> API documentation says:
>
>   In general file descriptors can be migrated among processes by means
>   of fork() and the SCM_RIGHTS facility of unix domain socket.  These
>   kinds of tricks are explicitly not supported by kvm.  While they will
>   not cause harm to the host, their actual behavior is not guaranteed by
>   the API.  The only supported use is one virtual machine per process,
>   and one vcpu per thread.
>
> So this code should maintain those semantics and it's fair to assume the
> thread group leader of a given VM stays the same, but the code must not
> rely on this fact for safe operations.

Thanks for clarifying. The current version passes muster on these
assumptions, but I'll have to take a closer look to convince myself of
the safety.

By moving to vCPU pids in the next version, things should further
improve in this regard.

>
> I also don't see why a process couldn't open multiple VMs; however
> messy that may be, it appears possible to me.

I imagine there is an implicit reliance on the VMM to handle any
resulting fallout if it chooses to do this.

>
> -Christoffer
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 26f0c8a0b790..b988f8801b86 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -289,6 +289,14 @@  static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
 
+#if !defined(CONFIG_KVM_HOST_PMU)
+static inline int kvm_host_pmu_init(void) { return 0; }
+static inline void kvm_host_pmu_teardown(void) { }
+#else
+int kvm_host_pmu_init(void);
+void kvm_host_pmu_teardown(void);
+#endif
+
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 11676787ad49..058626b65b8d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1263,6 +1263,7 @@  static int init_subsystems(void)
 		goto out;
 
 	kvm_perf_init();
+	kvm_host_pmu_init();
 	kvm_coproc_table_init();
 
 out:
@@ -1453,6 +1454,7 @@  int kvm_arch_init(void *opaque)
 void kvm_arch_exit(void)
 {
 	kvm_perf_teardown();
+	kvm_host_pmu_teardown();
 }
 
 static int arm_init(void)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1e83b707f14c..018f887e8964 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -349,6 +349,14 @@  int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
 
+#if !defined(CONFIG_KVM_HOST_PMU)
+static inline int kvm_host_pmu_init(void) { return 0; }
+static inline void kvm_host_pmu_teardown(void) { }
+#else
+int kvm_host_pmu_init(void);
+void kvm_host_pmu_teardown(void);
+#endif
+
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
 static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c
new file mode 100644
index 000000000000..fc610ccc169a
--- /dev/null
+++ b/virt/kvm/arm/host_pmu.c
@@ -0,0 +1,272 @@ 
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <linux/sysfs.h>
+
+#include <asm/kvm_emulate.h>
+
+enum host_pmu_events {
+	KVM_HOST_MAX_EVENTS,
+};
+
+struct host_pmu {
+	struct pmu pmu;
+	spinlock_t event_list_lock;
+	struct list_head event_list_head;
+} host_pmu;
+#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu))
+
+typedef void (*configure_event_fn)(struct kvm *kvm, bool enable);
+typedef u64 (*get_event_count_fn)(struct kvm *kvm);
+
+struct kvm_event_cb {
+	enum host_pmu_events event;
+	get_event_count_fn get_event_count;
+	configure_event_fn configure_event;
+};
+
+struct event_data {
+	bool enable;
+	struct kvm *kvm;
+	struct kvm_event_cb *cb;
+	struct work_struct work;
+	struct list_head event_list;
+};
+
+static struct kvm_event_cb event_callbacks[] = {
+};
+
+static struct attribute *event_attrs[] = {
+	NULL,
+};
+
+static struct attribute_group events_attr_group = {
+	.name	= "events",
+	.attrs	= event_attrs,
+};
+
+
+#define VM_MASK	GENMASK_ULL(31, 0)
+#define EVENT_MASK	GENMASK_ULL(32, 39)
+#define EVENT_SHIFT	(32)
+
+#define to_pid(cfg)	((cfg) & VM_MASK)
+#define to_event(cfg)	(((cfg) & EVENT_MASK) >> EVENT_SHIFT)
+
+PMU_FORMAT_ATTR(vm, "config:0-31");
+PMU_FORMAT_ATTR(event, "config:32-39");
+
+static struct attribute *format_attrs[] = {
+	&format_attr_vm.attr,
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group format_attr_group = {
+	.name	= "format",
+	.attrs	= format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static void host_event_destroy(struct perf_event *event)
+{
+	struct host_pmu *host_pmu = to_host_pmu(event->pmu);
+	struct event_data *e_data = event->pmu_private;
+
+	/*
+	 * Ensure outstanding work items related to this event are
+	 * completed before freeing resources.
+	 */
+	flush_work(&e_data->work);
+
+	kvm_put_kvm(e_data->kvm);
+
+	spin_lock(&host_pmu->event_list_lock);
+	list_del(&e_data->event_list);
+	spin_unlock(&host_pmu->event_list_lock);
+	kfree(e_data);
+}
+
+void host_event_work(struct work_struct *work)
+{
+	struct event_data *e_data = container_of(work, struct event_data, work);
+	struct kvm *kvm = e_data->kvm;
+
+	e_data->cb->configure_event(kvm, e_data->enable);
+}
+
+static int host_event_init(struct perf_event *event)
+{
+	struct host_pmu *host_pmu = to_host_pmu(event->pmu);
+	int event_id = to_event(event->attr.config);
+	pid_t task_pid = to_pid(event->attr.config);
+	struct event_data *e_data, *pos;
+	bool found = false;
+	struct pid *pid;
+	struct kvm *kvm;
+	int ret = 0;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (has_branch_stack(event)	||
+	    is_sampling_event(event)	||
+	    event->attr.exclude_user	||
+	    event->attr.exclude_kernel	||
+	    event->attr.exclude_hv	||
+	    event->attr.exclude_idle	||
+	    event->attr.exclude_guest) {
+		return -EINVAL;
+	}
+
+	if (event->attach_state == PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (event_id >= KVM_HOST_MAX_EVENTS)
+		return -EINVAL;
+
+	pid = find_get_pid(task_pid);
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		if (kvm->pid == pid) {
+			kvm_get_kvm(kvm);
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&kvm_lock);
+	put_pid(pid);
+
+	if (!found)
+		return -EINVAL;
+
+	spin_lock(&host_pmu->event_list_lock);
+	/* Make sure we don't already have the (event_id, kvm) pair */
+	list_for_each_entry(pos, &host_pmu->event_list_head, event_list) {
+		if (pos->cb->event == event_id &&
+		    pos->kvm->pid == pid) {
+			kvm_put_kvm(kvm);
+			ret = -EOPNOTSUPP;
+			goto unlock;
+		}
+	}
+
+	e_data = kzalloc(sizeof(*e_data), GFP_KERNEL);
+	e_data->kvm = kvm;
+	e_data->cb = &event_callbacks[event_id];
+	INIT_WORK(&e_data->work, host_event_work);
+	event->pmu_private = e_data;
+	event->cpu = cpumask_first(cpu_online_mask);
+	event->destroy = host_event_destroy;
+
+	list_add_tail(&e_data->event_list, &host_pmu->event_list_head);
+
+unlock:
+	spin_unlock(&host_pmu->event_list_lock);
+
+	return ret;
+}
+
+static void host_event_update(struct perf_event *event)
+{
+	struct event_data *e_data = event->pmu_private;
+	struct kvm_event_cb *cb = e_data->cb;
+	struct kvm *kvm = e_data->kvm;
+	struct hw_perf_event *hw = &event->hw;
+	u64 prev_count, new_count;
+
+	do {
+		prev_count = local64_read(&hw->prev_count);
+		new_count = cb->get_event_count(kvm);
+	} while (local64_xchg(&hw->prev_count, new_count) != prev_count);
+
+	local64_add(new_count - prev_count, &event->count);
+}
+
+static void host_event_start(struct perf_event *event, int flags)
+{
+	struct event_data *e_data = event->pmu_private;
+	struct kvm_event_cb *cb = e_data->cb;
+	struct kvm *kvm = e_data->kvm;
+	u64 val;
+
+	val = cb->get_event_count(kvm);
+	local64_set(&event->hw.prev_count, val);
+
+	e_data->enable = true;
+	schedule_work(&e_data->work);
+}
+
+static void host_event_stop(struct perf_event *event, int flags)
+{
+	struct event_data *e_data = event->pmu_private;
+
+	e_data->enable = false;
+	schedule_work(&e_data->work);
+
+	if (flags & PERF_EF_UPDATE)
+		host_event_update(event);
+}
+
+static int host_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		host_event_start(event, flags);
+
+	return 0;
+}
+
+static void host_event_del(struct perf_event *event, int flags)
+{
+	host_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void host_event_read(struct perf_event *event)
+{
+	host_event_update(event);
+}
+
+static void init_host_pmu(struct host_pmu *host_pmu)
+{
+	host_pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_sw_context,
+		.attr_groups	= attr_groups,
+		.event_init	= host_event_init,
+		.add		= host_event_add,
+		.del		= host_event_del,
+		.start		= host_event_start,
+		.stop		= host_event_stop,
+		.read		= host_event_read,
+		.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+	};
+
+	INIT_LIST_HEAD(&host_pmu->event_list_head);
+	spin_lock_init(&host_pmu->event_list_lock);
+}
+
+int kvm_host_pmu_init(void)
+{
+	init_host_pmu(&host_pmu);
+
+	return perf_pmu_register(&host_pmu.pmu, "kvm", -1);
+}
+
+void kvm_host_pmu_teardown(void)
+{
+	perf_pmu_unregister(&host_pmu.pmu);
+}