diff mbox

[07/18] KVM: ARM64: PMU: Add perf event map and introduce perf event creating function

Message ID 1436149068-3784-8-git-send-email-shannon.zhao@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Shannon Zhao July 6, 2015, 2:17 a.m. UTC
From: Shannon Zhao <shannon.zhao@linaro.org>

When we use tools like perf on host, perf passes the event type and the
id in this type category to kernel, then kernel will map them to event
number and write this number to PMU PMEVTYPER<n>_EL0 register. While
we're trapping and emulating guest accesses to PMU registers, we get the
event number and map it to the event type and the id reversely.

Check whether the event type is same with the one to be set. If not,
stop counter to monitor current event and find the event type map id.
According to the bits of data to configure this perf_event attr and
set exclude_host to 1 for guest. Then call perf_event API to create the
corresponding event and save the event pointer.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
---
 include/kvm/arm_pmu.h |   4 ++
 virt/kvm/arm/pmu.c    | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)

Comments

Christoffer Dall July 17, 2015, 2:30 p.m. UTC | #1
On Mon, Jul 06, 2015 at 10:17:37AM +0800, shannon.zhao@linaro.org wrote:
> From: Shannon Zhao <shannon.zhao@linaro.org>
> 
> When we use tools like perf on host, perf passes the event type and the
> id in this type category to kernel, then kernel will map them to event
> number and write this number to PMU PMEVTYPER<n>_EL0 register. While
> we're trapping and emulating guest accesses to PMU registers, we get the
> event number and map it to the event type and the id reversely.

There's something with the nomenclature that makes this really hard to
read.

you mention here: "event type", "the id", and "event number".  The
former two I think are perf identifiers, and the latter is the hardware
event number, is this right?

> 
> Check whether the event type is same with the one to be set.

when?

> If not,
> stop counter to monitor current event and find the event type map id.
> According to the bits of data to configure this perf_event attr and
> set exclude_host to 1 for guest. Then call perf_event API to create the
> corresponding event and save the event pointer.
> 
> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> ---
>  include/kvm/arm_pmu.h |   4 ++
>  virt/kvm/arm/pmu.c    | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 177 insertions(+)
> 
> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
> index 27d14ca..1050b24 100644
> --- a/include/kvm/arm_pmu.h
> +++ b/include/kvm/arm_pmu.h
> @@ -45,9 +45,13 @@ struct kvm_pmu {
>  
>  #ifdef CONFIG_KVM_ARM_PMU
>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> +				    unsigned long select_idx);
>  void kvm_pmu_init(struct kvm_vcpu *vcpu);
>  #else
>  static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> +				    unsigned long select_idx) {}
>  static inline void kvm_pmu_init(struct kvm_vcpu *vcpu) {}
>  #endif
>  
> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> index dc252d0..50a3c82 100644
> --- a/virt/kvm/arm/pmu.c
> +++ b/virt/kvm/arm/pmu.c
> @@ -18,8 +18,68 @@
>  #include <linux/cpu.h>
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
> +#include <linux/perf_event.h>
>  #include <kvm/arm_pmu.h>
>  
> +/* PMU HW events mapping. */
> +static struct kvm_pmu_hw_event_map {
> +	unsigned eventsel;
> +	unsigned event_type;
> +} kvm_pmu_hw_events[] = {
> +	[0] = { 0x11, PERF_COUNT_HW_CPU_CYCLES },
> +	[1] = { 0x08, PERF_COUNT_HW_INSTRUCTIONS },
> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_REFERENCES },
> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_MISSES },
> +	[4] = { 0x10, PERF_COUNT_HW_BRANCH_MISSES },
> +};
> +
> +/* PMU HW cache events mapping. */
> +static struct kvm_pmu_hw_cache_event_map {
> +	unsigned eventsel;
> +	unsigned cache_type;
> +	unsigned cache_op;
> +	unsigned cache_result;
> +} kvm_pmu_hw_cache_events[] = {
> +	[0] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> +	[1] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },

seems to me that the four entries above will never be used, because
you'll always match them in kvm_pmu_hw_events above?

Is this because perf map multiple generic perf events to the same
hardware event?  Does it matter if we register this with perf as one or
the other then?

> +	[4] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> +	[5] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> +	[6] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> +	[7] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> +};
> +
> +/**
> + * kvm_pmu_stop_counter - stop PMU counter for the selected counter
> + * @vcpu: The vcpu pointer
> + * @select_idx: The counter index
> + *
> + * If this counter has been configured to monitor some event, disable and
> + * release it.
> + */
> +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu,
> +				 unsigned long select_idx)
> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
> +
> +	if (pmc->perf_event) {
> +		perf_event_disable(pmc->perf_event);
> +		perf_event_release_kernel(pmc->perf_event);
> +	}
> +	pmc->perf_event = NULL;
> +	pmc->eventsel = 0xff;

why is 0xff 'unused' or reserved?  If we're choosing this arbitrarily,
why is it not 0x3ff?  Should we create a define for this?

> +}
> +
>  /**
>   * kvm_pmu_vcpu_reset - reset pmu state for cpu
>   * @vcpu: The vcpu pointer
> @@ -27,12 +87,125 @@
>   */
>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
>  {
> +	int i;
>  	struct kvm_pmu *pmu = &vcpu->arch.pmu;
>  
> +	for (i = 0; i < ARMV8_MAX_COUNTERS; i++)
> +		kvm_pmu_stop_counter(vcpu, i);
> +	pmu->overflow_status = 0;
>  	pmu->irq_pending = false;
>  }
>  
>  /**
> + * kvm_pmu_find_hw_event - find hardware event
> + * @pmu: The pmu pointer
> + * @event_select: The number of selected event type
> + *
> + * Based on the number of selected event type, find out whether it belongs to
> + * PERF_TYPE_HARDWARE. If so, return the corresponding event id.
> + */
> +static unsigned kvm_pmu_find_hw_event(struct kvm_pmu *pmu,
> +				      unsigned long event_select)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_events); i++)
> +		if (kvm_pmu_hw_events[i].eventsel == event_select)
> +			break;
> +
> +	if (i == ARRAY_SIZE(kvm_pmu_hw_events))
> +		return PERF_COUNT_HW_MAX;
> +
> +	return kvm_pmu_hw_events[i].event_type;

you can just return this directly in the loop if you have a match and
unconditionally return PERF_COUNT_HW_MAX outside the loop without having
to check the loop condition.

> +}
> +
> +/**
> + * kvm_pmu_find_hw_cache_event - find hardware cache event
> + * @pmu: The pmu pointer
> + * @event_select: The number of selected event type
> + *
> + * Based on the number of selected event type, find out whether it belongs to
> + * PERF_TYPE_HW_CACHE. If so, return the corresponding event id.
> + */
> +static unsigned kvm_pmu_find_hw_cache_event(struct kvm_pmu *pmu,
> +					    unsigned long event_select)
> +{
> +	int i;
> +	unsigned config;
> +
> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_cache_events); i++)
> +		if (kvm_pmu_hw_cache_events[i].eventsel == event_select)
> +			break;
> +
> +	if (i == ARRAY_SIZE(kvm_pmu_hw_cache_events))
> +		return PERF_COUNT_HW_CACHE_MAX;

I feel like I just read this code, can we reuse it with a pointer to the
array?

> +
> +	config = (kvm_pmu_hw_cache_events[i].cache_type & 0xff)
> +		 | ((kvm_pmu_hw_cache_events[i].cache_op & 0xff) << 8)
> +		 | ((kvm_pmu_hw_cache_events[i].cache_result & 0xff) << 16);
> +
> +	return config;
> +}
> +
> +/**
> + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
> + * @vcpu: The vcpu pointer
> + * @data: The data guest writes to PMXEVTYPER_EL0
> + * @select_idx: The number of selected counter
> + *
> + * Firstly check whether the event type is same with the one to be set.
> + * If not, stop counter to monitor current event and find the event type map id.
> + * According to the bits of data to configure this perf_event attr and set
> + * exclude_host to 1 for guest. Then call perf_event API to create the
> + * corresponding event and save the event pointer.

This text seems to be describing more how the function does something,
as opposed to what it does and why.  I found it a little hard to read.

> + */
> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> +				    unsigned long select_idx)
> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
> +	struct perf_event *event;
> +	struct perf_event_attr attr;
> +	unsigned config, type = PERF_TYPE_RAW;
> +
> +	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
> +		return;
> +
> +	kvm_pmu_stop_counter(vcpu, select_idx);
> +	pmc->eventsel = data & ARMV8_EVTYPE_EVENT;
> +
> +	config = kvm_pmu_find_hw_event(pmu, pmc->eventsel);
> +	if (config != PERF_COUNT_HW_MAX) {
> +		type = PERF_TYPE_HARDWARE;
> +	} else {
> +		config = kvm_pmu_find_hw_cache_event(pmu, pmc->eventsel);
> +		if (config != PERF_COUNT_HW_CACHE_MAX)
> +			type = PERF_TYPE_HW_CACHE;
> +	}
> +
> +	if (type == PERF_TYPE_RAW)
> +		config = pmc->eventsel;

don't you need to memset attr to 0 first?

otherwise, how do you ensure that for example exclude_guest is always
clear?

> +
> +	attr.type = type;
> +	attr.size = sizeof(attr);
> +	attr.pinned = true;
> +	attr.exclude_user = data & ARMV8_EXCLUDE_EL0 ? 1 : 0;
> +	attr.exclude_kernel = data & ARMV8_EXCLUDE_EL1 ? 1 : 0;
> +	attr.exclude_hv = data & ARMV8_INCLUDE_EL2 ? 0 : 1;

should the guest be able to see something counted in the hypervisor ever
or should that only be the host being able to see that?

my gut feeling is that the hypervisor should be hidden from the guest
and that exclude_hv = 0, is the right choice.  But this is a question
about the semantics of perf, I suppose.

> +	attr.exclude_host = 1;
> +	attr.config = config;
> +	attr.sample_period = (-pmc->counter) & (((u64)1 << 32) - 1);

whoa, what is this scary calculation?

definitely needs an explanation?

> +
> +	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> +	if (IS_ERR(event)) {
> +		kvm_err("kvm: pmu event creation failed %ld\n",
> +			  PTR_ERR(event));

doesn't this mean we'll spam the kernel log if the guest supplies
bogus/unsupported event numbers?

In that case it shoudl be kvm_debug and the guest should be able to see
this somehow (e.g. events don't count).

> +		return;
> +	}
> +	pmc->perf_event = event;
> +}
> +
> +/**
>   * kvm_pmu_init - Initialize global PMU state for per vcpu
>   * @vcpu: The vcpu pointer
>   *
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shannon Zhao July 21, 2015, 1:35 a.m. UTC | #2
On 2015/7/17 22:30, Christoffer Dall wrote:
> On Mon, Jul 06, 2015 at 10:17:37AM +0800, shannon.zhao@linaro.org wrote:
>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>
>> When we use tools like perf on host, perf passes the event type and the
>> id in this type category to kernel, then kernel will map them to event
>> number and write this number to PMU PMEVTYPER<n>_EL0 register. While
>> we're trapping and emulating guest accesses to PMU registers, we get the
>> event number and map it to the event type and the id reversely.
> 
> There's something with the nomenclature that makes this really hard to
> read.
> 
> you mention here: "event type", "the id", and "event number".  The
> former two I think are perf identifiers, and the latter is the hardware
> event number, is this right?
> 

Yeah, right.

>>
>> Check whether the event type is same with the one to be set.
> 
> when?
> 
In function kvm_pmu_set_counter_event_type before create a new perf event.

>> +	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
>> +		return;


>> If not,
>> stop counter to monitor current event and find the event type map id.
>> According to the bits of data to configure this perf_event attr and
>> set exclude_host to 1 for guest. Then call perf_event API to create the
>> corresponding event and save the event pointer.
>>
>> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>> ---
>>  include/kvm/arm_pmu.h |   4 ++
>>  virt/kvm/arm/pmu.c    | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 177 insertions(+)
>>
>> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
>> index 27d14ca..1050b24 100644
>> --- a/include/kvm/arm_pmu.h
>> +++ b/include/kvm/arm_pmu.h
>> @@ -45,9 +45,13 @@ struct kvm_pmu {
>>  
>>  #ifdef CONFIG_KVM_ARM_PMU
>>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
>> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
>> +				    unsigned long select_idx);
>>  void kvm_pmu_init(struct kvm_vcpu *vcpu);
>>  #else
>>  static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
>> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
>> +				    unsigned long select_idx) {}
>>  static inline void kvm_pmu_init(struct kvm_vcpu *vcpu) {}
>>  #endif
>>  
>> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
>> index dc252d0..50a3c82 100644
>> --- a/virt/kvm/arm/pmu.c
>> +++ b/virt/kvm/arm/pmu.c
>> @@ -18,8 +18,68 @@
>>  #include <linux/cpu.h>
>>  #include <linux/kvm.h>
>>  #include <linux/kvm_host.h>
>> +#include <linux/perf_event.h>
>>  #include <kvm/arm_pmu.h>
>>  
>> +/* PMU HW events mapping. */
>> +static struct kvm_pmu_hw_event_map {
>> +	unsigned eventsel;
>> +	unsigned event_type;
>> +} kvm_pmu_hw_events[] = {
>> +	[0] = { 0x11, PERF_COUNT_HW_CPU_CYCLES },
>> +	[1] = { 0x08, PERF_COUNT_HW_INSTRUCTIONS },
>> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_REFERENCES },
>> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_MISSES },
>> +	[4] = { 0x10, PERF_COUNT_HW_BRANCH_MISSES },
>> +};
>> +
>> +/* PMU HW cache events mapping. */
>> +static struct kvm_pmu_hw_cache_event_map {
>> +	unsigned eventsel;
>> +	unsigned cache_type;
>> +	unsigned cache_op;
>> +	unsigned cache_result;
>> +} kvm_pmu_hw_cache_events[] = {
>> +	[0] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
>> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
>> +	[1] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
>> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
>> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
>> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
>> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
>> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> 
> seems to me that the four entries above will never be used, because
> you'll always match them in kvm_pmu_hw_events above?
> 

Yes, I found this before, but for the completeness I list them.

> Is this because perf map multiple generic perf events to the same
> hardware event? 
I think so.

> Does it matter if we register this with perf as one or
> the other then?
> 
I think it's ok because the hardware event numbers are same.


>> +	[4] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
>> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
>> +	[5] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
>> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
>> +	[6] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
>> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
>> +	[7] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
>> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
>> +};
>> +
>> +/**
>> + * kvm_pmu_stop_counter - stop PMU counter for the selected counter
>> + * @vcpu: The vcpu pointer
>> + * @select_idx: The counter index
>> + *
>> + * If this counter has been configured to monitor some event, disable and
>> + * release it.
>> + */
>> +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu,
>> +				 unsigned long select_idx)
>> +{
>> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
>> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
>> +
>> +	if (pmc->perf_event) {
>> +		perf_event_disable(pmc->perf_event);
>> +		perf_event_release_kernel(pmc->perf_event);
>> +	}
>> +	pmc->perf_event = NULL;
>> +	pmc->eventsel = 0xff;
> 
> why is 0xff 'unused' or reserved?  If we're choosing this arbitrarily,
> why is it not 0x3ff?  Should we create a define for this?
> 

Yeah, 0x3ff may be better.

>> +}
>> +
>>  /**
>>   * kvm_pmu_vcpu_reset - reset pmu state for cpu
>>   * @vcpu: The vcpu pointer
>> @@ -27,12 +87,125 @@
>>   */
>>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
>>  {
>> +	int i;
>>  	struct kvm_pmu *pmu = &vcpu->arch.pmu;
>>  
>> +	for (i = 0; i < ARMV8_MAX_COUNTERS; i++)
>> +		kvm_pmu_stop_counter(vcpu, i);
>> +	pmu->overflow_status = 0;
>>  	pmu->irq_pending = false;
>>  }
>>  
>>  /**
>> + * kvm_pmu_find_hw_event - find hardware event
>> + * @pmu: The pmu pointer
>> + * @event_select: The number of selected event type
>> + *
>> + * Based on the number of selected event type, find out whether it belongs to
>> + * PERF_TYPE_HARDWARE. If so, return the corresponding event id.
>> + */
>> +static unsigned kvm_pmu_find_hw_event(struct kvm_pmu *pmu,
>> +				      unsigned long event_select)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_events); i++)
>> +		if (kvm_pmu_hw_events[i].eventsel == event_select)
>> +			break;
>> +
>> +	if (i == ARRAY_SIZE(kvm_pmu_hw_events))
>> +		return PERF_COUNT_HW_MAX;
>> +
>> +	return kvm_pmu_hw_events[i].event_type;
> 
> you can just return this directly in the loop if you have a match and
> unconditionally return PERF_COUNT_HW_MAX outside the loop without having
> to check the loop condition.
> 
ok
>> +}
>> +
>> +/**
>> + * kvm_pmu_find_hw_cache_event - find hardware cache event
>> + * @pmu: The pmu pointer
>> + * @event_select: The number of selected event type
>> + *
>> + * Based on the number of selected event type, find out whether it belongs to
>> + * PERF_TYPE_HW_CACHE. If so, return the corresponding event id.
>> + */
>> +static unsigned kvm_pmu_find_hw_cache_event(struct kvm_pmu *pmu,
>> +					    unsigned long event_select)
>> +{
>> +	int i;
>> +	unsigned config;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_cache_events); i++)
>> +		if (kvm_pmu_hw_cache_events[i].eventsel == event_select)
>> +			break;
>> +
>> +	if (i == ARRAY_SIZE(kvm_pmu_hw_cache_events))
>> +		return PERF_COUNT_HW_CACHE_MAX;
> 
> I feel like I just read this code, can we reuse it with a pointer to the
> array?
> 
>> +
>> +	config = (kvm_pmu_hw_cache_events[i].cache_type & 0xff)
>> +		 | ((kvm_pmu_hw_cache_events[i].cache_op & 0xff) << 8)
>> +		 | ((kvm_pmu_hw_cache_events[i].cache_result & 0xff) << 16);
>> +
>> +	return config;
>> +}
>> +
>> +/**
>> + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
>> + * @vcpu: The vcpu pointer
>> + * @data: The data guest writes to PMXEVTYPER_EL0
>> + * @select_idx: The number of selected counter
>> + *
>> + * Firstly check whether the event type is same with the one to be set.
>> + * If not, stop counter to monitor current event and find the event type map id.
>> + * According to the bits of data to configure this perf_event attr and set
>> + * exclude_host to 1 for guest. Then call perf_event API to create the
>> + * corresponding event and save the event pointer.
> 
> This text seems to be describing more how the function does something,
> as opposed to what it does and why.  I found it a little hard to read.
> 
>> + */
>> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
>> +				    unsigned long select_idx)
>> +{
>> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
>> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
>> +	struct perf_event *event;
>> +	struct perf_event_attr attr;
>> +	unsigned config, type = PERF_TYPE_RAW;
>> +
>> +	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
>> +		return;
>> +
>> +	kvm_pmu_stop_counter(vcpu, select_idx);
>> +	pmc->eventsel = data & ARMV8_EVTYPE_EVENT;
>> +
>> +	config = kvm_pmu_find_hw_event(pmu, pmc->eventsel);
>> +	if (config != PERF_COUNT_HW_MAX) {
>> +		type = PERF_TYPE_HARDWARE;
>> +	} else {
>> +		config = kvm_pmu_find_hw_cache_event(pmu, pmc->eventsel);
>> +		if (config != PERF_COUNT_HW_CACHE_MAX)
>> +			type = PERF_TYPE_HW_CACHE;
>> +	}
>> +
>> +	if (type == PERF_TYPE_RAW)
>> +		config = pmc->eventsel;
> 
> don't you need to memset attr to 0 first?
> 
ok
> otherwise, how do you ensure that for example exclude_guest is always
> clear?
> 
>> +
>> +	attr.type = type;
>> +	attr.size = sizeof(attr);
>> +	attr.pinned = true;
>> +	attr.exclude_user = data & ARMV8_EXCLUDE_EL0 ? 1 : 0;
>> +	attr.exclude_kernel = data & ARMV8_EXCLUDE_EL1 ? 1 : 0;
>> +	attr.exclude_hv = data & ARMV8_INCLUDE_EL2 ? 0 : 1;
> 
> should the guest be able to see something counted in the hypervisor ever
> or should that only be the host being able to see that?
> 
> my gut feeling is that the hypervisor should be hidden from the guest
> and that exclude_hv = 0, is the right choice.  But this is a question
> about the semantics of perf, I suppose.
> 

This is what I'm not sure. I just thought about if guest has complete
ELs(EL3-EL0) and how to deal with nested virtualization.

>> +	attr.exclude_host = 1;
>> +	attr.config = config;
>> +	attr.sample_period = (-pmc->counter) & (((u64)1 << 32) - 1);
> 
> whoa, what is this scary calculation?
> 
> definitely needs an explanation?
> 
>> +
>> +	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
>> +	if (IS_ERR(event)) {
>> +		kvm_err("kvm: pmu event creation failed %ld\n",
>> +			  PTR_ERR(event));
> 
> doesn't this mean we'll spam the kernel log if the guest supplies
> bogus/unsupported event numbers?
> 

X86 uses printk_once, is that ok to us here?

> In that case it shoudl be kvm_debug and the guest should be able to see
> this somehow (e.g. events don't count).
> 

Yes, will think about this.

>> +		return;
>> +	}
>> +	pmc->perf_event = event;
>> +}
>> +
>> +/**
>>   * kvm_pmu_init - Initialize global PMU state for per vcpu
>>   * @vcpu: The vcpu pointer
>>   *
>> -- 
>> 2.1.0
>>
Christoffer Dall Aug. 3, 2015, 7:55 p.m. UTC | #3
On Tue, Jul 21, 2015 at 09:35:03AM +0800, Shannon Zhao wrote:
> 
> 
> On 2015/7/17 22:30, Christoffer Dall wrote:
> > On Mon, Jul 06, 2015 at 10:17:37AM +0800, shannon.zhao@linaro.org wrote:
> >> From: Shannon Zhao <shannon.zhao@linaro.org>
> >>
> >> When we use tools like perf on host, perf passes the event type and the
> >> id in this type category to kernel, then kernel will map them to event
> >> number and write this number to PMU PMEVTYPER<n>_EL0 register. While
> >> we're trapping and emulating guest accesses to PMU registers, we get the
> >> event number and map it to the event type and the id reversely.
> > 
> > There's something with the nomenclature that makes this really hard to
> > read.
> > 
> > you mention here: "event type", "the id", and "event number".  The
> > former two I think are perf identifiers, and the latter is the hardware
> > event number, is this right?
> > 
> 
> Yeah, right.
> 

ok, if we can clarify our nomenclature throughout here I think that
would make the patches easier to understand/read.

> >>
> >> Check whether the event type is same with the one to be set.
> > 
> > when?
> > 
> In function kvm_pmu_set_counter_event_type before create a new perf event.

do you mean when the guest configures some counter and we we create a
perf event accordingly?

If so, I think this could be clarified in the commit text.

> 
> >> +	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
> >> +		return;
> 
> 
> >> If not,
> >> stop counter to monitor current event and find the event type map id.
> >> According to the bits of data to configure this perf_event attr and
> >> set exclude_host to 1 for guest. Then call perf_event API to create the
> >> corresponding event and save the event pointer.
> >>
> >> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> >> ---
> >>  include/kvm/arm_pmu.h |   4 ++
> >>  virt/kvm/arm/pmu.c    | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 177 insertions(+)
> >>
> >> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
> >> index 27d14ca..1050b24 100644
> >> --- a/include/kvm/arm_pmu.h
> >> +++ b/include/kvm/arm_pmu.h
> >> @@ -45,9 +45,13 @@ struct kvm_pmu {
> >>  
> >>  #ifdef CONFIG_KVM_ARM_PMU
> >>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
> >> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> >> +				    unsigned long select_idx);
> >>  void kvm_pmu_init(struct kvm_vcpu *vcpu);
> >>  #else
> >>  static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
> >> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> >> +				    unsigned long select_idx) {}
> >>  static inline void kvm_pmu_init(struct kvm_vcpu *vcpu) {}
> >>  #endif
> >>  
> >> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> >> index dc252d0..50a3c82 100644
> >> --- a/virt/kvm/arm/pmu.c
> >> +++ b/virt/kvm/arm/pmu.c
> >> @@ -18,8 +18,68 @@
> >>  #include <linux/cpu.h>
> >>  #include <linux/kvm.h>
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/perf_event.h>
> >>  #include <kvm/arm_pmu.h>
> >>  
> >> +/* PMU HW events mapping. */
> >> +static struct kvm_pmu_hw_event_map {
> >> +	unsigned eventsel;
> >> +	unsigned event_type;
> >> +} kvm_pmu_hw_events[] = {
> >> +	[0] = { 0x11, PERF_COUNT_HW_CPU_CYCLES },
> >> +	[1] = { 0x08, PERF_COUNT_HW_INSTRUCTIONS },
> >> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_REFERENCES },
> >> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_MISSES },
> >> +	[4] = { 0x10, PERF_COUNT_HW_BRANCH_MISSES },
> >> +};
> >> +
> >> +/* PMU HW cache events mapping. */
> >> +static struct kvm_pmu_hw_cache_event_map {
> >> +	unsigned eventsel;
> >> +	unsigned cache_type;
> >> +	unsigned cache_op;
> >> +	unsigned cache_result;
> >> +} kvm_pmu_hw_cache_events[] = {
> >> +	[0] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> >> +	[1] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> >> +	[2] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> >> +	[3] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> > 
> > seems to me that the four entries above will never be used, because
> > you'll always match them in kvm_pmu_hw_events above?
> > 
> 
> Yes, I found this before, but for the completeness I list them.
> 

hmm, having unused entries for completeness is a bit weird, IMHO.
Either way, a comment explaining why they're missing or that some are
never used would be helpful.

> > Is this because perf map multiple generic perf events to the same
> > hardware event? 
> I think so.
> 
> > Does it matter if we register this with perf as one or
> > the other then?
> > 
> I think it's ok because the hardware event numbers are same.
> 

ok

> 
> >> +	[4] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> >> +	[5] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> >> +	[6] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
> >> +	[7] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
> >> +		      PERF_COUNT_HW_CACHE_RESULT_MISS },
> >> +};
> >> +
> >> +/**
> >> + * kvm_pmu_stop_counter - stop PMU counter for the selected counter
> >> + * @vcpu: The vcpu pointer
> >> + * @select_idx: The counter index
> >> + *
> >> + * If this counter has been configured to monitor some event, disable and
> >> + * release it.
> >> + */
> >> +static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu,
> >> +				 unsigned long select_idx)
> >> +{
> >> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> >> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
> >> +
> >> +	if (pmc->perf_event) {
> >> +		perf_event_disable(pmc->perf_event);
> >> +		perf_event_release_kernel(pmc->perf_event);
> >> +	}
> >> +	pmc->perf_event = NULL;
> >> +	pmc->eventsel = 0xff;
> > 
> > why is 0xff 'unused' or reserved?  If we're choosing this arbitrarily,
> > why is it not 0x3ff?  Should we create a define for this?
> > 
> 
> Yeah, 0x3ff may be better.
> 
> >> +}
> >> +
> >>  /**
> >>   * kvm_pmu_vcpu_reset - reset pmu state for cpu
> >>   * @vcpu: The vcpu pointer
> >> @@ -27,12 +87,125 @@
> >>   */
> >>  void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
> >>  {
> >> +	int i;
> >>  	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> >>  
> >> +	for (i = 0; i < ARMV8_MAX_COUNTERS; i++)
> >> +		kvm_pmu_stop_counter(vcpu, i);
> >> +	pmu->overflow_status = 0;
> >>  	pmu->irq_pending = false;
> >>  }
> >>  
> >>  /**
> >> + * kvm_pmu_find_hw_event - find hardware event
> >> + * @pmu: The pmu pointer
> >> + * @event_select: The number of selected event type
> >> + *
> >> + * Based on the number of selected event type, find out whether it belongs to
> >> + * PERF_TYPE_HARDWARE. If so, return the corresponding event id.
> >> + */
> >> +static unsigned kvm_pmu_find_hw_event(struct kvm_pmu *pmu,
> >> +				      unsigned long event_select)
> >> +{
> >> +	int i;
> >> +
> >> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_events); i++)
> >> +		if (kvm_pmu_hw_events[i].eventsel == event_select)
> >> +			break;
> >> +
> >> +	if (i == ARRAY_SIZE(kvm_pmu_hw_events))
> >> +		return PERF_COUNT_HW_MAX;
> >> +
> >> +	return kvm_pmu_hw_events[i].event_type;
> > 
> > you can just return this directly in the loop if you have a match and
> > unconditionally return PERF_COUNT_HW_MAX outside the loop without having
> > to check the loop condition.
> > 
> ok
> >> +}
> >> +
> >> +/**
> >> + * kvm_pmu_find_hw_cache_event - find hardware cache event
> >> + * @pmu: The pmu pointer
> >> + * @event_select: The number of selected event type
> >> + *
> >> + * Based on the number of selected event type, find out whether it belongs to
> >> + * PERF_TYPE_HW_CACHE. If so, return the corresponding event id.
> >> + */
> >> +static unsigned kvm_pmu_find_hw_cache_event(struct kvm_pmu *pmu,
> >> +					    unsigned long event_select)
> >> +{
> >> +	int i;
> >> +	unsigned config;
> >> +
> >> +	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_cache_events); i++)
> >> +		if (kvm_pmu_hw_cache_events[i].eventsel == event_select)
> >> +			break;
> >> +
> >> +	if (i == ARRAY_SIZE(kvm_pmu_hw_cache_events))
> >> +		return PERF_COUNT_HW_CACHE_MAX;
> > 
> > I feel like I just read this code, can we reuse it with a pointer to the
> > array?
> > 
> >> +
> >> +	config = (kvm_pmu_hw_cache_events[i].cache_type & 0xff)
> >> +		 | ((kvm_pmu_hw_cache_events[i].cache_op & 0xff) << 8)
> >> +		 | ((kvm_pmu_hw_cache_events[i].cache_result & 0xff) << 16);
> >> +
> >> +	return config;
> >> +}
> >> +
> >> +/**
> >> + * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
> >> + * @vcpu: The vcpu pointer
> >> + * @data: The data guest writes to PMXEVTYPER_EL0
> >> + * @select_idx: The number of selected counter
> >> + *
> >> + * Firstly check whether the event type is same with the one to be set.
> >> + * If not, stop counter to monitor current event and find the event type map id.
> >> + * According to the bits of data to configure this perf_event attr and set
> >> + * exclude_host to 1 for guest. Then call perf_event API to create the
> >> + * corresponding event and save the event pointer.
> > 
> > This text seems to be describing more how the function does something,
> > as opposed to what it does and why.  I found it a little hard to read.
> > 
> >> + */
> >> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
> >> +				    unsigned long select_idx)
> >> +{
> >> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> >> +	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
> >> +	struct perf_event *event;
> >> +	struct perf_event_attr attr;
> >> +	unsigned config, type = PERF_TYPE_RAW;
> >> +
> >> +	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
> >> +		return;
> >> +
> >> +	kvm_pmu_stop_counter(vcpu, select_idx);
> >> +	pmc->eventsel = data & ARMV8_EVTYPE_EVENT;
> >> +
> >> +	config = kvm_pmu_find_hw_event(pmu, pmc->eventsel);
> >> +	if (config != PERF_COUNT_HW_MAX) {
> >> +		type = PERF_TYPE_HARDWARE;
> >> +	} else {
> >> +		config = kvm_pmu_find_hw_cache_event(pmu, pmc->eventsel);
> >> +		if (config != PERF_COUNT_HW_CACHE_MAX)
> >> +			type = PERF_TYPE_HW_CACHE;
> >> +	}
> >> +
> >> +	if (type == PERF_TYPE_RAW)
> >> +		config = pmc->eventsel;
> > 
> > don't you need to memset attr to 0 first?
> > 
> ok
> > otherwise, how do you ensure that for example exclude_guest is always
> > clear?
> > 
> >> +
> >> +	attr.type = type;
> >> +	attr.size = sizeof(attr);
> >> +	attr.pinned = true;
> >> +	attr.exclude_user = data & ARMV8_EXCLUDE_EL0 ? 1 : 0;
> >> +	attr.exclude_kernel = data & ARMV8_EXCLUDE_EL1 ? 1 : 0;
> >> +	attr.exclude_hv = data & ARMV8_INCLUDE_EL2 ? 0 : 1;
> > 
> > should the guest be able to see something counted in the hypervisor ever
> > or should that only be the host being able to see that?
> > 
> > my gut feeling is that the hypervisor should be hidden from the guest
> > and that exclude_hv = 0, is the right choice.  But this is a question
> > about the semantics of perf, I suppose.
> > 
> 
> This is what I'm not sure. I just thought about if guest has complete
> ELs(EL3-EL0) and how to deal with nested virtualization.
> 

nested virtualization is really not very likely to happen on arm64.  I
think the right way to look at this is that we're emulating a platform
without EL2 and therefore the guest should not see any events from EL2.

> >> +	attr.exclude_host = 1;
> >> +	attr.config = config;
> >> +	attr.sample_period = (-pmc->counter) & (((u64)1 << 32) - 1);
> > 
> > whoa, what is this scary calculation?
> > 
> > definitely needs an explanation?
> > 
> >> +
> >> +	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> >> +	if (IS_ERR(event)) {
> >> +		kvm_err("kvm: pmu event creation failed %ld\n",
> >> +			  PTR_ERR(event));
> > 
> > doesn't this mean we'll spam the kernel log if the guest supplies
> > bogus/unsupported event numbers?
> > 
> 
> X86 uses printk_once, is that ok to us here?

yeah, that should be fine.

> 
> > In that case it shoudl be kvm_debug and the guest should be able to see
> > this somehow (e.g. events don't count).
> > 
> 
> Yes, will think about this.
> 

printk_once of kvm_debug would be fine.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 27d14ca..1050b24 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -45,9 +45,13 @@  struct kvm_pmu {
 
 #ifdef CONFIG_KVM_ARM_PMU
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
+				    unsigned long select_idx);
 void kvm_pmu_init(struct kvm_vcpu *vcpu);
 #else
 static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
+				    unsigned long select_idx) {}
 static inline void kvm_pmu_init(struct kvm_vcpu *vcpu) {}
 #endif
 
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index dc252d0..50a3c82 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -18,8 +18,68 @@ 
 #include <linux/cpu.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/perf_event.h>
 #include <kvm/arm_pmu.h>
 
+/* PMU HW events mapping. */
+static struct kvm_pmu_hw_event_map {
+	unsigned eventsel;
+	unsigned event_type;
+} kvm_pmu_hw_events[] = {
+	[0] = { 0x11, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0x08, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x04, PERF_COUNT_HW_CACHE_REFERENCES },
+	[3] = { 0x03, PERF_COUNT_HW_CACHE_MISSES },
+	[4] = { 0x10, PERF_COUNT_HW_BRANCH_MISSES },
+};
+
+/* PMU HW cache events mapping. */
+static struct kvm_pmu_hw_cache_event_map {
+	unsigned eventsel;
+	unsigned cache_type;
+	unsigned cache_op;
+	unsigned cache_result;
+} kvm_pmu_hw_cache_events[] = {
+	[0] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
+		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
+	[1] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_READ,
+		      PERF_COUNT_HW_CACHE_RESULT_MISS },
+	[2] = { 0x04, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
+		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
+	[3] = { 0x03, PERF_COUNT_HW_CACHE_L1D, PERF_COUNT_HW_CACHE_OP_WRITE,
+		      PERF_COUNT_HW_CACHE_RESULT_MISS },
+	[4] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
+		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
+	[5] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_READ,
+		      PERF_COUNT_HW_CACHE_RESULT_MISS },
+	[6] = { 0x12, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
+		      PERF_COUNT_HW_CACHE_RESULT_ACCESS },
+	[7] = { 0x10, PERF_COUNT_HW_CACHE_BPU, PERF_COUNT_HW_CACHE_OP_WRITE,
+		      PERF_COUNT_HW_CACHE_RESULT_MISS },
+};
+
+/**
+ * kvm_pmu_stop_counter - stop PMU counter for the selected counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ *
+ * If this counter has been configured to monitor some event, disable and
+ * release it.
+ */
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu,
+				 unsigned long select_idx)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
+
+	if (pmc->perf_event) {
+		perf_event_disable(pmc->perf_event);
+		perf_event_release_kernel(pmc->perf_event);
+	}
+	pmc->perf_event = NULL;
+	pmc->eventsel = 0xff;
+}
+
 /**
  * kvm_pmu_vcpu_reset - reset pmu state for cpu
  * @vcpu: The vcpu pointer
@@ -27,12 +87,125 @@ 
  */
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+	int i;
 	struct kvm_pmu *pmu = &vcpu->arch.pmu;
 
+	for (i = 0; i < ARMV8_MAX_COUNTERS; i++)
+		kvm_pmu_stop_counter(vcpu, i);
+	pmu->overflow_status = 0;
 	pmu->irq_pending = false;
 }
 
 /**
+ * kvm_pmu_find_hw_event - find hardware event
+ * @pmu: The pmu pointer
+ * @event_select: The number of selected event type
+ *
+ * Based on the number of selected event type, find out whether it belongs to
+ * PERF_TYPE_HARDWARE. If so, return the corresponding event id.
+ */
+static unsigned kvm_pmu_find_hw_event(struct kvm_pmu *pmu,
+				      unsigned long event_select)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_events); i++)
+		if (kvm_pmu_hw_events[i].eventsel == event_select)
+			break;
+
+	if (i == ARRAY_SIZE(kvm_pmu_hw_events))
+		return PERF_COUNT_HW_MAX;
+
+	return kvm_pmu_hw_events[i].event_type;
+}
+
+/**
+ * kvm_pmu_find_hw_cache_event - find hardware cache event
+ * @pmu: The pmu pointer
+ * @event_select: The number of selected event type
+ *
+ * Based on the number of selected event type, find out whether it belongs to
+ * PERF_TYPE_HW_CACHE. If so, return the corresponding event id.
+ */
+static unsigned kvm_pmu_find_hw_cache_event(struct kvm_pmu *pmu,
+					    unsigned long event_select)
+{
+	int i;
+	unsigned config;
+
+	for (i = 0; i < ARRAY_SIZE(kvm_pmu_hw_cache_events); i++)
+		if (kvm_pmu_hw_cache_events[i].eventsel == event_select)
+			break;
+
+	if (i == ARRAY_SIZE(kvm_pmu_hw_cache_events))
+		return PERF_COUNT_HW_CACHE_MAX;
+
+	config = (kvm_pmu_hw_cache_events[i].cache_type & 0xff)
+		 | ((kvm_pmu_hw_cache_events[i].cache_op & 0xff) << 8)
+		 | ((kvm_pmu_hw_cache_events[i].cache_result & 0xff) << 16);
+
+	return config;
+}
+
+/**
+ * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
+ * @vcpu: The vcpu pointer
+ * @data: The data guest writes to PMXEVTYPER_EL0
+ * @select_idx: The number of selected counter
+ *
+ * Firstly check whether the event type is same with the one to be set.
+ * If not, stop counter to monitor current event and find the event type map id.
+ * According to the bits of data to configure this perf_event attr and set
+ * exclude_host to 1 for guest. Then call perf_event API to create the
+ * corresponding event and save the event pointer.
+ */
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, unsigned long data,
+				    unsigned long select_idx)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
+	struct perf_event *event;
+	struct perf_event_attr attr;
+	unsigned config, type = PERF_TYPE_RAW;
+
+	if ((data & ARMV8_EVTYPE_EVENT) == pmc->eventsel)
+		return;
+
+	kvm_pmu_stop_counter(vcpu, select_idx);
+	pmc->eventsel = data & ARMV8_EVTYPE_EVENT;
+
+	config = kvm_pmu_find_hw_event(pmu, pmc->eventsel);
+	if (config != PERF_COUNT_HW_MAX) {
+		type = PERF_TYPE_HARDWARE;
+	} else {
+		config = kvm_pmu_find_hw_cache_event(pmu, pmc->eventsel);
+		if (config != PERF_COUNT_HW_CACHE_MAX)
+			type = PERF_TYPE_HW_CACHE;
+	}
+
+	if (type == PERF_TYPE_RAW)
+		config = pmc->eventsel;
+
+	attr.type = type;
+	attr.size = sizeof(attr);
+	attr.pinned = true;
+	attr.exclude_user = data & ARMV8_EXCLUDE_EL0 ? 1 : 0;
+	attr.exclude_kernel = data & ARMV8_EXCLUDE_EL1 ? 1 : 0;
+	attr.exclude_hv = data & ARMV8_INCLUDE_EL2 ? 0 : 1;
+	attr.exclude_host = 1;
+	attr.config = config;
+	attr.sample_period = (-pmc->counter) & (((u64)1 << 32) - 1);
+
+	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
+	if (IS_ERR(event)) {
+		kvm_err("kvm: pmu event creation failed %ld\n",
+			  PTR_ERR(event));
+		return;
+	}
+	pmc->perf_event = event;
+}
+
+/**
  * kvm_pmu_init - Initialize global PMU state for per vcpu
  * @vcpu: The vcpu pointer
  *