diff mbox

[v8,20/20] KVM: ARM64: Add a new kvm ARM PMU device

Message ID 1450771695-11948-21-git-send-email-zhaoshenglong@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shannon Zhao Dec. 22, 2015, 8:08 a.m. UTC
From: Shannon Zhao <shannon.zhao@linaro.org>

Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
the kvm_device_ops for it.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
---
 Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
 arch/arm64/include/uapi/asm/kvm.h             |   4 +
 include/linux/kvm_host.h                      |   1 +
 include/uapi/linux/kvm.h                      |   2 +
 virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
 virt/kvm/kvm_main.c                           |   4 +
 6 files changed, 163 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt

Comments

Marc Zyngier Jan. 7, 2016, 1:56 p.m. UTC | #1
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao <shannon.zhao@linaro.org>
> 
> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
> the kvm_device_ops for it.
> 
> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> ---
>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>  include/linux/kvm_host.h                      |   1 +
>  include/uapi/linux/kvm.h                      |   2 +
>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                           |   4 +
>  6 files changed, 163 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
> 
> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
> new file mode 100644
> index 0000000..dda864e
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
> @@ -0,0 +1,24 @@
> +ARM Virtual Performance Monitor Unit (vPMU)
> +===========================================
> +
> +Device types supported:
> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
> +
> +Instantiate one PMU instance for per VCPU through this API.
> +
> +Groups:
> +  KVM_DEV_ARM_PMU_GRP_IRQ
> +  Attributes:
> +    The attr field of kvm_device_attr encodes one value:
> +    bits:     | 63 .... 32 | 31 ....  0 |
> +    values:   |  reserved  | vcpu_index |
> +    A value describing the PMU overflow interrupt number for the specified
> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> +    same for all vcpus, while as a SPI it must be different for each vcpu.

I don't see anything enforcing these restrictions in the code (you can
program different PPIs on each vcpu, or the same SPI everywhere, and
nothing will generate an error).

Is that something we want to enforce? Or we're happy just to leave it as
an unsupported corner case?

> +
> +  Errors:
> +    -ENXIO: Unsupported attribute group
> +    -EBUSY: The PMU overflow interrupt is already set
> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 2d4ca4b..cbb9022 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -204,6 +204,10 @@ struct kvm_arch_memory_slot {
>  #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
>  #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
>  
> +/* Device Control API: ARM PMU */
> +#define KVM_DEV_ARM_PMU_GRP_IRQ		0
> +#define   KVM_DEV_ARM_PMU_CPUID_MASK	0xffffffffULL
> +
>  /* KVM_IRQ_LINE irq field index values */
>  #define KVM_ARM_IRQ_TYPE_SHIFT		24
>  #define KVM_ARM_IRQ_TYPE_MASK		0xff
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index c923350..608dea6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1161,6 +1161,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> +extern struct kvm_device_ops kvm_arm_pmu_ops;
>  
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 03f3618..4ba6fdd 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1032,6 +1032,8 @@ enum kvm_device_type {
>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
>  	KVM_DEV_TYPE_ARM_VGIC_V3,
>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
> +	KVM_DEV_TYPE_ARM_PMU_V3,
> +#define	KVM_DEV_TYPE_ARM_PMU_V3		KVM_DEV_TYPE_ARM_PMU_V3
>  	KVM_DEV_TYPE_MAX,
>  };
>  
> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> index 3ec3cdd..5518308 100644
> --- a/virt/kvm/arm/pmu.c
> +++ b/virt/kvm/arm/pmu.c
> @@ -19,6 +19,7 @@
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
>  #include <linux/perf_event.h>
> +#include <linux/uaccess.h>
>  #include <asm/kvm_emulate.h>
>  #include <kvm/arm_pmu.h>
>  #include <kvm/arm_vgic.h>
> @@ -374,3 +375,130 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
>  
>  	pmc->perf_event = event;
>  }
> +
> +static inline bool kvm_arm_pmu_initialized(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.pmu.irq_num != -1;
> +}
> +
> +static int kvm_arm_pmu_irq_access(struct kvm *kvm, struct kvm_device_attr *attr,
> +				  int *irq, bool is_set)
> +{
> +	int cpuid;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_pmu *pmu;
> +
> +	cpuid = attr->attr & KVM_DEV_ARM_PMU_CPUID_MASK;
> +	if (cpuid >= atomic_read(&kvm->online_vcpus))
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu(kvm, cpuid);
> +	if (!vcpu)
> +		return -EINVAL;
> +
> +	pmu = &vcpu->arch.pmu;
> +	if (!is_set) {
> +		if (!kvm_arm_pmu_initialized(vcpu))
> +			return -ENODEV;
> +
> +		*irq = pmu->irq_num;
> +	} else {
> +		if (kvm_arm_pmu_initialized(vcpu))
> +			return -EBUSY;
> +
> +		kvm_debug("Set kvm ARM PMU irq: %d\n", *irq);
> +		pmu->irq_num = *irq;
> +	}
> +
> +	return 0;
> +}
> +
> +static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm *kvm = dev->kvm;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +		memset(pmu, 0, sizeof(*pmu));
> +		kvm_pmu_vcpu_reset(vcpu);
> +		pmu->irq_num = -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void kvm_arm_pmu_destroy(struct kvm_device *dev)
> +{
> +	kfree(dev);
> +}
> +
> +static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> +		int __user *uaddr = (int __user *)(long)attr->addr;
> +		int reg;
> +
> +		if (get_user(reg, uaddr))
> +			return -EFAULT;
> +
> +		/*
> +		 * The PMU overflow interrupt could be a PPI or SPI, but for one
> +		 * VM the interrupt type must be same for each vcpu. As a PPI,
> +		 * the interrupt number is same for all vcpus, while as a SPI it
> +		 * must be different for each vcpu.
> +		 */
> +		if (reg < VGIC_NR_SGIS || reg >= dev->kvm->arch.vgic.nr_irqs)
> +			return -EINVAL;
> +
> +		return kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, true);
> +	}
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int kvm_arm_pmu_get_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> +		int __user *uaddr = (int __user *)(long)attr->addr;
> +		int reg = -1;
> +
> +
> +		ret = kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, false);
> +		if (ret)
> +			return ret;
> +		return put_user(reg, uaddr);
> +	}
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int kvm_arm_pmu_has_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ:
> +		return 0;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +struct kvm_device_ops kvm_arm_pmu_ops = {
> +	.name = "kvm-arm-pmu",
> +	.create = kvm_arm_pmu_create,
> +	.destroy = kvm_arm_pmu_destroy,
> +	.set_attr = kvm_arm_pmu_set_attr,
> +	.get_attr = kvm_arm_pmu_get_attr,
> +	.has_attr = kvm_arm_pmu_has_attr,
> +};
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 484079e..81a42cc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2647,6 +2647,10 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
>  #ifdef CONFIG_KVM_XICS
>  	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
>  #endif
> +
> +#ifdef CONFIG_KVM_ARM_PMU
> +	[KVM_DEV_TYPE_ARM_PMU_V3]	= &kvm_arm_pmu_ops,
> +#endif
>  };
>  
>  int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
> 

Thanks,

	M.
Peter Maydell Jan. 7, 2016, 2:36 p.m. UTC | #2
On 22 December 2015 at 08:08, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> From: Shannon Zhao <shannon.zhao@linaro.org>
>
> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
> the kvm_device_ops for it.
>
> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> ---
>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>  include/linux/kvm_host.h                      |   1 +
>  include/uapi/linux/kvm.h                      |   2 +
>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                           |   4 +
>  6 files changed, 163 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>
> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
> new file mode 100644
> index 0000000..dda864e
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
> @@ -0,0 +1,24 @@
> +ARM Virtual Performance Monitor Unit (vPMU)
> +===========================================
> +
> +Device types supported:
> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
> +
> +Instantiate one PMU instance for per VCPU through this API.

Do you mean that userspace has to call this API once per vCPU to
create each PMU, or that calling the device create ioctl once makes
the kernel instantiate a PMU for each vCPU?

(It's a little bit confusing that we say "this API" to mean
"not the API documented in this file at all but actually
the KVM_CREATE_DEVICE ioctl", but I see we do that in the GIC
API docs too.)

> +
> +Groups:
> +  KVM_DEV_ARM_PMU_GRP_IRQ
> +  Attributes:
> +    The attr field of kvm_device_attr encodes one value:
> +    bits:     | 63 .... 32 | 31 ....  0 |
> +    values:   |  reserved  | vcpu_index |
> +    A value describing the PMU overflow interrupt number for the specified
> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> +    same for all vcpus, while as a SPI it must be different for each vcpu.

I see we're using vcpu_index rather than MPIDR affinity value
for specifying which CPU we're configuring. Is this in line with
our planned API for GICv3 configuration?

> +  Errors:
> +    -ENXIO: Unsupported attribute group
> +    -EBUSY: The PMU overflow interrupt is already set
> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied

What happens if you create a PMU but then never set the IRQ number?
Is there a default or does the VM refuse to run or something?

Do we want an attribute like KVM_DEV_ARM_VGIC_CTRL_INIT so userspace
can say "I have definitely completed configuration of this device now" ?

We wound up with a fair amount of legacy mess to deal with in the
GIC code because we didn't put one of those in from the start.
(Perhaps we should aim to standardize on all kvm devices having one?)

thanks
-- PMM
Shannon Zhao Jan. 7, 2016, 2:49 p.m. UTC | #3
On 2016/1/7 22:36, Peter Maydell wrote:
> On 22 December 2015 at 08:08, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>
>> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
>> the kvm_device_ops for it.
>>
>> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>> ---
>>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>>  include/linux/kvm_host.h                      |   1 +
>>  include/uapi/linux/kvm.h                      |   2 +
>>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>>  virt/kvm/kvm_main.c                           |   4 +
>>  6 files changed, 163 insertions(+)
>>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>>
>> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
>> new file mode 100644
>> index 0000000..dda864e
>> --- /dev/null
>> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
>> @@ -0,0 +1,24 @@
>> +ARM Virtual Performance Monitor Unit (vPMU)
>> +===========================================
>> +
>> +Device types supported:
>> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
>> +
>> +Instantiate one PMU instance for per VCPU through this API.
> 
> Do you mean that userspace has to call this API once per vCPU to
> create each PMU, or that calling the device create ioctl once makes
> the kernel instantiate a PMU for each vCPU?
> 
Call the device create ioctl once and kvm will create a PMU for each
vCPU. But userspace should set the irqs for each PMU since for SPI they
are different.

> (It's a little bit confusing that we say "this API" to mean
> "not the API documented in this file at all but actually
> the KVM_CREATE_DEVICE ioctl", but I see we do that in the GIC
> API docs too.)
> 
>> +
>> +Groups:
>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>> +  Attributes:
>> +    The attr field of kvm_device_attr encodes one value:
>> +    bits:     | 63 .... 32 | 31 ....  0 |
>> +    values:   |  reserved  | vcpu_index |
>> +    A value describing the PMU overflow interrupt number for the specified
>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> 
> I see we're using vcpu_index rather than MPIDR affinity value
> for specifying which CPU we're configuring. Is this in line with
> our planned API for GICv3 configuration?
> 
Here vcpu_index is used to indexing the vCPU, no special use.

>> +  Errors:
>> +    -ENXIO: Unsupported attribute group
>> +    -EBUSY: The PMU overflow interrupt is already set
>> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
> 
> What happens if you create a PMU but then never set the IRQ number?
> Is there a default or does the VM refuse to run or something?
> 
If userspace doesn't specify the irq number, the guest will not receive
the PMU interrupt because we check if the irq is initialized when we
inject the interrupt. But guest could still use the vPMU if QEMU
generates a proper DTB or ACPI.

> Do we want an attribute like KVM_DEV_ARM_VGIC_CTRL_INIT so userspace
> can say "I have definitely completed configuration of this device now" ?
> 
> We wound up with a fair amount of legacy mess to deal with in the
> GIC code because we didn't put one of those in from the start.
> (Perhaps we should aim to standardize on all kvm devices having one?)
> 
> thanks
> -- PMM
> 
> .
> 

Thanks,
Peter Maydell Jan. 7, 2016, 2:56 p.m. UTC | #4
On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>
>
> On 2016/1/7 22:36, Peter Maydell wrote:
>> On 22 December 2015 at 08:08, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>>
>>> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
>>> the kvm_device_ops for it.
>>>
>>> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>>> ---
>>>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>>>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>>>  include/linux/kvm_host.h                      |   1 +
>>>  include/uapi/linux/kvm.h                      |   2 +
>>>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>>>  virt/kvm/kvm_main.c                           |   4 +
>>>  6 files changed, 163 insertions(+)
>>>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>>>
>>> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
>>> new file mode 100644
>>> index 0000000..dda864e
>>> --- /dev/null
>>> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
>>> @@ -0,0 +1,24 @@
>>> +ARM Virtual Performance Monitor Unit (vPMU)
>>> +===========================================
>>> +
>>> +Device types supported:
>>> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
>>> +
>>> +Instantiate one PMU instance for per VCPU through this API.
>>
>> Do you mean that userspace has to call this API once per vCPU to
>> create each PMU, or that calling the device create ioctl once makes
>> the kernel instantiate a PMU for each vCPU?
>>
> Call the device create ioctl once and kvm will create a PMU for each
> vCPU. But userspace should set the irqs for each PMU since for SPI they
> are different.
>
>> (It's a little bit confusing that we say "this API" to mean
>> "not the API documented in this file at all but actually
>> the KVM_CREATE_DEVICE ioctl", but I see we do that in the GIC
>> API docs too.)
>>
>>> +
>>> +Groups:
>>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>>> +  Attributes:
>>> +    The attr field of kvm_device_attr encodes one value:
>>> +    bits:     | 63 .... 32 | 31 ....  0 |
>>> +    values:   |  reserved  | vcpu_index |
>>> +    A value describing the PMU overflow interrupt number for the specified
>>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
>>
>> I see we're using vcpu_index rather than MPIDR affinity value
>> for specifying which CPU we're configuring. Is this in line with
>> our planned API for GICv3 configuration?
>>
> Here vcpu_index is used to indexing the vCPU, no special use.

Yes, but you can identify the CPU by index, or by its MPIDR.
We had a discussion about which was the best way for doing
the VGIC API, and I can't remember which way round we ended up
going for. Whichever we chose, we should do the same thing here.

>>> +  Errors:
>>> +    -ENXIO: Unsupported attribute group
>>> +    -EBUSY: The PMU overflow interrupt is already set
>>> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>>> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>>
>> What happens if you create a PMU but then never set the IRQ number?
>> Is there a default or does the VM refuse to run or something?
>>
> If userspace doesn't specify the irq number, the guest will not receive
> the PMU interrupt because we check if the irq is initialized when we
> inject the interrupt. But guest could still use the vPMU if QEMU
> generates a proper DTB or ACPI.

So is it a valid use case to create a PMU with the interrupt not wired
up to anything? (If it's never valid it would be nice to diagnose it
rather than just silently letting the guest run but not work right.)

thanks
-- PMM
Andrew Jones Jan. 7, 2016, 8:18 p.m. UTC | #5
On Tue, Dec 22, 2015 at 04:08:15PM +0800, Shannon Zhao wrote:
> From: Shannon Zhao <shannon.zhao@linaro.org>
> 
> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
> the kvm_device_ops for it.
> 
> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> ---
>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>  include/linux/kvm_host.h                      |   1 +
>  include/uapi/linux/kvm.h                      |   2 +
>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                           |   4 +
>  6 files changed, 163 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
> 
> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
> new file mode 100644
> index 0000000..dda864e
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
> @@ -0,0 +1,24 @@
> +ARM Virtual Performance Monitor Unit (vPMU)
> +===========================================
> +
> +Device types supported:
> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
> +
> +Instantiate one PMU instance for per VCPU through this API.
                                ^^ don't need this 'for' here

> +
> +Groups:
> +  KVM_DEV_ARM_PMU_GRP_IRQ
> +  Attributes:
> +    The attr field of kvm_device_attr encodes one value:
> +    bits:     | 63 .... 32 | 31 ....  0 |
> +    values:   |  reserved  | vcpu_index |

Everywhere else in kvm documentation a vcpu_index is 8 bits. I'm not
saying that that's good, but expanding it to 32 bits here is
inconsistent. (A side note is that I think we should s/vcpu_index/vcpu_id/
in order to keep the parameter name consistent with what's used by
KVM_CREATE_VCPU)

> +    A value describing the PMU overflow interrupt number for the specified
> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> +
> +  Errors:
> +    -ENXIO: Unsupported attribute group

Do we need to specify ENXIO here? It's already specified in
Documentation/virtual/kvm/api.txt for SET/GET_DEVICE_ATTR

> +    -EBUSY: The PMU overflow interrupt is already set
> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 2d4ca4b..cbb9022 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -204,6 +204,10 @@ struct kvm_arch_memory_slot {
>  #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
>  #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
>  
> +/* Device Control API: ARM PMU */
> +#define KVM_DEV_ARM_PMU_GRP_IRQ		0
> +#define   KVM_DEV_ARM_PMU_CPUID_MASK	0xffffffffULL

I don't think we should need a mask like this, at least not in the uapi.
The documentation says to use a vcpu-index [really a vcpu-id], and users
should know how to convert their vcpu handle (whatever that may be) to
a vcpu-id using whatever transformation is necessary. They may already
have a "vcpu id mask" defined (this is one reason why I don't think we
should use a 32-bit vcpu-index here, instead of the 8-bit vcpu-index used
everywhere else). Likewise, kvm should know how to do it's transformation.
Maybe, to aid in the attr field construction, we should supply a shift,
allowing both sides to do something like

int pmu_attr_extract_vcpu_id(u64 attr)
{
    return (attr >> KVM_DEV_ARM_PMU_VCPUID_SHIFT) & VCPU_ID_MASK;
}

u64 pmu_attr_create(int vcpu_id)
{
    return vcpu_id << KVM_DEV_ARM_PMU_VCPUID_SHIFT;
}

But, in this case the shift is zero, so it's not really necessary. In
any case, please add the 'V' for VCPU.

> +
>  /* KVM_IRQ_LINE irq field index values */
>  #define KVM_ARM_IRQ_TYPE_SHIFT		24
>  #define KVM_ARM_IRQ_TYPE_MASK		0xff
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index c923350..608dea6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1161,6 +1161,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
>  extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> +extern struct kvm_device_ops kvm_arm_pmu_ops;
>  
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 03f3618..4ba6fdd 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1032,6 +1032,8 @@ enum kvm_device_type {
>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
>  	KVM_DEV_TYPE_ARM_VGIC_V3,
>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
> +	KVM_DEV_TYPE_ARM_PMU_V3,
> +#define	KVM_DEV_TYPE_ARM_PMU_V3		KVM_DEV_TYPE_ARM_PMU_V3
>  	KVM_DEV_TYPE_MAX,
>  };
>  
> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> index 3ec3cdd..5518308 100644
> --- a/virt/kvm/arm/pmu.c
> +++ b/virt/kvm/arm/pmu.c
> @@ -19,6 +19,7 @@
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
>  #include <linux/perf_event.h>
> +#include <linux/uaccess.h>
>  #include <asm/kvm_emulate.h>
>  #include <kvm/arm_pmu.h>
>  #include <kvm/arm_vgic.h>
> @@ -374,3 +375,130 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
>  
>  	pmc->perf_event = event;
>  }
> +
> +static inline bool kvm_arm_pmu_initialized(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.pmu.irq_num != -1;
> +}
> +
> +static int kvm_arm_pmu_irq_access(struct kvm *kvm, struct kvm_device_attr *attr,
> +				  int *irq, bool is_set)
> +{
> +	int cpuid;

please call this vcpu_id

> +	struct kvm_vcpu *vcpu;
> +	struct kvm_pmu *pmu;
> +
> +	cpuid = attr->attr & KVM_DEV_ARM_PMU_CPUID_MASK;
> +	if (cpuid >= atomic_read(&kvm->online_vcpus))
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu(kvm, cpuid);
> +	if (!vcpu)
> +		return -EINVAL;
> +
> +	pmu = &vcpu->arch.pmu;
> +	if (!is_set) {
> +		if (!kvm_arm_pmu_initialized(vcpu))
> +			return -ENODEV;
> +
> +		*irq = pmu->irq_num;
> +	} else {
> +		if (kvm_arm_pmu_initialized(vcpu))
> +			return -EBUSY;
> +
> +		kvm_debug("Set kvm ARM PMU irq: %d\n", *irq);
> +		pmu->irq_num = *irq;
> +	}
> +
> +	return 0;
> +}
> +
> +static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm *kvm = dev->kvm;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +		memset(pmu, 0, sizeof(*pmu));

I don't think we want this memset. If we can only create the pmu
once, then it's unnecessary (we zalloc vcpus). And, if we can
recreate pmus with this call, then it'll create a memory leak, as
we'll be zero-ing out all the perf_event pointers, and then won't
be able to free them on the call to kvm_pmu_vcpu_reset. Naturally
we need to make sure we're NULL-ing them after each free instead.

> +		kvm_pmu_vcpu_reset(vcpu);
> +		pmu->irq_num = -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static void kvm_arm_pmu_destroy(struct kvm_device *dev)
> +{
> +	kfree(dev);
> +}
> +
> +static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> +		int __user *uaddr = (int __user *)(long)attr->addr;
> +		int reg;
> +
> +		if (get_user(reg, uaddr))
> +			return -EFAULT;
> +
> +		/*
> +		 * The PMU overflow interrupt could be a PPI or SPI, but for one
> +		 * VM the interrupt type must be same for each vcpu. As a PPI,
> +		 * the interrupt number is same for all vcpus, while as a SPI it
> +		 * must be different for each vcpu.
> +		 */
> +		if (reg < VGIC_NR_SGIS || reg >= dev->kvm->arch.vgic.nr_irqs)
> +			return -EINVAL;
> +
> +		return kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, true);
> +	}
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int kvm_arm_pmu_get_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	int ret;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> +		int __user *uaddr = (int __user *)(long)attr->addr;
> +		int reg = -1;
> +
> +
> +		ret = kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, false);
> +		if (ret)
> +			return ret;
> +		return put_user(reg, uaddr);
> +	}
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +static int kvm_arm_pmu_has_attr(struct kvm_device *dev,
> +				struct kvm_device_attr *attr)
> +{
> +	switch (attr->group) {
> +	case KVM_DEV_ARM_PMU_GRP_IRQ:
> +		return 0;
> +	}
> +
> +	return -ENXIO;
> +}
> +
> +struct kvm_device_ops kvm_arm_pmu_ops = {
> +	.name = "kvm-arm-pmu",
> +	.create = kvm_arm_pmu_create,
> +	.destroy = kvm_arm_pmu_destroy,
> +	.set_attr = kvm_arm_pmu_set_attr,
> +	.get_attr = kvm_arm_pmu_get_attr,
> +	.has_attr = kvm_arm_pmu_has_attr,
> +};
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 484079e..81a42cc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2647,6 +2647,10 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
>  #ifdef CONFIG_KVM_XICS
>  	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
>  #endif
> +
> +#ifdef CONFIG_KVM_ARM_PMU
> +	[KVM_DEV_TYPE_ARM_PMU_V3]	= &kvm_arm_pmu_ops,

Shouldn't we specify 'v3' in the kvm_arm_pmu_ops name, as we do with the
device type name?

> +#endif
>  };
>  
>  int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
> -- 
> 2.0.4
> 

Thanks,
drew
Andrew Jones Jan. 7, 2016, 8:36 p.m. UTC | #6
On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> >>> +
> >>> +Groups:
> >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> >>> +  Attributes:
> >>> +    The attr field of kvm_device_attr encodes one value:
> >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> >>> +    values:   |  reserved  | vcpu_index |
> >>> +    A value describing the PMU overflow interrupt number for the specified
> >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> >>
> >> I see we're using vcpu_index rather than MPIDR affinity value
> >> for specifying which CPU we're configuring. Is this in line with
> >> our planned API for GICv3 configuration?
> >>
> > Here vcpu_index is used to indexing the vCPU, no special use.
> 
> Yes, but you can identify the CPU by index, or by its MPIDR.
> We had a discussion about which was the best way for doing
> the VGIC API, and I can't remember which way round we ended up
> going for. Whichever we chose, we should do the same thing here.

I think we should start up a new discussion on this. My understanding,
after a chat with Igor, who was involved in the untangling of vcpu-id and
apic-id for x86, is that using vcpu-id is preferred, unless of course
the device expects an apic-id/mpidr, in which case there's no reason to
translate it on both sides.

Other things to discuss are
1) Consistent use of the term vcpu-id vs. vcpu-index in the documentation.
   I just brought that up in my last reply to this patch.
2) Extending a vcpu-index [id] beyond 8-bits.
3) Should vcpu-ids be reusable after hotunplugging them?
4) Devices like this pmu and the vgic have some global (all vcpus wide)
   properties, but then also vcpu specific registers, which require the
   vcpu-index parameter. Should we instead create a "vcpu device ioctl"
   for these? I.e. extend SET/GET_DEVICE_ATTR to also be of type
   'vcpu ioctl', allowing the vcpu-index parameter to be dropped?

Thanks,
drew
Shannon Zhao Jan. 8, 2016, 2:53 a.m. UTC | #7
On 2016/1/8 4:18, Andrew Jones wrote:
> On Tue, Dec 22, 2015 at 04:08:15PM +0800, Shannon Zhao wrote:
>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>
>> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
>> the kvm_device_ops for it.
>>
>> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>> ---
>>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
>>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
>>  include/linux/kvm_host.h                      |   1 +
>>  include/uapi/linux/kvm.h                      |   2 +
>>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
>>  virt/kvm/kvm_main.c                           |   4 +
>>  6 files changed, 163 insertions(+)
>>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>>
>> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
>> new file mode 100644
>> index 0000000..dda864e
>> --- /dev/null
>> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
>> @@ -0,0 +1,24 @@
>> +ARM Virtual Performance Monitor Unit (vPMU)
>> +===========================================
>> +
>> +Device types supported:
>> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
>> +
>> +Instantiate one PMU instance for per VCPU through this API.
>                                 ^^ don't need this 'for' here
> 
>> +
>> +Groups:
>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>> +  Attributes:
>> +    The attr field of kvm_device_attr encodes one value:
>> +    bits:     | 63 .... 32 | 31 ....  0 |
>> +    values:   |  reserved  | vcpu_index |
> 
> Everywhere else in kvm documentation a vcpu_index is 8 bits. I'm not
> saying that that's good, but expanding it to 32 bits here is
> inconsistent. 
Expand it to 32 bits just in case it needs to support more than 256
vcpus in the future. If you think this is not good, I'll change it to 8
bits.

(A side note is that I think we should s/vcpu_index/vcpu_id/
> in order to keep the parameter name consistent with what's used by
> KVM_CREATE_VCPU)
> 
Eh, I make it consistent with vGIC which is a kvm device that's also
created via KVM_CREATE_DEVICE API. So they are different names but
express same thing.

>> +    A value describing the PMU overflow interrupt number for the specified
>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
>> +
>> +  Errors:
>> +    -ENXIO: Unsupported attribute group
> 
> Do we need to specify ENXIO here? It's already specified in
> Documentation/virtual/kvm/api.txt for SET/GET_DEVICE_ATTR
> 
But specifying it here is not bad, right? If you insist on this, I'll
drop it.

>> +    -EBUSY: The PMU overflow interrupt is already set
>> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
>> index 2d4ca4b..cbb9022 100644
>> --- a/arch/arm64/include/uapi/asm/kvm.h
>> +++ b/arch/arm64/include/uapi/asm/kvm.h
>> @@ -204,6 +204,10 @@ struct kvm_arch_memory_slot {
>>  #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
>>  #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
>>  
>> +/* Device Control API: ARM PMU */
>> +#define KVM_DEV_ARM_PMU_GRP_IRQ		0
>> +#define   KVM_DEV_ARM_PMU_CPUID_MASK	0xffffffffULL
> 
> I don't think we should need a mask like this, at least not in the uapi.
> The documentation says to use a vcpu-index [really a vcpu-id], and users
> should know how to convert their vcpu handle (whatever that may be) to
> a vcpu-id using whatever transformation is necessary. They may already
> have a "vcpu id mask" defined (this is one reason why I don't think we
> should use a 32-bit vcpu-index here, instead of the 8-bit vcpu-index used
> everywhere else). Likewise, kvm should know how to do it's transformation.
> Maybe, to aid in the attr field construction, we should supply a shift,
> allowing both sides to do something like
> 
> int pmu_attr_extract_vcpu_id(u64 attr)
> {
>     return (attr >> KVM_DEV_ARM_PMU_VCPUID_SHIFT) & VCPU_ID_MASK;
So what's the value of the VCPU_ID_MASK? I didn't see any definitions
about it. It looks like KVM_DEV_ARM_PMU_CPUID_MASK(just has a difference
between 32 bits and 8 bits)

> }
> 
> u64 pmu_attr_create(int vcpu_id)
> {
>     return vcpu_id << KVM_DEV_ARM_PMU_VCPUID_SHIFT;
> }
> 
> But, in this case the shift is zero, so it's not really necessary. In
> any case, please add the 'V' for VCPU.
> 
>> +
>>  /* KVM_IRQ_LINE irq field index values */
>>  #define KVM_ARM_IRQ_TYPE_SHIFT		24
>>  #define KVM_ARM_IRQ_TYPE_MASK		0xff
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index c923350..608dea6 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1161,6 +1161,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
>>  extern struct kvm_device_ops kvm_xics_ops;
>>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>> +extern struct kvm_device_ops kvm_arm_pmu_ops;
>>  
>>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>  
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 03f3618..4ba6fdd 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1032,6 +1032,8 @@ enum kvm_device_type {
>>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
>>  	KVM_DEV_TYPE_ARM_VGIC_V3,
>>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
>> +	KVM_DEV_TYPE_ARM_PMU_V3,
>> +#define	KVM_DEV_TYPE_ARM_PMU_V3		KVM_DEV_TYPE_ARM_PMU_V3
>>  	KVM_DEV_TYPE_MAX,
>>  };
>>  
>> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
>> index 3ec3cdd..5518308 100644
>> --- a/virt/kvm/arm/pmu.c
>> +++ b/virt/kvm/arm/pmu.c
>> @@ -19,6 +19,7 @@
>>  #include <linux/kvm.h>
>>  #include <linux/kvm_host.h>
>>  #include <linux/perf_event.h>
>> +#include <linux/uaccess.h>
>>  #include <asm/kvm_emulate.h>
>>  #include <kvm/arm_pmu.h>
>>  #include <kvm/arm_vgic.h>
>> @@ -374,3 +375,130 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
>>  
>>  	pmc->perf_event = event;
>>  }
>> +
>> +static inline bool kvm_arm_pmu_initialized(struct kvm_vcpu *vcpu)
>> +{
>> +	return vcpu->arch.pmu.irq_num != -1;
>> +}
>> +
>> +static int kvm_arm_pmu_irq_access(struct kvm *kvm, struct kvm_device_attr *attr,
>> +				  int *irq, bool is_set)
>> +{
>> +	int cpuid;
> 
> please call this vcpu_id
> 
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvm_pmu *pmu;
>> +
>> +	cpuid = attr->attr & KVM_DEV_ARM_PMU_CPUID_MASK;
>> +	if (cpuid >= atomic_read(&kvm->online_vcpus))
>> +		return -EINVAL;
>> +
>> +	vcpu = kvm_get_vcpu(kvm, cpuid);
>> +	if (!vcpu)
>> +		return -EINVAL;
>> +
>> +	pmu = &vcpu->arch.pmu;
>> +	if (!is_set) {
>> +		if (!kvm_arm_pmu_initialized(vcpu))
>> +			return -ENODEV;
>> +
>> +		*irq = pmu->irq_num;
>> +	} else {
>> +		if (kvm_arm_pmu_initialized(vcpu))
>> +			return -EBUSY;
>> +
>> +		kvm_debug("Set kvm ARM PMU irq: %d\n", *irq);
>> +		pmu->irq_num = *irq;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
>> +{
>> +	int i;
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvm *kvm = dev->kvm;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		struct kvm_pmu *pmu = &vcpu->arch.pmu;
>> +
>> +		memset(pmu, 0, sizeof(*pmu));
> 
> I don't think we want this memset. If we can only create the pmu
> once, then it's unnecessary (we zalloc vcpus). And, if we can
> recreate pmus with this call, then it'll create a memory leak, as
> we'll be zero-ing out all the perf_event pointers, and then won't
> be able to free them on the call to kvm_pmu_vcpu_reset. Naturally
> we need to make sure we're NULL-ing them after each free instead.
> 
Ok, will drop this.

>> +		kvm_pmu_vcpu_reset(vcpu);
>> +		pmu->irq_num = -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void kvm_arm_pmu_destroy(struct kvm_device *dev)
>> +{
>> +	kfree(dev);
>> +}
>> +
>> +static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
>> +				struct kvm_device_attr *attr)
>> +{
>> +	switch (attr->group) {
>> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
>> +		int __user *uaddr = (int __user *)(long)attr->addr;
>> +		int reg;
>> +
>> +		if (get_user(reg, uaddr))
>> +			return -EFAULT;
>> +
>> +		/*
>> +		 * The PMU overflow interrupt could be a PPI or SPI, but for one
>> +		 * VM the interrupt type must be same for each vcpu. As a PPI,
>> +		 * the interrupt number is same for all vcpus, while as a SPI it
>> +		 * must be different for each vcpu.
>> +		 */
>> +		if (reg < VGIC_NR_SGIS || reg >= dev->kvm->arch.vgic.nr_irqs)
>> +			return -EINVAL;
>> +
>> +		return kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, true);
>> +	}
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +static int kvm_arm_pmu_get_attr(struct kvm_device *dev,
>> +				struct kvm_device_attr *attr)
>> +{
>> +	int ret;
>> +
>> +	switch (attr->group) {
>> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
>> +		int __user *uaddr = (int __user *)(long)attr->addr;
>> +		int reg = -1;
>> +
>> +
>> +		ret = kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, false);
>> +		if (ret)
>> +			return ret;
>> +		return put_user(reg, uaddr);
>> +	}
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +static int kvm_arm_pmu_has_attr(struct kvm_device *dev,
>> +				struct kvm_device_attr *attr)
>> +{
>> +	switch (attr->group) {
>> +	case KVM_DEV_ARM_PMU_GRP_IRQ:
>> +		return 0;
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +struct kvm_device_ops kvm_arm_pmu_ops = {
>> +	.name = "kvm-arm-pmu",
>> +	.create = kvm_arm_pmu_create,
>> +	.destroy = kvm_arm_pmu_destroy,
>> +	.set_attr = kvm_arm_pmu_set_attr,
>> +	.get_attr = kvm_arm_pmu_get_attr,
>> +	.has_attr = kvm_arm_pmu_has_attr,
>> +};
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 484079e..81a42cc 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2647,6 +2647,10 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
>>  #ifdef CONFIG_KVM_XICS
>>  	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
>>  #endif
>> +
>> +#ifdef CONFIG_KVM_ARM_PMU
>> +	[KVM_DEV_TYPE_ARM_PMU_V3]	= &kvm_arm_pmu_ops,
> 
> Shouldn't we specify 'v3' in the kvm_arm_pmu_ops name, as we do with the
> device type name?
> 
Sure, will add.

>> +#endif
>>  };
>>  
>>  int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
>> -- 
>> 2.0.4
>>
> 
> Thanks,
> drew
> 
> .
>
Shannon Zhao Jan. 8, 2016, 3:06 a.m. UTC | #8
On 2016/1/7 22:56, Peter Maydell wrote:
>>>> +  Errors:
>>>> >>> +    -ENXIO: Unsupported attribute group
>>>> >>> +    -EBUSY: The PMU overflow interrupt is already set
>>>> >>> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>>>> >>> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>>> >>
>>> >> What happens if you create a PMU but then never set the IRQ number?
>>> >> Is there a default or does the VM refuse to run or something?
>>> >>
>> > If userspace doesn't specify the irq number, the guest will not receive
>> > the PMU interrupt because we check if the irq is initialized when we
>> > inject the interrupt. But guest could still use the vPMU if QEMU
>> > generates a proper DTB or ACPI.
> So is it a valid use case to create a PMU with the interrupt not wired
> up to anything? (If it's never valid it would be nice to diagnose it
> rather than just silently letting the guest run but not work right.)
So how about adding a helper to check if the PMU is completely
initialized and if not, return trap_raz_wi when guest access PMU registers?

Thanks,
Peter Maydell Jan. 8, 2016, 10:24 a.m. UTC | #9
On 8 January 2016 at 03:06, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>
>
> On 2016/1/7 22:56, Peter Maydell wrote:
>>>>> +  Errors:
>>>>> >>> +    -ENXIO: Unsupported attribute group
>>>>> >>> +    -EBUSY: The PMU overflow interrupt is already set
>>>>> >>> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>>>>> >>> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>>>> >>
>>>> >> What happens if you create a PMU but then never set the IRQ number?
>>>> >> Is there a default or does the VM refuse to run or something?
>>>> >>
>>> > If userspace doesn't specify the irq number, the guest will not receive
>>> > the PMU interrupt because we check if the irq is initialized when we
>>> > inject the interrupt. But guest could still use the vPMU if QEMU
>>> > generates a proper DTB or ACPI.
>> So is it a valid use case to create a PMU with the interrupt not wired
>> up to anything? (If it's never valid it would be nice to diagnose it
>> rather than just silently letting the guest run but not work right.)
> So how about adding a helper to check if the PMU is completely
> initialized and if not, return trap_raz_wi when guest access PMU registers?

That would still be letting the guest run and not work right.

If we have an "I have now completed config" ioctl like the GIC does,
then you could make that ioctl return an error if the config is not
valid.

thanks
-- PMM
Andrew Jones Jan. 8, 2016, 11:22 a.m. UTC | #10
On Fri, Jan 08, 2016 at 10:53:03AM +0800, Shannon Zhao wrote:
> 
> 
> On 2016/1/8 4:18, Andrew Jones wrote:
> > On Tue, Dec 22, 2015 at 04:08:15PM +0800, Shannon Zhao wrote:
> >> From: Shannon Zhao <shannon.zhao@linaro.org>
> >>
> >> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
> >> the kvm_device_ops for it.
> >>
> >> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> >> ---
> >>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +++++
> >>  arch/arm64/include/uapi/asm/kvm.h             |   4 +
> >>  include/linux/kvm_host.h                      |   1 +
> >>  include/uapi/linux/kvm.h                      |   2 +
> >>  virt/kvm/arm/pmu.c                            | 128 ++++++++++++++++++++++++++
> >>  virt/kvm/kvm_main.c                           |   4 +
> >>  6 files changed, 163 insertions(+)
> >>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
> >>
> >> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
> >> new file mode 100644
> >> index 0000000..dda864e
> >> --- /dev/null
> >> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
> >> @@ -0,0 +1,24 @@
> >> +ARM Virtual Performance Monitor Unit (vPMU)
> >> +===========================================
> >> +
> >> +Device types supported:
> >> +  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
> >> +
> >> +Instantiate one PMU instance for per VCPU through this API.
> >                                 ^^ don't need this 'for' here
> > 
> >> +
> >> +Groups:
> >> +  KVM_DEV_ARM_PMU_GRP_IRQ
> >> +  Attributes:
> >> +    The attr field of kvm_device_attr encodes one value:
> >> +    bits:     | 63 .... 32 | 31 ....  0 |
> >> +    values:   |  reserved  | vcpu_index |
> > 
> > Everywhere else in kvm documentation a vcpu_index is 8 bits. I'm not
> > saying that that's good, but expanding it to 32 bits here is
> > inconsistent. 
> Expand it to 32 bits just in case it needs to support more than 256
> vcpus in the future. If you think this is not good, I'll change it to 8
> bits.

I agree that eventually we'll want more than 256 vcpus. However the kvm
api already has the concept of a vcpu-index, which shows up in two other
places, KVM_IRQ_LINE and KVM_DEV_ARM_VGIC_*. In both those other places
it's only 8 bits. While there may not be a KVM_VCPU_INDEX_MASK=0xff
somewhere, I still think we should keep things consistent.

Hmm, or each device should define its own index. Actually, that sounds
better, as each device may have a different max-vcpus limit; KVM_IRQ_LINE
has 8 bits due to the x86 apic supporting 8 bits, gicv2 could have gotten
by with only 3 bits, and gicv3 can use much, much more. When we want more
than 256 vcpus we'll need a new kvm-irq-line ioctl, but, as for the arm
devices, we could avoid the problem completely by extending the
SET/GET_DEVICE_ATTR ioctl to also be a vcpu ioctl.

> 
> (A side note is that I think we should s/vcpu_index/vcpu_id/
> > in order to keep the parameter name consistent with what's used by
> > KVM_CREATE_VCPU)
> > 
> Eh, I make it consistent with vGIC which is a kvm device that's also
> created via KVM_CREATE_DEVICE API. So they are different names but
> express same thing.

I knew you are consistent with arm-vgic.txt, and also with KVM_IRQ_LINE.
I was just thinking that it'd be nice for everything to be consistent
with KVM_CREATE_VCPU. However, now I'm of the mind that the vcpu-id
(32 bits) should always be independent of device vcpu-indexes, as we
don't want to limit the number of vcpus (by limiting vcpu-ids) to the
maximum of the most-limited device we support.

> 
> >> +    A value describing the PMU overflow interrupt number for the specified
> >> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> >> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> >> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> >> +
> >> +  Errors:
> >> +    -ENXIO: Unsupported attribute group
> > 
> > Do we need to specify ENXIO here? It's already specified in
> > Documentation/virtual/kvm/api.txt for SET/GET_DEVICE_ATTR
> > 
> But specifying it here is not bad, right? If you insist on this, I'll
> drop it.

It's fine, as the specification in SET/GET_DEVICE_ATTR section isn't
going to change, but we require users of this api to read that section
already, so I feel it's redundant and prefer to avoid redundancy. I won't
insist.

> 
> >> +    -EBUSY: The PMU overflow interrupt is already set
> >> +    -ENODEV: Getting the PMU overflow interrupt number while it's not set
> >> +    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
> >> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> >> index 2d4ca4b..cbb9022 100644
> >> --- a/arch/arm64/include/uapi/asm/kvm.h
> >> +++ b/arch/arm64/include/uapi/asm/kvm.h
> >> @@ -204,6 +204,10 @@ struct kvm_arch_memory_slot {
> >>  #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
> >>  #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
> >>  
> >> +/* Device Control API: ARM PMU */
> >> +#define KVM_DEV_ARM_PMU_GRP_IRQ		0
> >> +#define   KVM_DEV_ARM_PMU_CPUID_MASK	0xffffffffULL
> > 
> > I don't think we should need a mask like this, at least not in the uapi.
> > The documentation says to use a vcpu-index [really a vcpu-id], and users
> > should know how to convert their vcpu handle (whatever that may be) to
> > a vcpu-id using whatever transformation is necessary. They may already
> > have a "vcpu id mask" defined (this is one reason why I don't think we
> > should use a 32-bit vcpu-index here, instead of the 8-bit vcpu-index used
> > everywhere else). Likewise, kvm should know how to do it's transformation.
> > Maybe, to aid in the attr field construction, we should supply a shift,
> > allowing both sides to do something like
> > 
> > int pmu_attr_extract_vcpu_id(u64 attr)
> > {
> >     return (attr >> KVM_DEV_ARM_PMU_VCPUID_SHIFT) & VCPU_ID_MASK;
> So what's the value of the VCPU_ID_MASK? I didn't see any definitions
> about it. It looks like KVM_DEV_ARM_PMU_CPUID_MASK(just has a difference
> between 32 bits and 8 bits)

Whether or not a user would define a VCPU_ID_MASK == 0xff depends on
whether or not they understand 'vcpu-index' to be a consistent concept
everywhere they see it in the documentation. I now would prefer
vcpu-index being device specific. Going that way, then I think the
documentation could be updated to help make that clearer, e.g. calling
the gic vcpu-index gic_vcpu_index or something.

> 
> > }
> > 
> > u64 pmu_attr_create(int vcpu_id)
> > {
> >     return vcpu_id << KVM_DEV_ARM_PMU_VCPUID_SHIFT;
> > }
> > 
> > But, in this case the shift is zero, so it's not really necessary. In
> > any case, please add the 'V' for VCPU.
> > 
> >> +
> >>  /* KVM_IRQ_LINE irq field index values */
> >>  #define KVM_ARM_IRQ_TYPE_SHIFT		24
> >>  #define KVM_ARM_IRQ_TYPE_MASK		0xff
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index c923350..608dea6 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -1161,6 +1161,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
> >>  extern struct kvm_device_ops kvm_xics_ops;
> >>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> >>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> >> +extern struct kvm_device_ops kvm_arm_pmu_ops;
> >>  
> >>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >>  
> >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> index 03f3618..4ba6fdd 100644
> >> --- a/include/uapi/linux/kvm.h
> >> +++ b/include/uapi/linux/kvm.h
> >> @@ -1032,6 +1032,8 @@ enum kvm_device_type {
> >>  #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
> >>  	KVM_DEV_TYPE_ARM_VGIC_V3,
> >>  #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
> >> +	KVM_DEV_TYPE_ARM_PMU_V3,
> >> +#define	KVM_DEV_TYPE_ARM_PMU_V3		KVM_DEV_TYPE_ARM_PMU_V3
> >>  	KVM_DEV_TYPE_MAX,
> >>  };
> >>  
> >> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> >> index 3ec3cdd..5518308 100644
> >> --- a/virt/kvm/arm/pmu.c
> >> +++ b/virt/kvm/arm/pmu.c
> >> @@ -19,6 +19,7 @@
> >>  #include <linux/kvm.h>
> >>  #include <linux/kvm_host.h>
> >>  #include <linux/perf_event.h>
> >> +#include <linux/uaccess.h>
> >>  #include <asm/kvm_emulate.h>
> >>  #include <kvm/arm_pmu.h>
> >>  #include <kvm/arm_vgic.h>
> >> @@ -374,3 +375,130 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
> >>  
> >>  	pmc->perf_event = event;
> >>  }
> >> +
> >> +static inline bool kvm_arm_pmu_initialized(struct kvm_vcpu *vcpu)
> >> +{
> >> +	return vcpu->arch.pmu.irq_num != -1;
> >> +}
> >> +
> >> +static int kvm_arm_pmu_irq_access(struct kvm *kvm, struct kvm_device_attr *attr,
> >> +				  int *irq, bool is_set)
> >> +{
> >> +	int cpuid;
> > 
> > please call this vcpu_id
> > 
> >> +	struct kvm_vcpu *vcpu;
> >> +	struct kvm_pmu *pmu;
> >> +
> >> +	cpuid = attr->attr & KVM_DEV_ARM_PMU_CPUID_MASK;
> >> +	if (cpuid >= atomic_read(&kvm->online_vcpus))
> >> +		return -EINVAL;
> >> +
> >> +	vcpu = kvm_get_vcpu(kvm, cpuid);
> >> +	if (!vcpu)
> >> +		return -EINVAL;
> >> +
> >> +	pmu = &vcpu->arch.pmu;
> >> +	if (!is_set) {
> >> +		if (!kvm_arm_pmu_initialized(vcpu))
> >> +			return -ENODEV;
> >> +
> >> +		*irq = pmu->irq_num;
> >> +	} else {
> >> +		if (kvm_arm_pmu_initialized(vcpu))
> >> +			return -EBUSY;
> >> +
> >> +		kvm_debug("Set kvm ARM PMU irq: %d\n", *irq);
> >> +		pmu->irq_num = *irq;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
> >> +{
> >> +	int i;
> >> +	struct kvm_vcpu *vcpu;
> >> +	struct kvm *kvm = dev->kvm;
> >> +
> >> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> >> +		struct kvm_pmu *pmu = &vcpu->arch.pmu;
> >> +
> >> +		memset(pmu, 0, sizeof(*pmu));
> > 
> > I don't think we want this memset. If we can only create the pmu
> > once, then it's unnecessary (we zalloc vcpus). And, if we can
> > recreate pmus with this call, then it'll create a memory leak, as
> > we'll be zero-ing out all the perf_event pointers, and then won't
> > be able to free them on the call to kvm_pmu_vcpu_reset. Naturally
> > we need to make sure we're NULL-ing them after each free instead.
> > 
> Ok, will drop this.
> 
> >> +		kvm_pmu_vcpu_reset(vcpu);
> >> +		pmu->irq_num = -1;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void kvm_arm_pmu_destroy(struct kvm_device *dev)
> >> +{
> >> +	kfree(dev);
> >> +}
> >> +
> >> +static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
> >> +				struct kvm_device_attr *attr)
> >> +{
> >> +	switch (attr->group) {
> >> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> >> +		int __user *uaddr = (int __user *)(long)attr->addr;
> >> +		int reg;
> >> +
> >> +		if (get_user(reg, uaddr))
> >> +			return -EFAULT;
> >> +
> >> +		/*
> >> +		 * The PMU overflow interrupt could be a PPI or SPI, but for one
> >> +		 * VM the interrupt type must be same for each vcpu. As a PPI,
> >> +		 * the interrupt number is same for all vcpus, while as a SPI it
> >> +		 * must be different for each vcpu.
> >> +		 */
> >> +		if (reg < VGIC_NR_SGIS || reg >= dev->kvm->arch.vgic.nr_irqs)
> >> +			return -EINVAL;
> >> +
> >> +		return kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, true);
> >> +	}
> >> +	}
> >> +
> >> +	return -ENXIO;
> >> +}
> >> +
> >> +static int kvm_arm_pmu_get_attr(struct kvm_device *dev,
> >> +				struct kvm_device_attr *attr)
> >> +{
> >> +	int ret;
> >> +
> >> +	switch (attr->group) {
> >> +	case KVM_DEV_ARM_PMU_GRP_IRQ: {
> >> +		int __user *uaddr = (int __user *)(long)attr->addr;
> >> +		int reg = -1;
> >> +
> >> +
> >> +		ret = kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, false);
> >> +		if (ret)
> >> +			return ret;
> >> +		return put_user(reg, uaddr);
> >> +	}
> >> +	}
> >> +
> >> +	return -ENXIO;
> >> +}
> >> +
> >> +static int kvm_arm_pmu_has_attr(struct kvm_device *dev,
> >> +				struct kvm_device_attr *attr)
> >> +{
> >> +	switch (attr->group) {
> >> +	case KVM_DEV_ARM_PMU_GRP_IRQ:
> >> +		return 0;
> >> +	}
> >> +
> >> +	return -ENXIO;
> >> +}
> >> +
> >> +struct kvm_device_ops kvm_arm_pmu_ops = {
> >> +	.name = "kvm-arm-pmu",
> >> +	.create = kvm_arm_pmu_create,
> >> +	.destroy = kvm_arm_pmu_destroy,
> >> +	.set_attr = kvm_arm_pmu_set_attr,
> >> +	.get_attr = kvm_arm_pmu_get_attr,
> >> +	.has_attr = kvm_arm_pmu_has_attr,
> >> +};
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index 484079e..81a42cc 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -2647,6 +2647,10 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
> >>  #ifdef CONFIG_KVM_XICS
> >>  	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
> >>  #endif
> >> +
> >> +#ifdef CONFIG_KVM_ARM_PMU
> >> +	[KVM_DEV_TYPE_ARM_PMU_V3]	= &kvm_arm_pmu_ops,
> > 
> > Shouldn't we specify 'v3' in the kvm_arm_pmu_ops name, as we do with the
> > device type name?
> > 
> Sure, will add.
> 
> >> +#endif
> >>  };
> >>  
> >>  int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
> >> -- 
> >> 2.0.4
> >>

Thanks,
drew
Shannon Zhao Jan. 8, 2016, 12:15 p.m. UTC | #11
On 2016/1/8 18:24, Peter Maydell wrote:
> On 8 January 2016 at 03:06, Shannon Zhao<zhaoshenglong@huawei.com>  wrote:
>> >
>> >
>> >On 2016/1/7 22:56, Peter Maydell wrote:
>>>>>> >>>>>+  Errors:
>>>>>>>>> >>>>> >>>+    -ENXIO: Unsupported attribute group
>>>>>>>>> >>>>> >>>+    -EBUSY: The PMU overflow interrupt is already set
>>>>>>>>> >>>>> >>>+    -ENODEV: Getting the PMU overflow interrupt number while it's not set
>>>>>>>>> >>>>> >>>+    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>>>>>>> >>>> >>
>>>>>>> >>>> >>What happens if you create a PMU but then never set the IRQ number?
>>>>>>> >>>> >>Is there a default or does the VM refuse to run or something?
>>>>>>> >>>> >>
>>>>> >>> >If userspace doesn't specify the irq number, the guest will not receive
>>>>> >>> >the PMU interrupt because we check if the irq is initialized when we
>>>>> >>> >inject the interrupt. But guest could still use the vPMU if QEMU
>>>>> >>> >generates a proper DTB or ACPI.
>>> >>So is it a valid use case to create a PMU with the interrupt not wired
>>> >>up to anything? (If it's never valid it would be nice to diagnose it
>>> >>rather than just silently letting the guest run but not work right.)
>> >So how about adding a helper to check if the PMU is completely
>> >initialized and if not, return trap_raz_wi when guest access PMU registers?
> That would still be letting the guest run and not work right.
>
> If we have an "I have now completed config" ioctl like the GIC does,
> then you could make that ioctl return an error if the config is not
> valid.

Firstly, userspace will call kvm device ioctl to create the PMU device, 
if it fails it will return an error.
Secondly, userspace will call SET_DEVICE_ATTR ioctl to configure the PMU 
irq number. If the irq is invalid, this will return an error and the 
userspace will stop if it takes care the error. If the irq is valid, the 
PMU is well initialized.

If userspace doesn't call SET_ATTR ioctl to configure the PMU irq, the 
PMU is not well initialized and guest can't use the PMU.

I didn't see anywhere that needs to call something like 
KVM_DEV_ARM_VGIC_CTRL_INIT to let kvm initialize PMU because it will be 
initialized when userspace configure the irq correctly. This is a little 
different with the vGIC I think.

Thanks,
Peter Maydell Jan. 8, 2016, 12:56 p.m. UTC | #12
On 8 January 2016 at 12:15, Shannon Zhao <shannon.zhao@linaro.org> wrote:
> Firstly, userspace will call kvm device ioctl to create the PMU device, if
> it fails it will return an error.
> Secondly, userspace will call SET_DEVICE_ATTR ioctl to configure the PMU irq
> number. If the irq is invalid, this will return an error and the userspace
> will stop if it takes care the error. If the irq is valid, the PMU is well
> initialized.
>
> If userspace doesn't call SET_ATTR ioctl to configure the PMU irq, the PMU
> is not well initialized and guest can't use the PMU.

Yes, this is my point. If you require userspace to do:
 * create PMU device
 * configure PMU device via SET_DEVICE_ATTR
 * complete init of PMU device via a KVM_DEV_ARM_PMU_CTRL_INIT attr

then any bugs on the userspace side where it isn't configuring
correctly can be easily caught, and will turn into a "VM fails
to start with helpful error message", rather than "VM starts but
doesn't work in an obscure way".

This is particularly important for devices which have userspace
visible registers, because userspace will also want to be able
to read and write state of those registers before first vcpu run.
If there is no "init is complete" ioctl then the kernel code ends
up having to guess when init has completed for various entry points
where the guest does something else (register access, vcpu start, etc).

It might happen to not be so important for the PMU, but I think it
would be very useful if we have a pattern where every device that
is created via this API works in the same way (and it means that if
we end up extending the PMU in future we will already have the API
in place and don't need to worry about legacy userspace).

Speaking of userspace access to registers, what is the intended API
for migrating the PMU register state ?

thanks
-- PMM
Shannon Zhao Jan. 8, 2016, 1:31 p.m. UTC | #13
On 2016/1/8 20:56, Peter Maydell wrote:
> On 8 January 2016 at 12:15, Shannon Zhao <shannon.zhao@linaro.org> wrote:
>> Firstly, userspace will call kvm device ioctl to create the PMU device, if
>> it fails it will return an error.
>> Secondly, userspace will call SET_DEVICE_ATTR ioctl to configure the PMU irq
>> number. If the irq is invalid, this will return an error and the userspace
>> will stop if it takes care the error. If the irq is valid, the PMU is well
>> initialized.
>>
>> If userspace doesn't call SET_ATTR ioctl to configure the PMU irq, the PMU
>> is not well initialized and guest can't use the PMU.
>
> Yes, this is my point. If you require userspace to do:
>   * create PMU device
>   * configure PMU device via SET_DEVICE_ATTR
>   * complete init of PMU device via a KVM_DEV_ARM_PMU_CTRL_INIT attr
>
> then any bugs on the userspace side where it isn't configuring
> correctly can be easily caught, and will turn into a "VM fails
> to start with helpful error message", rather than "VM starts but
> doesn't work in an obscure way".
>
> This is particularly important for devices which have userspace
> visible registers, because userspace will also want to be able
> to read and write state of those registers before first vcpu run.
> If there is no "init is complete" ioctl then the kernel code ends
> up having to guess when init has completed for various entry points
> where the guest does something else (register access, vcpu start, etc).
>
> It might happen to not be so important for the PMU, but I think it
> would be very useful if we have a pattern where every device that
> is created via this API works in the same way (and it means that if
> we end up extending the PMU in future we will already have the API
> in place and don't need to worry about legacy userspace).
>
> Speaking of userspace access to registers, what is the intended API
> for migrating the PMU register state ?
>
Since the PMU register states are stored in sys_regs[NR_SYS_REGS], they 
could be migrated with other sys registers. There is only the "struct 
perf_event *perf_event" we should take care of I think. I didn't figure 
out a good way for this especially when guest is using the perf.

Thanks,
Andrew Jones Jan. 8, 2016, 3:20 p.m. UTC | #14
On Fri, Jan 08, 2016 at 12:22:13PM +0100, Andrew Jones wrote:
> When we want more than 256 vcpus we'll need a new kvm-irq-line ioctl,
> but, as for the arm devices, we could avoid the problem completely by
> extending the SET/GET_DEVICE_ATTR ioctl to also be a vcpu ioctl.

Replying to myself on this comment. The arm-gic device is fine as is, as
it really is a vm device. However I'm convincing myself more and more
(and with the help of Radim Krcmar being my sounding board) that using
the device api for the pmu is wrong (at least as a per-vm device). The
pmu is a per-vcpu device.

I think we should consider adding KVM_CREATE_DEVICE, KVM_SET/GET_DEVICE_ATTR
to the vcpu ioctl to allow per-vcpu devices. Then, instead of looping vcpus
in the kernel to init the pmu per vcpu on the call to KVM_CREATE_DEVICE,
we'd have the more natural looping in userspace. Also, the set-attr for the
irq would be part of that loop, and no longer need a vcpu-index parameter.

Thanks,
drew
Andrew Jones Jan. 8, 2016, 3:59 p.m. UTC | #15
On Fri, Jan 08, 2016 at 04:20:28PM +0100, Andrew Jones wrote:
> On Fri, Jan 08, 2016 at 12:22:13PM +0100, Andrew Jones wrote:
> > When we want more than 256 vcpus we'll need a new kvm-irq-line ioctl,
> > but, as for the arm devices, we could avoid the problem completely by
> > extending the SET/GET_DEVICE_ATTR ioctl to also be a vcpu ioctl.
> 
> Replying to myself on this comment. The arm-gic device is fine as is, as
> it really is a vm device. However I'm convincing myself more and more
> (and with the help of Radim Krcmar being my sounding board) that using
> the device api for the pmu is wrong (at least as a per-vm device). The
> pmu is a per-vcpu device.
> 
> I think we should consider adding KVM_CREATE_DEVICE, KVM_SET/GET_DEVICE_ATTR
> to the vcpu ioctl to allow per-vcpu devices. Then, instead of looping vcpus
> in the kernel to init the pmu per vcpu on the call to KVM_CREATE_DEVICE,
> we'd have the more natural looping in userspace. Also, the set-attr for the
> irq would be part of that loop, and no longer need a vcpu-index parameter.
>

Another note, for this pmu device we don't actually need KVM_CREATE_DEVICE
to be a vcpu ioctl. We can just add another vcpu-init feature flag that we
set with KVM_ARM_VCPU_INIT in order to "create" the pmu. We still need to
add KVM_SET/GET_DEVICE_ATTR to the vcpu ioctl though, allowing the irq to
be set. KVM_ARM_PMU_GRP_IRQ will just become a vcpu attribute.

Thanks,
drew
Christoffer Dall Jan. 9, 2016, 12:29 p.m. UTC | #16
On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> > >>> +
> > >>> +Groups:
> > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> > >>> +  Attributes:
> > >>> +    The attr field of kvm_device_attr encodes one value:
> > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> > >>> +    values:   |  reserved  | vcpu_index |
> > >>> +    A value describing the PMU overflow interrupt number for the specified
> > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> > >>
> > >> I see we're using vcpu_index rather than MPIDR affinity value
> > >> for specifying which CPU we're configuring. Is this in line with
> > >> our planned API for GICv3 configuration?
> > >>
> > > Here vcpu_index is used to indexing the vCPU, no special use.
> > 
> > Yes, but you can identify the CPU by index, or by its MPIDR.
> > We had a discussion about which was the best way for doing
> > the VGIC API, and I can't remember which way round we ended up
> > going for. Whichever we chose, we should do the same thing here.
> 
> I think we should start up a new discussion on this. My understanding,
> after a chat with Igor, who was involved in the untangling of vcpu-id and
> apic-id for x86, is that using vcpu-id is preferred, unless of course
> the device expects an apic-id/mpidr, in which case there's no reason to
> translate it on both sides.
> 

I'm fairly strongly convinced that we should use the full 32-bit
compressed MPIDR for everything ARM related going forward, as this will
cover any case required and leverages and architecturally defined way of
uniquely identifying a (v)CPU.

-Christoffer
Marc Zyngier Jan. 9, 2016, 3:03 p.m. UTC | #17
On Sat, 9 Jan 2016 13:29:56 +0100
Christoffer Dall <christoffer.dall@linaro.org> wrote:

> On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> > On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> > > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> > > >>> +
> > > >>> +Groups:
> > > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> > > >>> +  Attributes:
> > > >>> +    The attr field of kvm_device_attr encodes one value:
> > > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> > > >>> +    values:   |  reserved  | vcpu_index |
> > > >>> +    A value describing the PMU overflow interrupt number for the specified
> > > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> > > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> > > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> > > >>
> > > >> I see we're using vcpu_index rather than MPIDR affinity value
> > > >> for specifying which CPU we're configuring. Is this in line with
> > > >> our planned API for GICv3 configuration?
> > > >>
> > > > Here vcpu_index is used to indexing the vCPU, no special use.
> > > 
> > > Yes, but you can identify the CPU by index, or by its MPIDR.
> > > We had a discussion about which was the best way for doing
> > > the VGIC API, and I can't remember which way round we ended up
> > > going for. Whichever we chose, we should do the same thing here.
> > 
> > I think we should start up a new discussion on this. My understanding,
> > after a chat with Igor, who was involved in the untangling of vcpu-id and
> > apic-id for x86, is that using vcpu-id is preferred, unless of course
> > the device expects an apic-id/mpidr, in which case there's no reason to
> > translate it on both sides.
> > 
> 
> I'm fairly strongly convinced that we should use the full 32-bit
> compressed MPIDR for everything ARM related going forward, as this will
> cover any case required and leverages and architecturally defined way of
> uniquely identifying a (v)CPU.

+1.

vcpu_ids, indexes or any other constructs are just a bunch
of KVM-specific definitions that do not describe the VM from an
architecture PoV. In contrast, the MPIDR is guaranteed to be unique
stable, and identifies a given (v)CPU.

As for the PMU: either 1) we instantiate it together with the CPU
(with a new capability/feature), or 2) we map it to a MPIDR, and
associate it with its vcpu at runtime.

I'm slightly in favor of 1), as it simplifies things a bit, but 2) is
doable as well (GICv3 has similar requirements).

	M.
Shannon Zhao Jan. 11, 2016, 8:45 a.m. UTC | #18
On 2016/1/9 23:03, Marc Zyngier wrote:
> On Sat, 9 Jan 2016 13:29:56 +0100
> Christoffer Dall <christoffer.dall@linaro.org> wrote:
> 
>> > On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
>>> > > On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
>>>> > > > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>>>>>>> > > > >>> +
>>>>>>> > > > >>> +Groups:
>>>>>>> > > > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>>>>>>> > > > >>> +  Attributes:
>>>>>>> > > > >>> +    The attr field of kvm_device_attr encodes one value:
>>>>>>> > > > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
>>>>>>> > > > >>> +    values:   |  reserved  | vcpu_index |
>>>>>>> > > > >>> +    A value describing the PMU overflow interrupt number for the specified
>>>>>>> > > > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>>>>>>> > > > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>>>>>>> > > > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
>>>>>> > > > >>
>>>>>> > > > >> I see we're using vcpu_index rather than MPIDR affinity value
>>>>>> > > > >> for specifying which CPU we're configuring. Is this in line with
>>>>>> > > > >> our planned API for GICv3 configuration?
>>>>>> > > > >>
>>>>> > > > > Here vcpu_index is used to indexing the vCPU, no special use.
>>>> > > > 
>>>> > > > Yes, but you can identify the CPU by index, or by its MPIDR.
>>>> > > > We had a discussion about which was the best way for doing
>>>> > > > the VGIC API, and I can't remember which way round we ended up
>>>> > > > going for. Whichever we chose, we should do the same thing here.
>>> > > 
>>> > > I think we should start up a new discussion on this. My understanding,
>>> > > after a chat with Igor, who was involved in the untangling of vcpu-id and
>>> > > apic-id for x86, is that using vcpu-id is preferred, unless of course
>>> > > the device expects an apic-id/mpidr, in which case there's no reason to
>>> > > translate it on both sides.
>>> > > 
>> > 
>> > I'm fairly strongly convinced that we should use the full 32-bit
>> > compressed MPIDR for everything ARM related going forward, as this will
>> > cover any case required and leverages and architecturally defined way of
>> > uniquely identifying a (v)CPU.
> +1.
> 
> vcpu_ids, indexes or any other constructs are just a bunch
> of KVM-specific definitions that do not describe the VM from an
> architecture PoV. In contrast, the MPIDR is guaranteed to be unique
> stable, and identifies a given (v)CPU.
> 
> As for the PMU: either 1) we instantiate it together with the CPU
> (with a new capability/feature), 
So spare some bits(e.g. 10 bits) of the features array to pass the PMU
irq number or add KVM_SET/GET_DEVICE_ATTR for vcpu ioctl?

Thanks,
Marc Zyngier Jan. 11, 2016, 8:59 a.m. UTC | #19
On 11/01/16 08:45, Shannon Zhao wrote:
> 
> 
> On 2016/1/9 23:03, Marc Zyngier wrote:
>> On Sat, 9 Jan 2016 13:29:56 +0100
>> Christoffer Dall <christoffer.dall@linaro.org> wrote:
>>
>>>> On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
>>>>>> On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
>>>>>>>> On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +Groups:
>>>>>>>>>>>>>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>>>>>>>>>>>>>> +  Attributes:
>>>>>>>>>>>>>> +    The attr field of kvm_device_attr encodes one value:
>>>>>>>>>>>>>> +    bits:     | 63 .... 32 | 31 ....  0 |
>>>>>>>>>>>>>> +    values:   |  reserved  | vcpu_index |
>>>>>>>>>>>>>> +    A value describing the PMU overflow interrupt number for the specified
>>>>>>>>>>>>>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>>>>>>>>>>>>>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>>>>>>>>>>>>>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
>>>>>>>>>>>>
>>>>>>>>>>>> I see we're using vcpu_index rather than MPIDR affinity value
>>>>>>>>>>>> for specifying which CPU we're configuring. Is this in line with
>>>>>>>>>>>> our planned API for GICv3 configuration?
>>>>>>>>>>>>
>>>>>>>>>> Here vcpu_index is used to indexing the vCPU, no special use.
>>>>>>>>
>>>>>>>> Yes, but you can identify the CPU by index, or by its MPIDR.
>>>>>>>> We had a discussion about which was the best way for doing
>>>>>>>> the VGIC API, and I can't remember which way round we ended up
>>>>>>>> going for. Whichever we chose, we should do the same thing here.
>>>>>>
>>>>>> I think we should start up a new discussion on this. My understanding,
>>>>>> after a chat with Igor, who was involved in the untangling of vcpu-id and
>>>>>> apic-id for x86, is that using vcpu-id is preferred, unless of course
>>>>>> the device expects an apic-id/mpidr, in which case there's no reason to
>>>>>> translate it on both sides.
>>>>>>
>>>>
>>>> I'm fairly strongly convinced that we should use the full 32-bit
>>>> compressed MPIDR for everything ARM related going forward, as this will
>>>> cover any case required and leverages and architecturally defined way of
>>>> uniquely identifying a (v)CPU.
>> +1.
>>
>> vcpu_ids, indexes or any other constructs are just a bunch
>> of KVM-specific definitions that do not describe the VM from an
>> architecture PoV. In contrast, the MPIDR is guaranteed to be unique
>> stable, and identifies a given (v)CPU.
>>
>> As for the PMU: either 1) we instantiate it together with the CPU
>> (with a new capability/feature), 
> So spare some bits(e.g. 10 bits) of the features array to pass the PMU
> irq number or add KVM_SET/GET_DEVICE_ATTR for vcpu ioctl?

Using the device attributes seems more suitable, but I don't know if
using GET/SET_DEVICE_ATTR without the actual creation of a device is
acceptable...

	M.
Andrew Jones Jan. 11, 2016, 11:52 a.m. UTC | #20
On Mon, Jan 11, 2016 at 08:59:38AM +0000, Marc Zyngier wrote:
> On 11/01/16 08:45, Shannon Zhao wrote:
> > 
> > 
> > On 2016/1/9 23:03, Marc Zyngier wrote:
> >> On Sat, 9 Jan 2016 13:29:56 +0100
> >> Christoffer Dall <christoffer.dall@linaro.org> wrote:
> >>
> >>>> On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> >>>>>> On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> >>>>>>>> On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +Groups:
> >>>>>>>>>>>>>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> >>>>>>>>>>>>>> +  Attributes:
> >>>>>>>>>>>>>> +    The attr field of kvm_device_attr encodes one value:
> >>>>>>>>>>>>>> +    bits:     | 63 .... 32 | 31 ....  0 |
> >>>>>>>>>>>>>> +    values:   |  reserved  | vcpu_index |
> >>>>>>>>>>>>>> +    A value describing the PMU overflow interrupt number for the specified
> >>>>>>>>>>>>>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> >>>>>>>>>>>>>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> >>>>>>>>>>>>>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I see we're using vcpu_index rather than MPIDR affinity value
> >>>>>>>>>>>> for specifying which CPU we're configuring. Is this in line with
> >>>>>>>>>>>> our planned API for GICv3 configuration?
> >>>>>>>>>>>>
> >>>>>>>>>> Here vcpu_index is used to indexing the vCPU, no special use.
> >>>>>>>>
> >>>>>>>> Yes, but you can identify the CPU by index, or by its MPIDR.
> >>>>>>>> We had a discussion about which was the best way for doing
> >>>>>>>> the VGIC API, and I can't remember which way round we ended up
> >>>>>>>> going for. Whichever we chose, we should do the same thing here.
> >>>>>>
> >>>>>> I think we should start up a new discussion on this. My understanding,
> >>>>>> after a chat with Igor, who was involved in the untangling of vcpu-id and
> >>>>>> apic-id for x86, is that using vcpu-id is preferred, unless of course
> >>>>>> the device expects an apic-id/mpidr, in which case there's no reason to
> >>>>>> translate it on both sides.
> >>>>>>
> >>>>
> >>>> I'm fairly strongly convinced that we should use the full 32-bit
> >>>> compressed MPIDR for everything ARM related going forward, as this will
> >>>> cover any case required and leverages and architecturally defined way of
> >>>> uniquely identifying a (v)CPU.
> >> +1.
> >>
> >> vcpu_ids, indexes or any other constructs are just a bunch
> >> of KVM-specific definitions that do not describe the VM from an
> >> architecture PoV. In contrast, the MPIDR is guaranteed to be unique
> >> stable, and identifies a given (v)CPU.
> >>
> >> As for the PMU: either 1) we instantiate it together with the CPU
> >> (with a new capability/feature), 
> > So spare some bits(e.g. 10 bits) of the features array to pass the PMU
> > irq number or add KVM_SET/GET_DEVICE_ATTR for vcpu ioctl?
> 
> Using the device attributes seems more suitable, but I don't know if
> using GET/SET_DEVICE_ATTR without the actual creation of a device is
> acceptable...

There's precedent set by s390 to take only the set/get/has api into a
new context, commit f206165620.

Thanks,
drew
Shannon Zhao Jan. 11, 2016, 12:03 p.m. UTC | #21
On 2016/1/11 19:52, Andrew Jones wrote:
> On Mon, Jan 11, 2016 at 08:59:38AM +0000, Marc Zyngier wrote:
>> >On 11/01/16 08:45, Shannon Zhao wrote:
>>> > >
>>> > >
>>> > >On 2016/1/9 23:03, Marc Zyngier wrote:
>>>> > >>On Sat, 9 Jan 2016 13:29:56 +0100
>>>> > >>Christoffer Dall<christoffer.dall@linaro.org>  wrote:
>>>> > >>
>>>>>> > >>>>On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
>>>>>>>> > >>>>>>On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
>>>>>>>>>> > >>>>>>>>On 7 January 2016 at 14:49, Shannon Zhao<zhaoshenglong@huawei.com>  wrote:
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+Groups:
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+  KVM_DEV_ARM_PMU_GRP_IRQ
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+  Attributes:
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    The attr field of kvm_device_attr encodes one value:
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    bits:     | 63 .... 32 | 31 ....  0 |
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    values:   |  reserved  | vcpu_index |
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    A value describing the PMU overflow interrupt number for the specified
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
>>>>>>>>>>>>>>>> > >>>>>>>>>>>>>>+    same for all vcpus, while as a SPI it must be different for each vcpu.
>>>>>>>>>>>>>> > >>>>>>>>>>>>
>>>>>>>>>>>>>> > >>>>>>>>>>>>I see we're using vcpu_index rather than MPIDR affinity value
>>>>>>>>>>>>>> > >>>>>>>>>>>>for specifying which CPU we're configuring. Is this in line with
>>>>>>>>>>>>>> > >>>>>>>>>>>>our planned API for GICv3 configuration?
>>>>>>>>>>>>>> > >>>>>>>>>>>>
>>>>>>>>>>>> > >>>>>>>>>>Here vcpu_index is used to indexing the vCPU, no special use.
>>>>>>>>>> > >>>>>>>>
>>>>>>>>>> > >>>>>>>>Yes, but you can identify the CPU by index, or by its MPIDR.
>>>>>>>>>> > >>>>>>>>We had a discussion about which was the best way for doing
>>>>>>>>>> > >>>>>>>>the VGIC API, and I can't remember which way round we ended up
>>>>>>>>>> > >>>>>>>>going for. Whichever we chose, we should do the same thing here.
>>>>>>>> > >>>>>>
>>>>>>>> > >>>>>>I think we should start up a new discussion on this. My understanding,
>>>>>>>> > >>>>>>after a chat with Igor, who was involved in the untangling of vcpu-id and
>>>>>>>> > >>>>>>apic-id for x86, is that using vcpu-id is preferred, unless of course
>>>>>>>> > >>>>>>the device expects an apic-id/mpidr, in which case there's no reason to
>>>>>>>> > >>>>>>translate it on both sides.
>>>>>>>> > >>>>>>
>>>>>> > >>>>
>>>>>> > >>>>I'm fairly strongly convinced that we should use the full 32-bit
>>>>>> > >>>>compressed MPIDR for everything ARM related going forward, as this will
>>>>>> > >>>>cover any case required and leverages and architecturally defined way of
>>>>>> > >>>>uniquely identifying a (v)CPU.
>>>> > >>+1.
>>>> > >>
>>>> > >>vcpu_ids, indexes or any other constructs are just a bunch
>>>> > >>of KVM-specific definitions that do not describe the VM from an
>>>> > >>architecture PoV. In contrast, the MPIDR is guaranteed to be unique
>>>> > >>stable, and identifies a given (v)CPU.
>>>> > >>
>>>> > >>As for the PMU: either 1) we instantiate it together with the CPU
>>>> > >>(with a new capability/feature),
>>> > >So spare some bits(e.g. 10 bits) of the features array to pass the PMU
>>> > >irq number or add KVM_SET/GET_DEVICE_ATTR for vcpu ioctl?
>> >
>> >Using the device attributes seems more suitable, but I don't know if
>> >using GET/SET_DEVICE_ATTR without the actual creation of a device is
>> >acceptable...
> There's precedent set by s390 to take only the set/get/has api into a
> new context, commit f206165620.
Thanks Andrew. So adding the set/get/has api for only ARM VCPU is 
acceptable? If so, I'll rewrite my patch.

Thanks,
Andrew Jones Jan. 11, 2016, 2:07 p.m. UTC | #22
On Sat, Jan 09, 2016 at 03:03:39PM +0000, Marc Zyngier wrote:
> On Sat, 9 Jan 2016 13:29:56 +0100
> Christoffer Dall <christoffer.dall@linaro.org> wrote:
> 
> > On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> > > On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> > > > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> > > > >>> +
> > > > >>> +Groups:
> > > > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> > > > >>> +  Attributes:
> > > > >>> +    The attr field of kvm_device_attr encodes one value:
> > > > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> > > > >>> +    values:   |  reserved  | vcpu_index |
> > > > >>> +    A value describing the PMU overflow interrupt number for the specified
> > > > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> > > > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> > > > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> > > > >>
> > > > >> I see we're using vcpu_index rather than MPIDR affinity value
> > > > >> for specifying which CPU we're configuring. Is this in line with
> > > > >> our planned API for GICv3 configuration?
> > > > >>
> > > > > Here vcpu_index is used to indexing the vCPU, no special use.
> > > > 
> > > > Yes, but you can identify the CPU by index, or by its MPIDR.
> > > > We had a discussion about which was the best way for doing
> > > > the VGIC API, and I can't remember which way round we ended up
> > > > going for. Whichever we chose, we should do the same thing here.
> > > 
> > > I think we should start up a new discussion on this. My understanding,
> > > after a chat with Igor, who was involved in the untangling of vcpu-id and
> > > apic-id for x86, is that using vcpu-id is preferred, unless of course
> > > the device expects an apic-id/mpidr, in which case there's no reason to
> > > translate it on both sides.
> > > 
> > 
> > I'm fairly strongly convinced that we should use the full 32-bit
> > compressed MPIDR for everything ARM related going forward, as this will
> > cover any case required and leverages and architecturally defined way of
> > uniquely identifying a (v)CPU.
> 
> +1.
> 
> vcpu_ids, indexes or any other constructs are just a bunch
> of KVM-specific definitions that do not describe the VM from an
> architecture PoV. In contrast, the MPIDR is guaranteed to be unique
> stable, and identifies a given (v)CPU.
>

cpu-cpu and cpu-device interfaces should certainly use MPIDR, if they do
in real hardware, to allow us to match emulation code to specs and keep
sanity. But I assume those are the only places of "everything" you guys
are referring to, as everywhere else we should stick to using the concept
of vcpu-ids/indices. Since vcpu-indices are just counters they keep us
from needing all the data structures to be large, complex, sparse things.
Identifiers separate from MPIDR also allow hotunplug/plug to more easily
reuse resources, i.e. remap indices to other vcpus as necessary.

In the PMU case above it seems better to use a vcpu-index. KVM and KVM's
userspace both have unique vcpu-indices and unique MPIDRs per vcpu. The
use here isn't based on a hardware spec, so there's nowhere to look for
how MPIDR should/shouldn't be used. This is just a KVM spec. Here we might
as well use the easiest to use unique identifier.

That said, I like the vcpu ioctl method much better. With that we avoid
the need for vcpu identifiers all together. I'm even having third
thoughts about the gic per vcpu registers. If we go with extending
GET/SET_DEVICE_ATTR here, then I think we should do the same there as
well. That would then leave only KVM_IRQ_LINE using a vcpu-index, which,
with its 8-bit vcpu-index, we've outgrown for gicv3 machine types already
anyway.

Thanks,
drew
Christoffer Dall Jan. 11, 2016, 3:09 p.m. UTC | #23
On Mon, Jan 11, 2016 at 03:07:17PM +0100, Andrew Jones wrote:
> On Sat, Jan 09, 2016 at 03:03:39PM +0000, Marc Zyngier wrote:
> > On Sat, 9 Jan 2016 13:29:56 +0100
> > Christoffer Dall <christoffer.dall@linaro.org> wrote:
> > 
> > > On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> > > > On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> > > > > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> > > > > >>> +
> > > > > >>> +Groups:
> > > > > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> > > > > >>> +  Attributes:
> > > > > >>> +    The attr field of kvm_device_attr encodes one value:
> > > > > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> > > > > >>> +    values:   |  reserved  | vcpu_index |
> > > > > >>> +    A value describing the PMU overflow interrupt number for the specified
> > > > > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> > > > > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> > > > > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> > > > > >>
> > > > > >> I see we're using vcpu_index rather than MPIDR affinity value
> > > > > >> for specifying which CPU we're configuring. Is this in line with
> > > > > >> our planned API for GICv3 configuration?
> > > > > >>
> > > > > > Here vcpu_index is used to indexing the vCPU, no special use.
> > > > > 
> > > > > Yes, but you can identify the CPU by index, or by its MPIDR.
> > > > > We had a discussion about which was the best way for doing
> > > > > the VGIC API, and I can't remember which way round we ended up
> > > > > going for. Whichever we chose, we should do the same thing here.
> > > > 
> > > > I think we should start up a new discussion on this. My understanding,
> > > > after a chat with Igor, who was involved in the untangling of vcpu-id and
> > > > apic-id for x86, is that using vcpu-id is preferred, unless of course
> > > > the device expects an apic-id/mpidr, in which case there's no reason to
> > > > translate it on both sides.
> > > > 
> > > 
> > > I'm fairly strongly convinced that we should use the full 32-bit
> > > compressed MPIDR for everything ARM related going forward, as this will
> > > cover any case required and leverages and architecturally defined way of
> > > uniquely identifying a (v)CPU.
> > 
> > +1.
> > 
> > vcpu_ids, indexes or any other constructs are just a bunch
> > of KVM-specific definitions that do not describe the VM from an
> > architecture PoV. In contrast, the MPIDR is guaranteed to be unique
> > stable, and identifies a given (v)CPU.
> >
> 
> cpu-cpu and cpu-device interfaces should certainly use MPIDR, if they do
> in real hardware, to allow us to match emulation code to specs and keep
> sanity. But I assume those are the only places of "everything" you guys
> are referring to, as everywhere else we should stick to using the concept
> of vcpu-ids/indices. Since vcpu-indices are just counters they keep us
> from needing all the data structures to be large, complex, sparse things.
> Identifiers separate from MPIDR also allow hotunplug/plug to more easily
> reuse resources, i.e. remap indices to other vcpus as necessary.

Are vcpu ids already exposed to userspace (beyond the stupid
KVM_IRQ_LINE) ioctl and as such we're bound to whatever upper limit and
format they have?

If not, I think decoupling an internal ID and uniquely identifying a CPU
is a good idea.

> 
> In the PMU case above it seems better to use a vcpu-index. KVM and KVM's
> userspace both have unique vcpu-indices and unique MPIDRs per vcpu. The
> use here isn't based on a hardware spec, so there's nowhere to look for
> how MPIDR should/shouldn't be used. This is just a KVM spec. Here we might
> as well use the easiest to use unique identifier.

I think the only things that should matter here are (in no particular
order):
 - Userspace convenience
 - Clarity
 - Avoiding ambiguity

> 
> That said, I like the vcpu ioctl method much better. With that we avoid
> the need for vcpu identifiers all together. I'm even having third
> thoughts about the gic per vcpu registers. If we go with extending
> GET/SET_DEVICE_ATTR here, then I think we should do the same there as
> well. That would then leave only KVM_IRQ_LINE using a vcpu-index, which,
> with its 8-bit vcpu-index, we've outgrown for gicv3 machine types already
> anyway.
> 
For the GIC, I think we've discussed this in the past.  It really
depends whether you think about the GIC as one device (the distributor)
separate from the CPUs, with a bunch of separate devices attached to
each CPU and wired together somehow, or if you think of this as one big
coherent thing where parts of the device are specific to each CPU.

I tend to interpret the GIC as the latter and I think the kernel and
userspace implementations are also done that way, suggesting we should
stick with the device API for all GIC-related state (as was also the
suggestion for the GICv3 save/restore API).

Similarly, because the GIC architecture refers to CPUs using the MPIDR,
we should do the same in this interface.  Otherwise, I think you have to
define stricly how the exposed VCPU ID maps to an MPIDR somehow.

-Christoffer
Andrew Jones Jan. 11, 2016, 4:09 p.m. UTC | #24
On Mon, Jan 11, 2016 at 04:09:29PM +0100, Christoffer Dall wrote:
> On Mon, Jan 11, 2016 at 03:07:17PM +0100, Andrew Jones wrote:
> > On Sat, Jan 09, 2016 at 03:03:39PM +0000, Marc Zyngier wrote:
> > > On Sat, 9 Jan 2016 13:29:56 +0100
> > > Christoffer Dall <christoffer.dall@linaro.org> wrote:
> > > 
> > > > On Thu, Jan 07, 2016 at 09:36:47PM +0100, Andrew Jones wrote:
> > > > > On Thu, Jan 07, 2016 at 02:56:15PM +0000, Peter Maydell wrote:
> > > > > > On 7 January 2016 at 14:49, Shannon Zhao <zhaoshenglong@huawei.com> wrote:
> > > > > > >>> +
> > > > > > >>> +Groups:
> > > > > > >>> +  KVM_DEV_ARM_PMU_GRP_IRQ
> > > > > > >>> +  Attributes:
> > > > > > >>> +    The attr field of kvm_device_attr encodes one value:
> > > > > > >>> +    bits:     | 63 .... 32 | 31 ....  0 |
> > > > > > >>> +    values:   |  reserved  | vcpu_index |
> > > > > > >>> +    A value describing the PMU overflow interrupt number for the specified
> > > > > > >>> +    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> > > > > > >>> +    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
> > > > > > >>> +    same for all vcpus, while as a SPI it must be different for each vcpu.
> > > > > > >>
> > > > > > >> I see we're using vcpu_index rather than MPIDR affinity value
> > > > > > >> for specifying which CPU we're configuring. Is this in line with
> > > > > > >> our planned API for GICv3 configuration?
> > > > > > >>
> > > > > > > Here vcpu_index is used to indexing the vCPU, no special use.
> > > > > > 
> > > > > > Yes, but you can identify the CPU by index, or by its MPIDR.
> > > > > > We had a discussion about which was the best way for doing
> > > > > > the VGIC API, and I can't remember which way round we ended up
> > > > > > going for. Whichever we chose, we should do the same thing here.
> > > > > 
> > > > > I think we should start up a new discussion on this. My understanding,
> > > > > after a chat with Igor, who was involved in the untangling of vcpu-id and
> > > > > apic-id for x86, is that using vcpu-id is preferred, unless of course
> > > > > the device expects an apic-id/mpidr, in which case there's no reason to
> > > > > translate it on both sides.
> > > > > 
> > > > 
> > > > I'm fairly strongly convinced that we should use the full 32-bit
> > > > compressed MPIDR for everything ARM related going forward, as this will
> > > > cover any case required and leverages and architecturally defined way of
> > > > uniquely identifying a (v)CPU.
> > > 
> > > +1.
> > > 
> > > vcpu_ids, indexes or any other constructs are just a bunch
> > > of KVM-specific definitions that do not describe the VM from an
> > > architecture PoV. In contrast, the MPIDR is guaranteed to be unique
> > > stable, and identifies a given (v)CPU.
> > >
> > 
> > cpu-cpu and cpu-device interfaces should certainly use MPIDR, if they do
> > in real hardware, to allow us to match emulation code to specs and keep
> > sanity. But I assume those are the only places of "everything" you guys
> > are referring to, as everywhere else we should stick to using the concept
> > of vcpu-ids/indices. Since vcpu-indices are just counters they keep us
> > from needing all the data structures to be large, complex, sparse things.
> > Identifiers separate from MPIDR also allow hotunplug/plug to more easily
> > reuse resources, i.e. remap indices to other vcpus as necessary.
> 
> Are vcpu ids already exposed to userspace (beyond the stupid
> KVM_IRQ_LINE) ioctl and as such we're bound to whatever upper limit and
> format they have?

The only other place I found is KVM_CREATE_VCPU. I suppose we could move
to MPIDR for that, and it would be a nice way to handle the "userspace
determines MPIDR" work that I plan to do. Both KVM and its userspaces
would still use some counter-based vcpu identifiers internally, to avoid
large, sparse structures, but I guess the advantage is that they don't
have to agree on how they do that. The 'vcpu id' used by KVM_CREATE_VCPU
is already 32-bits, and is supposed to be an arbitrary identifier. That
all looks good for converting to MPIDR.

> 
> If not, I think decoupling an internal ID and uniquely identifying a CPU
> is a good idea.
> 
> > 
> > In the PMU case above it seems better to use a vcpu-index. KVM and KVM's
> > userspace both have unique vcpu-indices and unique MPIDRs per vcpu. The
> > use here isn't based on a hardware spec, so there's nowhere to look for
> > how MPIDR should/shouldn't be used. This is just a KVM spec. Here we might
> > as well use the easiest to use unique identifier.
> 
> I think the only things that should matter here are (in no particular
> order):
>  - Userspace convenience
>  - Clarity
>  - Avoiding ambiguity

Agreed. I believe the counter-based vcpu-id brings the most convenience
for any non-MPIDR based APIs, i.e. no spec saying MPIDR should be used.
However, picking which counter-based vcpu-id (if more than one exist)
costs both convenience and clarity. I guess that's another argument for
switching to MPIDR...

> 
> > 
> > That said, I like the vcpu ioctl method much better. With that we avoid
> > the need for vcpu identifiers all together. I'm even having third
> > thoughts about the gic per vcpu registers. If we go with extending
> > GET/SET_DEVICE_ATTR here, then I think we should do the same there as
> > well. That would then leave only KVM_IRQ_LINE using a vcpu-index, which,
> > with its 8-bit vcpu-index, we've outgrown for gicv3 machine types already
> > anyway.
> > 
> For the GIC, I think we've discussed this in the past.  It really
> depends whether you think about the GIC as one device (the distributor)
> separate from the CPUs, with a bunch of separate devices attached to
> each CPU and wired together somehow, or if you think of this as one big
> coherent thing where parts of the device are specific to each CPU.

I keep flip-flopping my view, which is why I keep flip-flopping my
opinion on how to deal with its register API :-)

> 
> I tend to interpret the GIC as the latter and I think the kernel and
> userspace implementations are also done that way, suggesting we should
> stick with the device API for all GIC-related state (as was also the
> suggestion for the GICv3 save/restore API).

I think the save/restore case is where I always flip to seeing it as a
bunch of separate per cpu devices. It would feel better to me to
save/restore the cpu-gic registers the same way we do all other cpu
registers.

I think we can get the best of both worlds by extending the
SET/GET_DEVICE_ATTR to vcpus, and then using both the device ioctl
and the vcpu ioctls.

> 
> Similarly, because the GIC architecture refers to CPUs using the MPIDR,
> we should do the same in this interface.  Otherwise, I think you have to
> define stricly how the exposed VCPU ID maps to an MPIDR somehow.

Well, we're creating our own interface to a gic model, so there's
nothing in the spec that says MPIDR is needed here, but if there's
no good reason not to change the KVM_CREATE_VCPU vcpu-id to MPIDR,
then we certainly wouldn't want to invent a new vcpu-id for this use.


My current feelings are:
  1) avoid needing vcpu identifiers (use vcpu ioctl whenever possible)
  2) if we need them, they should match the same one used by
     KVM_CREATE_VCPU (whenever possible)
  3) any device api that doesn't provide a 32-bit identifier field
     should just be special-cased, and have plenty of documentation
     explaining how to handle it - explaining what other limitations
     it puts on the guest, etc.

None of those feelings exclude using MPIDR as _the_ vcpu-id (I don't
think). It appears the main benefit of switching to MPIDR would be to
decouple KVM from its userspaces wrt to choosing the non-MPIDR vcpu
identification used for managing data structures and looping over
vcpus internally. We also gain a clean way for userspace to choose
the MPIDR for each vcpu, which we need to do anyway.

Thanks,
drew
Peter Maydell Jan. 11, 2016, 4:13 p.m. UTC | #25
On 11 January 2016 at 16:09, Andrew Jones <drjones@redhat.com> wrote:
> I think the save/restore case is where I always flip to seeing it as a
> bunch of separate per cpu devices. It would feel better to me to
> save/restore the cpu-gic registers the same way we do all other cpu
> registers.

From QEMU's point of view this would be extremely awkward, which
is one reason I'm happy with the GICv3 API we came up with
and reviewed last year...

thanks
-- PMM
Andrew Jones Jan. 11, 2016, 4:21 p.m. UTC | #26
On Mon, Jan 11, 2016 at 05:09:27PM +0100, Andrew Jones wrote:
> On Mon, Jan 11, 2016 at 04:09:29PM +0100, Christoffer Dall wrote:
> > Are vcpu ids already exposed to userspace (beyond the stupid
> > KVM_IRQ_LINE) ioctl and as such we're bound to whatever upper limit and
> > format they have?
> 
> The only other place I found is KVM_CREATE_VCPU. I suppose we could move
> to MPIDR for that, and it would be a nice way to handle the "userspace
> determines MPIDR" work that I plan to do. Both KVM and its userspaces
> would still use some counter-based vcpu identifiers internally, to avoid
> large, sparse structures, but I guess the advantage is that they don't
> have to agree on how they do that. The 'vcpu id' used by KVM_CREATE_VCPU
> is already 32-bits, and is supposed to be an arbitrary identifier. That
> all looks good for converting to MPIDR.
>

Correction. I understand that vcpu-id is "supposed" to be an arbitrary
identifier now, but it doesn't appear that all the assumptions that it's
a counter are gone yet... virt/kvm/kvm_main.c has

static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
...
        if (id >= KVM_MAX_VCPUS)
                return -EINVAL;



More to do there I guess...

drew
Peter Maydell Jan. 11, 2016, 4:29 p.m. UTC | #27
On 11 January 2016 at 16:21, Andrew Jones <drjones@redhat.com> wrote:
> On Mon, Jan 11, 2016 at 05:09:27PM +0100, Andrew Jones wrote:
>> On Mon, Jan 11, 2016 at 04:09:29PM +0100, Christoffer Dall wrote:
>> > Are vcpu ids already exposed to userspace (beyond the stupid
>> > KVM_IRQ_LINE) ioctl and as such we're bound to whatever upper limit and
>> > format they have?
>>
>> The only other place I found is KVM_CREATE_VCPU. I suppose we could move
>> to MPIDR for that, and it would be a nice way to handle the "userspace
>> determines MPIDR" work that I plan to do. Both KVM and its userspaces
>> would still use some counter-based vcpu identifiers internally, to avoid
>> large, sparse structures, but I guess the advantage is that they don't
>> have to agree on how they do that. The 'vcpu id' used by KVM_CREATE_VCPU
>> is already 32-bits, and is supposed to be an arbitrary identifier. That
>> all looks good for converting to MPIDR.
>>
>
> Correction. I understand that vcpu-id is "supposed" to be an arbitrary
> identifier now, but it doesn't appear that all the assumptions that it's
> a counter are gone yet... virt/kvm/kvm_main.c has
>
> static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
> ...
>         if (id >= KVM_MAX_VCPUS)
>                 return -EINVAL;

I think the last time we talked about supporting "userspace
determines MPIDR" the idea was to do it by allowing userspace to
write to the MPIDR register with KVM_SET_ONE_REG. So you'd
create a bunch of CPUs with vcpu-ids as usual, and then the
MPIDRs would be set for them later as appropriate (or not
at all, if userspace was an older qemu).

thanks
-- PMM
Andrew Jones Jan. 11, 2016, 4:44 p.m. UTC | #28
On Mon, Jan 11, 2016 at 04:29:03PM +0000, Peter Maydell wrote:
> On 11 January 2016 at 16:21, Andrew Jones <drjones@redhat.com> wrote:
> > On Mon, Jan 11, 2016 at 05:09:27PM +0100, Andrew Jones wrote:
> >> On Mon, Jan 11, 2016 at 04:09:29PM +0100, Christoffer Dall wrote:
> >> > Are vcpu ids already exposed to userspace (beyond the stupid
> >> > KVM_IRQ_LINE) ioctl and as such we're bound to whatever upper limit and
> >> > format they have?
> >>
> >> The only other place I found is KVM_CREATE_VCPU. I suppose we could move
> >> to MPIDR for that, and it would be a nice way to handle the "userspace
> >> determines MPIDR" work that I plan to do. Both KVM and its userspaces
> >> would still use some counter-based vcpu identifiers internally, to avoid
> >> large, sparse structures, but I guess the advantage is that they don't
> >> have to agree on how they do that. The 'vcpu id' used by KVM_CREATE_VCPU
> >> is already 32-bits, and is supposed to be an arbitrary identifier. That
> >> all looks good for converting to MPIDR.
> >>
> >
> > Correction. I understand that vcpu-id is "supposed" to be an arbitrary
> > identifier now, but it doesn't appear that all the assumptions that it's
> > a counter are gone yet... virt/kvm/kvm_main.c has
> >
> > static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
> > ...
> >         if (id >= KVM_MAX_VCPUS)
> >                 return -EINVAL;
> 
> I think the last time we talked about supporting "userspace
> determines MPIDR" the idea was to do it by allowing userspace to
> write to the MPIDR register with KVM_SET_ONE_REG. So you'd
> create a bunch of CPUs with vcpu-ids as usual, and then the
> MPIDRs would be set for them later as appropriate (or not
> at all, if userspace was an older qemu).

Yup, I recall that. I'm just expanding this discussion into that one.
If we wanted to single vcpu identifier type, and we wanted it to be
MPIDR, then I guess we'd want to pass it in to KVM_CREATE_VCPU too, at
which point we no longer need to set it later with KVM_SET_ONE_REG.

Thanks,
drew
Andrew Jones Jan. 11, 2016, 4:48 p.m. UTC | #29
On Mon, Jan 11, 2016 at 04:13:12PM +0000, Peter Maydell wrote:
> On 11 January 2016 at 16:09, Andrew Jones <drjones@redhat.com> wrote:
> > I think the save/restore case is where I always flip to seeing it as a
> > bunch of separate per cpu devices. It would feel better to me to
> > save/restore the cpu-gic registers the same way we do all other cpu
> > registers.
> 
> From QEMU's point of view this would be extremely awkward, which
> is one reason I'm happy with the GICv3 API we came up with
> and reviewed last year...

OK, I'll shut up about this. As I didn't speak then, I shall now forever
hold my peace :)

drew
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt b/Documentation/virtual/kvm/devices/arm-pmu.txt
new file mode 100644
index 0000000..dda864e
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
@@ -0,0 +1,24 @@ 
+ARM Virtual Performance Monitor Unit (vPMU)
+===========================================
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_PMU_V3         ARM Performance Monitor Unit v3
+
+Instantiate one PMU instance for per VCPU through this API.
+
+Groups:
+  KVM_DEV_ARM_PMU_GRP_IRQ
+  Attributes:
+    The attr field of kvm_device_attr encodes one value:
+    bits:     | 63 .... 32 | 31 ....  0 |
+    values:   |  reserved  | vcpu_index |
+    A value describing the PMU overflow interrupt number for the specified
+    vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
+    interrupt type must be same for each vcpu. As a PPI, the interrupt number is
+    same for all vcpus, while as a SPI it must be different for each vcpu.
+
+  Errors:
+    -ENXIO: Unsupported attribute group
+    -EBUSY: The PMU overflow interrupt is already set
+    -ENODEV: Getting the PMU overflow interrupt number while it's not set
+    -EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 2d4ca4b..cbb9022 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -204,6 +204,10 @@  struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
 
+/* Device Control API: ARM PMU */
+#define KVM_DEV_ARM_PMU_GRP_IRQ		0
+#define   KVM_DEV_ARM_PMU_CPUID_MASK	0xffffffffULL
+
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
 #define KVM_ARM_IRQ_TYPE_MASK		0xff
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c923350..608dea6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1161,6 +1161,7 @@  extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
+extern struct kvm_device_ops kvm_arm_pmu_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 03f3618..4ba6fdd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1032,6 +1032,8 @@  enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
 	KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
+	KVM_DEV_TYPE_ARM_PMU_V3,
+#define	KVM_DEV_TYPE_ARM_PMU_V3		KVM_DEV_TYPE_ARM_PMU_V3
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 3ec3cdd..5518308 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -19,6 +19,7 @@ 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_emulate.h>
 #include <kvm/arm_pmu.h>
 #include <kvm/arm_vgic.h>
@@ -374,3 +375,130 @@  void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 
 	pmc->perf_event = event;
 }
+
+static inline bool kvm_arm_pmu_initialized(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pmu.irq_num != -1;
+}
+
+static int kvm_arm_pmu_irq_access(struct kvm *kvm, struct kvm_device_attr *attr,
+				  int *irq, bool is_set)
+{
+	int cpuid;
+	struct kvm_vcpu *vcpu;
+	struct kvm_pmu *pmu;
+
+	cpuid = attr->attr & KVM_DEV_ARM_PMU_CPUID_MASK;
+	if (cpuid >= atomic_read(&kvm->online_vcpus))
+		return -EINVAL;
+
+	vcpu = kvm_get_vcpu(kvm, cpuid);
+	if (!vcpu)
+		return -EINVAL;
+
+	pmu = &vcpu->arch.pmu;
+	if (!is_set) {
+		if (!kvm_arm_pmu_initialized(vcpu))
+			return -ENODEV;
+
+		*irq = pmu->irq_num;
+	} else {
+		if (kvm_arm_pmu_initialized(vcpu))
+			return -EBUSY;
+
+		kvm_debug("Set kvm ARM PMU irq: %d\n", *irq);
+		pmu->irq_num = *irq;
+	}
+
+	return 0;
+}
+
+static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	struct kvm *kvm = dev->kvm;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+		memset(pmu, 0, sizeof(*pmu));
+		kvm_pmu_vcpu_reset(vcpu);
+		pmu->irq_num = -1;
+	}
+
+	return 0;
+}
+
+static void kvm_arm_pmu_destroy(struct kvm_device *dev)
+{
+	kfree(dev);
+}
+
+static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_PMU_GRP_IRQ: {
+		int __user *uaddr = (int __user *)(long)attr->addr;
+		int reg;
+
+		if (get_user(reg, uaddr))
+			return -EFAULT;
+
+		/*
+		 * The PMU overflow interrupt could be a PPI or SPI, but for one
+		 * VM the interrupt type must be same for each vcpu. As a PPI,
+		 * the interrupt number is same for all vcpus, while as a SPI it
+		 * must be different for each vcpu.
+		 */
+		if (reg < VGIC_NR_SGIS || reg >= dev->kvm->arch.vgic.nr_irqs)
+			return -EINVAL;
+
+		return kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, true);
+	}
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_arm_pmu_get_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_PMU_GRP_IRQ: {
+		int __user *uaddr = (int __user *)(long)attr->addr;
+		int reg = -1;
+
+
+		ret = kvm_arm_pmu_irq_access(dev->kvm, attr, &reg, false);
+		if (ret)
+			return ret;
+		return put_user(reg, uaddr);
+	}
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_arm_pmu_has_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_PMU_GRP_IRQ:
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_pmu_ops = {
+	.name = "kvm-arm-pmu",
+	.create = kvm_arm_pmu_create,
+	.destroy = kvm_arm_pmu_destroy,
+	.set_attr = kvm_arm_pmu_set_attr,
+	.get_attr = kvm_arm_pmu_get_attr,
+	.has_attr = kvm_arm_pmu_has_attr,
+};
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 484079e..81a42cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2647,6 +2647,10 @@  static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 #ifdef CONFIG_KVM_XICS
 	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
 #endif
+
+#ifdef CONFIG_KVM_ARM_PMU
+	[KVM_DEV_TYPE_ARM_PMU_V3]	= &kvm_arm_pmu_ops,
+#endif
 };
 
 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)