diff mbox series

[4/4] KVM: arm64: Refuse to run VCPU if the PMU doesn't match the physical CPU

Message ID 20211115165041.194884-5-alexandru.elisei@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Improve PMU support on heterogeneous systems | expand

Commit Message

Alexandru Elisei Nov. 15, 2021, 4:50 p.m. UTC
Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
device ioctl. If the VCPU is scheduled on a physical CPU which has a
different PMU, the perf events needed to emulate a guest PMU won't be
scheduled in and the guest performance counters will stop counting. Treat
it as an userspace error and refuse to run the VCPU in this situation.

The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
the flag is cleared when the KVM_RUN enters the non-preemptible section
instead of in vcpu_put(); this has been done on purpose so the error
condition is communicated as soon as possible to userspace, otherwise
vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.

Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
---
 Documentation/virt/kvm/api.rst          |  5 +++--
 Documentation/virt/kvm/devices/vcpu.rst |  3 ++-
 arch/arm64/include/asm/kvm_host.h       |  3 +++
 arch/arm64/kvm/arm.c                    | 15 +++++++++++++++
 arch/arm64/kvm/pmu-emul.c               |  1 +
 5 files changed, 24 insertions(+), 3 deletions(-)

Comments

Marc Zyngier Nov. 21, 2021, 7:35 p.m. UTC | #1
On Mon, 15 Nov 2021 16:50:41 +0000,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> device ioctl. If the VCPU is scheduled on a physical CPU which has a
> different PMU, the perf events needed to emulate a guest PMU won't be
> scheduled in and the guest performance counters will stop counting. Treat
> it as an userspace error and refuse to run the VCPU in this situation.
> 
> The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> the flag is cleared when the KVM_RUN enters the non-preemptible section
> instead of in vcpu_put(); this has been done on purpose so the error
> condition is communicated as soon as possible to userspace, otherwise
> vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.

Can we make this something orthogonal to the PMU, and get userspace to
pick an affinity mask independently of instantiating a PMU? I can
imagine this would also be useful for SPE on asymmetric systems.

> Suggested-by: Marc Zyngier <maz@kernel.org>
> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
> ---
>  Documentation/virt/kvm/api.rst          |  5 +++--
>  Documentation/virt/kvm/devices/vcpu.rst |  3 ++-
>  arch/arm64/include/asm/kvm_host.h       |  3 +++
>  arch/arm64/kvm/arm.c                    | 15 +++++++++++++++
>  arch/arm64/kvm/pmu-emul.c               |  1 +
>  5 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index aeeb071c7688..5bbad8318ea5 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -396,8 +396,9 @@ Errors:
>  
>    =======    ==============================================================
>    EINTR      an unmasked signal is pending
> -  ENOEXEC    the vcpu hasn't been initialized or the guest tried to execute
> -             instructions from device memory (arm64)
> +  ENOEXEC    the vcpu hasn't been initialized, the guest tried to execute
> +             instructions from device memory (arm64) or the vcpu PMU is
> +             different from the physical cpu PMU (arm64).
>    ENOSYS     data abort outside memslots with no syndrome info and
>               KVM_CAP_ARM_NISV_TO_USER not enabled (arm64)
>    EPERM      SVE feature set but not finalized (arm64)
> diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
> index 59ac382af59a..ca0da34da889 100644
> --- a/Documentation/virt/kvm/devices/vcpu.rst
> +++ b/Documentation/virt/kvm/devices/vcpu.rst
> @@ -128,7 +128,8 @@ systems where there are at least two PMUs on the system.
>  
>  Note that KVM will not make any attempts to run the VCPU on the physical CPUs
>  associated with the PMU specified by this attribute. This is entirely left to
> -userspace.
> +userspace. However, if the VCPU is scheduled on a CPU which has a different PMU,
> +then KVM_RUN will return with the error code ENOEXEC.
>  
>  2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
>  =================================
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2a5f7f38006f..ae2083b41d8a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -385,6 +385,9 @@ struct kvm_vcpu_arch {
>  		u64 last_steal;
>  		gpa_t base;
>  	} steal;
> +
> +	cpumask_var_t supported_cpus;
> +	bool cpu_not_supported;

Can this just be made a vcpu flag instead?

>  };
>  
>  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 2f03cbfefe67..5dbfd18c4e37 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -320,6 +320,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  
>  	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
>  
> +	if (!zalloc_cpumask_var(&vcpu->arch.supported_cpus, GFP_KERNEL))
> +		return -ENOMEM;
> +
>  	/* Set up the timer */
>  	kvm_timer_vcpu_init(vcpu);
>  
> @@ -347,6 +350,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>  	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
>  		static_branch_dec(&userspace_irqchip_in_use);
>  
> +	free_cpumask_var(vcpu->arch.supported_cpus);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
>  	kvm_timer_vcpu_terminate(vcpu);
>  	kvm_pmu_vcpu_destroy(vcpu);
> @@ -425,6 +429,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	if (vcpu_has_ptrauth(vcpu))
>  		vcpu_ptrauth_disable(vcpu);
>  	kvm_arch_vcpu_load_debug_state_flags(vcpu);
> +
> +	if (!cpumask_empty(vcpu->arch.supported_cpus) &&

How about initialising the cpumask to cpu_possible_mask, avoiding the
cpumask_empty check?

> +	    !cpumask_test_cpu(smp_processor_id(), vcpu->arch.supported_cpus))
> +		vcpu->arch.cpu_not_supported = true;

I have the feeling this would actually better be implemented as a
request, but there may be some surgery required for this.

>  }
>  
>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> @@ -815,6 +823,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
>  		 */
>  		preempt_disable();
>  
> +		if (unlikely(vcpu->arch.cpu_not_supported)) {
> +			vcpu->arch.cpu_not_supported = false;
> +			ret = -ENOEXEC;
> +			preempt_enable();

How about populating run->fail_entry with some information? I bet this
would be useful, if only as a debugging tool.

> +			continue;
> +		}
> +
>  		kvm_pmu_flush_hwstate(vcpu);
>  
>  		local_irq_disable();
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index 53cedeb5dbf6..957a6d0cfa56 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -951,6 +951,7 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
>  		arm_pmu = entry->arm_pmu;
>  		if (arm_pmu->pmu.type == pmu_id) {
>  			kvm_pmu->arm_pmu = arm_pmu;
> +			cpumask_copy(vcpu->arch.supported_cpus, &arm_pmu->supported_cpus);
>  			return 0;
>  		}
>  	}

Thanks,

	M.
Alexandru Elisei Nov. 22, 2021, 12:12 p.m. UTC | #2
Hi Marc,

On Sun, Nov 21, 2021 at 07:35:13PM +0000, Marc Zyngier wrote:
> On Mon, 15 Nov 2021 16:50:41 +0000,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> > device ioctl. If the VCPU is scheduled on a physical CPU which has a
> > different PMU, the perf events needed to emulate a guest PMU won't be
> > scheduled in and the guest performance counters will stop counting. Treat
> > it as an userspace error and refuse to run the VCPU in this situation.
> > 
> > The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> > the flag is cleared when the KVM_RUN enters the non-preemptible section
> > instead of in vcpu_put(); this has been done on purpose so the error
> > condition is communicated as soon as possible to userspace, otherwise
> > vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.
> 
> Can we make this something orthogonal to the PMU, and get userspace to
> pick an affinity mask independently of instantiating a PMU? I can
> imagine this would also be useful for SPE on asymmetric systems.

I actually went this way for the latest version of the SPE series [1] and
dropped the explicit userspace ioctl in favor of this mechanism.

The expectation is that userspace already knows which CPUs are associated
with the chosen PMU (or SPE) when setting the PMU for the VCPU, and having
userspace set it explicitely via an ioctl looks like an unnecessary step to
me. I don't see other usecases of an explicit ioctl outside of the above
two situation (if userspace wants a VCPU to run only on specific CPUs, it
can use thread affinity for that), so I decided to drop it.

For reference, this is how the ioctl looked like in the previous version of
the SPE series [2], in case you want to see how it would look like.

[1] https://www.spinics.net/lists/arm-kernel/msg934211.html
[2] https://www.spinics.net/lists/arm-kernel/msg917229.html

> 
> > Suggested-by: Marc Zyngier <maz@kernel.org>
> > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
> > ---
> >  Documentation/virt/kvm/api.rst          |  5 +++--
> >  Documentation/virt/kvm/devices/vcpu.rst |  3 ++-
> >  arch/arm64/include/asm/kvm_host.h       |  3 +++
> >  arch/arm64/kvm/arm.c                    | 15 +++++++++++++++
> >  arch/arm64/kvm/pmu-emul.c               |  1 +
> >  5 files changed, 24 insertions(+), 3 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index aeeb071c7688..5bbad8318ea5 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -396,8 +396,9 @@ Errors:
> >  
> >    =======    ==============================================================
> >    EINTR      an unmasked signal is pending
> > -  ENOEXEC    the vcpu hasn't been initialized or the guest tried to execute
> > -             instructions from device memory (arm64)
> > +  ENOEXEC    the vcpu hasn't been initialized, the guest tried to execute
> > +             instructions from device memory (arm64) or the vcpu PMU is
> > +             different from the physical cpu PMU (arm64).
> >    ENOSYS     data abort outside memslots with no syndrome info and
> >               KVM_CAP_ARM_NISV_TO_USER not enabled (arm64)
> >    EPERM      SVE feature set but not finalized (arm64)
> > diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
> > index 59ac382af59a..ca0da34da889 100644
> > --- a/Documentation/virt/kvm/devices/vcpu.rst
> > +++ b/Documentation/virt/kvm/devices/vcpu.rst
> > @@ -128,7 +128,8 @@ systems where there are at least two PMUs on the system.
> >  
> >  Note that KVM will not make any attempts to run the VCPU on the physical CPUs
> >  associated with the PMU specified by this attribute. This is entirely left to
> > -userspace.
> > +userspace. However, if the VCPU is scheduled on a CPU which has a different PMU,
> > +then KVM_RUN will return with the error code ENOEXEC.
> >  
> >  2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
> >  =================================
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 2a5f7f38006f..ae2083b41d8a 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -385,6 +385,9 @@ struct kvm_vcpu_arch {
> >  		u64 last_steal;
> >  		gpa_t base;
> >  	} steal;
> > +
> > +	cpumask_var_t supported_cpus;
> > +	bool cpu_not_supported;
> 
> Can this just be made a vcpu flag instead?

Sure, that can also work.

> 
> >  };
> >  
> >  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 2f03cbfefe67..5dbfd18c4e37 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -320,6 +320,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> >  
> >  	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
> >  
> > +	if (!zalloc_cpumask_var(&vcpu->arch.supported_cpus, GFP_KERNEL))
> > +		return -ENOMEM;
> > +
> >  	/* Set up the timer */
> >  	kvm_timer_vcpu_init(vcpu);
> >  
> > @@ -347,6 +350,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
> >  	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
> >  		static_branch_dec(&userspace_irqchip_in_use);
> >  
> > +	free_cpumask_var(vcpu->arch.supported_cpus);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
> >  	kvm_timer_vcpu_terminate(vcpu);
> >  	kvm_pmu_vcpu_destroy(vcpu);
> > @@ -425,6 +429,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >  	if (vcpu_has_ptrauth(vcpu))
> >  		vcpu_ptrauth_disable(vcpu);
> >  	kvm_arch_vcpu_load_debug_state_flags(vcpu);
> > +
> > +	if (!cpumask_empty(vcpu->arch.supported_cpus) &&
> 
> How about initialising the cpumask to cpu_possible_mask, avoiding the
> cpumask_empty check?

That's a great idea, I will change it.

> 
> > +	    !cpumask_test_cpu(smp_processor_id(), vcpu->arch.supported_cpus))
> > +		vcpu->arch.cpu_not_supported = true;
> 
> I have the feeling this would actually better be implemented as a
> request, but there may be some surgery required for this.

I can give it a go, see how it looks.

> 
> >  }
> >  
> >  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> > @@ -815,6 +823,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
> >  		 */
> >  		preempt_disable();
> >  
> > +		if (unlikely(vcpu->arch.cpu_not_supported)) {
> > +			vcpu->arch.cpu_not_supported = false;
> > +			ret = -ENOEXEC;
> > +			preempt_enable();
> 
> How about populating run->fail_entry with some information? I bet this
> would be useful, if only as a debugging tool.

That's another great idea, will definitely add it.

Thanks,
Alex

> 
> > +			continue;
> > +		}
> > +
> >  		kvm_pmu_flush_hwstate(vcpu);
> >  
> >  		local_irq_disable();
> > diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> > index 53cedeb5dbf6..957a6d0cfa56 100644
> > --- a/arch/arm64/kvm/pmu-emul.c
> > +++ b/arch/arm64/kvm/pmu-emul.c
> > @@ -951,6 +951,7 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
> >  		arm_pmu = entry->arm_pmu;
> >  		if (arm_pmu->pmu.type == pmu_id) {
> >  			kvm_pmu->arm_pmu = arm_pmu;
> > +			cpumask_copy(vcpu->arch.supported_cpus, &arm_pmu->supported_cpus);
> >  			return 0;
> >  		}
> >  	}
> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Marc Zyngier Nov. 22, 2021, 2:21 p.m. UTC | #3
On Mon, 22 Nov 2021 12:12:17 +0000,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Sun, Nov 21, 2021 at 07:35:13PM +0000, Marc Zyngier wrote:
> > On Mon, 15 Nov 2021 16:50:41 +0000,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > 
> > > Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> > > device ioctl. If the VCPU is scheduled on a physical CPU which has a
> > > different PMU, the perf events needed to emulate a guest PMU won't be
> > > scheduled in and the guest performance counters will stop counting. Treat
> > > it as an userspace error and refuse to run the VCPU in this situation.
> > > 
> > > The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> > > the flag is cleared when the KVM_RUN enters the non-preemptible section
> > > instead of in vcpu_put(); this has been done on purpose so the error
> > > condition is communicated as soon as possible to userspace, otherwise
> > > vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.
> > 
> > Can we make this something orthogonal to the PMU, and get userspace to
> > pick an affinity mask independently of instantiating a PMU? I can
> > imagine this would also be useful for SPE on asymmetric systems.
> 
> I actually went this way for the latest version of the SPE series [1] and
> dropped the explicit userspace ioctl in favor of this mechanism.
> 
> The expectation is that userspace already knows which CPUs are associated
> with the chosen PMU (or SPE) when setting the PMU for the VCPU, and having
> userspace set it explicitely via an ioctl looks like an unnecessary step to
> me. I don't see other usecases of an explicit ioctl outside of the above
> two situation (if userspace wants a VCPU to run only on specific CPUs, it
> can use thread affinity for that), so I decided to drop it.

My problem with that is that if you have (for whatever reason) a set
of affinities that are not strictly identical for both PMU and SPE,
and expose both of these to a guest, what do you choose?

As long as you have a single affinity set to take care of, you're
good. It is when you have several ones that it becomes ugly (as with
anything involving asymmetric CPUs).

	M.
Alexandru Elisei Nov. 22, 2021, 2:43 p.m. UTC | #4
Hi Marc,

On Mon, Nov 22, 2021 at 02:21:00PM +0000, Marc Zyngier wrote:
> On Mon, 22 Nov 2021 12:12:17 +0000,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Sun, Nov 21, 2021 at 07:35:13PM +0000, Marc Zyngier wrote:
> > > On Mon, 15 Nov 2021 16:50:41 +0000,
> > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > > 
> > > > Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> > > > device ioctl. If the VCPU is scheduled on a physical CPU which has a
> > > > different PMU, the perf events needed to emulate a guest PMU won't be
> > > > scheduled in and the guest performance counters will stop counting. Treat
> > > > it as an userspace error and refuse to run the VCPU in this situation.
> > > > 
> > > > The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> > > > the flag is cleared when the KVM_RUN enters the non-preemptible section
> > > > instead of in vcpu_put(); this has been done on purpose so the error
> > > > condition is communicated as soon as possible to userspace, otherwise
> > > > vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.
> > > 
> > > Can we make this something orthogonal to the PMU, and get userspace to
> > > pick an affinity mask independently of instantiating a PMU? I can
> > > imagine this would also be useful for SPE on asymmetric systems.
> > 
> > I actually went this way for the latest version of the SPE series [1] and
> > dropped the explicit userspace ioctl in favor of this mechanism.
> > 
> > The expectation is that userspace already knows which CPUs are associated
> > with the chosen PMU (or SPE) when setting the PMU for the VCPU, and having
> > userspace set it explicitely via an ioctl looks like an unnecessary step to
> > me. I don't see other usecases of an explicit ioctl outside of the above
> > two situation (if userspace wants a VCPU to run only on specific CPUs, it
> > can use thread affinity for that), so I decided to drop it.
> 
> My problem with that is that if you have (for whatever reason) a set
> of affinities that are not strictly identical for both PMU and SPE,
> and expose both of these to a guest, what do you choose?
> 
> As long as you have a single affinity set to take care of, you're
> good. It is when you have several ones that it becomes ugly (as with
> anything involving asymmetric CPUs).

I thought about it when I decided to do it this way, my solution was to do
a cpumask_and() with the existing VCPU cpumask when setting a VCPU feature
that requires it, and returning an error if we get an empty cpumask,
because userspace is requesting a combination of VCPU features that is not
supported by the hardware.

Going with the other solution (user sets the cpumask via an ioctl), KVM
would still have to check against certain combinations of VCPU features
(for SPE it's mandatory, so KVM doesn't trigger an undefined exception, we
could skip the check for PMU, but then what do we gain from the ioctl if
KVM doesn't check that it matches the PMU?), so I don't think we loose
anything by going with the implicit cpumask.

What do you think?

Thanks,
Alex

> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Marc Zyngier Dec. 6, 2021, 10:15 a.m. UTC | #5
On Mon, 22 Nov 2021 14:43:17 +0000,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Mon, Nov 22, 2021 at 02:21:00PM +0000, Marc Zyngier wrote:
> > On Mon, 22 Nov 2021 12:12:17 +0000,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > 
> > > Hi Marc,
> > > 
> > > On Sun, Nov 21, 2021 at 07:35:13PM +0000, Marc Zyngier wrote:
> > > > On Mon, 15 Nov 2021 16:50:41 +0000,
> > > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > > > 
> > > > > Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> > > > > device ioctl. If the VCPU is scheduled on a physical CPU which has a
> > > > > different PMU, the perf events needed to emulate a guest PMU won't be
> > > > > scheduled in and the guest performance counters will stop counting. Treat
> > > > > it as an userspace error and refuse to run the VCPU in this situation.
> > > > > 
> > > > > The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> > > > > the flag is cleared when the KVM_RUN enters the non-preemptible section
> > > > > instead of in vcpu_put(); this has been done on purpose so the error
> > > > > condition is communicated as soon as possible to userspace, otherwise
> > > > > vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.
> > > > 
> > > > Can we make this something orthogonal to the PMU, and get userspace to
> > > > pick an affinity mask independently of instantiating a PMU? I can
> > > > imagine this would also be useful for SPE on asymmetric systems.
> > > 
> > > I actually went this way for the latest version of the SPE series [1] and
> > > dropped the explicit userspace ioctl in favor of this mechanism.
> > > 
> > > The expectation is that userspace already knows which CPUs are associated
> > > with the chosen PMU (or SPE) when setting the PMU for the VCPU, and having
> > > userspace set it explicitely via an ioctl looks like an unnecessary step to
> > > me. I don't see other usecases of an explicit ioctl outside of the above
> > > two situation (if userspace wants a VCPU to run only on specific CPUs, it
> > > can use thread affinity for that), so I decided to drop it.
> > 
> > My problem with that is that if you have (for whatever reason) a set
> > of affinities that are not strictly identical for both PMU and SPE,
> > and expose both of these to a guest, what do you choose?
> > 
> > As long as you have a single affinity set to take care of, you're
> > good. It is when you have several ones that it becomes ugly (as with
> > anything involving asymmetric CPUs).
> 
> I thought about it when I decided to do it this way, my solution was to do
> a cpumask_and() with the existing VCPU cpumask when setting a VCPU feature
> that requires it, and returning an error if we get an empty cpumask,
> because userspace is requesting a combination of VCPU features that is not
> supported by the hardware.

So every new asymetric feature would come with its own potential
affinity mask, and KVM would track the restriction of that affinity. I
guess that because it can only converge to zero, this is safe by
design...

One thing I want to make sure is that we can evaluate the mask very
early on, and reduce the overhead of that evaluation.

> Going with the other solution (user sets the cpumask via an ioctl), KVM
> would still have to check against certain combinations of VCPU features
> (for SPE it's mandatory, so KVM doesn't trigger an undefined exception, we
> could skip the check for PMU, but then what do we gain from the ioctl if
> KVM doesn't check that it matches the PMU?), so I don't think we loose
> anything by going with the implicit cpumask.
> 
> What do you think?

OK, fair enough. Please respin the series (I had a bunch of minor
comments), and I'll have another look.

Thanks,

	M.
Alexandru Elisei Dec. 6, 2021, 10:26 a.m. UTC | #6
Hi Marc,

On Mon, Dec 06, 2021 at 10:15:31AM +0000, Marc Zyngier wrote:
> On Mon, 22 Nov 2021 14:43:17 +0000,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Mon, Nov 22, 2021 at 02:21:00PM +0000, Marc Zyngier wrote:
> > > On Mon, 22 Nov 2021 12:12:17 +0000,
> > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > > 
> > > > Hi Marc,
> > > > 
> > > > On Sun, Nov 21, 2021 at 07:35:13PM +0000, Marc Zyngier wrote:
> > > > > On Mon, 15 Nov 2021 16:50:41 +0000,
> > > > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > > > > 
> > > > > > Userspace can assign a PMU to a VCPU with the KVM_ARM_VCPU_PMU_V3_SET_PMU
> > > > > > device ioctl. If the VCPU is scheduled on a physical CPU which has a
> > > > > > different PMU, the perf events needed to emulate a guest PMU won't be
> > > > > > scheduled in and the guest performance counters will stop counting. Treat
> > > > > > it as an userspace error and refuse to run the VCPU in this situation.
> > > > > > 
> > > > > > The VCPU is flagged as being scheduled on the wrong CPU in vcpu_load(), but
> > > > > > the flag is cleared when the KVM_RUN enters the non-preemptible section
> > > > > > instead of in vcpu_put(); this has been done on purpose so the error
> > > > > > condition is communicated as soon as possible to userspace, otherwise
> > > > > > vcpu_load() on the wrong CPU followed by a vcpu_put() could clear the flag.
> > > > > 
> > > > > Can we make this something orthogonal to the PMU, and get userspace to
> > > > > pick an affinity mask independently of instantiating a PMU? I can
> > > > > imagine this would also be useful for SPE on asymmetric systems.
> > > > 
> > > > I actually went this way for the latest version of the SPE series [1] and
> > > > dropped the explicit userspace ioctl in favor of this mechanism.
> > > > 
> > > > The expectation is that userspace already knows which CPUs are associated
> > > > with the chosen PMU (or SPE) when setting the PMU for the VCPU, and having
> > > > userspace set it explicitely via an ioctl looks like an unnecessary step to
> > > > me. I don't see other usecases of an explicit ioctl outside of the above
> > > > two situation (if userspace wants a VCPU to run only on specific CPUs, it
> > > > can use thread affinity for that), so I decided to drop it.
> > > 
> > > My problem with that is that if you have (for whatever reason) a set
> > > of affinities that are not strictly identical for both PMU and SPE,
> > > and expose both of these to a guest, what do you choose?
> > > 
> > > As long as you have a single affinity set to take care of, you're
> > > good. It is when you have several ones that it becomes ugly (as with
> > > anything involving asymmetric CPUs).
> > 
> > I thought about it when I decided to do it this way, my solution was to do
> > a cpumask_and() with the existing VCPU cpumask when setting a VCPU feature
> > that requires it, and returning an error if we get an empty cpumask,
> > because userspace is requesting a combination of VCPU features that is not
> > supported by the hardware.
> 
> So every new asymetric feature would come with its own potential
> affinity mask, and KVM would track the restriction of that affinity. I
> guess that because it can only converge to zero, this is safe by
> design...
> 
> One thing I want to make sure is that we can evaluate the mask very
> early on, and reduce the overhead of that evaluation.

I don't think the check can be made any sooner than when the feature bit is set,
which is what I am proposing :)

> 
> > Going with the other solution (user sets the cpumask via an ioctl), KVM
> > would still have to check against certain combinations of VCPU features
> > (for SPE it's mandatory, so KVM doesn't trigger an undefined exception, we
> > could skip the check for PMU, but then what do we gain from the ioctl if
> > KVM doesn't check that it matches the PMU?), so I don't think we loose
> > anything by going with the implicit cpumask.
> > 
> > What do you think?
> 
> OK, fair enough. Please respin the series (I had a bunch of minor
> comments), and I'll have another look.

Great, thanks!

Alex

> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index aeeb071c7688..5bbad8318ea5 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -396,8 +396,9 @@  Errors:
 
   =======    ==============================================================
   EINTR      an unmasked signal is pending
-  ENOEXEC    the vcpu hasn't been initialized or the guest tried to execute
-             instructions from device memory (arm64)
+  ENOEXEC    the vcpu hasn't been initialized, the guest tried to execute
+             instructions from device memory (arm64) or the vcpu PMU is
+             different from the physical cpu PMU (arm64).
   ENOSYS     data abort outside memslots with no syndrome info and
              KVM_CAP_ARM_NISV_TO_USER not enabled (arm64)
   EPERM      SVE feature set but not finalized (arm64)
diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index 59ac382af59a..ca0da34da889 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -128,7 +128,8 @@  systems where there are at least two PMUs on the system.
 
 Note that KVM will not make any attempts to run the VCPU on the physical CPUs
 associated with the PMU specified by this attribute. This is entirely left to
-userspace.
+userspace. However, if the VCPU is scheduled on a CPU which has a different PMU,
+then KVM_RUN will return with the error code ENOEXEC.
 
 2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
 =================================
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2a5f7f38006f..ae2083b41d8a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -385,6 +385,9 @@  struct kvm_vcpu_arch {
 		u64 last_steal;
 		gpa_t base;
 	} steal;
+
+	cpumask_var_t supported_cpus;
+	bool cpu_not_supported;
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 2f03cbfefe67..5dbfd18c4e37 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -320,6 +320,9 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
 
+	if (!zalloc_cpumask_var(&vcpu->arch.supported_cpus, GFP_KERNEL))
+		return -ENOMEM;
+
 	/* Set up the timer */
 	kvm_timer_vcpu_init(vcpu);
 
@@ -347,6 +350,7 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		static_branch_dec(&userspace_irqchip_in_use);
 
+	free_cpumask_var(vcpu->arch.supported_cpus);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);
@@ -425,6 +429,10 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu_has_ptrauth(vcpu))
 		vcpu_ptrauth_disable(vcpu);
 	kvm_arch_vcpu_load_debug_state_flags(vcpu);
+
+	if (!cpumask_empty(vcpu->arch.supported_cpus) &&
+	    !cpumask_test_cpu(smp_processor_id(), vcpu->arch.supported_cpus))
+		vcpu->arch.cpu_not_supported = true;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -815,6 +823,13 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		 */
 		preempt_disable();
 
+		if (unlikely(vcpu->arch.cpu_not_supported)) {
+			vcpu->arch.cpu_not_supported = false;
+			ret = -ENOEXEC;
+			preempt_enable();
+			continue;
+		}
+
 		kvm_pmu_flush_hwstate(vcpu);
 
 		local_irq_disable();
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 53cedeb5dbf6..957a6d0cfa56 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -951,6 +951,7 @@  static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
 		arm_pmu = entry->arm_pmu;
 		if (arm_pmu->pmu.type == pmu_id) {
 			kvm_pmu->arm_pmu = arm_pmu;
+			cpumask_copy(vcpu->arch.supported_cpus, &arm_pmu->supported_cpus);
 			return 0;
 		}
 	}