diff mbox series

[RFC,02/41] perf: Support guest enter/exit interfaces

Message ID 20240126085444.324918-3-xiong.y.zhang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: Introduce passthrough vPM | expand

Commit Message

Xiong Zhang Jan. 26, 2024, 8:54 a.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Currently, the guest and host share the PMU resources when a guest is
running. KVM has to create an extra virtual event to simulate the
guest's event, which brings several issues, e.g., high overhead, not
accuracy and etc.

A new pass-through method is proposed to address the issue. It requires
that the PMU resources can be fully occupied by the guest while it's
running. Two new interfaces are implemented to fulfill the requirement.
The hypervisor should invoke the interface while entering/exiting a
guest which wants the pass-through PMU capability.

The PMU resources should only be temporarily occupied when a guest is
running. When the guest is out, the PMU resources are still shared among
different users.

The exclude_guest event modifier is used to guarantee the exclusive
occupation of the PMU resources. When a guest enters, perf forces the
exclude_guest capability. If the pre-existing events with
!exclude_guest, the events are moved to the error state. The new
event-creation of the !exclude_guest event will error out during the
period. So the PMU resources can be safely accessed by the guest
directly.
https://lore.kernel.org/lkml/20231002204017.GB27267@noisy.programming.kicks-ass.net/

Not all PMUs support exclude_guest and vPMU pass-through, e.g., uncore
PMU and SW PMU. The guest enter/exit interfaces should only impact the
supported PMUs. Add a new PERF_PMU_CAP_VPMU_PASSTHROUGH flag to indicate
the PMUs that support the feature.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
---
 include/linux/perf_event.h |   9 ++
 kernel/events/core.c       | 174 +++++++++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+)

Comments

Raghavendra Rao Ananta March 20, 2024, 4:40 p.m. UTC | #1
Hi Kan,

>
> +static void __perf_force_exclude_guest_pmu(struct perf_event_pmu_context *pmu_ctx,
> +                                          struct perf_event *event)
> +{
> +       struct perf_event_context *ctx = pmu_ctx->ctx;
> +       struct perf_event *sibling;
> +       bool include_guest = false;
> +
> +       event_sched_out(event, ctx);
> +       if (!event->attr.exclude_guest)
> +               include_guest = true;
> +       for_each_sibling_event(sibling, event) {
> +               event_sched_out(sibling, ctx);
> +               if (!sibling->attr.exclude_guest)
> +                       include_guest = true;
> +       }
> +       if (include_guest) {
> +               perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> +               for_each_sibling_event(sibling, event)
> +                       perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> +       }
Does the perf core revert the PERF_EVENT_STATE_ERROR state somewhere
from the perf_guest_exit() path, or is it expected to remain in this
state?
IIUC, in the perf_guest_exit() path, when we land into
merge_sched_in(), we never schedule the event back if event->state <=
PERF_EVENT_STATE_OFF.

Thank you.
Raghavendra
Liang, Kan March 20, 2024, 5:12 p.m. UTC | #2
On 2024-03-20 12:40 p.m., Raghavendra Rao Ananta wrote:
> Hi Kan,
> 
>>
>> +static void __perf_force_exclude_guest_pmu(struct perf_event_pmu_context *pmu_ctx,
>> +                                          struct perf_event *event)
>> +{
>> +       struct perf_event_context *ctx = pmu_ctx->ctx;
>> +       struct perf_event *sibling;
>> +       bool include_guest = false;
>> +
>> +       event_sched_out(event, ctx);
>> +       if (!event->attr.exclude_guest)
>> +               include_guest = true;
>> +       for_each_sibling_event(sibling, event) {
>> +               event_sched_out(sibling, ctx);
>> +               if (!sibling->attr.exclude_guest)
>> +                       include_guest = true;
>> +       }
>> +       if (include_guest) {
>> +               perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
>> +               for_each_sibling_event(sibling, event)
>> +                       perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
>> +       }
> Does the perf core revert the PERF_EVENT_STATE_ERROR state somewhere
> from the perf_guest_exit() path, or is it expected to remain in this
> state?
> IIUC, in the perf_guest_exit() path, when we land into
> merge_sched_in(), we never schedule the event back if event->state <=
> PERF_EVENT_STATE_OFF.
> 

The perf doesn't revert event with the ERROR STATE. A user asks to
profile both guest and host, but the pass-through mode doesn't allow the
profiling of the guest. So it has to error out and remain the ERROR STATE.

Thanks,
Kan
Sean Christopherson April 11, 2024, 6:06 p.m. UTC | #3
On Fri, Jan 26, 2024, Xiong Zhang wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 683dc086ef10..59471eeec7e4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3803,6 +3803,8 @@ static inline void group_update_userpage(struct perf_event *group_event)
>  		event_update_userpage(event);
>  }
>  
> +static DEFINE_PER_CPU(bool, __perf_force_exclude_guest);
> +
>  static int merge_sched_in(struct perf_event *event, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> @@ -3814,6 +3816,14 @@ static int merge_sched_in(struct perf_event *event, void *data)
>  	if (!event_filter_match(event))
>  		return 0;
>  
> +	/*
> +	 * The __perf_force_exclude_guest indicates entering the guest.
> +	 * No events of the passthrough PMU should be scheduled.
> +	 */
> +	if (__this_cpu_read(__perf_force_exclude_guest) &&
> +	    has_vpmu_passthrough_cap(event->pmu))

As mentioned in the previous reply, I think perf should WARN and reject any attempt
to trigger a "passthrough" context switch if such a switch isn't supported by
perf, not silently let it go through and then skip things later.

> +		return 0;
> +
>  	if (group_can_go_on(event, *can_add_hw)) {
>  		if (!group_sched_in(event, ctx))
>  			list_add_tail(&event->active_list, get_event_list(event));

...

> +/*
> + * When a guest enters, force all active events of the PMU, which supports
> + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
> + * PMUs, such as uncore PMU, should not be impacted. The guest can
> + * temporarily own all counters of the PMU.
> + * During the period, all the creation of the new event of the PMU with
> + * !exclude_guest are error out.
> + */
> +void perf_guest_enter(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	if (__this_cpu_read(__perf_force_exclude_guest))

This should be a WARN_ON_ONCE, no?

> +		return;
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	perf_force_exclude_guest_enter(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_force_exclude_guest_enter(cpuctx->task_ctx);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +
> +	__this_cpu_write(__perf_force_exclude_guest, true);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_enter);
> +
> +static void perf_force_exclude_guest_exit(struct perf_event_context *ctx)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +	struct pmu *pmu;
> +
> +	update_context_time(ctx);
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		pmu = pmu_ctx->pmu;
> +		if (!has_vpmu_passthrough_cap(pmu))
> +			continue;

I don't see how we can sanely support a CPU that doesn't support writable
PERF_GLOBAL_STATUS across all PMUs.

> +
> +		perf_pmu_disable(pmu);
> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +		perf_pmu_enable(pmu);
> +	}
> +}
> +
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	if (!__this_cpu_read(__perf_force_exclude_guest))

WARN_ON_ONCE here too?

> +		return;
> +
> +	__this_cpu_write(__perf_force_exclude_guest, false);
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	perf_force_exclude_guest_exit(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_force_exclude_guest_exit(cpuctx->task_ctx);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_exit);
> +
> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
> +						 int cpu, struct task_struct *task)
> +{
> +	bool *force_exclude_guest = NULL;
> +
> +	if (!has_vpmu_passthrough_cap(event->pmu))
> +		return 0;
> +
> +	if (event->attr.exclude_guest)
> +		return 0;
> +
> +	if (cpu != -1) {
> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
> +	} else if (task && (task->flags & PF_VCPU)) {
> +		/*
> +		 * Just need to check the running CPU in the event creation. If the
> +		 * task is moved to another CPU which supports the force_exclude_guest.
> +		 * The event will filtered out and be moved to the error stage. See
> +		 * merge_sched_in().
> +		 */
> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
> +	}

These checks are extremely racy, I don't see how this can possibly do the
right thing.  PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
time slices).

Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
per-CPU context mutex.

> +	if (force_exclude_guest && *force_exclude_guest)
> +		return -EBUSY;
> +	return 0;
> +}
> +
>  /*
>   * Holding the top-level event's child_mutex means that any
>   * descendant process that has inherited this event will block
> @@ -11973,6 +12142,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>  		goto err_ns;
>  	}
>  
> +	if (perf_force_exclude_guest_check(event, cpu, task)) {

This should be:

	err = perf_force_exclude_guest_check(event, cpu, task);
	if (err)
		goto err_pmu;

i.e. shouldn't effectively ignore/override the return result.

> +		err = -EBUSY;
> +		goto err_pmu;
> +	}
> +
>  	/*
>  	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
>  	 * events (they don't make sense as the cgroup will be different
> -- 
> 2.34.1
>
Liang, Kan April 11, 2024, 7:53 p.m. UTC | #4
On 2024-04-11 2:06 p.m., Sean Christopherson wrote:
> On Fri, Jan 26, 2024, Xiong Zhang wrote:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 683dc086ef10..59471eeec7e4 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -3803,6 +3803,8 @@ static inline void group_update_userpage(struct perf_event *group_event)
>>  		event_update_userpage(event);
>>  }
>>  
>> +static DEFINE_PER_CPU(bool, __perf_force_exclude_guest);
>> +
>>  static int merge_sched_in(struct perf_event *event, void *data)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> @@ -3814,6 +3816,14 @@ static int merge_sched_in(struct perf_event *event, void *data)
>>  	if (!event_filter_match(event))
>>  		return 0;
>>  
>> +	/*
>> +	 * The __perf_force_exclude_guest indicates entering the guest.
>> +	 * No events of the passthrough PMU should be scheduled.
>> +	 */
>> +	if (__this_cpu_read(__perf_force_exclude_guest) &&
>> +	    has_vpmu_passthrough_cap(event->pmu))
> 
> As mentioned in the previous reply, I think perf should WARN and reject any attempt
> to trigger a "passthrough" context switch if such a switch isn't supported by
> perf, not silently let it go through and then skip things later.

perf supports many PMUs. The core PMU is one of them. Only the core PMU
supports "passthrough", and will do the "passthrough" context switch if
there are active events.
For other PMUs, they should not be impacted. The "passthrough" context
switch should be transparent for there.

Here is to reject an existing host event in the schedule stage. If a
"passthrough" guest is running, perf should rejects any existing host
events of the "passthrough" supported PMU.

> 
>> +		return 0;
>> +
>>  	if (group_can_go_on(event, *can_add_hw)) {
>>  		if (!group_sched_in(event, ctx))
>>  			list_add_tail(&event->active_list, get_event_list(event));
> 
> ...
> 
>> +/*
>> + * When a guest enters, force all active events of the PMU, which supports
>> + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
>> + * PMUs, such as uncore PMU, should not be impacted. The guest can
>> + * temporarily own all counters of the PMU.
>> + * During the period, all the creation of the new event of the PMU with
>> + * !exclude_guest are error out.
>> + */
>> +void perf_guest_enter(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	if (__this_cpu_read(__perf_force_exclude_guest))
> 
> This should be a WARN_ON_ONCE, no?

To debug the improper behavior of KVM?
I guess yes.

> 
>> +		return;
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	perf_force_exclude_guest_enter(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_force_exclude_guest_enter(cpuctx->task_ctx);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +
>> +	__this_cpu_write(__perf_force_exclude_guest, true);
>> +}
>> +EXPORT_SYMBOL_GPL(perf_guest_enter);
>> +
>> +static void perf_force_exclude_guest_exit(struct perf_event_context *ctx)
>> +{
>> +	struct perf_event_pmu_context *pmu_ctx;
>> +	struct pmu *pmu;
>> +
>> +	update_context_time(ctx);
>> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>> +		pmu = pmu_ctx->pmu;
>> +		if (!has_vpmu_passthrough_cap(pmu))
>> +			continue;
> 
> I don't see how we can sanely support a CPU that doesn't support writable
> PERF_GLOBAL_STATUS across all PMUs.

Only core PMU has the PERF_GLOBAL_STATUS. Other PMUs, e.g., uncore PMU,
aren't impacted by the MSR. Those MSRs should be ignored.

> 
>> +
>> +		perf_pmu_disable(pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>> +		perf_pmu_enable(pmu);
>> +	}
>> +}
>> +
>> +void perf_guest_exit(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	if (!__this_cpu_read(__perf_force_exclude_guest))
> 
> WARN_ON_ONCE here too?
> 
>> +		return;
>> +
>> +	__this_cpu_write(__perf_force_exclude_guest, false);
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	perf_force_exclude_guest_exit(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_force_exclude_guest_exit(cpuctx->task_ctx);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
>> +EXPORT_SYMBOL_GPL(perf_guest_exit);
>> +
>> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
>> +						 int cpu, struct task_struct *task)
>> +{
>> +	bool *force_exclude_guest = NULL;
>> +
>> +	if (!has_vpmu_passthrough_cap(event->pmu))
>> +		return 0;
>> +
>> +	if (event->attr.exclude_guest)
>> +		return 0;
>> +
>> +	if (cpu != -1) {
>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
>> +	} else if (task && (task->flags & PF_VCPU)) {
>> +		/*
>> +		 * Just need to check the running CPU in the event creation. If the
>> +		 * task is moved to another CPU which supports the force_exclude_guest.
>> +		 * The event will filtered out and be moved to the error stage. See
>> +		 * merge_sched_in().
>> +		 */
>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
>> +	}
> 
> These checks are extremely racy, I don't see how this can possibly do the
> right thing.  PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
> to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
> include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
> time slices).
>

This is to reject an !exclude_guest event creation for a running
"passthrough" guest from host perf tool.
Could you please suggest a way to detect it via the struct task_struct?


> Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
> perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
> per-CPU context mutex.

Do you mean that the perf_guest_enter() check could be happened right
after the perf_force_exclude_guest_check()?
It's possible. For this case, the event can still be created. It will be
treated as an existing event and handled in merge_sched_in(). It will
never be scheduled when a guest is running.

The perf_force_exclude_guest_check() is to make sure most of the cases
can be rejected at the creation place. For the corner cases, they will
be rejected in the schedule stage.

> 
>> +	if (force_exclude_guest && *force_exclude_guest)
>> +		return -EBUSY;
>> +	return 0;
>> +}
>> +
>>  /*
>>   * Holding the top-level event's child_mutex means that any
>>   * descendant process that has inherited this event will block
>> @@ -11973,6 +12142,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>>  		goto err_ns;
>>  	}
>>  
>> +	if (perf_force_exclude_guest_check(event, cpu, task)) {
> 
> This should be:
> 
> 	err = perf_force_exclude_guest_check(event, cpu, task);
> 	if (err)
> 		goto err_pmu;
> 
> i.e. shouldn't effectively ignore/override the return result.
>

Sure.

Thanks,
Kan

>> +		err = -EBUSY;
>> +		goto err_pmu;
>> +	}
>> +
>>  	/*
>>  	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
>>  	 * events (they don't make sense as the cgroup will be different
>> -- 
>> 2.34.1
>>
Sean Christopherson April 12, 2024, 7:17 p.m. UTC | #5
On Thu, Apr 11, 2024, Kan Liang wrote:
> >> +/*
> >> + * When a guest enters, force all active events of the PMU, which supports
> >> + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
> >> + * PMUs, such as uncore PMU, should not be impacted. The guest can
> >> + * temporarily own all counters of the PMU.
> >> + * During the period, all the creation of the new event of the PMU with
> >> + * !exclude_guest are error out.
> >> + */
> >> +void perf_guest_enter(void)
> >> +{
> >> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> >> +
> >> +	lockdep_assert_irqs_disabled();
> >> +
> >> +	if (__this_cpu_read(__perf_force_exclude_guest))
> > 
> > This should be a WARN_ON_ONCE, no?
> 
> To debug the improper behavior of KVM?

Not so much "debug" as ensure that the platform owner noticies that KVM is buggy.

> >> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
> >> +						 int cpu, struct task_struct *task)
> >> +{
> >> +	bool *force_exclude_guest = NULL;
> >> +
> >> +	if (!has_vpmu_passthrough_cap(event->pmu))
> >> +		return 0;
> >> +
> >> +	if (event->attr.exclude_guest)
> >> +		return 0;
> >> +
> >> +	if (cpu != -1) {
> >> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
> >> +	} else if (task && (task->flags & PF_VCPU)) {
> >> +		/*
> >> +		 * Just need to check the running CPU in the event creation. If the
> >> +		 * task is moved to another CPU which supports the force_exclude_guest.
> >> +		 * The event will filtered out and be moved to the error stage. See
> >> +		 * merge_sched_in().
> >> +		 */
> >> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
> >> +	}
> > 
> > These checks are extremely racy, I don't see how this can possibly do the
> > right thing.  PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
> > to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
> > include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
> > time slices).
> >
> 
> This is to reject an !exclude_guest event creation for a running
> "passthrough" guest from host perf tool.
> Could you please suggest a way to detect it via the struct task_struct?
> 
> > Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
> > perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
> > per-CPU context mutex.
> 
> Do you mean that the perf_guest_enter() check could be happened right
> after the perf_force_exclude_guest_check()?
> It's possible. For this case, the event can still be created. It will be
> treated as an existing event and handled in merge_sched_in(). It will
> never be scheduled when a guest is running.
> 
> The perf_force_exclude_guest_check() is to make sure most of the cases
> can be rejected at the creation place. For the corner cases, they will
> be rejected in the schedule stage.

Ah, the "rejected in the schedule stage" is what I'm missing.  But that creates
a gross ABI, because IIUC, event creation will "randomly" succeed based on whether
or not a CPU happens to be running in a KVM guest.  I.e. it's not just the kernel
code that has races, the entire event creation is one big race.

What if perf had a global knob to enable/disable mediate PMU support?  Then when
KVM is loaded with enable_mediated_true, call into perf to (a) check that there
are no existing !exclude_guest events (this part could be optional), and (b) set
the global knob to reject all new !exclude_guest events (for the core PMU?).

Hmm, or probably better, do it at VM creation.  That has the advantage of playing
nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
want to run VMs.

E.g. (very roughly)

int x86_perf_get_mediated_pmu(void)
{
	if (refcount_inc_not_zero(...))
		return 0;

	if (<system wide events>)
		return -EBUSY;

	<slow path with locking>
}

void x86_perf_put_mediated_pmu(void)
{
	if (!refcount_dec_and_test(...))
		return;

	<slow path with locking>
}

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1bbf312cbd73..f2994377ef44 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12467,6 +12467,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (type)
                return -EINVAL;
 
+       if (enable_mediated_pmu)
+               ret = x86_perf_get_mediated_pmu();
+               if (ret)
+                       return ret;
+       }
+
        ret = kvm_page_track_init(kvm);
        if (ret)
                goto out;
@@ -12518,6 +12524,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm_mmu_uninit_vm(kvm);
        kvm_page_track_cleanup(kvm);
 out:
+       x86_perf_put_mediated_pmu();
        return ret;
 }
 
@@ -12659,6 +12666,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_page_track_cleanup(kvm);
        kvm_xen_destroy_vm(kvm);
        kvm_hv_destroy_vm(kvm);
+       x86_perf_put_mediated_pmu();
 }
 
 static void memslot_rmap_free(struct kvm_memory_slot *slot)
Liang, Kan April 12, 2024, 8:56 p.m. UTC | #6
On 2024-04-12 3:17 p.m., Sean Christopherson wrote:
> On Thu, Apr 11, 2024, Kan Liang wrote:
>>>> +/*
>>>> + * When a guest enters, force all active events of the PMU, which supports
>>>> + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
>>>> + * PMUs, such as uncore PMU, should not be impacted. The guest can
>>>> + * temporarily own all counters of the PMU.
>>>> + * During the period, all the creation of the new event of the PMU with
>>>> + * !exclude_guest are error out.
>>>> + */
>>>> +void perf_guest_enter(void)
>>>> +{
>>>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>>> +
>>>> +	lockdep_assert_irqs_disabled();
>>>> +
>>>> +	if (__this_cpu_read(__perf_force_exclude_guest))
>>>
>>> This should be a WARN_ON_ONCE, no?
>>
>> To debug the improper behavior of KVM?
> 
> Not so much "debug" as ensure that the platform owner noticies that KVM is buggy.
> 
>>>> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
>>>> +						 int cpu, struct task_struct *task)
>>>> +{
>>>> +	bool *force_exclude_guest = NULL;
>>>> +
>>>> +	if (!has_vpmu_passthrough_cap(event->pmu))
>>>> +		return 0;
>>>> +
>>>> +	if (event->attr.exclude_guest)
>>>> +		return 0;
>>>> +
>>>> +	if (cpu != -1) {
>>>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
>>>> +	} else if (task && (task->flags & PF_VCPU)) {
>>>> +		/*
>>>> +		 * Just need to check the running CPU in the event creation. If the
>>>> +		 * task is moved to another CPU which supports the force_exclude_guest.
>>>> +		 * The event will filtered out and be moved to the error stage. See
>>>> +		 * merge_sched_in().
>>>> +		 */
>>>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
>>>> +	}
>>>
>>> These checks are extremely racy, I don't see how this can possibly do the
>>> right thing.  PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
>>> to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
>>> include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
>>> time slices).
>>>
>>
>> This is to reject an !exclude_guest event creation for a running
>> "passthrough" guest from host perf tool.
>> Could you please suggest a way to detect it via the struct task_struct?
>>
>>> Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
>>> perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
>>> per-CPU context mutex.
>>
>> Do you mean that the perf_guest_enter() check could be happened right
>> after the perf_force_exclude_guest_check()?
>> It's possible. For this case, the event can still be created. It will be
>> treated as an existing event and handled in merge_sched_in(). It will
>> never be scheduled when a guest is running.
>>
>> The perf_force_exclude_guest_check() is to make sure most of the cases
>> can be rejected at the creation place. For the corner cases, they will
>> be rejected in the schedule stage.
> 
> Ah, the "rejected in the schedule stage" is what I'm missing.  But that creates
> a gross ABI, because IIUC, event creation will "randomly" succeed based on whether
> or not a CPU happens to be running in a KVM guest.  I.e. it's not just the kernel
> code that has races, the entire event creation is one big race.
> 
> What if perf had a global knob to enable/disable mediate PMU support?  Then when
> KVM is loaded with enable_mediated_true, call into perf to (a) check that there
> are no existing !exclude_guest events (this part could be optional), and (b) set
> the global knob to reject all new !exclude_guest events (for the core PMU?).
> 
> Hmm, or probably better, do it at VM creation.  That has the advantage of playing
> nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
> KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
> want to run VMs.

I think it should be doable, and may simplify the perf implementation.
(The check in the schedule stage should not be necessary anymore.)

With this, something like NMI watchdog should fail the VM creation. The
user should either disable the NMI watchdog or use a replacement.

Thanks,
Kan
> 
> E.g. (very roughly)
> 
> int x86_perf_get_mediated_pmu(void)
> {
> 	if (refcount_inc_not_zero(...))
> 		return 0;
> 
> 	if (<system wide events>)
> 		return -EBUSY;
> 
> 	<slow path with locking>
> }
> 
> void x86_perf_put_mediated_pmu(void)
> {
> 	if (!refcount_dec_and_test(...))
> 		return;
> 
> 	<slow path with locking>
> }
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1bbf312cbd73..f2994377ef44 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12467,6 +12467,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>         if (type)
>                 return -EINVAL;
>  
> +       if (enable_mediated_pmu)
> +               ret = x86_perf_get_mediated_pmu();
> +               if (ret)
> +                       return ret;
> +       }
> +
>         ret = kvm_page_track_init(kvm);
>         if (ret)
>                 goto out;
> @@ -12518,6 +12524,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>         kvm_mmu_uninit_vm(kvm);
>         kvm_page_track_cleanup(kvm);
>  out:
> +       x86_perf_put_mediated_pmu();
>         return ret;
>  }
>  
> @@ -12659,6 +12666,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>         kvm_page_track_cleanup(kvm);
>         kvm_xen_destroy_vm(kvm);
>         kvm_hv_destroy_vm(kvm);
> +       x86_perf_put_mediated_pmu();
>  }
>  
>  static void memslot_rmap_free(struct kvm_memory_slot *slot)
> 
>
Liang, Kan April 15, 2024, 4:03 p.m. UTC | #7
On 2024-04-12 4:56 p.m., Liang, Kan wrote:
>> What if perf had a global knob to enable/disable mediate PMU support?  Then when
>> KVM is loaded with enable_mediated_true, call into perf to (a) check that there
>> are no existing !exclude_guest events (this part could be optional), and (b) set
>> the global knob to reject all new !exclude_guest events (for the core PMU?).
>>
>> Hmm, or probably better, do it at VM creation.  That has the advantage of playing
>> nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
>> KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
>> want to run VMs.
> I think it should be doable, and may simplify the perf implementation.
> (The check in the schedule stage should not be necessary anymore.)
> 
> With this, something like NMI watchdog should fail the VM creation. The
> user should either disable the NMI watchdog or use a replacement.
> 
> Thanks,
> Kan
>> E.g. (very roughly)
>>
>> int x86_perf_get_mediated_pmu(void)
>> {
>> 	if (refcount_inc_not_zero(...))
>> 		return 0;
>>
>> 	if (<system wide events>)
>> 		return -EBUSY;
>>
>> 	<slow path with locking>
>> }
>>
>> void x86_perf_put_mediated_pmu(void)
>> {
>> 	if (!refcount_dec_and_test(...))
>> 		return;
>>
>> 	<slow path with locking>
>> }


I think the locking should include the refcount check and system wide
event check as well.
It should be possible that two VMs are created very close.
The second creation may mistakenly return 0 if there is no lock.

I plan to do something as below (not test yet).

+/*
+ * Currently invoked at VM creation to
+ * - Check whether there are existing !exclude_guest system wide events
+ *   of PMU with PERF_PMU_CAP_MEDIATED_VPMU
+ * - Set nr_mediated_pmu to prevent !exclude_guest event creation on
+ *   PMUs with PERF_PMU_CAP_MEDIATED_VPMU
+ *
+ * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
+ * still owns all the PMU resources.
+ */
+int x86_perf_get_mediated_pmu(void)
+{
+	int ret = 0;
+	mutex_lock(&perf_mediated_pmu_mutex);
+	if (refcount_inc_not_zero(&nr_mediated_pmu_vms))
+		goto end;
+
+	if (atomic_read(&nr_include_guest_events)) {
+		ret = -EBUSY;
+		goto end;
+	}
+	refcount_inc(&nr_mediated_pmu_vms);
+end:
+	mutex_unlock(&perf_mediated_pmu_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(x86_perf_get_mediated_pmu);
+
+void x86_perf_put_mediated_pmu(void)
+{
+	mutex_lock(&perf_mediated_pmu_mutex);
+	refcount_dec(&nr_mediated_pmu_vms);
+	mutex_unlock(&perf_mediated_pmu_mutex);
+}
+EXPORT_SYMBOL_GPL(x86_perf_put_mediated_pmu);


Thanks,
Kan
Xiong Zhang April 16, 2024, 5:34 a.m. UTC | #8
On 4/16/2024 12:03 AM, Liang, Kan wrote:
> 
> 
> On 2024-04-12 4:56 p.m., Liang, Kan wrote:
>>> What if perf had a global knob to enable/disable mediate PMU support?  Then when
>>> KVM is loaded with enable_mediated_true, call into perf to (a) check that there
>>> are no existing !exclude_guest events (this part could be optional), and (b) set
>>> the global knob to reject all new !exclude_guest events (for the core PMU?).
>>>
>>> Hmm, or probably better, do it at VM creation.  That has the advantage of playing
>>> nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
>>> KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
>>> want to run VMs.
>> I think it should be doable, and may simplify the perf implementation.
>> (The check in the schedule stage should not be necessary anymore.)
>>
>> With this, something like NMI watchdog should fail the VM creation. The
>> user should either disable the NMI watchdog or use a replacement.
>>
>> Thanks,
>> Kan
>>> E.g. (very roughly)
>>>
>>> int x86_perf_get_mediated_pmu(void)
>>> {
>>> 	if (refcount_inc_not_zero(...))
>>> 		return 0;
>>>
>>> 	if (<system wide events>)
>>> 		return -EBUSY;
>>>
>>> 	<slow path with locking>
>>> }
>>>
>>> void x86_perf_put_mediated_pmu(void)
>>> {
>>> 	if (!refcount_dec_and_test(...))
>>> 		return;
>>>
>>> 	<slow path with locking>
>>> }
> 
> 
> I think the locking should include the refcount check and system wide
> event check as well.
> It should be possible that two VMs are created very close.
> The second creation may mistakenly return 0 if there is no lock.
> 
> I plan to do something as below (not test yet).
> 
> +/*
> + * Currently invoked at VM creation to
> + * - Check whether there are existing !exclude_guest system wide events
> + *   of PMU with PERF_PMU_CAP_MEDIATED_VPMU
> + * - Set nr_mediated_pmu to prevent !exclude_guest event creation on
> + *   PMUs with PERF_PMU_CAP_MEDIATED_VPMU
> + *
> + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
> + * still owns all the PMU resources.
> + */
> +int x86_perf_get_mediated_pmu(void)
> +{
> +	int ret = 0;
> +	mutex_lock(&perf_mediated_pmu_mutex);
> +	if (refcount_inc_not_zero(&nr_mediated_pmu_vms))
> +		goto end;
> +
> +	if (atomic_read(&nr_include_guest_events)) {
> +		ret = -EBUSY;
> +		goto end;
> +	}
> +	refcount_inc(&nr_mediated_pmu_vms);
> +end:
> +	mutex_unlock(&perf_mediated_pmu_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(x86_perf_get_mediated_pmu);
> +
> +void x86_perf_put_mediated_pmu(void)
> +{
> +	mutex_lock(&perf_mediated_pmu_mutex);
> +	refcount_dec(&nr_mediated_pmu_vms);
> +	mutex_unlock(&perf_mediated_pmu_mutex);
> +}
> +EXPORT_SYMBOL_GPL(x86_perf_put_mediated_pmu);
> 
> 
> Thanks,
> Kan
x86_perf_get_mediated_pmu() is called at vm_create(), x86_perf_put_mediated_pmu() is called at vm_destroy(), then system wide perf events without exclude_guest=1 can not be created during the whole vm life cycle (where nr_mediated_pmu_vms > 0 always), do I understand and use the interface correctly ?

thanks
Liang, Kan April 16, 2024, 12:48 p.m. UTC | #9
On 2024-04-16 1:34 a.m., Zhang, Xiong Y wrote:
> 
> 
> On 4/16/2024 12:03 AM, Liang, Kan wrote:
>>
>>
>> On 2024-04-12 4:56 p.m., Liang, Kan wrote:
>>>> What if perf had a global knob to enable/disable mediate PMU support?  Then when
>>>> KVM is loaded with enable_mediated_true, call into perf to (a) check that there
>>>> are no existing !exclude_guest events (this part could be optional), and (b) set
>>>> the global knob to reject all new !exclude_guest events (for the core PMU?).
>>>>
>>>> Hmm, or probably better, do it at VM creation.  That has the advantage of playing
>>>> nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
>>>> KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
>>>> want to run VMs.
>>> I think it should be doable, and may simplify the perf implementation.
>>> (The check in the schedule stage should not be necessary anymore.)
>>>
>>> With this, something like NMI watchdog should fail the VM creation. The
>>> user should either disable the NMI watchdog or use a replacement.
>>>
>>> Thanks,
>>> Kan
>>>> E.g. (very roughly)
>>>>
>>>> int x86_perf_get_mediated_pmu(void)
>>>> {
>>>> 	if (refcount_inc_not_zero(...))
>>>> 		return 0;
>>>>
>>>> 	if (<system wide events>)
>>>> 		return -EBUSY;
>>>>
>>>> 	<slow path with locking>
>>>> }
>>>>
>>>> void x86_perf_put_mediated_pmu(void)
>>>> {
>>>> 	if (!refcount_dec_and_test(...))
>>>> 		return;
>>>>
>>>> 	<slow path with locking>
>>>> }
>>
>>
>> I think the locking should include the refcount check and system wide
>> event check as well.
>> It should be possible that two VMs are created very close.
>> The second creation may mistakenly return 0 if there is no lock.
>>
>> I plan to do something as below (not test yet).
>>
>> +/*
>> + * Currently invoked at VM creation to
>> + * - Check whether there are existing !exclude_guest system wide events
>> + *   of PMU with PERF_PMU_CAP_MEDIATED_VPMU
>> + * - Set nr_mediated_pmu to prevent !exclude_guest event creation on
>> + *   PMUs with PERF_PMU_CAP_MEDIATED_VPMU
>> + *
>> + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
>> + * still owns all the PMU resources.
>> + */
>> +int x86_perf_get_mediated_pmu(void)
>> +{
>> +	int ret = 0;
>> +	mutex_lock(&perf_mediated_pmu_mutex);
>> +	if (refcount_inc_not_zero(&nr_mediated_pmu_vms))
>> +		goto end;
>> +
>> +	if (atomic_read(&nr_include_guest_events)) {
>> +		ret = -EBUSY;
>> +		goto end;
>> +	}
>> +	refcount_inc(&nr_mediated_pmu_vms);
>> +end:
>> +	mutex_unlock(&perf_mediated_pmu_mutex);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(x86_perf_get_mediated_pmu);
>> +
>> +void x86_perf_put_mediated_pmu(void)
>> +{
>> +	mutex_lock(&perf_mediated_pmu_mutex);
>> +	refcount_dec(&nr_mediated_pmu_vms);
>> +	mutex_unlock(&perf_mediated_pmu_mutex);
>> +}
>> +EXPORT_SYMBOL_GPL(x86_perf_put_mediated_pmu);
>>
>>
>> Thanks,
>> Kan
> x86_perf_get_mediated_pmu() is called at vm_create(), x86_perf_put_mediated_pmu() is called at vm_destroy(), then system wide perf events without exclude_guest=1 can not be created during the whole vm life cycle (where nr_mediated_pmu_vms > 0 always), do I understand and use the interface correctly ?

Right, but it only impacts the events of PMU with the
PERF_PMU_CAP_MEDIATED_VPMU.
For other PMUs, the event with exclude_guest=1 can still be created.
KVM should not touch the counters of the PMU without
PERF_PMU_CAP_MEDIATED_VPMU.

BTW: I will also remove the prefix x86, since the functions are in the
generic code.

Thanks,
Kan
Xiong Zhang April 17, 2024, 9:42 a.m. UTC | #10
On 4/16/2024 8:48 PM, Liang, Kan wrote:
> 
> 
> On 2024-04-16 1:34 a.m., Zhang, Xiong Y wrote:
>>
>>
>> On 4/16/2024 12:03 AM, Liang, Kan wrote:
>>>
>>>
>>> On 2024-04-12 4:56 p.m., Liang, Kan wrote:
>>>>> What if perf had a global knob to enable/disable mediate PMU support?  Then when
>>>>> KVM is loaded with enable_mediated_true, call into perf to (a) check that there
>>>>> are no existing !exclude_guest events (this part could be optional), and (b) set
>>>>> the global knob to reject all new !exclude_guest events (for the core PMU?).
>>>>>
>>>>> Hmm, or probably better, do it at VM creation.  That has the advantage of playing
>>>>> nice with CONFIG_KVM=y (perf could reject the enabling without completely breaking
>>>>> KVM), and not causing problems if KVM is auto-probed but the user doesn't actually
>>>>> want to run VMs.
>>>> I think it should be doable, and may simplify the perf implementation.
>>>> (The check in the schedule stage should not be necessary anymore.)
>>>>
>>>> With this, something like NMI watchdog should fail the VM creation. The
>>>> user should either disable the NMI watchdog or use a replacement.
>>>>
>>>> Thanks,
>>>> Kan
>>>>> E.g. (very roughly)
>>>>>
>>>>> int x86_perf_get_mediated_pmu(void)
>>>>> {
>>>>> 	if (refcount_inc_not_zero(...))
>>>>> 		return 0;
>>>>>
>>>>> 	if (<system wide events>)
>>>>> 		return -EBUSY;
>>>>>
>>>>> 	<slow path with locking>
>>>>> }
>>>>>
>>>>> void x86_perf_put_mediated_pmu(void)
>>>>> {
>>>>> 	if (!refcount_dec_and_test(...))
>>>>> 		return;
>>>>>
>>>>> 	<slow path with locking>
>>>>> }
>>>
>>>
>>> I think the locking should include the refcount check and system wide
>>> event check as well.
>>> It should be possible that two VMs are created very close.
>>> The second creation may mistakenly return 0 if there is no lock.
>>>
>>> I plan to do something as below (not test yet).
>>>
>>> +/*
>>> + * Currently invoked at VM creation to
>>> + * - Check whether there are existing !exclude_guest system wide events
>>> + *   of PMU with PERF_PMU_CAP_MEDIATED_VPMU
>>> + * - Set nr_mediated_pmu to prevent !exclude_guest event creation on
>>> + *   PMUs with PERF_PMU_CAP_MEDIATED_VPMU
>>> + *
>>> + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf
>>> + * still owns all the PMU resources.
>>> + */
>>> +int x86_perf_get_mediated_pmu(void)
>>> +{
>>> +	int ret = 0;
>>> +	mutex_lock(&perf_mediated_pmu_mutex);
>>> +	if (refcount_inc_not_zero(&nr_mediated_pmu_vms))
>>> +		goto end;
>>> +
>>> +	if (atomic_read(&nr_include_guest_events)) {
>>> +		ret = -EBUSY;
>>> +		goto end;
>>> +	}
>>> +	refcount_inc(&nr_mediated_pmu_vms);
>>> +end:
>>> +	mutex_unlock(&perf_mediated_pmu_mutex);
>>> +	return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(x86_perf_get_mediated_pmu);
>>> +
>>> +void x86_perf_put_mediated_pmu(void)
>>> +{
>>> +	mutex_lock(&perf_mediated_pmu_mutex);
>>> +	refcount_dec(&nr_mediated_pmu_vms);
>>> +	mutex_unlock(&perf_mediated_pmu_mutex);
>>> +}
>>> +EXPORT_SYMBOL_GPL(x86_perf_put_mediated_pmu);
>>>
>>>
>>> Thanks,
>>> Kan
>> x86_perf_get_mediated_pmu() is called at vm_create(), x86_perf_put_mediated_pmu() is called at vm_destroy(), then system wide perf events without exclude_guest=1 can not be created during the whole vm life cycle (where nr_mediated_pmu_vms > 0 always), do I understand and use the interface correctly ?
> 
> Right, but it only impacts the events of PMU with the
> PERF_PMU_CAP_MEDIATED_VPMU.
> For other PMUs, the event with exclude_guest=1 can still be created.
> KVM should not touch the counters of the PMU without
> PERF_PMU_CAP_MEDIATED_VPMU.
> 
> BTW: I will also remove the prefix x86, since the functions are in the
> generic code.
> 
> Thanks,
> Kan
After userspace VMM call VCPU SET_CPUID() ioctl, KVM knows whether vPMU is enabled or not. If perf_get_mediated_pmu() is called at vm create, it is too early. 
it is better to let perf_get_mediated_pmu() track per cpu PMU state, so perf_get_mediated_pmu() can be called by kvm after vcpu_cpuid_set(). Note user space vmm may call SET_CPUID() on one vcpu multi times, then here refcount maybe isn't suitable. what's a better solution ?

thanks
Sean Christopherson April 18, 2024, 4:11 p.m. UTC | #11
On Wed, Apr 17, 2024, Xiong Y Zhang wrote:
> On 4/16/2024 8:48 PM, Liang, Kan wrote:
> >> x86_perf_get_mediated_pmu() is called at vm_create(),
> >> x86_perf_put_mediated_pmu() is called at vm_destroy(), then system wide
> >> perf events without exclude_guest=1 can not be created during the whole vm
> >> life cycle (where nr_mediated_pmu_vms > 0 always), do I understand and use
> >> the interface correctly ?
> > 
> > Right, but it only impacts the events of PMU with the
> > PERF_PMU_CAP_MEDIATED_VPMU.  For other PMUs, the event with exclude_guest=1
> > can still be created.  KVM should not touch the counters of the PMU without
> > PERF_PMU_CAP_MEDIATED_VPMU.
> > 
> > BTW: I will also remove the prefix x86, since the functions are in the
> > generic code.
> > 
> > Thanks,
> > Kan
> After userspace VMM call VCPU SET_CPUID() ioctl, KVM knows whether vPMU is
> enabled or not. If perf_get_mediated_pmu() is called at vm create, it is too
> early.

Eh, if someone wants to create _only_ VMs without vPMUs, then they should load
KVM with enable_pmu=false.  I can see people complaining about not being able to
create VMs if they don't want to use have *any* vPMU usage, but I doubt anyone
has a use cases where they want a mix of PMU-enabled and PMU- disabled VMs, *and*
they are ok with VM creation failing for some VMs but not others.

> it is better to let perf_get_mediated_pmu() track per cpu PMU state,
> so perf_get_mediated_pmu() can be called by kvm after vcpu_cpuid_set(). Note
> user space vmm may call SET_CPUID() on one vcpu multi times, then here
> refcount maybe isn't suitable. 

Yeah, waiting until KVM_SET_CPUID2 would be unpleasant for both KVM and userspace.
E.g. failing KVM_SET_CPUID2 because KVM can't enable mediated PMU support would
be rather confusing for userspace.

> what's a better solution ?

If doing the checks at VM creation is a stick point for folks, then the best
approach is probably to use KVM_CAP_PMU_CAPABILITY, i.e. require userspace to
explicitly opt-in to enabling mediated PMU usage.  Ha!  We can even do that
without additional uAPI, because KVM interprets cap->args[0]==0 as "enable vPMU".

The big problem with this is that enabling mediated PMU support by default would
break userspace.  Hmm, but that's arguably the case no matter what, as a setup
that worked before would suddenly start failing if the host was configured to use
the PMU-based NMI watchdog.

E.g. this, if we're ok commiting to never enabling mediated PMU by default.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 47d9f03b7778..01d9ee2114c8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6664,9 +6664,21 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                        break;
 
                mutex_lock(&kvm->lock);
-               if (!kvm->created_vcpus) {
-                       kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
-                       r = 0;
+               /*
+                * To keep PMU configuration "simple", setting vPMU support is
+                * disallowed if vCPUs are created, or if mediated PMU support
+                * was already enabled for the VM.
+                */
+               if (!kvm->created_vcpus &&
+                   (!enable_mediated_pmu || !kvm->arch.enable_pmu)) {
+                       if (enable_mediated_pmu &&
+                           !(cap->args[0] & KVM_PMU_CAP_DISABLE))
+                               r = x86_perf_get_mediated_pmu();
+                       else
+                               r = 0;
+
+                       if (!r)
+                               kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
                }
                mutex_unlock(&kvm->lock);
                break;
@@ -12563,7 +12575,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
        kvm->arch.guest_can_read_msr_platform_info = true;
-       kvm->arch.enable_pmu = enable_pmu;
+
+       /* PMU virtualization is opt-in when mediated PMU support is enabled. */
+       kvm->arch.enable_pmu = enable_pmu && !enable_mediated_pmu;
 
 #if IS_ENABLED(CONFIG_HYPERV)
        spin_lock_init(&kvm->arch.hv_root_tdp_lock);
Xiong Zhang April 19, 2024, 1:37 a.m. UTC | #12
On 4/19/2024 12:11 AM, Sean Christopherson wrote:
> On Wed, Apr 17, 2024, Xiong Y Zhang wrote:
>> On 4/16/2024 8:48 PM, Liang, Kan wrote:
>>>> x86_perf_get_mediated_pmu() is called at vm_create(),
>>>> x86_perf_put_mediated_pmu() is called at vm_destroy(), then system wide
>>>> perf events without exclude_guest=1 can not be created during the whole vm
>>>> life cycle (where nr_mediated_pmu_vms > 0 always), do I understand and use
>>>> the interface correctly ?
>>>
>>> Right, but it only impacts the events of PMU with the
>>> PERF_PMU_CAP_MEDIATED_VPMU.  For other PMUs, the event with exclude_guest=1
>>> can still be created.  KVM should not touch the counters of the PMU without
>>> PERF_PMU_CAP_MEDIATED_VPMU.
>>>
>>> BTW: I will also remove the prefix x86, since the functions are in the
>>> generic code.
>>>
>>> Thanks,
>>> Kan
>> After userspace VMM call VCPU SET_CPUID() ioctl, KVM knows whether vPMU is
>> enabled or not. If perf_get_mediated_pmu() is called at vm create, it is too
>> early.
> 
> Eh, if someone wants to create _only_ VMs without vPMUs, then they should load
> KVM with enable_pmu=false.  I can see people complaining about not being able to
> create VMs if they don't want to use have *any* vPMU usage, but I doubt anyone
> has a use cases where they want a mix of PMU-enabled and PMU- disabled VMs, *and*
> they are ok with VM creation failing for some VMs but not others.
enable_mediated_pmu and PMU-based nmi_watchdog are enabled by default on my ubuntu system, some ubuntu services create vm during ubuntu bootup, these ubuntu services fail after I add perf_get_mediated_pmu() in kvm_arch_init_vm(). so do this checking at vm creation may break some bootup services.
  
> 
>> it is better to let perf_get_mediated_pmu() track per cpu PMU state,
>> so perf_get_mediated_pmu() can be called by kvm after vcpu_cpuid_set(). Note
>> user space vmm may call SET_CPUID() on one vcpu multi times, then here
>> refcount maybe isn't suitable. 
> 
> Yeah, waiting until KVM_SET_CPUID2 would be unpleasant for both KVM and userspace.
> E.g. failing KVM_SET_CPUID2 because KVM can't enable mediated PMU support would
> be rather confusing for userspace.
> 
>> what's a better solution ?
> 
> If doing the checks at VM creation is a stick point for folks, then the best
> approach is probably to use KVM_CAP_PMU_CAPABILITY, i.e. require userspace to
> explicitly opt-in to enabling mediated PMU usage.  Ha!  We can even do that
> without additional uAPI, because KVM interprets cap->args[0]==0 as "enable vPMU".
> 
QEMU doesn't use KVM_CAP_PMU_CAPABILITY to enable/disable pmu. enable_cap(KVM_CAP_PMU_CAPABILITY) will be added into QEMU for mediated PMU.
With old QEMU, guest PMU will always use emulated vPMU, mediated PMU won't be enabled, if emulated vPMU is replaced later, the old QEMU guest will be broken.
> The big problem with this is that enabling mediated PMU support by default would
> break userspace.  Hmm, but that's arguably the case no matter what, as a setup
> that worked before would suddenly start failing if the host was configured to use
> the PMU-based NMI watchdog.
Based on perf_get_mediated_pmu() interface, admin need to disable all the system wide perf events before vm creation, no matter where the perf_get_mediated_pmu() is called in vm_create() or enable_cap(KVM_CAP_PMU_CAPABILITY) ioctl.
> 
> E.g. this, if we're ok commiting to never enabling mediated PMU by defau
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 47d9f03b7778..01d9ee2114c8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6664,9 +6664,21 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>                         break;
>  
>                 mutex_lock(&kvm->lock);
> -               if (!kvm->created_vcpus) {
> -                       kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
> -                       r = 0;
> +               /*
> +                * To keep PMU configuration "simple", setting vPMU support is
> +                * disallowed if vCPUs are created, or if mediated PMU support
> +                * was already enabled for the VM.
> +                */
> +               if (!kvm->created_vcpus &&
> +                   (!enable_mediated_pmu || !kvm->arch.enable_pmu)) {
> +                       if (enable_mediated_pmu &&
> +                           !(cap->args[0] & KVM_PMU_CAP_DISABLE))
> +                               r = x86_perf_get_mediated_pmu();
> +                       else
> +                               r = 0;
> +
> +                       if (!r)
> +                               kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
>                 }
>                 mutex_unlock(&kvm->lock);
>                 break;
> @@ -12563,7 +12575,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  
>         kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
>         kvm->arch.guest_can_read_msr_platform_info = true;
> -       kvm->arch.enable_pmu = enable_pmu;
> +
> +       /* PMU virtualization is opt-in when mediated PMU support is enabled. */
> +       kvm->arch.enable_pmu = enable_pmu && !enable_mediated_pmu;
>  
>  #if IS_ENABLED(CONFIG_HYPERV)
>         spin_lock_init(&kvm->arch.hv_root_tdp_lock);
> 
>
Xiong Zhang April 26, 2024, 4:09 a.m. UTC | #13
>>> +static inline int perf_force_exclude_guest_check(struct perf_event *event,
>>> +						 int cpu, struct task_struct *task)
>>> +{
>>> +	bool *force_exclude_guest = NULL;
>>> +
>>> +	if (!has_vpmu_passthrough_cap(event->pmu))
>>> +		return 0;
>>> +
>>> +	if (event->attr.exclude_guest)
>>> +		return 0;
>>> +
>>> +	if (cpu != -1) {
>>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
>>> +	} else if (task && (task->flags & PF_VCPU)) {
>>> +		/*
>>> +		 * Just need to check the running CPU in the event creation. If the
>>> +		 * task is moved to another CPU which supports the force_exclude_guest.
>>> +		 * The event will filtered out and be moved to the error stage. See
>>> +		 * merge_sched_in().
>>> +		 */
>>> +		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
>>> +	}
>>
>> These checks are extremely racy, I don't see how this can possibly do the
>> right thing.  PF_VCPU isn't a "this is a vCPU task", it's a "this task is about
>> to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in
>> include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting
>> time slices).
>>
> 
> This is to reject an !exclude_guest event creation for a running
> "passthrough" guest from host perf tool.
> Could you please suggest a way to detect it via the struct task_struct?
Here PF_VCPU is used to distinguish a perf event profiling userspace VMM
process, like perf record -e {} -p $QEMU_PID. A lot of emails have
discussed how to handle system wide perf event which has
perf_event.attr.task == NULL. But perf event for user space VMM should be
handled the same as system wide perf event, perf need a method to identify
a process perf event is for user space VMM. PF_VCPU isn't the right one,
then an open how to handle this ?

thanks
> 
> 
>> Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g.
>> perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the
>> per-CPU context mutex.
> 
> Do you mean that the perf_guest_enter() check could be happened right
> after the perf_force_exclude_guest_check()?
> It's possible. For this case, the event can still be created. It will be
> treated as an existing event and handled in merge_sched_in(). It will
> never be scheduled when a guest is running.
> 
> The perf_force_exclude_guest_check() is to make sure most of the cases
> can be rejected at the creation place. For the corner cases, they will
> be rejected in the schedule stage.
> 
>>
>>> +	if (force_exclude_guest && *force_exclude_guest)
>>> +		return -EBUSY;
>>> +	return 0;
>>> +}
>>> +
>>>  /*
>>>   * Holding the top-level event's child_mutex means that any
>>>   * descendant process that has inherited this event will block
>>> @@ -11973,6 +12142,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>>>  		goto err_ns;
>>>  	}
>>>  
>>> +	if (perf_force_exclude_guest_check(event, cpu, task)) {
>>
>> This should be:
>>
>> 	err = perf_force_exclude_guest_check(event, cpu, task);
>> 	if (err)
>> 		goto err_pmu;
>>
>> i.e. shouldn't effectively ignore/override the return result.
>>
> 
> Sure.
> 
> Thanks,
> Kan
> 
>>> +		err = -EBUSY;
>>> +		goto err_pmu;
>>> +	}
>>> +
>>>  	/*
>>>  	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
>>>  	 * events (they don't make sense as the cgroup will be different
>>> -- 
>>> 2.34.1
>>>
>
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 60eff413dbba..9912d1112371 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1392,6 +1392,11 @@  static inline int is_exclusive_pmu(struct pmu *pmu)
 	return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
 }
 
+static inline int has_vpmu_passthrough_cap(struct pmu *pmu)
+{
+	return pmu->capabilities & PERF_PMU_CAP_VPMU_PASSTHROUGH;
+}
+
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
@@ -1709,6 +1714,8 @@  extern void perf_event_task_tick(void);
 extern int perf_event_account_interrupt(struct perf_event *event);
 extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
+extern void perf_guest_enter(void);
+extern void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1795,6 +1802,8 @@  static inline u64 perf_event_pause(struct perf_event *event, bool reset)
 {
 	return 0;
 }
+static inline void perf_guest_enter(void)				{ }
+static inline void perf_guest_exit(void)				{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 683dc086ef10..59471eeec7e4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3803,6 +3803,8 @@  static inline void group_update_userpage(struct perf_event *group_event)
 		event_update_userpage(event);
 }
 
+static DEFINE_PER_CPU(bool, __perf_force_exclude_guest);
+
 static int merge_sched_in(struct perf_event *event, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -3814,6 +3816,14 @@  static int merge_sched_in(struct perf_event *event, void *data)
 	if (!event_filter_match(event))
 		return 0;
 
+	/*
+	 * The __perf_force_exclude_guest indicates entering the guest.
+	 * No events of the passthrough PMU should be scheduled.
+	 */
+	if (__this_cpu_read(__perf_force_exclude_guest) &&
+	    has_vpmu_passthrough_cap(event->pmu))
+		return 0;
+
 	if (group_can_go_on(event, *can_add_hw)) {
 		if (!group_sched_in(event, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
@@ -5707,6 +5717,165 @@  u64 perf_event_pause(struct perf_event *event, bool reset)
 }
 EXPORT_SYMBOL_GPL(perf_event_pause);
 
+static void __perf_force_exclude_guest_pmu(struct perf_event_pmu_context *pmu_ctx,
+					   struct perf_event *event)
+{
+	struct perf_event_context *ctx = pmu_ctx->ctx;
+	struct perf_event *sibling;
+	bool include_guest = false;
+
+	event_sched_out(event, ctx);
+	if (!event->attr.exclude_guest)
+		include_guest = true;
+	for_each_sibling_event(sibling, event) {
+		event_sched_out(sibling, ctx);
+		if (!sibling->attr.exclude_guest)
+			include_guest = true;
+	}
+	if (include_guest) {
+		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+		for_each_sibling_event(sibling, event)
+			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+	}
+}
+
+static void perf_force_exclude_guest_pmu(struct perf_event_pmu_context *pmu_ctx)
+{
+	struct perf_event *event, *tmp;
+	struct pmu *pmu = pmu_ctx->pmu;
+
+	perf_pmu_disable(pmu);
+
+	/*
+	 * Sched out all active events.
+	 * For the !exclude_guest events, they are forced to be sched out and
+	 * moved to the error state.
+	 * For the exclude_guest events, they should be scheduled out anyway
+	 * when the guest is running.
+	 */
+	list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
+		__perf_force_exclude_guest_pmu(pmu_ctx, event);
+
+	list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
+		__perf_force_exclude_guest_pmu(pmu_ctx, event);
+
+	pmu_ctx->rotate_necessary = 0;
+
+	perf_pmu_enable(pmu);
+}
+
+static void perf_force_exclude_guest_enter(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		/*
+		 * The PMU, which doesn't have the capability of excluding guest
+		 * e.g., uncore PMU, is not impacted.
+		 */
+		if (!has_vpmu_passthrough_cap(pmu_ctx->pmu))
+			continue;
+		perf_force_exclude_guest_pmu(pmu_ctx);
+	}
+}
+
+/*
+ * When a guest enters, force all active events of the PMU, which supports
+ * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other
+ * PMUs, such as uncore PMU, should not be impacted. The guest can
+ * temporarily own all counters of the PMU.
+ * During the period, all the creation of the new event of the PMU with
+ * !exclude_guest are error out.
+ */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	if (__this_cpu_read(__perf_force_exclude_guest))
+		return;
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	perf_force_exclude_guest_enter(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_force_exclude_guest_enter(cpuctx->task_ctx);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+
+	__this_cpu_write(__perf_force_exclude_guest, true);
+}
+EXPORT_SYMBOL_GPL(perf_guest_enter);
+
+static void perf_force_exclude_guest_exit(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+	struct pmu *pmu;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		pmu = pmu_ctx->pmu;
+		if (!has_vpmu_passthrough_cap(pmu))
+			continue;
+
+		perf_pmu_disable(pmu);
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+		perf_pmu_enable(pmu);
+	}
+}
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	if (!__this_cpu_read(__perf_force_exclude_guest))
+		return;
+
+	__this_cpu_write(__perf_force_exclude_guest, false);
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	perf_force_exclude_guest_exit(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_force_exclude_guest_exit(cpuctx->task_ctx);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_exit);
+
+static inline int perf_force_exclude_guest_check(struct perf_event *event,
+						 int cpu, struct task_struct *task)
+{
+	bool *force_exclude_guest = NULL;
+
+	if (!has_vpmu_passthrough_cap(event->pmu))
+		return 0;
+
+	if (event->attr.exclude_guest)
+		return 0;
+
+	if (cpu != -1) {
+		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu);
+	} else if (task && (task->flags & PF_VCPU)) {
+		/*
+		 * Just need to check the running CPU in the event creation. If the
+		 * task is moved to another CPU which supports the force_exclude_guest.
+		 * The event will filtered out and be moved to the error stage. See
+		 * merge_sched_in().
+		 */
+		force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task));
+	}
+
+	if (force_exclude_guest && *force_exclude_guest)
+		return -EBUSY;
+	return 0;
+}
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block
@@ -11973,6 +12142,11 @@  perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		goto err_ns;
 	}
 
+	if (perf_force_exclude_guest_check(event, cpu, task)) {
+		err = -EBUSY;
+		goto err_pmu;
+	}
+
 	/*
 	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
 	 * events (they don't make sense as the cgroup will be different