diff mbox series

[v2,07/54] perf: Add generic exclude_guest support

Message ID 20240506053020.3911940-8-mizhang@google.com (mailing list archive)
State New
Headers show
Series Mediated Passthrough vPMU 2.0 for x86 | expand

Commit Message

Mingwei Zhang May 6, 2024, 5:29 a.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Current perf doesn't explicitly schedule out all exclude_guest events
while the guest is running. There is no problem for the current emulated
vPMU. Because perf owns all the PMU counters. It can mask the counter
which is assigned to a exclude_guest event when a guest is running
(Intel way), or set the correspoinding HOSTONLY bit in evsentsel (AMD
way). The counter doesn't count when a guest is running.

However, either way doesn't work with the passthrough vPMU introduced.
A guest owns all the PMU counters when it's running. Host should not
mask any counters. The counter may be used by the guest. The evsentsel
may be overwrite.

Perf should explicitly schedule out all exclude_guest events to release
the PMU resources when entering a guest, and resume the counting when
exiting the guest.

Expose two interfaces to KVM. The KVM should notify the perf when
entering/exiting a guest.

It's possible that a exclude_guest event is created when a guest is
running. The new event should not be scheduled in as well.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |   4 ++
 kernel/events/core.c       | 104 +++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

Comments

Peter Zijlstra May 7, 2024, 8:58 a.m. UTC | #1
On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote:

> @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>  
> +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	update_context_time(ctx);
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		struct perf_event *event, *tmp;
> +		struct pmu *pmu = pmu_ctx->pmu;
> +
> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
> +			continue;
> +
> +		perf_pmu_disable(pmu);
> +
> +		/*
> +		 * All active events must be exclude_guest events.
> +		 * See perf_get_mediated_pmu().
> +		 * Unconditionally remove all active events.
> +		 */
> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
> +			group_sched_out(event, pmu_ctx->ctx);
> +
> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
> +			group_sched_out(event, pmu_ctx->ctx);
> +
> +		pmu_ctx->rotate_necessary = 0;
> +
> +		perf_pmu_enable(pmu);
> +	}
> +}
> +
> +/* When entering a guest, schedule out all exclude_guest events. */
> +void perf_guest_enter(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +		return;
> +	}
> +
> +	perf_sched_out_exclude_guest(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_sched_out_exclude_guest(cpuctx->task_ctx);
> +
> +	__this_cpu_write(perf_in_guest, true);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +
> +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	update_context_time(ctx);
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		struct pmu *pmu = pmu_ctx->pmu;
> +
> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
> +			continue;
> +
> +		perf_pmu_disable(pmu);
> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +		perf_pmu_enable(pmu);
> +	}
> +}
> +
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +		return;
> +	}
> +
> +	__this_cpu_write(perf_in_guest, false);
> +
> +	perf_sched_in_exclude_guest(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_sched_in_exclude_guest(cpuctx->task_ctx);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}

Bah, this is a ton of copy-paste from the normal scheduling code with
random changes. Why ?

Why can't this use ctx_sched_{in,out}() ? Surely the whole
CAP_PASSTHROUGHT thing is but a flag away.
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dd4920bf3d1b..acf16676401a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1734,6 +1734,8 @@  extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
 extern int perf_get_mediated_pmu(void);
 extern void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1826,6 +1828,8 @@  static inline int perf_get_mediated_pmu(void)
 }
 
 static inline void perf_put_mediated_pmu(void)			{ }
+static inline void perf_guest_enter(void)			{ }
+static inline void perf_guest_exit(void)			{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 701b622c670e..4c6daf5cc923 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -406,6 +406,7 @@  static atomic_t nr_include_guest_events __read_mostly;
 
 static refcount_t nr_mediated_pmu_vms = REFCOUNT_INIT(0);
 static DEFINE_MUTEX(perf_mediated_pmu_mutex);
+static DEFINE_PER_CPU(bool, perf_in_guest);
 
 /* !exclude_guest system wide event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
 static inline bool is_include_guest_event(struct perf_event *event)
@@ -3854,6 +3855,15 @@  static int merge_sched_in(struct perf_event *event, void *data)
 	if (!event_filter_match(event))
 		return 0;
 
+	/*
+	 * Don't schedule in any exclude_guest events of PMU with
+	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
+	 */
+	if (__this_cpu_read(perf_in_guest) &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
+	    event->attr.exclude_guest)
+		return 0;
+
 	if (group_can_go_on(event, *can_add_hw)) {
 		if (!group_sched_in(event, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
@@ -5791,6 +5801,100 @@  void perf_put_mediated_pmu(void)
 }
 EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
 
+static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		struct perf_event *event, *tmp;
+		struct pmu *pmu = pmu_ctx->pmu;
+
+		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
+			continue;
+
+		perf_pmu_disable(pmu);
+
+		/*
+		 * All active events must be exclude_guest events.
+		 * See perf_get_mediated_pmu().
+		 * Unconditionally remove all active events.
+		 */
+		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
+			group_sched_out(event, pmu_ctx->ctx);
+
+		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
+			group_sched_out(event, pmu_ctx->ctx);
+
+		pmu_ctx->rotate_necessary = 0;
+
+		perf_pmu_enable(pmu);
+	}
+}
+
+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	perf_sched_out_exclude_guest(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_sched_out_exclude_guest(cpuctx->task_ctx);
+
+	__this_cpu_write(perf_in_guest, true);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
+static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		struct pmu *pmu = pmu_ctx->pmu;
+
+		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
+			continue;
+
+		perf_pmu_disable(pmu);
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+		perf_pmu_enable(pmu);
+	}
+}
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	__this_cpu_write(perf_in_guest, false);
+
+	perf_sched_in_exclude_guest(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_sched_in_exclude_guest(cpuctx->task_ctx);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block