@@ -462,6 +462,14 @@ void intel_pmu_lbr_add(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;
+ /*
+ * An lbr event without a counter indicates this is for the vcpu lbr
+ * emulation, so set the vcpu_lbr flag when the vcpu lbr event
+ * gets scheduled on the lbr here.
+ */
+ if (is_no_counter_event(event))
+ cpuc->vcpu_lbr = 1;
+
cpuc->br_sel = event->hw.branch_reg.reg;
if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
@@ -509,6 +517,14 @@ void intel_pmu_lbr_del(struct perf_event *event)
task_ctx->lbr_callstack_users--;
}
+ /*
+ * An lbr event without a counter indicates this is for the vcpu lbr
+ * emulation, so clear the vcpu_lbr flag when the vcpu's lbr event
+ * gets scheduled out from the lbr.
+ */
+ if (is_no_counter_event(event))
+ cpuc->vcpu_lbr = 0;
+
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users--;
cpuc->lbr_users--;
@@ -521,7 +537,12 @@ void intel_pmu_lbr_enable_all(bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (cpuc->lbr_users)
+ /*
+ * The vcpu lbr emulation doesn't need host to enable lbr at this
+ * point, because the guest will set the enabling at a proper time
+ * itself.
+ */
+ if (cpuc->lbr_users && !cpuc->vcpu_lbr)
__intel_pmu_lbr_enable(pmi);
}
@@ -529,7 +550,11 @@ void intel_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (cpuc->lbr_users)
+ /*
+ * Same as intel_pmu_lbr_enable_all, the guest is responsible for
+ * clearing the enabling itself.
+ */
+ if (cpuc->lbr_users && !cpuc->vcpu_lbr)
__intel_pmu_lbr_disable();
}
@@ -668,8 +693,12 @@ void intel_pmu_lbr_read(void)
*
* This could be smarter and actually check the event,
* but this simple approach seems to work for now.
+ *
+ * And no need to read the lbr msrs here if the vcpu lbr event
+ * is using it, as the guest will read them itself.
*/
- if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users)
+ if (!cpuc->lbr_users || cpuc->vcpu_lbr ||
+ cpuc->lbr_users == cpuc->lbr_pebs_users)
return;
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
@@ -802,6 +831,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return -EOPNOTSUPP;
+ if (event->attr.exclude_host && is_kernel_event(event))
+ perf_event_set_no_counter(event);
+
/*
* setup SW LBR filter
*/
@@ -220,6 +220,7 @@ struct cpu_hw_events {
/*
* Intel LBR bits
*/
+ u8 vcpu_lbr;
int lbr_users;
int lbr_pebs_users;
struct perf_branch_stack lbr_stack;
@@ -474,6 +474,7 @@ struct kvm_pmu {
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
struct irq_work irq_work;
u64 reprogram_pmi;
+ struct perf_event *lbr_event;
};
struct kvm_pmu_ops;
@@ -164,6 +164,70 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
return ret;
}
+int intel_pmu_create_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf event is created for the following purposes:
+ * - have the host perf subsystem manage (prioritize) the guest's use
+ * of lbr with other host lbr events (if there are). The pinned field
+ * is set to true to make this event task pinned. If a cpu pinned
+ * lbr event reclaims lbr, the event->oncpu field will be set to -1.
+ * It will be checked at the moment before vm-entry, and the lbr
+ * feature will not be passed through to the guest for direct
+ * accesses if the vcpu's lbr event does not own the lbr feature
+ * anymore. This will cause the guest's lbr accesses to trap to the
+ * kvm's handler, where the accesses will be prevented in this case.
+ * - have the host perf subsystem help save/restore the guest lbr stack
+ * on vcpu switching. Since the host perf only performs this
+ * save/restore for the user callstack mode lbr event, we configure
+ * the sample_type and branch_sample_type fields accordingly to make
+ * this a user callstack mode lbr event.
+ *
+ * This perf event is used for the emulation of the lbr feature, which
+ * doesn't have a pmu counter. Accordingly, the related attr fields,
+ * such as config and sample period, don't need to be set here.
+ * exclude_host is set to tell the perf lbr driver that the event is for
+ * the guest lbr emulation.
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_host = true,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (pmu->lbr_event)
+ return 0;
+
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
+ NULL);
+ if (IS_ERR(event)) {
+ pr_err("%s: failed %ld\n", __func__, PTR_ERR(event));
+ return -ENOENT;
+ }
+ pmu->lbr_event = event;
+
+ return 0;
+}
+
+void intel_pmu_free_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event = pmu->lbr_event;
+
+ if (!event)
+ return;
+
+ perf_event_release_kernel(event);
+ pmu->lbr_event = NULL;
+}
+
static bool intel_pmu_lbr_enable(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
@@ -1050,6 +1050,13 @@ static inline void perf_event_set_no_counter(struct perf_event *event)
event->event_caps |= PERF_EV_CAP_NO_COUNTER;
}
+#define TASK_TOMBSTONE ((void *)-1L)
+
+static inline bool is_kernel_event(struct perf_event *event)
+{
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
/*
* Return 1 for a software event, 0 for a hardware event
*/
@@ -164,13 +164,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
raw_spin_unlock(&cpuctx->ctx.lock);
}
-#define TASK_TOMBSTONE ((void *)-1L)
-
-static bool is_kernel_event(struct perf_event *event)
-{
- return READ_ONCE(event->owner) == TASK_TOMBSTONE;
-}
-
/*
* On task ctx scheduling...
*