Message ID | 20240801045907.4010984-34-mizhang@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Mediated Passthrough vPMU 3.0 for x86 | expand |
On 8/1/2024 12:58 PM, Mingwei Zhang wrote: > Implement the save/restore of PMU state for pasthrough PMU in Intel. In > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state > and gains the full HW PMU ownership. On the contrary, host regains the > ownership of PMU HW from KVM when control flow leaves the scope of > passthrough PMU. > > Implement PMU context switches for Intel CPUs and opptunistically use > rdpmcl() instead of rdmsrl() when reading counters since the former has > lower latency in Intel CPUs. > > Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> > Signed-off-by: Mingwei Zhang <mizhang@google.com> > Tested-by: Yongwei Ma <yongwei.ma@intel.com> > --- > arch/x86/kvm/pmu.c | 46 ++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/vmx/pmu_intel.c | 41 +++++++++++++++++++++++++++++++- > 2 files changed, 86 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c > index 782b564bdf96..9bb733384069 100644 > --- a/arch/x86/kvm/pmu.c > +++ b/arch/x86/kvm/pmu.c > @@ -1068,14 +1068,60 @@ void kvm_pmu_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > > void kvm_pmu_save_pmu_context(struct kvm_vcpu *vcpu) > { > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_pmc *pmc; > + u32 i; > + > lockdep_assert_irqs_disabled(); > > static_call_cond(kvm_x86_pmu_save_pmu_context)(vcpu); > + > + /* > + * Clear hardware selector MSR content and its counter to avoid > + * leakage and also avoid this guest GP counter get accidentally > + * enabled during host running when host enable global ctrl. > + */ > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > + pmc = &pmu->gp_counters[i]; > + rdpmcl(i, pmc->counter); > + rdmsrl(pmc->msr_eventsel, pmc->eventsel); > + if (pmc->counter) > + wrmsrl(pmc->msr_counter, 0); > + if (pmc->eventsel) > + wrmsrl(pmc->msr_eventsel, 0); > + } > + > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > + pmc = &pmu->fixed_counters[i]; > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); > + if (pmc->counter) > + wrmsrl(pmc->msr_counter, 0); > + } > } > > void kvm_pmu_restore_pmu_context(struct kvm_vcpu *vcpu) > { > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + struct kvm_pmc *pmc; > + int i; > + > lockdep_assert_irqs_disabled(); > > static_call_cond(kvm_x86_pmu_restore_pmu_context)(vcpu); > + > + /* > + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC > + * in this case will be intercepted. Accessing to these counters and > + * selectors will cause #GP in the guest. > + */ > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { > + pmc = &pmu->gp_counters[i]; > + wrmsrl(pmc->msr_counter, pmc->counter); > + wrmsrl(pmc->msr_eventsel, pmu->gp_counters[i].eventsel); pmu->gp_counters[i].eventsel -> pmc->eventsel > + } > + > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > + pmc = &pmu->fixed_counters[i]; > + wrmsrl(pmc->msr_counter, pmc->counter); > + } > } > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c > index 0de918dc14ea..89c8f73a48c8 100644 > --- a/arch/x86/kvm/vmx/pmu_intel.c > +++ b/arch/x86/kvm/vmx/pmu_intel.c > @@ -572,7 +572,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) > } > > for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { > - pmu->fixed_counters[i].msr_eventsel = MSR_CORE_PERF_FIXED_CTR_CTRL; > + pmu->fixed_counters[i].msr_eventsel = 0; Seems unnecessary to clear msr_eventsel for fixed counters. Why? > pmu->fixed_counters[i].msr_counter = MSR_CORE_PERF_FIXED_CTR0 + i; > } > } > @@ -799,6 +799,43 @@ static void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) > vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_TYPE_RW, msr_intercept); > } > > +static void intel_save_guest_pmu_context(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + > + /* Global ctrl register is already saved at VM-exit. */ > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ > + if (pmu->global_status) > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); > + > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > + /* > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and > + * also avoid these guest fixed counters get accidentially enabled > + * during host running when host enable global ctrl. > + */ > + if (pmu->fixed_ctr_ctrl) > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > +} > + > +static void intel_restore_guest_pmu_context(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); > + u64 global_status, toggle; > + > + /* Clear host global_ctrl MSR if non-zero. */ > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); > + toggle = pmu->global_status ^ global_status; > + if (global_status & toggle) > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); > + if (pmu->global_status & toggle) > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); > + > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); > +} > + > struct kvm_pmu_ops intel_pmu_ops __initdata = { > .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, > .msr_idx_to_pmc = intel_msr_idx_to_pmc, > @@ -812,6 +849,8 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { > .cleanup = intel_pmu_cleanup, > .is_rdpmc_passthru_allowed = intel_is_rdpmc_passthru_allowed, > .passthrough_pmu_msrs = intel_passthrough_pmu_msrs, > + .save_pmu_context = intel_save_guest_pmu_context, > + .restore_pmu_context = intel_restore_guest_pmu_context, > .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, > .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, > .MIN_NR_GP_COUNTERS = 1,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 782b564bdf96..9bb733384069 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1068,14 +1068,60 @@ void kvm_pmu_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) void kvm_pmu_save_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + lockdep_assert_irqs_disabled(); static_call_cond(kvm_x86_pmu_save_pmu_context)(vcpu); + + /* + * Clear hardware selector MSR content and its counter to avoid + * leakage and also avoid this guest GP counter get accidentally + * enabled during host running when host enable global ctrl. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + rdpmcl(i, pmc->counter); + rdmsrl(pmc->msr_eventsel, pmc->eventsel); + if (pmc->counter) + wrmsrl(pmc->msr_counter, 0); + if (pmc->eventsel) + wrmsrl(pmc->msr_eventsel, 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter); + if (pmc->counter) + wrmsrl(pmc->msr_counter, 0); + } } void kvm_pmu_restore_pmu_context(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + lockdep_assert_irqs_disabled(); static_call_cond(kvm_x86_pmu_restore_pmu_context)(vcpu); + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * in this case will be intercepted. Accessing to these counters and + * selectors will cause #GP in the guest. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + wrmsrl(pmc->msr_counter, pmc->counter); + wrmsrl(pmc->msr_eventsel, pmu->gp_counters[i].eventsel); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + wrmsrl(pmc->msr_counter, pmc->counter); + } } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0de918dc14ea..89c8f73a48c8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -572,7 +572,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_counters[i].msr_eventsel = MSR_CORE_PERF_FIXED_CTR_CTRL; + pmu->fixed_counters[i].msr_eventsel = 0; pmu->fixed_counters[i].msr_counter = MSR_CORE_PERF_FIXED_CTR0 + i; } } @@ -799,6 +799,43 @@ static void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_TYPE_RW, msr_intercept); } +static void intel_save_guest_pmu_context(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* Global ctrl register is already saved at VM-exit. */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also avoid these guest fixed counters get accidentially enabled + * during host running when host enable global ctrl. + */ + if (pmu->fixed_ctr_ctrl) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + +static void intel_restore_guest_pmu_context(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + /* Clear host global_ctrl MSR if non-zero. */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl); +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -812,6 +849,8 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_rdpmc_passthru_allowed = intel_is_rdpmc_passthru_allowed, .passthrough_pmu_msrs = intel_passthrough_pmu_msrs, + .save_pmu_context = intel_save_guest_pmu_context, + .restore_pmu_context = intel_restore_guest_pmu_context, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, .MIN_NR_GP_COUNTERS = 1,