diff mbox series

[v3] KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros

Message ID 20230824020546.1108516-1-dapeng1.mi@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [v3] KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros | expand

Commit Message

Mi, Dapeng Aug. 24, 2023, 2:05 a.m. UTC
Magic numbers are used to manipulate the bit fields of
FIXED_CTR_CTRL MSR. This is not read-friendly and use macros to replace
these magic numbers to increase the readability.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
 arch/x86/kvm/pmu.c           | 10 +++++-----
 arch/x86/kvm/pmu.h           |  6 ++++--
 arch/x86/kvm/vmx/pmu_intel.c | 11 ++++++++---
 3 files changed, 17 insertions(+), 10 deletions(-)


base-commit: fff2e47e6c3b8050ca26656693caa857e3a8b740

Comments

Sean Christopherson March 5, 2024, 11:22 p.m. UTC | #1
+Mingwei

On Thu, Aug 24, 2023, Dapeng Mi wrote:
 diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
> index 7d9ba301c090..ffda2ecc3a22 100644
> --- a/arch/x86/kvm/pmu.h
> +++ b/arch/x86/kvm/pmu.h
> @@ -12,7 +12,8 @@
>  					  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
>  
>  /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
> +#define fixed_ctrl_field(ctrl_reg, idx) \
> +	(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
>  
>  #define VMWARE_BACKDOOR_PMC_HOST_TSC		0x10000
>  #define VMWARE_BACKDOOR_PMC_REAL_TIME		0x10001
> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
>  
>  	if (pmc_is_fixed(pmc))
>  		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
> -					pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
> +					pmc->idx - INTEL_PMC_IDX_FIXED) &
> +					(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
>  
>  	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
>  }
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index f2efa0bf7ae8..b0ac55891cb7 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>  		setup_fixed_pmc_eventsel(pmu);
>  	}
>  
> -	for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
> -		pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
> +	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> +		pmu->fixed_ctr_ctrl_mask &=
> +			 ~intel_fixed_bits_by_idx(i,
> +						  INTEL_FIXED_0_KERNEL |
> +						  INTEL_FIXED_0_USER |
> +						  INTEL_FIXED_0_ENABLE_PMI);
> +	}
>  	counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
>  		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
>  	pmu->global_ctrl_mask = counter_mask;
> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>  			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>  			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>  				pmu->fixed_ctr_ctrl_mask &=
> -					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));

OMG, this might just win the award for most obfuscated PMU code in KVM, which is
saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.

Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.

Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.

IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
perf will _always_ generate an adaptive record, even if the guest requested a
basic record.  Ugh, and KVM will always generate adaptive records even if the
guest doesn't support them.  This is all completely broken.  It probably kinda
sorta works because the Basic info is always stored in the record, and generating
more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.

Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.

*sigh*

I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
available only when the so-called passthrough PMU is available[*].  Because I
don't see how this is can possibly be functionally correct, nor do I see a way
to make it functionally correct without a rather large and invasive series.

Ouch.  And after chatting with Mingwei, who asked the very good question of
"can this leak host state?", I am pretty sure that yes, this can leak host state.

When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG

	pmu->pebs_data_cfg_mask = ~0xff00000full;

which makes sense in a vacuum, because AFAICT the architecture doesn't allow
exposing a subset of the four adaptive controls.

GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
provides data that might now otherwise be available to the guest, e.g. if host
userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.

And unless I'm missing something, LBRs are a full leak of host state.  Nothing
in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
use PEBS to read host LBRs at will.

Unless someone chimes in to point out how PEBS virtualization isn't a broken mess,
I will post a patch to effectively disable PEBS virtualization.

diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 41a4533f9989..a2f827fa0ca1 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
 
 static inline bool vmx_pebs_supported(void)
 {
-       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
+       return false;
 }
 
 static inline bool cpu_has_notify_vmexit(void)
Mi, Dapeng March 6, 2024, 7:55 a.m. UTC | #2
On 3/6/2024 7:22 AM, Sean Christopherson wrote:
> +Mingwei
>
> On Thu, Aug 24, 2023, Dapeng Mi wrote:
>   diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
>> index 7d9ba301c090..ffda2ecc3a22 100644
>> --- a/arch/x86/kvm/pmu.h
>> +++ b/arch/x86/kvm/pmu.h
>> @@ -12,7 +12,8 @@
>>   					  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
>>   
>>   /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
>> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
>> +#define fixed_ctrl_field(ctrl_reg, idx) \
>> +	(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
>>   
>>   #define VMWARE_BACKDOOR_PMC_HOST_TSC		0x10000
>>   #define VMWARE_BACKDOOR_PMC_REAL_TIME		0x10001
>> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
>>   
>>   	if (pmc_is_fixed(pmc))
>>   		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
>> -					pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
>> +					pmc->idx - INTEL_PMC_IDX_FIXED) &
>> +					(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
>>   
>>   	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
>>   }
>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>> index f2efa0bf7ae8..b0ac55891cb7 100644
>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>   		setup_fixed_pmc_eventsel(pmu);
>>   	}
>>   
>> -	for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
>> -		pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
>> +	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>> +		pmu->fixed_ctr_ctrl_mask &=
>> +			 ~intel_fixed_bits_by_idx(i,
>> +						  INTEL_FIXED_0_KERNEL |
>> +						  INTEL_FIXED_0_USER |
>> +						  INTEL_FIXED_0_ENABLE_PMI);
>> +	}
>>   	counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
>>   		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
>>   	pmu->global_ctrl_mask = counter_mask;
>> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>   			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>>   			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>   				pmu->fixed_ctr_ctrl_mask &=
>> -					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
> OMG, this might just win the award for most obfuscated PMU code in KVM, which is
> saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
> bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
>
> Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
> doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
>
> Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
>
> IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
> perf will _always_ generate an adaptive record, even if the guest requested a
> basic record.  Ugh, and KVM will always generate adaptive records even if the
> guest doesn't support them.  This is all completely broken.  It probably kinda
> sorta works because the Basic info is always stored in the record, and generating
> more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
>
> Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
> bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
> intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
> as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.


Currently the host PMU driver would always set the "Adaptive_Record" bit 
in PERFEVTSELx and FIXED_CNTR_CTR MSRs as long as HW supports the 
adaptive PEBS feature (See details in helpers intel_pmu_pebs_enable() 
and intel_pmu_enable_fixed()).

It looks perf system doesn't export a interface to enable/disable the 
adaptive PEBS.  I suppose that's why KVM doesn't handle the 
"Adaptive_Record" bit in ERFEVTSELx and FIXED_CNTR_CTR MSRs.


>
> *sigh*
>
> I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
> available only when the so-called passthrough PMU is available[*].  Because I
> don't see how this is can possibly be functionally correct, nor do I see a way
> to make it functionally correct without a rather large and invasive series.
>
> Ouch.  And after chatting with Mingwei, who asked the very good question of
> "can this leak host state?", I am pretty sure that yes, this can leak host state.
>
> When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
> access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
>
> 	pmu->pebs_data_cfg_mask = ~0xff00000full;
>
> which makes sense in a vacuum, because AFAICT the architecture doesn't allow
> exposing a subset of the four adaptive controls.
>
> GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
> provides data that might now otherwise be available to the guest, e.g. if host
> userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
>
> And unless I'm missing something, LBRs are a full leak of host state.  Nothing
> in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
> also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
> use PEBS to read host LBRs at will.

Not sure If I missed something, but I don't see there is leak of host 
state. All perf events created by KVM would set "exclude_host" 
attribute, that would leads to all guest counters including the PEBS 
enabling counters would be disabled immediately by VMX once vm exits, 
and so the PEBS engine would stop as well. I don't see a PEBS record 
contains host state is possible to be written into guest DS area.


>
> Unless someone chimes in to point out how PEBS virtualization isn't a broken mess,
> I will post a patch to effectively disable PEBS virtualization.
>
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 41a4533f9989..a2f827fa0ca1 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
>   
>   static inline bool vmx_pebs_supported(void)
>   {
> -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
> +       return false;
>   }
>   
>   static inline bool cpu_has_notify_vmexit(void)
Like Xu March 6, 2024, 9:03 a.m. UTC | #3
On 6/3/2024 7:22 am, Sean Christopherson wrote:
> +Mingwei
> 
> On Thu, Aug 24, 2023, Dapeng Mi wrote:
>   diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
>> index 7d9ba301c090..ffda2ecc3a22 100644
>> --- a/arch/x86/kvm/pmu.h
>> +++ b/arch/x86/kvm/pmu.h
>> @@ -12,7 +12,8 @@
>>   					  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
>>   
>>   /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
>> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
>> +#define fixed_ctrl_field(ctrl_reg, idx) \
>> +	(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
>>   
>>   #define VMWARE_BACKDOOR_PMC_HOST_TSC		0x10000
>>   #define VMWARE_BACKDOOR_PMC_REAL_TIME		0x10001
>> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
>>   
>>   	if (pmc_is_fixed(pmc))
>>   		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
>> -					pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
>> +					pmc->idx - INTEL_PMC_IDX_FIXED) &
>> +					(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
>>   
>>   	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
>>   }
>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>> index f2efa0bf7ae8..b0ac55891cb7 100644
>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>   		setup_fixed_pmc_eventsel(pmu);
>>   	}
>>   
>> -	for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
>> -		pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
>> +	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>> +		pmu->fixed_ctr_ctrl_mask &=
>> +			 ~intel_fixed_bits_by_idx(i,
>> +						  INTEL_FIXED_0_KERNEL |
>> +						  INTEL_FIXED_0_USER |
>> +						  INTEL_FIXED_0_ENABLE_PMI);
>> +	}
>>   	counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
>>   		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
>>   	pmu->global_ctrl_mask = counter_mask;
>> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>   			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>>   			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>   				pmu->fixed_ctr_ctrl_mask &=
>> -					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
> 
> OMG, this might just win the award for most obfuscated PMU code in KVM, which is
> saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
> bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
> 
> Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
> doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
> 
> Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
> 
> IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
> perf will _always_ generate an adaptive record, even if the guest requested a
> basic record.  Ugh, and KVM will always generate adaptive records even if the
> guest doesn't support them.  This is all completely broken.  It probably kinda
> sorta works because the Basic info is always stored in the record, and generating
> more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.

Yep, it works at least on machines with both adaptive and pebs_full features.

I remember one generation of Atom core (? GOLDMONT) that didn't have both
above PEBS sub-features, so we didn't set x86_pmu.pebs_ept on that platform.

Mingwei or others are encouraged to construct use cases in KUT::pmu_pebs.flat
that violate guest-pebs rules (e.g., leak host state), as we all recognize that 
testing
is the right way to condemn legacy code, not just lengthy emails.

> 
> Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
> bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
> intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
> as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
> 
> *sigh*
> 
> I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
> available only when the so-called passthrough PMU is available[*].  Because I
> don't see how this is can possibly be functionally correct, nor do I see a way
> to make it functionally correct without a rather large and invasive series.

Considering that I've tried the idea myself, I have no inclination towards
"passthrough PMU", and I'd like to be able to take the time to review that
patchset while we all wait for a clear statement from that perf-core man,
who don't really care about virtualization and don't want to lose control
of global hardware resources.

Before we actually get to that ideal state you want, we have to deal with
some intermediate state and face to any users that rely on the current code,
you had urged to merge in a KVM document for vPMU, not sure how far
along that part of the work is.

> 
> Ouch.  And after chatting with Mingwei, who asked the very good question of
> "can this leak host state?", I am pretty sure that yes, this can leak host state.

The Basic Info has a tsc field, I suspect it's the host-state-tsc.

> 
> When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
> access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
> 
> 	pmu->pebs_data_cfg_mask = ~0xff00000full;
> 
> which makes sense in a vacuum, because AFAICT the architecture doesn't allow
> exposing a subset of the four adaptive controls.
> 
> GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
> provides data that might now otherwise be available to the guest, e.g. if host
> userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.

Indeed, KVM_SET_PMU_EVENT_FILTER doesn't work in harmony with
guest-pebs, and I believe there is a big problem here, especially with the
lack of targeted testing.

One reason for this is that we don't use this cockamamie API in our
large-scale production environments, and users of vPMU want to get real
runtime information about physical cpus, not just virtualised hardware
architecture interfaces.

> 
> And unless I'm missing something, LBRs are a full leak of host state.  Nothing
> in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
> also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
> use PEBS to read host LBRs at will.

KVM is also passing through LBRs when guest uses LBR but not at the
granularity of vm-exit/entry. I'm not sure if the LBR_EN bit is required
to get LBR information via PEBS, also not confirmed whether PEBS-lbr
can be enabled at the same time as independent LBR;

I recall that PEBS-assist, per cpu-arch, would clean up this part of the
record when crossing root/non-root boundaries, or not generate record.

We're looking forward to the tests that will undermine this perception.

There are some devilish details during the processing of vm-exit and
the generation of host/guest pebs, and those interested can delve into
the short description in this SDM section "20.9.5 EPT-Friendly PEBS".

> 
> Unless someone chimes in to point out how PEBS virtualization isn't a broken mess,
> I will post a patch to effectively disable PEBS virtualization.

There are two factors that affect the availability of guest-pebs:

1. the technical need to use core-PMU in both host/guest worlds;
(I don't think Googlers are paying attention to this part of users' needs)

2. guest-pebs is temporarily disabled in the case of cross-mapping counter,
which reduces the number of performance samples collected by guest;

> 
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 41a4533f9989..a2f827fa0ca1 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
>   
>   static inline bool vmx_pebs_supported(void)
>   {
> -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
> +       return false;

As you know, user-space VMM may disable guest-pebs by filtering out the
MSR_IA32_PERF_CAPABILITIE.PERF_CAP_PEBS_FORMAT or CPUID.PDCM.

In the end, if our great KVM maintainers insist on doing this,
there is obviously nothing I can do about it.

Hope you have a good day.

>   }
>   
>   static inline bool cpu_has_notify_vmexit(void)
>
Jim Mattson March 6, 2024, 3:09 p.m. UTC | #4
On Wed, Mar 6, 2024 at 1:11 AM Like Xu <like.xu.linux@gmail.com> wrote:
>
> On 6/3/2024 7:22 am, Sean Christopherson wrote:
> > +Mingwei
> >
> > On Thu, Aug 24, 2023, Dapeng Mi wrote:
> >   diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
> >> index 7d9ba301c090..ffda2ecc3a22 100644
> >> --- a/arch/x86/kvm/pmu.h
> >> +++ b/arch/x86/kvm/pmu.h
> >> @@ -12,7 +12,8 @@
> >>                                        MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
> >>
> >>   /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
> >> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
> >> +#define fixed_ctrl_field(ctrl_reg, idx) \
> >> +    (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
> >>
> >>   #define VMWARE_BACKDOOR_PMC_HOST_TSC               0x10000
> >>   #define VMWARE_BACKDOOR_PMC_REAL_TIME              0x10001
> >> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
> >>
> >>      if (pmc_is_fixed(pmc))
> >>              return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
> >> -                                    pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
> >> +                                    pmc->idx - INTEL_PMC_IDX_FIXED) &
> >> +                                    (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
> >>
> >>      return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
> >>   }
> >> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> >> index f2efa0bf7ae8..b0ac55891cb7 100644
> >> --- a/arch/x86/kvm/vmx/pmu_intel.c
> >> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> >> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> >>              setup_fixed_pmc_eventsel(pmu);
> >>      }
> >>
> >> -    for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
> >> -            pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
> >> +    for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> >> +            pmu->fixed_ctr_ctrl_mask &=
> >> +                     ~intel_fixed_bits_by_idx(i,
> >> +                                              INTEL_FIXED_0_KERNEL |
> >> +                                              INTEL_FIXED_0_USER |
> >> +                                              INTEL_FIXED_0_ENABLE_PMI);
> >> +    }
> >>      counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
> >>              (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
> >>      pmu->global_ctrl_mask = counter_mask;
> >> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> >>                      pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
> >>                      for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> >>                              pmu->fixed_ctr_ctrl_mask &=
> >> -                                    ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
> >
> > OMG, this might just win the award for most obfuscated PMU code in KVM, which is
> > saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
> > bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
> >
> > Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
> > doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
> >
> > Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
> >
> > IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
> > perf will _always_ generate an adaptive record, even if the guest requested a
> > basic record.  Ugh, and KVM will always generate adaptive records even if the
> > guest doesn't support them.  This is all completely broken.  It probably kinda
> > sorta works because the Basic info is always stored in the record, and generating
> > more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
>
> Yep, it works at least on machines with both adaptive and pebs_full features.
>
> I remember one generation of Atom core (? GOLDMONT) that didn't have both
> above PEBS sub-features, so we didn't set x86_pmu.pebs_ept on that platform.
>
> Mingwei or others are encouraged to construct use cases in KUT::pmu_pebs.flat
> that violate guest-pebs rules (e.g., leak host state), as we all recognize that
> testing
> is the right way to condemn legacy code, not just lengthy emails.
>
> >
> > Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
> > bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
> > intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
> > as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
> >
> > *sigh*
> >
> > I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
> > available only when the so-called passthrough PMU is available[*].  Because I
> > don't see how this is can possibly be functionally correct, nor do I see a way
> > to make it functionally correct without a rather large and invasive series.
>
> Considering that I've tried the idea myself, I have no inclination towards
> "passthrough PMU", and I'd like to be able to take the time to review that
> patchset while we all wait for a clear statement from that perf-core man,
> who don't really care about virtualization and don't want to lose control
> of global hardware resources.
>
> Before we actually get to that ideal state you want, we have to deal with
> some intermediate state and face to any users that rely on the current code,
> you had urged to merge in a KVM document for vPMU, not sure how far
> along that part of the work is.
>
> >
> > Ouch.  And after chatting with Mingwei, who asked the very good question of
> > "can this leak host state?", I am pretty sure that yes, this can leak host state.
>
> The Basic Info has a tsc field, I suspect it's the host-state-tsc.
>
> >
> > When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
> > access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
> >
> >       pmu->pebs_data_cfg_mask = ~0xff00000full;
> >
> > which makes sense in a vacuum, because AFAICT the architecture doesn't allow
> > exposing a subset of the four adaptive controls.
> >
> > GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
> > provides data that might now otherwise be available to the guest, e.g. if host
> > userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
>
> Indeed, KVM_SET_PMU_EVENT_FILTER doesn't work in harmony with
> guest-pebs, and I believe there is a big problem here, especially with the
> lack of targeted testing.
>
> One reason for this is that we don't use this cockamamie API in our
> large-scale production environments, and users of vPMU want to get real
> runtime information about physical cpus, not just virtualised hardware
> architecture interfaces.
>
> >
> > And unless I'm missing something, LBRs are a full leak of host state.  Nothing
> > in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
> > also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
> > use PEBS to read host LBRs at will.
>
> KVM is also passing through LBRs when guest uses LBR but not at the
> granularity of vm-exit/entry. I'm not sure if the LBR_EN bit is required
> to get LBR information via PEBS, also not confirmed whether PEBS-lbr
> can be enabled at the same time as independent LBR;
>
> I recall that PEBS-assist, per cpu-arch, would clean up this part of the
> record when crossing root/non-root boundaries, or not generate record.
>
> We're looking forward to the tests that will undermine this perception.
>
> There are some devilish details during the processing of vm-exit and
> the generation of host/guest pebs, and those interested can delve into
> the short description in this SDM section "20.9.5 EPT-Friendly PEBS".
>
> >
> > Unless someone chimes in to point out how PEBS virtualization isn't a broken mess,
> > I will post a patch to effectively disable PEBS virtualization.
>
> There are two factors that affect the availability of guest-pebs:
>
> 1. the technical need to use core-PMU in both host/guest worlds;
> (I don't think Googlers are paying attention to this part of users' needs)

Let me clear up any misperceptions you might have that Google alone is
foisting the pass-through PMU on the world. The work so far has been a
collaboration between Google and Intel. Now, AMD has joined the
collaboration as well. Mingwei is taking the lead on the project, but
Googlers are outnumbered by the x86 CPU vendors ten to one.

The pass-through PMU allows both the host and guest worlds to use the
core PMU, more so than the existing vPMU implementation. I assume your
complaint is about the desire for host software to monitor guest
behavior with core PMU events while the guest is running. Today,
Google Cloud does this for fleet management, and losing this
capability is not something we are looking forward to. However, the
writing is on the wall: Coco is going to take this capability away
from us anyway.

> 2. guest-pebs is temporarily disabled in the case of cross-mapping counter,
> which reduces the number of performance samples collected by guest;
>
> >
> > diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> > index 41a4533f9989..a2f827fa0ca1 100644
> > --- a/arch/x86/kvm/vmx/capabilities.h
> > +++ b/arch/x86/kvm/vmx/capabilities.h
> > @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
> >
> >   static inline bool vmx_pebs_supported(void)
> >   {
> > -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
> > +       return false;
>
> As you know, user-space VMM may disable guest-pebs by filtering out the
> MSR_IA32_PERF_CAPABILITIE.PERF_CAP_PEBS_FORMAT or CPUID.PDCM.
>
> In the end, if our great KVM maintainers insist on doing this,
> there is obviously nothing I can do about it.
>
> Hope you have a good day.
>
> >   }
> >
> >   static inline bool cpu_has_notify_vmexit(void)
> >
>
Sean Christopherson March 6, 2024, 8:17 p.m. UTC | #5
On Wed, Mar 06, 2024, Like Xu wrote:
> > > @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> > >   			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
> > >   			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> > >   				pmu->fixed_ctr_ctrl_mask &=
> > > -					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
> > 
> > OMG, this might just win the award for most obfuscated PMU code in KVM, which is
> > saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
> > bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
> > 
> > Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
> > doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
> > 
> > Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
> > 
> > IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
> > perf will _always_ generate an adaptive record, even if the guest requested a
> > basic record.  Ugh, and KVM will always generate adaptive records even if the
> > guest doesn't support them.  This is all completely broken.  It probably kinda
> > sorta works because the Basic info is always stored in the record, and generating
> > more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
> 
> Yep, it works at least on machines with both adaptive and pebs_full features.

*AND* if the guest uses PEBS exactly the same way that the Linux kernel uses PEBS.

> Mingwei or others are encouraged to construct use cases in KUT::pmu_pebs.flat
> that violate guest-pebs rules (e.g., leak host state), as we all recognize
> that testing is the right way to condemn legacy code, not just lengthy emails.

*sigh*

diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c
index f7b52b90..43e7a207 100644
--- a/x86/pmu_pebs.c
+++ b/x86/pmu_pebs.c
@@ -212,8 +212,12 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
        u64 baseline_extra_ctrl = 0, fixed_ctr_ctrl = 0;
        unsigned int idx;
 
-       if (has_baseline)
-               wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+       if (has_baseline) {
+               if (pebs_data_cfg)
+                       wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+               else
+                       wrmsr(MSR_PEBS_DATA_CFG, 0xf);
+       }
 
        ds = (struct debug_store *)ds_bufer;
        ds->pebs_index = ds->pebs_buffer_base = (unsigned long)pebs_buffer;
@@ -224,7 +228,7 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
        for (idx = 0; idx < pmu.nr_fixed_counters; idx++) {
                if (!(BIT_ULL(FIXED_CNT_INDEX + idx) & bitmask))
                        continue;
-               if (has_baseline)
+               if (has_baseline && pebs_data_cfg)
                        baseline_extra_ctrl = BIT(FIXED_CNT_INDEX + idx * 4);
                wrmsr(MSR_PERF_FIXED_CTRx(idx), ctr_start_val);
                fixed_ctr_ctrl |= (0xbULL << (idx * 4) | baseline_extra_ctrl);
@@ -235,7 +239,7 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
        for (idx = 0; idx < max_nr_gp_events; idx++) {
                if (!(BIT_ULL(idx) & bitmask))
                        continue;
-               if (has_baseline)
+               if (has_baseline && pebs_data_cfg)
                        baseline_extra_ctrl = ICL_EVENTSEL_ADAPTIVE;
                wrmsr(MSR_GP_EVENT_SELECTx(idx), EVNTSEL_EN | EVNTSEL_OS | EVNTSEL_USR |
                                                 intel_arch_events[idx] | baseline_extra_ctrl);

FAIL: Multiple (0x700000055): PEBS record (written seq 0) is verified (including size, counters and cfg).
FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).
FAIL: GP counter 0 (0xfffffffffffe): PEBS record (written seq 0) is verified (including size, counters and cfg).
FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).
FAIL: Multiple (0x700000055): PEBS record (written seq 0) is verified (including size, counters and cfg).
FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).

> > Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
> > bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
> > intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
> > as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
> > 
> > *sigh*
> > 
> > I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
> > available only when the so-called passthrough PMU is available[*].  Because I
> > don't see how this is can possibly be functionally correct, nor do I see a way
> > to make it functionally correct without a rather large and invasive series.
> 
> Considering that I've tried the idea myself, I have no inclination towards
> "passthrough PMU", and I'd like to be able to take the time to review that
> patchset while we all wait for a clear statement from that perf-core man,
> who don't really care about virtualization and don't want to lose control
> of global hardware resources.
> 
> Before we actually get to that ideal state you want, we have to deal with
> some intermediate state and face to any users that rely on the current code,

It's not an ideal state, it's simply the only way I see to get things like adaptive
PEBS to work safely, reliably, and correctly without taking on an absurd amount of
complexity.

> you had urged to merge in a KVM document for vPMU, not sure how far
> along that part of the work is.

> > Ouch.  And after chatting with Mingwei, who asked the very good question of
> > "can this leak host state?", I am pretty sure that yes, this can leak host state.
> 
> The Basic Info has a tsc field, I suspect it's the host-state-tsc.

It's not, the CPU offsets it correctly, at least on ICX (I haven't check scaling).

> > When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
> > access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
> > 
> > 	pmu->pebs_data_cfg_mask = ~0xff00000full;
> > 
> > which makes sense in a vacuum, because AFAICT the architecture doesn't allow
> > exposing a subset of the four adaptive controls.
> > 
> > GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
> > provides data that might now otherwise be available to the guest, e.g. if host
> > userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
> 
> Indeed, KVM_SET_PMU_EVENT_FILTER doesn't work in harmony with
> guest-pebs, and I believe there is a big problem here, especially with the
> lack of targeted testing.
> 
> One reason for this is that we don't use this cockamamie API in our

Libeling APIs because they aren't useful for _your_ security goals doesn't mean
you get to ignore their existence when contributing upstream.  New features don't
necessarilly have to fully support existing capabilities, e.g. I would be a-ok
making KVM_SET_PMU_EVENT_FILTER mututally exclusive with exposing adapative PEBS
to the guest.  That way the user can at least know that filtering won't work if
adapative PEBS is exposed to the guest.

> large-scale production environments, and users of vPMU want to get real
> runtime information about physical cpus, not just virtualised hardware
> architecture interfaces.
> 
> > 
> > And unless I'm missing something, LBRs are a full leak of host state.  Nothing
> > in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
> > also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
> > use PEBS to read host LBRs at will.
> 
> KVM is also passing through LBRs when guest uses LBR but not at the
> granularity of vm-exit/entry. I'm not sure if the LBR_EN bit is required
> to get LBR information via PEBS, also not confirmed whether PEBS-lbr
> can be enabled at the same time as independent LBR;
> 
> I recall that PEBS-assist, per cpu-arch, would clean up this part of the
> record when crossing root/non-root boundaries, or not generate record.

Nope.  The MSRs definitely leak to the guest.  The only hard part was figuring
out how to get perf to utilize LBRs without consuming every counter (`perf top`
was the extent of my knowledge, until now...).

E.g.

  perf record -b -e instructions

and

  FAIL: PEBS LBR record 0 isn't empty, got from = 'ffffffffc0a00ccb', to = 'ffffffffc0a010af', info = '2'
  FAIL: PEBS LBR record 1 isn't empty, got from = 'ffffffffc0a010aa', to = 'ffffffffc0a00cb0', info = '6'
  FAIL: PEBS LBR record 2 isn't empty, got from = 'ffffffffc0a00e06', to = 'ffffffffc0a01090', info = '1'
  FAIL: PEBS LBR record 3 isn't empty, got from = 'ffffffffc0a00df4', to = 'ffffffffc0a00e00', info = '2'
  FAIL: PEBS LBR record 4 isn't empty, got from = 'ffffffffc0a00dbc', to = 'ffffffffc0a00de0', info = '1'
  FAIL: PEBS LBR record 5 isn't empty, got from = 'ffffffffc0a00f63', to = 'ffffffffc0a00db5', info = '1'
  FAIL: PEBS LBR record 6 isn't empty, got from = 'ffffffffc0903f23', to = 'ffffffffc0a00f61', info = '11'
  FAIL: PEBS LBR record 7 isn't empty, got from = 'ffffffffc0a00f5c', to = 'ffffffffc0903f10', info = '1'
  FAIL: PEBS LBR record 8 isn't empty, got from = 'ffffffffc0a00db0', to = 'ffffffffc0a00f55', info = '1'
  FAIL: PEBS LBR record 9 isn't empty, got from = 'ffffffff8f6b2c23', to = 'ffffffffc0a00da6', info = 'a'
  FAIL: PEBS LBR record 10 isn't empty, got from = 'ffffffffc0a00da1', to = 'ffffffff8f6b2b60', info = '7'
  FAIL: PEBS LBR record 11 isn't empty, got from = 'ffffffff8eba1b85', to = 'ffffffffc0a00d9a', info = '6'
  FAIL: PEBS LBR record 12 isn't empty, got from = 'ffffffff8eba1b8c', to = 'ffffffff8eba1b3f', info = '1'
  FAIL: PEBS LBR record 13 isn't empty, got from = 'ffffffff8eba1b2d', to = 'ffffffff8eba1b87', info = 'e'
  FAIL: PEBS LBR record 14 isn't empty, got from = 'ffffffff8eba1aff', to = 'ffffffff8eba1b18', info = '7'
  FAIL: PEBS LBR record 15 isn't empty, got from = 'ffffffff8eb996e0', to = 'ffffffff8eba1aeb', info = '2'
  FAIL: PEBS LBR record 16 isn't empty, got from = 'ffffffff8eb9963a', to = 'ffffffff8eb996cc', info = '1'
  FAIL: PEBS LBR record 17 isn't empty, got from = 'ffffffff8eb995ef', to = 'ffffffff8eb9962f', info = '1'
  FAIL: PEBS LBR record 18 isn't empty, got from = 'ffffffff8f6b30a1', to = 'ffffffff8eb995df', info = '4'
  FAIL: PEBS LBR record 19 isn't empty, got from = 'ffffffff8eb995da', to = 'ffffffff8f6b3070', info = '2'
  FAIL: PEBS LBR record 20 isn't empty, got from = 'ffffffff8eba1ae6', to = 'ffffffff8eb995c0', info = '6'
  FAIL: PEBS LBR record 21 isn't empty, got from = 'ffffffffc0a00d95', to = 'ffffffff8eba1a90', info = '2'
  FAIL: PEBS LBR record 22 isn't empty, got from = 'ffffffff8eb69135', to = 'ffffffffc0a00d89', info = '2'
  FAIL: PEBS LBR record 23 isn't empty, got from = 'ffffffff8eb690d6', to = 'ffffffff8eb69104', info = '2'
  FAIL: PEBS LBR record 24 isn't empty, got from = 'ffffffff8eb69102', to = 'ffffffff8eb690c3', info = '1'
  FAIL: PEBS LBR record 25 isn't empty, got from = 'ffffffff8eb6f442', to = 'ffffffff8eb69100', info = '2'
  FAIL: PEBS LBR record 26 isn't empty, got from = 'ffffffff8eb6f3f3', to = 'ffffffff8eb6f429', info = '5'
  FAIL: PEBS LBR record 27 isn't empty, got from = 'ffffffff8eb6f3a6', to = 'ffffffff8eb6f3c3', info = '2'
  FAIL: PEBS LBR record 28 isn't empty, got from = 'ffffffff8eb690fb', to = 'ffffffff8eb6f390', info = '2'
  FAIL: PEBS LBR record 29 isn't empty, got from = 'ffffffff8eb690c1', to = 'ffffffff8eb690d8', info = '2'
  FAIL: PEBS LBR record 30 isn't empty, got from = 'ffffffff8eb6907b', to = 'ffffffff8eb690b1', info = '1'
  FAIL: PEBS LBR record 31 isn't empty, got from = 'ffffffff8eb690af', to = 'ffffffff8eb6906e', info = '1'

> > diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> > index 41a4533f9989..a2f827fa0ca1 100644
> > --- a/arch/x86/kvm/vmx/capabilities.h
> > +++ b/arch/x86/kvm/vmx/capabilities.h
> > @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
> >   static inline bool vmx_pebs_supported(void)
> >   {
> > -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
> > +       return false;
> 
> As you know, user-space VMM may disable guest-pebs by filtering out the
> MSR_IA32_PERF_CAPABILITIE.PERF_CAP_PEBS_FORMAT or CPUID.PDCM.

Relying on userspace to fudge around KVM bugs is not acceptable for upstream.
PMU virtualization is already a bit dicey in that KVM relies on userspace to
filter out sensitive events, but leaking host addresses and failing to correctly
virtualize the architecture is an entirely different level of wrong.

> In the end, if our great KVM maintainers insist on doing this,
> there is obviously nothing I can do about it.

Bullshit.  There is plenty you could have done to prevent this.  It took me what,
6 hours total?  To (a) realize the code is buggy, (b) type up an email, and (c)
modify tests to confirm the bugs.  And at least half of that time was due to me
floundering around trying to generate LBR records because I know nothing about
the perf tool.

If you were more interested in ensuring KVM is maintainble, secure, and functionally
correct, we wouldn't be where we are today.
Like Xu March 7, 2024, 3:27 a.m. UTC | #6
On 6/3/2024 11:09 pm, Jim Mattson wrote:
> On Wed, Mar 6, 2024 at 1:11 AM Like Xu <like.xu.linux@gmail.com> wrote:
>>
>> On 6/3/2024 7:22 am, Sean Christopherson wrote:
>>> +Mingwei
>>>
>>> On Thu, Aug 24, 2023, Dapeng Mi wrote:
>>>    diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
>>>> index 7d9ba301c090..ffda2ecc3a22 100644
>>>> --- a/arch/x86/kvm/pmu.h
>>>> +++ b/arch/x86/kvm/pmu.h
>>>> @@ -12,7 +12,8 @@
>>>>                                         MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
>>>>
>>>>    /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
>>>> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
>>>> +#define fixed_ctrl_field(ctrl_reg, idx) \
>>>> +    (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
>>>>
>>>>    #define VMWARE_BACKDOOR_PMC_HOST_TSC               0x10000
>>>>    #define VMWARE_BACKDOOR_PMC_REAL_TIME              0x10001
>>>> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
>>>>
>>>>       if (pmc_is_fixed(pmc))
>>>>               return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
>>>> -                                    pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
>>>> +                                    pmc->idx - INTEL_PMC_IDX_FIXED) &
>>>> +                                    (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
>>>>
>>>>       return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
>>>>    }
>>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>>>> index f2efa0bf7ae8..b0ac55891cb7 100644
>>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>>> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>>>               setup_fixed_pmc_eventsel(pmu);
>>>>       }
>>>>
>>>> -    for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
>>>> -            pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
>>>> +    for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>>> +            pmu->fixed_ctr_ctrl_mask &=
>>>> +                     ~intel_fixed_bits_by_idx(i,
>>>> +                                              INTEL_FIXED_0_KERNEL |
>>>> +                                              INTEL_FIXED_0_USER |
>>>> +                                              INTEL_FIXED_0_ENABLE_PMI);
>>>> +    }
>>>>       counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
>>>>               (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
>>>>       pmu->global_ctrl_mask = counter_mask;
>>>> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>>>                       pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>>>>                       for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>>>                               pmu->fixed_ctr_ctrl_mask &=
>>>> -                                    ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
>>>
>>> OMG, this might just win the award for most obfuscated PMU code in KVM, which is
>>> saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
>>> bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
>>>
>>> Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
>>> doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
>>>
>>> Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
>>>
>>> IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
>>> perf will _always_ generate an adaptive record, even if the guest requested a
>>> basic record.  Ugh, and KVM will always generate adaptive records even if the
>>> guest doesn't support them.  This is all completely broken.  It probably kinda
>>> sorta works because the Basic info is always stored in the record, and generating
>>> more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
>>
>> Yep, it works at least on machines with both adaptive and pebs_full features.
>>
>> I remember one generation of Atom core (? GOLDMONT) that didn't have both
>> above PEBS sub-features, so we didn't set x86_pmu.pebs_ept on that platform.
>>
>> Mingwei or others are encouraged to construct use cases in KUT::pmu_pebs.flat
>> that violate guest-pebs rules (e.g., leak host state), as we all recognize that
>> testing
>> is the right way to condemn legacy code, not just lengthy emails.
>>
>>>
>>> Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
>>> bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
>>> intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
>>> as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
>>>
>>> *sigh*
>>>
>>> I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
>>> available only when the so-called passthrough PMU is available[*].  Because I
>>> don't see how this is can possibly be functionally correct, nor do I see a way
>>> to make it functionally correct without a rather large and invasive series.
>>
>> Considering that I've tried the idea myself, I have no inclination towards
>> "passthrough PMU", and I'd like to be able to take the time to review that
>> patchset while we all wait for a clear statement from that perf-core man,
>> who don't really care about virtualization and don't want to lose control
>> of global hardware resources.
>>
>> Before we actually get to that ideal state you want, we have to deal with
>> some intermediate state and face to any users that rely on the current code,
>> you had urged to merge in a KVM document for vPMU, not sure how far
>> along that part of the work is.
>>
>>>
>>> Ouch.  And after chatting with Mingwei, who asked the very good question of
>>> "can this leak host state?", I am pretty sure that yes, this can leak host state.
>>
>> The Basic Info has a tsc field, I suspect it's the host-state-tsc.
>>
>>>
>>> When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
>>> access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
>>>
>>>        pmu->pebs_data_cfg_mask = ~0xff00000full;
>>>
>>> which makes sense in a vacuum, because AFAICT the architecture doesn't allow
>>> exposing a subset of the four adaptive controls.
>>>
>>> GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
>>> provides data that might now otherwise be available to the guest, e.g. if host
>>> userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
>>
>> Indeed, KVM_SET_PMU_EVENT_FILTER doesn't work in harmony with
>> guest-pebs, and I believe there is a big problem here, especially with the
>> lack of targeted testing.
>>
>> One reason for this is that we don't use this cockamamie API in our
>> large-scale production environments, and users of vPMU want to get real
>> runtime information about physical cpus, not just virtualised hardware
>> architecture interfaces.
>>
>>>
>>> And unless I'm missing something, LBRs are a full leak of host state.  Nothing
>>> in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
>>> also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
>>> use PEBS to read host LBRs at will.
>>
>> KVM is also passing through LBRs when guest uses LBR but not at the
>> granularity of vm-exit/entry. I'm not sure if the LBR_EN bit is required
>> to get LBR information via PEBS, also not confirmed whether PEBS-lbr
>> can be enabled at the same time as independent LBR;
>>
>> I recall that PEBS-assist, per cpu-arch, would clean up this part of the
>> record when crossing root/non-root boundaries, or not generate record.
>>
>> We're looking forward to the tests that will undermine this perception.
>>
>> There are some devilish details during the processing of vm-exit and
>> the generation of host/guest pebs, and those interested can delve into
>> the short description in this SDM section "20.9.5 EPT-Friendly PEBS".
>>
>>>
>>> Unless someone chimes in to point out how PEBS virtualization isn't a broken mess,
>>> I will post a patch to effectively disable PEBS virtualization.
>>
>> There are two factors that affect the availability of guest-pebs:
>>
>> 1. the technical need to use core-PMU in both host/guest worlds;
>> (I don't think Googlers are paying attention to this part of users' needs)
> 
> Let me clear up any misperceptions you might have that Google alone is
> foisting the pass-through PMU on the world. The work so far has been a
> collaboration between Google and Intel. Now, AMD has joined the
> collaboration as well. Mingwei is taking the lead on the project, but
> Googlers are outnumbered by the x86 CPU vendors ten to one.

This is such great news.

> 
> The pass-through PMU allows both the host and guest worlds to use the
> core PMU, more so than the existing vPMU implementation. I assume your

Can I further confirm that in any case, host/guest can use PMU resources,
such as some special more accurate counters ? Is there an end of story
for that static partitioning scheme ?

> complaint is about the desire for host software to monitor guest
> behavior with core PMU events while the guest is running. Today,
> Google Cloud does this for fleet management, and losing this
> capability is not something we are looking forward to. However, the
> writing is on the wall: Coco is going to take this capability away
> from us anyway.

Coco pays a corresponding performance cost, and it's a paradox to hide
any performance trace of coco-guests from host's point of view.

Thanks for the input, Jim. Let me try to help.

> 
>> 2. guest-pebs is temporarily disabled in the case of cross-mapping counter,
>> which reduces the number of performance samples collected by guest;
>>
>>>
>>> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
>>> index 41a4533f9989..a2f827fa0ca1 100644
>>> --- a/arch/x86/kvm/vmx/capabilities.h
>>> +++ b/arch/x86/kvm/vmx/capabilities.h
>>> @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
>>>
>>>    static inline bool vmx_pebs_supported(void)
>>>    {
>>> -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
>>> +       return false;
>>
>> As you know, user-space VMM may disable guest-pebs by filtering out the
>> MSR_IA32_PERF_CAPABILITIE.PERF_CAP_PEBS_FORMAT or CPUID.PDCM.
>>
>> In the end, if our great KVM maintainers insist on doing this,
>> there is obviously nothing I can do about it.
>>
>> Hope you have a good day.
>>
>>>    }
>>>
>>>    static inline bool cpu_has_notify_vmexit(void)
>>>
>>
Mi, Dapeng March 7, 2024, 11:54 a.m. UTC | #7
On 3/6/2024 3:55 PM, Mi, Dapeng wrote:
>
> On 3/6/2024 7:22 AM, Sean Christopherson wrote:
>> +Mingwei
>>
>> On Thu, Aug 24, 2023, Dapeng Mi wrote:
>>   diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
>>> index 7d9ba301c090..ffda2ecc3a22 100644
>>> --- a/arch/x86/kvm/pmu.h
>>> +++ b/arch/x86/kvm/pmu.h
>>> @@ -12,7 +12,8 @@
>>>                         MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
>>>     /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
>>> -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) 
>>> & 0xf)
>>> +#define fixed_ctrl_field(ctrl_reg, idx) \
>>> +    (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & 
>>> INTEL_FIXED_BITS_MASK)
>>>     #define VMWARE_BACKDOOR_PMC_HOST_TSC        0x10000
>>>   #define VMWARE_BACKDOOR_PMC_REAL_TIME        0x10001
>>> @@ -165,7 +166,8 @@ static inline bool pmc_speculative_in_use(struct 
>>> kvm_pmc *pmc)
>>>         if (pmc_is_fixed(pmc))
>>>           return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
>>> -                    pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
>>> +                    pmc->idx - INTEL_PMC_IDX_FIXED) &
>>> +                    (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
>>>         return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
>>>   }
>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c 
>>> b/arch/x86/kvm/vmx/pmu_intel.c
>>> index f2efa0bf7ae8..b0ac55891cb7 100644
>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>> @@ -548,8 +548,13 @@ static void intel_pmu_refresh(struct kvm_vcpu 
>>> *vcpu)
>>>           setup_fixed_pmc_eventsel(pmu);
>>>       }
>>>   -    for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
>>> -        pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
>>> +    for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>> +        pmu->fixed_ctr_ctrl_mask &=
>>> +             ~intel_fixed_bits_by_idx(i,
>>> +                          INTEL_FIXED_0_KERNEL |
>>> +                          INTEL_FIXED_0_USER |
>>> +                          INTEL_FIXED_0_ENABLE_PMI);
>>> +    }
>>>       counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
>>>           (((1ull << pmu->nr_arch_fixed_counters) - 1) << 
>>> INTEL_PMC_IDX_FIXED));
>>>       pmu->global_ctrl_mask = counter_mask;
>>> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu 
>>> *vcpu)
>>>               pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>>>               for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>>                   pmu->fixed_ctr_ctrl_mask &=
>>> -                    ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
>> OMG, this might just win the award for most obfuscated PMU code in 
>> KVM, which is
>> saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 
>> 32, the same
>> bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
>>
>> Argh, and this goofy code helped introduce a real bug. 
>> reprogram_fixed_counters()
>> doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
>>
>> Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't 
>> possibly work.
>>
>> IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS 
>> events,
>> perf will _always_ generate an adaptive record, even if the guest 
>> requested a
>> basic record.  Ugh, and KVM will always generate adaptive records 
>> even if the
>> guest doesn't support them.  This is all completely broken.  It 
>> probably kinda
>> sorta works because the Basic info is always stored in the record, 
>> and generating
>> more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
>>
>> Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear 
>> the upper
>> bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm 
>> misreading the code,
>> intel_pmu_enable_fixed() effectively doesn't clear 
>> ICL_FIXED_0_ADAPTIVE either,
>> as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
>
>
> Currently the host PMU driver would always set the "Adaptive_Record" 
> bit in PERFEVTSELx and FIXED_CNTR_CTR MSRs as long as HW supports the 
> adaptive PEBS feature (See details in helpers intel_pmu_pebs_enable() 
> and intel_pmu_enable_fixed()).
>
> It looks perf system doesn't export a interface to enable/disable the 
> adaptive PEBS.  I suppose that's why KVM doesn't handle the 
> "Adaptive_Record" bit in ERFEVTSELx and FIXED_CNTR_CTR MSRs.
>
>
>>
>> *sigh*
>>
>> I'm _very_ tempted to disable KVM PEBS support for the current PMU, 
>> and make it
>> available only when the so-called passthrough PMU is available[*].  
>> Because I
>> don't see how this is can possibly be functionally correct, nor do I 
>> see a way
>> to make it functionally correct without a rather large and invasive 
>> series.
>>
>> Ouch.  And after chatting with Mingwei, who asked the very good 
>> question of
>> "can this leak host state?", I am pretty sure that yes, this can leak 
>> host state.
>>
>> When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the 
>> guest has
>> access to adaptive records, KVM gives the guest full access to 
>> MSR_PEBS_DATA_CFG
>>
>>     pmu->pebs_data_cfg_mask = ~0xff00000full;
>>
>> which makes sense in a vacuum, because AFAICT the architecture 
>> doesn't allow
>> exposing a subset of the four adaptive controls.
>>
>> GPRs and XMMs are always context switched and thus benign, but IIUC, 
>> Memory Info
>> provides data that might now otherwise be available to the guest, 
>> e.g. if host
>> userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
>>
>> And unless I'm missing something, LBRs are a full leak of host 
>> state.  Nothing
>> in the SDM suggests that PEBS records honor MSR intercepts, so unless 
>> KVM is
>> also passing through LBRs, i.e. is context switching all LBR MSRs, 
>> the guest can
>> use PEBS to read host LBRs at will.
>
> Not sure If I missed something, but I don't see there is leak of host 
> state. All perf events created by KVM would set "exclude_host" 
> attribute, that would leads to all guest counters including the PEBS 
> enabling counters would be disabled immediately by VMX once vm exits, 
> and so the PEBS engine would stop as well. I don't see a PEBS record 
> contains host state is possible to be written into guest DS area.


Jut think twice, it looks the host LBR stack could really be possible to 
leak into guest since LBR stack is not cleared or switched in VM-entry 
even though the captured guest LBR record may overwrite the LBR stack 
gradually after vm-entry.


>
>
>>
>> Unless someone chimes in to point out how PEBS virtualization isn't a 
>> broken mess,
>> I will post a patch to effectively disable PEBS virtualization.
>>
>> diff --git a/arch/x86/kvm/vmx/capabilities.h 
>> b/arch/x86/kvm/vmx/capabilities.h
>> index 41a4533f9989..a2f827fa0ca1 100644
>> --- a/arch/x86/kvm/vmx/capabilities.h
>> +++ b/arch/x86/kvm/vmx/capabilities.h
>> @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
>>     static inline bool vmx_pebs_supported(void)
>>   {
>> -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
>> +       return false;
>>   }
>>     static inline bool cpu_has_notify_vmexit(void)
Like Xu March 7, 2024, 12:02 p.m. UTC | #8
On 7/3/2024 4:17 am, Sean Christopherson wrote:
> On Wed, Mar 06, 2024, Like Xu wrote:
>>>> @@ -595,7 +600,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>>>>    			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
>>>>    			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
>>>>    				pmu->fixed_ctr_ctrl_mask &=
>>>> -					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
>>>
>>> OMG, this might just win the award for most obfuscated PMU code in KVM, which is
>>> saying something.  The fact that INTEL_PMC_IDX_FIXED happens to be 32, the same
>>> bit number as ICL_FIXED_0_ADAPTIVE, is 100% coincidence.  Good riddance.
>>>
>>> Argh, and this goofy code helped introduce a real bug.  reprogram_fixed_counters()
>>> doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL.
>>>
>>> Wait, WTF?  Nothing in KVM accounts for the upper bits.  This can't possibly work.
>>>
>>> IIUC, because KVM _always_ sets precise_ip to a non-zero bit for PEBS events,
>>> perf will _always_ generate an adaptive record, even if the guest requested a
>>> basic record.  Ugh, and KVM will always generate adaptive records even if the
>>> guest doesn't support them.  This is all completely broken.  It probably kinda
>>> sorta works because the Basic info is always stored in the record, and generating
>>> more info requires a non-zero MSR_PEBS_DATA_CFG, but ugh.
>>
>> Yep, it works at least on machines with both adaptive and pebs_full features.
> 
> *AND* if the guest uses PEBS exactly the same way that the Linux kernel uses PEBS.
> 
>> Mingwei or others are encouraged to construct use cases in KUT::pmu_pebs.flat
>> that violate guest-pebs rules (e.g., leak host state), as we all recognize
>> that testing is the right way to condemn legacy code, not just lengthy emails.
> 
> *sigh*
> 
> diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c
> index f7b52b90..43e7a207 100644
> --- a/x86/pmu_pebs.c
> +++ b/x86/pmu_pebs.c
> @@ -212,8 +212,12 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
>          u64 baseline_extra_ctrl = 0, fixed_ctr_ctrl = 0;
>          unsigned int idx;
>   
> -       if (has_baseline)
> -               wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
> +       if (has_baseline) {
> +               if (pebs_data_cfg)
> +                       wrmsr(MSR_PEBS_DATA_CFG, pebs_data_cfg);
> +               else
> +                       wrmsr(MSR_PEBS_DATA_CFG, 0xf);
> +       }
>   
>          ds = (struct debug_store *)ds_bufer;
>          ds->pebs_index = ds->pebs_buffer_base = (unsigned long)pebs_buffer;
> @@ -224,7 +228,7 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
>          for (idx = 0; idx < pmu.nr_fixed_counters; idx++) {
>                  if (!(BIT_ULL(FIXED_CNT_INDEX + idx) & bitmask))
>                          continue;
> -               if (has_baseline)
> +               if (has_baseline && pebs_data_cfg)
>                          baseline_extra_ctrl = BIT(FIXED_CNT_INDEX + idx * 4);
>                  wrmsr(MSR_PERF_FIXED_CTRx(idx), ctr_start_val);
>                  fixed_ctr_ctrl |= (0xbULL << (idx * 4) | baseline_extra_ctrl);
> @@ -235,7 +239,7 @@ static void pebs_enable(u64 bitmask, u64 pebs_data_cfg)
>          for (idx = 0; idx < max_nr_gp_events; idx++) {
>                  if (!(BIT_ULL(idx) & bitmask))
>                          continue;
> -               if (has_baseline)
> +               if (has_baseline && pebs_data_cfg)
>                          baseline_extra_ctrl = ICL_EVENTSEL_ADAPTIVE;
>                  wrmsr(MSR_GP_EVENT_SELECTx(idx), EVNTSEL_EN | EVNTSEL_OS | EVNTSEL_USR |
>                                                   intel_arch_events[idx] | baseline_extra_ctrl);
> 
> FAIL: Multiple (0x700000055): PEBS record (written seq 0) is verified (including size, counters and cfg).
> FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
> FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).
> FAIL: GP counter 0 (0xfffffffffffe): PEBS record (written seq 0) is verified (including size, counters and cfg).
> FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
> FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).
> FAIL: Multiple (0x700000055): PEBS record (written seq 0) is verified (including size, counters and cfg).
> FAIL: The pebs_record_size (488) doesn't match with MSR_PEBS_DATA_CFG (32).
> FAIL: The pebs_data_cfg (0xf) doesn't match with MSR_PEBS_DATA_CFG (0x0).
> 
>>> Oh great, and it gets worse.  intel_pmu_disable_fixed() doesn't clear the upper
>>> bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set.  Unless I'm misreading the code,
>>> intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either,
>>> as it only modifies the bit when it wants to set ICL_FIXED_0_ADAPTIVE.
>>>
>>> *sigh*
>>>
>>> I'm _very_ tempted to disable KVM PEBS support for the current PMU, and make it
>>> available only when the so-called passthrough PMU is available[*].  Because I
>>> don't see how this is can possibly be functionally correct, nor do I see a way
>>> to make it functionally correct without a rather large and invasive series.
>>
>> Considering that I've tried the idea myself, I have no inclination towards
>> "passthrough PMU", and I'd like to be able to take the time to review that
>> patchset while we all wait for a clear statement from that perf-core man,
>> who don't really care about virtualization and don't want to lose control
>> of global hardware resources.
>>
>> Before we actually get to that ideal state you want, we have to deal with
>> some intermediate state and face to any users that rely on the current code,
> 
> It's not an ideal state, it's simply the only way I see to get things like adaptive
> PEBS to work safely, reliably, and correctly without taking on an absurd amount of
> complexity.

The upstream guest_pebs feature has always been broken from my perspective,
but unfortunately I didn't get enough time to polish it (slow path, host/guest 
multiplexing,
live migration support, security defense), so I'm really glad that Google is 
seriously
considering using this or that PMU feature.

> 
>> you had urged to merge in a KVM document for vPMU, not sure how far
>> along that part of the work is.
> 
>>> Ouch.  And after chatting with Mingwei, who asked the very good question of
>>> "can this leak host state?", I am pretty sure that yes, this can leak host state.
>>
>> The Basic Info has a tsc field, I suspect it's the host-state-tsc.
> 
> It's not, the CPU offsets it correctly, at least on ICX (I haven't check scaling).

And one more to check, the tsc fields form the guest Intel_PT or Timed LBR if any.

> 
>>> When PERF_CAP_PEBS_BASELINE is enabled for the guest, i.e. when the guest has
>>> access to adaptive records, KVM gives the guest full access to MSR_PEBS_DATA_CFG
>>>
>>> 	pmu->pebs_data_cfg_mask = ~0xff00000full;
>>>
>>> which makes sense in a vacuum, because AFAICT the architecture doesn't allow
>>> exposing a subset of the four adaptive controls.
>>>
>>> GPRs and XMMs are always context switched and thus benign, but IIUC, Memory Info
>>> provides data that might now otherwise be available to the guest, e.g. if host
>>> userspace has disallowed equivalent events via KVM_SET_PMU_EVENT_FILTER.
>>
>> Indeed, KVM_SET_PMU_EVENT_FILTER doesn't work in harmony with
>> guest-pebs, and I believe there is a big problem here, especially with the
>> lack of targeted testing.
>>
>> One reason for this is that we don't use this cockamamie API in our
> 
> Libeling APIs because they aren't useful for _your_ security goals doesn't mean
> you get to ignore their existence when contributing upstream.  New features don't
> necessarilly have to fully support existing capabilities, e.g. I would be a-ok
> making KVM_SET_PMU_EVENT_FILTER mututally exclusive with exposing adapative PEBS
> to the guest.  That way the user can at least know that filtering won't work if
> adapative PEBS is exposed to the guest.

Yes, that should be considered.

> 
>> large-scale production environments, and users of vPMU want to get real
>> runtime information about physical cpus, not just virtualised hardware
>> architecture interfaces.
>>
>>>
>>> And unless I'm missing something, LBRs are a full leak of host state.  Nothing
>>> in the SDM suggests that PEBS records honor MSR intercepts, so unless KVM is
>>> also passing through LBRs, i.e. is context switching all LBR MSRs, the guest can
>>> use PEBS to read host LBRs at will.
>>
>> KVM is also passing through LBRs when guest uses LBR but not at the
>> granularity of vm-exit/entry. I'm not sure if the LBR_EN bit is required
>> to get LBR information via PEBS, also not confirmed whether PEBS-lbr
>> can be enabled at the same time as independent LBR;
>>
>> I recall that PEBS-assist, per cpu-arch, would clean up this part of the
>> record when crossing root/non-root boundaries, or not generate record.
> 
> Nope.  The MSRs definitely leak to the guest.  The only hard part was figuring
> out how to get perf to utilize LBRs without consuming every counter (`perf top`
> was the extent of my knowledge, until now...).
> 
> E.g.
> 
>    perf record -b -e instructions

Consulting "Ian Rogers" is the fastest route.

> 
> and
> 
>    FAIL: PEBS LBR record 0 isn't empty, got from = 'ffffffffc0a00ccb', to = 'ffffffffc0a010af', info = '2'
>    FAIL: PEBS LBR record 1 isn't empty, got from = 'ffffffffc0a010aa', to = 'ffffffffc0a00cb0', info = '6'
>    FAIL: PEBS LBR record 2 isn't empty, got from = 'ffffffffc0a00e06', to = 'ffffffffc0a01090', info = '1'
>    FAIL: PEBS LBR record 3 isn't empty, got from = 'ffffffffc0a00df4', to = 'ffffffffc0a00e00', info = '2'
>    FAIL: PEBS LBR record 4 isn't empty, got from = 'ffffffffc0a00dbc', to = 'ffffffffc0a00de0', info = '1'
>    FAIL: PEBS LBR record 5 isn't empty, got from = 'ffffffffc0a00f63', to = 'ffffffffc0a00db5', info = '1'
>    FAIL: PEBS LBR record 6 isn't empty, got from = 'ffffffffc0903f23', to = 'ffffffffc0a00f61', info = '11'
>    FAIL: PEBS LBR record 7 isn't empty, got from = 'ffffffffc0a00f5c', to = 'ffffffffc0903f10', info = '1'
>    FAIL: PEBS LBR record 8 isn't empty, got from = 'ffffffffc0a00db0', to = 'ffffffffc0a00f55', info = '1'
>    FAIL: PEBS LBR record 9 isn't empty, got from = 'ffffffff8f6b2c23', to = 'ffffffffc0a00da6', info = 'a'
>    FAIL: PEBS LBR record 10 isn't empty, got from = 'ffffffffc0a00da1', to = 'ffffffff8f6b2b60', info = '7'
>    FAIL: PEBS LBR record 11 isn't empty, got from = 'ffffffff8eba1b85', to = 'ffffffffc0a00d9a', info = '6'
>    FAIL: PEBS LBR record 12 isn't empty, got from = 'ffffffff8eba1b8c', to = 'ffffffff8eba1b3f', info = '1'
>    FAIL: PEBS LBR record 13 isn't empty, got from = 'ffffffff8eba1b2d', to = 'ffffffff8eba1b87', info = 'e'
>    FAIL: PEBS LBR record 14 isn't empty, got from = 'ffffffff8eba1aff', to = 'ffffffff8eba1b18', info = '7'
>    FAIL: PEBS LBR record 15 isn't empty, got from = 'ffffffff8eb996e0', to = 'ffffffff8eba1aeb', info = '2'
>    FAIL: PEBS LBR record 16 isn't empty, got from = 'ffffffff8eb9963a', to = 'ffffffff8eb996cc', info = '1'
>    FAIL: PEBS LBR record 17 isn't empty, got from = 'ffffffff8eb995ef', to = 'ffffffff8eb9962f', info = '1'
>    FAIL: PEBS LBR record 18 isn't empty, got from = 'ffffffff8f6b30a1', to = 'ffffffff8eb995df', info = '4'
>    FAIL: PEBS LBR record 19 isn't empty, got from = 'ffffffff8eb995da', to = 'ffffffff8f6b3070', info = '2'
>    FAIL: PEBS LBR record 20 isn't empty, got from = 'ffffffff8eba1ae6', to = 'ffffffff8eb995c0', info = '6'
>    FAIL: PEBS LBR record 21 isn't empty, got from = 'ffffffffc0a00d95', to = 'ffffffff8eba1a90', info = '2'
>    FAIL: PEBS LBR record 22 isn't empty, got from = 'ffffffff8eb69135', to = 'ffffffffc0a00d89', info = '2'
>    FAIL: PEBS LBR record 23 isn't empty, got from = 'ffffffff8eb690d6', to = 'ffffffff8eb69104', info = '2'
>    FAIL: PEBS LBR record 24 isn't empty, got from = 'ffffffff8eb69102', to = 'ffffffff8eb690c3', info = '1'
>    FAIL: PEBS LBR record 25 isn't empty, got from = 'ffffffff8eb6f442', to = 'ffffffff8eb69100', info = '2'
>    FAIL: PEBS LBR record 26 isn't empty, got from = 'ffffffff8eb6f3f3', to = 'ffffffff8eb6f429', info = '5'
>    FAIL: PEBS LBR record 27 isn't empty, got from = 'ffffffff8eb6f3a6', to = 'ffffffff8eb6f3c3', info = '2'
>    FAIL: PEBS LBR record 28 isn't empty, got from = 'ffffffff8eb690fb', to = 'ffffffff8eb6f390', info = '2'
>    FAIL: PEBS LBR record 29 isn't empty, got from = 'ffffffff8eb690c1', to = 'ffffffff8eb690d8', info = '2'
>    FAIL: PEBS LBR record 30 isn't empty, got from = 'ffffffff8eb6907b', to = 'ffffffff8eb690b1', info = '1'
>    FAIL: PEBS LBR record 31 isn't empty, got from = 'ffffffff8eb690af', to = 'ffffffff8eb6906e', info = '1'

Guilty Guilty Guilty

> 
>>> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
>>> index 41a4533f9989..a2f827fa0ca1 100644
>>> --- a/arch/x86/kvm/vmx/capabilities.h
>>> +++ b/arch/x86/kvm/vmx/capabilities.h
>>> @@ -392,7 +392,7 @@ static inline bool vmx_pt_mode_is_host_guest(void)
>>>    static inline bool vmx_pebs_supported(void)
>>>    {
>>> -       return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
>>> +       return false;
>>
>> As you know, user-space VMM may disable guest-pebs by filtering out the
>> MSR_IA32_PERF_CAPABILITIE.PERF_CAP_PEBS_FORMAT or CPUID.PDCM.
> 
> Relying on userspace to fudge around KVM bugs is not acceptable for upstream.
> PMU virtualization is already a bit dicey in that KVM relies on userspace to
> filter out sensitive events, but leaking host addresses and failing to correctly
> virtualize the architecture is an entirely different level of wrong.

The leak is real and I would do the same thing (drop ADAPTIVE) in the kernel side.

> 
>> In the end, if our great KVM maintainers insist on doing this,
>> there is obviously nothing I can do about it.
> 
> Bullshit.  There is plenty you could have done to prevent this.  It took me what,
> 6 hours total?  To (a) realize the code is buggy, (b) type up an email, and (c)
> modify tests to confirm the bugs.  And at least half of that time was due to me
> floundering around trying to generate LBR records because I know nothing about
> the perf tool.
> 
> If you were more interested in ensuring KVM is maintainble, secure, and functionally
> correct, we wouldn't be where we are today.

I was actually really trying to communicate my todo list to my successors
in this direction, but it took a long time for them to warm up.

Very pleased to know that you have so much manpower invested in vPMU,
which was unimaginable a few years ago when I was the only one doing
these things.

We have benefited from your efforts here or there, and hope to still have
the opportunity to contribute back. Thank you, Sean.
diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index edb89b51b383..fb4ef2da3e32 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -420,11 +420,11 @@  static void reprogram_counter(struct kvm_pmc *pmc)
 	if (pmc_is_fixed(pmc)) {
 		fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
 						  pmc->idx - INTEL_PMC_IDX_FIXED);
-		if (fixed_ctr_ctrl & 0x1)
+		if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
 			eventsel |= ARCH_PERFMON_EVENTSEL_OS;
-		if (fixed_ctr_ctrl & 0x2)
+		if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
 			eventsel |= ARCH_PERFMON_EVENTSEL_USR;
-		if (fixed_ctr_ctrl & 0x8)
+		if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
 			eventsel |= ARCH_PERFMON_EVENTSEL_INT;
 		new_config = (u64)fixed_ctr_ctrl;
 	}
@@ -749,8 +749,8 @@  static inline bool cpl_is_matched(struct kvm_pmc *pmc)
 	} else {
 		config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
 					  pmc->idx - INTEL_PMC_IDX_FIXED);
-		select_os = config & 0x1;
-		select_user = config & 0x2;
+		select_os = config & INTEL_FIXED_0_KERNEL;
+		select_user = config & INTEL_FIXED_0_USER;
 	}
 
 	return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7d9ba301c090..ffda2ecc3a22 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -12,7 +12,8 @@ 
 					  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
 
 /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
-#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define fixed_ctrl_field(ctrl_reg, idx) \
+	(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
 
 #define VMWARE_BACKDOOR_PMC_HOST_TSC		0x10000
 #define VMWARE_BACKDOOR_PMC_REAL_TIME		0x10001
@@ -165,7 +166,8 @@  static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
 
 	if (pmc_is_fixed(pmc))
 		return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
-					pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+					pmc->idx - INTEL_PMC_IDX_FIXED) &
+					(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
 
 	return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
 }
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index f2efa0bf7ae8..b0ac55891cb7 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -548,8 +548,13 @@  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 		setup_fixed_pmc_eventsel(pmu);
 	}
 
-	for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
-		pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		pmu->fixed_ctr_ctrl_mask &=
+			 ~intel_fixed_bits_by_idx(i,
+						  INTEL_FIXED_0_KERNEL |
+						  INTEL_FIXED_0_USER |
+						  INTEL_FIXED_0_ENABLE_PMI);
+	}
 	counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
 		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
 	pmu->global_ctrl_mask = counter_mask;
@@ -595,7 +600,7 @@  static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 			pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
 			for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
 				pmu->fixed_ctr_ctrl_mask &=
-					~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+					~intel_fixed_bits_by_idx(i, ICL_FIXED_0_ADAPTIVE);
 			}
 			pmu->pebs_data_cfg_mask = ~0xff00000full;
 		} else {