diff mbox

[v2,15/30] xen/x86: Improvements to in-hypervisor cpuid sanity checks

Message ID 1454679743-18133-16-git-send-email-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Cooper Feb. 5, 2016, 1:42 p.m. UTC
* Use the boot-generated pv and hvm featureset to clamp the visible features,
  rather than picking and choosing at individual features.  This subsumes the
  static feature manipulation.
* More use of compiler-visible &'s and |'s, rather than clear,set bit.
* Remove logic which hides PSE36 out of PAE mode.  This is not how real
  hardware behaves.
* Improve logic to set OSXSAVE.  The bit is cleared by virtue of not being
  valid in a featureset, and should be a strict fast-forward from %cr4.
  Provide a very big health warning for OXSAVE for PV guests, which is
  non-architectural.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>

v2:
 * Reinstate some of the dynamic checks for now.  Future development work will
   instate a complete per-domain policy.
 * Fix OSXSAVE handling for PV guests.
---
 xen/arch/x86/hvm/hvm.c |  56 +++++++++---------
 xen/arch/x86/traps.c   | 151 ++++++++++++++++++++++++-------------------------
 2 files changed, 100 insertions(+), 107 deletions(-)

Comments

Jan Beulich Feb. 15, 2016, 3:43 p.m. UTC | #1
>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>          /* Fix up VLAPIC details. */
>          *ebx &= 0x00FFFFFFu;
>          *ebx |= (v->vcpu_id * 2) << 24;
> +
> +        *ecx &= hvm_featureset[FEATURESET_1c];
> +        *edx &= hvm_featureset[FEATURESET_1d];

Looks like I've overlooked an issue in patch 11, which becomes
apparent here: How can you use a domain-independent featureset
here, when features vary between HAP and shadow mode guests?
I.e. in the earlier patch I suppose you need to calculate two
hvm_*_featureset[]s, with the HAP one perhaps empty when
!hvm_funcs.hap_supported.

> @@ -4694,6 +4687,9 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>          break;
>  
>      case 0x80000001:
> +        *ecx &= hvm_featureset[FEATURESET_e1c];
> +        *edx &= hvm_featureset[FEATURESET_e1d];

Looking at the uses here, wouldn't it be better (less ambiguous) for
these to be named FEATURESET_x1c and FEATURESET_x1d
respectively, since x is not a hex digit, but e is?

> @@ -841,69 +842,70 @@ void pv_cpuid(struct cpu_user_regs *regs)
>      else
>          cpuid_count(leaf, subleaf, &a, &b, &c, &d);
>  
> -    if ( (leaf & 0x7fffffff) == 0x00000001 )
> -    {
> -        /* Modify Feature Information. */
> -        if ( !cpu_has_apic )
> -            __clear_bit(X86_FEATURE_APIC, &d);
> -
> -        if ( !is_pvh_domain(currd) )
> -        {
> -            __clear_bit(X86_FEATURE_PSE, &d);
> -            __clear_bit(X86_FEATURE_PGE, &d);
> -            __clear_bit(X86_FEATURE_PSE36, &d);
> -            __clear_bit(X86_FEATURE_VME, &d);
> -        }
> -    }
> -
>      switch ( leaf )
>      {
>      case 0x00000001:
> -        /* Modify Feature Information. */
> -        if ( !cpu_has_sep )
> -            __clear_bit(X86_FEATURE_SEP, &d);
> -        __clear_bit(X86_FEATURE_DS, &d);
> -        __clear_bit(X86_FEATURE_ACC, &d);
> -        __clear_bit(X86_FEATURE_PBE, &d);
> -        if ( is_pvh_domain(currd) )
> -            __clear_bit(X86_FEATURE_MTRR, &d);
> -
> -        __clear_bit(X86_FEATURE_DTES64 % 32, &c);
> -        __clear_bit(X86_FEATURE_MWAIT % 32, &c);
> -        __clear_bit(X86_FEATURE_DSCPL % 32, &c);
> -        __clear_bit(X86_FEATURE_VMXE % 32, &c);
> -        __clear_bit(X86_FEATURE_SMXE % 32, &c);
> -        __clear_bit(X86_FEATURE_TM2 % 32, &c);
> +        c &= pv_featureset[FEATURESET_1c];
> +        d &= pv_featureset[FEATURESET_1d];
> +
>          if ( is_pv_32bit_domain(currd) )
> -            __clear_bit(X86_FEATURE_CX16 % 32, &c);
> -        __clear_bit(X86_FEATURE_XTPR % 32, &c);
> -        __clear_bit(X86_FEATURE_PDCM % 32, &c);
> -        __clear_bit(X86_FEATURE_PCID % 32, &c);
> -        __clear_bit(X86_FEATURE_DCA % 32, &c);
> -        if ( !cpu_has_xsave )
> -        {
> -            __clear_bit(X86_FEATURE_XSAVE % 32, &c);
> -            __clear_bit(X86_FEATURE_AVX % 32, &c);
> -        }
> -        if ( !cpu_has_apic )
> -           __clear_bit(X86_FEATURE_X2APIC % 32, &c);
> -        __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
> +            c &= ~cpufeat_mask(X86_FEATURE_CX16);

Shouldn't this be taken care of by clearing LM and then applying
your dependencies correction? Or is that meant to only get
enforced later? Is it maybe still worth having both pv64_featureset[]
and pv32_featureset[]?

> +        /*
> +         * !!! Warning - OSXSAVE handling for PV guests is non-architectural !!!
> +         *
> +         * Architecturally, the correct code here is simply:
> +         *
> +         *   if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
> +         *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
> +         *
> +         * However because of bugs in Xen (before c/s bd19080b, Nov 2010, the
> +         * XSAVE cpuid flag leaked into guests despite the feature not being
> +         * avilable for use), buggy workarounds where introduced to Linux (c/s
> +         * 947ccf9c, also Nov 2010) which relied on the fact that Xen also
> +         * incorrectly leaked OSXSAVE into the guest.
> +         *
> +         * Furthermore, providing architectural OSXSAVE behaviour to a many
> +         * Linux PV guests triggered a further kernel bug when the fpu code
> +         * observes that XSAVEOPT is available, assumes that xsave state had
> +         * been set up for the task, and follows a wild pointer.
> +         *
> +         * Therefore, the leaking of Xen's OSXSAVE setting has become a
> +         * defacto part of the PV ABI and can't reasonably be corrected.
> +         *
> +         * The following situations and logic now applies:
> +         *
> +         * - Hardware without CPUID faulting support and native CPUID:
> +         *    There is nothing Xen can do here.  The hosts XSAVE flag will
> +         *    leak through and Xen's OSXSAVE choice will leak through.
> +         *
> +         *    In the case that the guest kernel has not set up OSXSAVE, only
> +         *    SSE will be set in xcr0, and guest userspace can't do too much
> +         *    damage itself.
> +         *
> +         * - Enlightened CPUID or CPUID faulting available:
> +         *    Xen can fully control what is seen here.  Guest kernels need to
> +         *    see the leaked OSXSAVE, but guest userspace is given
> +         *    architectural behaviour, to reflect the guest kernels
> +         *    intentions.
> +         */

Well, I think all of this is too harsh: In a hypervisor-guest
relationship of PV kind I don't view it as entirely wrong to let the
guest kernel know of whether the hypervisor enabled XSAVE
support by inspecting the OSXSAVE bit. From guest kernel
perspective, the hypervisor is kind of the OS. While I won't
make weakening the above a little a requirement, I'd appreciate
you doing so.

> +    case 0x80000007:
> +        d &= pv_featureset[FEATURESET_e7d];
> +        break;

By not clearing eax and ebx (not sure about ecx) here you would
again expose flags to guests without proper white listing.

> +    case 0x80000008:
> +        b &= pv_featureset[FEATURESET_e8b];
>          break;

Same here for ecx and edx and perhaps the upper 8 bits of eax.

Jan
Andrew Cooper Feb. 15, 2016, 5:12 p.m. UTC | #2
On 15/02/16 15:43, Jan Beulich wrote:
>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>          /* Fix up VLAPIC details. */
>>          *ebx &= 0x00FFFFFFu;
>>          *ebx |= (v->vcpu_id * 2) << 24;
>> +
>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>> +        *edx &= hvm_featureset[FEATURESET_1d];
> Looks like I've overlooked an issue in patch 11, which becomes
> apparent here: How can you use a domain-independent featureset
> here, when features vary between HAP and shadow mode guests?
> I.e. in the earlier patch I suppose you need to calculate two
> hvm_*_featureset[]s, with the HAP one perhaps empty when
> !hvm_funcs.hap_supported.

Their use here is a halfway house between nothing and the planned full
per-domain policies.

In this case, the "don't expose $X to a non-hap domain" checks have been
retained, to cover the difference.

>
>> @@ -4694,6 +4687,9 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>          break;
>>  
>>      case 0x80000001:
>> +        *ecx &= hvm_featureset[FEATURESET_e1c];
>> +        *edx &= hvm_featureset[FEATURESET_e1d];
> Looking at the uses here, wouldn't it be better (less ambiguous) for
> these to be named FEATURESET_x1c and FEATURESET_x1d
> respectively, since x is not a hex digit, but e is?

I originally chose e because these are commonly known as the extended
cpuid leaves.  'e' being a hex digit is why the number is specified as
an upper case hex number, as shown with FEATURESET_Da1.

I suppose x could work here, but I don't see being less ambiguous. 
(Perhaps I am clouded by having this syntax firmly embedded in my
expectation).

>
>> @@ -841,69 +842,70 @@ void pv_cpuid(struct cpu_user_regs *regs)
>>      else
>>          cpuid_count(leaf, subleaf, &a, &b, &c, &d);
>>  
>> -    if ( (leaf & 0x7fffffff) == 0x00000001 )
>> -    {
>> -        /* Modify Feature Information. */
>> -        if ( !cpu_has_apic )
>> -            __clear_bit(X86_FEATURE_APIC, &d);
>> -
>> -        if ( !is_pvh_domain(currd) )
>> -        {
>> -            __clear_bit(X86_FEATURE_PSE, &d);
>> -            __clear_bit(X86_FEATURE_PGE, &d);
>> -            __clear_bit(X86_FEATURE_PSE36, &d);
>> -            __clear_bit(X86_FEATURE_VME, &d);
>> -        }
>> -    }
>> -
>>      switch ( leaf )
>>      {
>>      case 0x00000001:
>> -        /* Modify Feature Information. */
>> -        if ( !cpu_has_sep )
>> -            __clear_bit(X86_FEATURE_SEP, &d);
>> -        __clear_bit(X86_FEATURE_DS, &d);
>> -        __clear_bit(X86_FEATURE_ACC, &d);
>> -        __clear_bit(X86_FEATURE_PBE, &d);
>> -        if ( is_pvh_domain(currd) )
>> -            __clear_bit(X86_FEATURE_MTRR, &d);
>> -
>> -        __clear_bit(X86_FEATURE_DTES64 % 32, &c);
>> -        __clear_bit(X86_FEATURE_MWAIT % 32, &c);
>> -        __clear_bit(X86_FEATURE_DSCPL % 32, &c);
>> -        __clear_bit(X86_FEATURE_VMXE % 32, &c);
>> -        __clear_bit(X86_FEATURE_SMXE % 32, &c);
>> -        __clear_bit(X86_FEATURE_TM2 % 32, &c);
>> +        c &= pv_featureset[FEATURESET_1c];
>> +        d &= pv_featureset[FEATURESET_1d];
>> +
>>          if ( is_pv_32bit_domain(currd) )
>> -            __clear_bit(X86_FEATURE_CX16 % 32, &c);
>> -        __clear_bit(X86_FEATURE_XTPR % 32, &c);
>> -        __clear_bit(X86_FEATURE_PDCM % 32, &c);
>> -        __clear_bit(X86_FEATURE_PCID % 32, &c);
>> -        __clear_bit(X86_FEATURE_DCA % 32, &c);
>> -        if ( !cpu_has_xsave )
>> -        {
>> -            __clear_bit(X86_FEATURE_XSAVE % 32, &c);
>> -            __clear_bit(X86_FEATURE_AVX % 32, &c);
>> -        }
>> -        if ( !cpu_has_apic )
>> -           __clear_bit(X86_FEATURE_X2APIC % 32, &c);
>> -        __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
>> +            c &= ~cpufeat_mask(X86_FEATURE_CX16);
> Shouldn't this be taken care of by clearing LM and then applying
> your dependencies correction? Or is that meant to only get
> enforced later? Is it maybe still worth having both pv64_featureset[]
> and pv32_featureset[]?

Again, this is just used as a halfway house.  Longterm, I plan to read
the featureset straight from the per-domain policy, which won't be
stored as an adhoc array.

>
>> +        /*
>> +         * !!! Warning - OSXSAVE handling for PV guests is non-architectural !!!
>> +         *
>> +         * Architecturally, the correct code here is simply:
>> +         *
>> +         *   if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
>> +         *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
>> +         *
>> +         * However because of bugs in Xen (before c/s bd19080b, Nov 2010, the
>> +         * XSAVE cpuid flag leaked into guests despite the feature not being
>> +         * avilable for use), buggy workarounds where introduced to Linux (c/s
>> +         * 947ccf9c, also Nov 2010) which relied on the fact that Xen also
>> +         * incorrectly leaked OSXSAVE into the guest.
>> +         *
>> +         * Furthermore, providing architectural OSXSAVE behaviour to a many
>> +         * Linux PV guests triggered a further kernel bug when the fpu code
>> +         * observes that XSAVEOPT is available, assumes that xsave state had
>> +         * been set up for the task, and follows a wild pointer.
>> +         *
>> +         * Therefore, the leaking of Xen's OSXSAVE setting has become a
>> +         * defacto part of the PV ABI and can't reasonably be corrected.
>> +         *
>> +         * The following situations and logic now applies:
>> +         *
>> +         * - Hardware without CPUID faulting support and native CPUID:
>> +         *    There is nothing Xen can do here.  The hosts XSAVE flag will
>> +         *    leak through and Xen's OSXSAVE choice will leak through.
>> +         *
>> +         *    In the case that the guest kernel has not set up OSXSAVE, only
>> +         *    SSE will be set in xcr0, and guest userspace can't do too much
>> +         *    damage itself.
>> +         *
>> +         * - Enlightened CPUID or CPUID faulting available:
>> +         *    Xen can fully control what is seen here.  Guest kernels need to
>> +         *    see the leaked OSXSAVE, but guest userspace is given
>> +         *    architectural behaviour, to reflect the guest kernels
>> +         *    intentions.
>> +         */
> Well, I think all of this is too harsh: In a hypervisor-guest
> relationship of PV kind I don't view it as entirely wrong to let the
> guest kernel know of whether the hypervisor enabled XSAVE
> support by inspecting the OSXSAVE bit. From guest kernel
> perspective, the hypervisor is kind of the OS.

Xen shadows the guests %cr4.  That alone means that the emulated cpuid
should have matched.

If you are going for the OS argument, that means "Xen has XSAVE turned
on and the guest can the provided features" and not "Xen supports the
guest configuring its own XCR0", which is what it is taken to mean.  In
both of these cases, deviating from the architectural behaviour confuses
the situation, and adds adds needless divergence from the native
implementation in guests.

I admit that all of this stems from the complete fiasco which was the
original XSAVE support, but all of it would be unnecessary had buggy
bugfixes not been stack up on each other.

>  While I won't
> make weakening the above a little a requirement, I'd appreciate
> you doing so.

I don't which to be deliberately over the top, but I really don't think
I am in this case.  None of this should be necessary.

>
>> +    case 0x80000007:
>> +        d &= pv_featureset[FEATURESET_e7d];
>> +        break;
> By not clearing eax and ebx (not sure about ecx) here you would
> again expose flags to guests without proper white listing.
>
>> +    case 0x80000008:
>> +        b &= pv_featureset[FEATURESET_e8b];
>>          break;
> Same here for ecx and edx and perhaps the upper 8 bits of eax.

Both of these would be changes to how these things are currently
handled, whereby a guest gets to see whatever the toolstack managed to
find in said leaf.  I was hoping to put off some of these decisions, but
they probably need making now.  On the PV side they definitely can't be
fully hidden, as these leaves are not maskable.

~Andrew
Jan Beulich Feb. 16, 2016, 10:06 a.m. UTC | #3
>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
> On 15/02/16 15:43, Jan Beulich wrote:
>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>          /* Fix up VLAPIC details. */
>>>          *ebx &= 0x00FFFFFFu;
>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>> +
>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>> Looks like I've overlooked an issue in patch 11, which becomes
>> apparent here: How can you use a domain-independent featureset
>> here, when features vary between HAP and shadow mode guests?
>> I.e. in the earlier patch I suppose you need to calculate two
>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>> !hvm_funcs.hap_supported.
> 
> Their use here is a halfway house between nothing and the planned full
> per-domain policies.
> 
> In this case, the "don't expose $X to a non-hap domain" checks have been
> retained, to cover the difference.

Well, doesn't it seem to you that doing only half of the HAP/shadow
separation is odd/confusing? I.e. could I talk you into not doing any
such separation (enforcing the non-HAP overrides as is done now)
or finishing the separation to become visible/usable here?

>>> @@ -4694,6 +4687,9 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>          break;
>>>  
>>>      case 0x80000001:
>>> +        *ecx &= hvm_featureset[FEATURESET_e1c];
>>> +        *edx &= hvm_featureset[FEATURESET_e1d];
>> Looking at the uses here, wouldn't it be better (less ambiguous) for
>> these to be named FEATURESET_x1c and FEATURESET_x1d
>> respectively, since x is not a hex digit, but e is?
> 
> I originally chose e because these are commonly known as the extended
> cpuid leaves.  'e' being a hex digit is why the number is specified as
> an upper case hex number, as shown with FEATURESET_Da1.
> 
> I suppose x could work here, but I don't see being less ambiguous. 
> (Perhaps I am clouded by having this syntax firmly embedded in my
> expectation).

Okay then, I just wanted to point out that the current naming may
be confusing to others (and to a degree it is to me).

>>>      switch ( leaf )
>>>      {
>>>      case 0x00000001:
>>> -        /* Modify Feature Information. */
>>> -        if ( !cpu_has_sep )
>>> -            __clear_bit(X86_FEATURE_SEP, &d);
>>> -        __clear_bit(X86_FEATURE_DS, &d);
>>> -        __clear_bit(X86_FEATURE_ACC, &d);
>>> -        __clear_bit(X86_FEATURE_PBE, &d);
>>> -        if ( is_pvh_domain(currd) )
>>> -            __clear_bit(X86_FEATURE_MTRR, &d);
>>> -
>>> -        __clear_bit(X86_FEATURE_DTES64 % 32, &c);
>>> -        __clear_bit(X86_FEATURE_MWAIT % 32, &c);
>>> -        __clear_bit(X86_FEATURE_DSCPL % 32, &c);
>>> -        __clear_bit(X86_FEATURE_VMXE % 32, &c);
>>> -        __clear_bit(X86_FEATURE_SMXE % 32, &c);
>>> -        __clear_bit(X86_FEATURE_TM2 % 32, &c);
>>> +        c &= pv_featureset[FEATURESET_1c];
>>> +        d &= pv_featureset[FEATURESET_1d];
>>> +
>>>          if ( is_pv_32bit_domain(currd) )
>>> -            __clear_bit(X86_FEATURE_CX16 % 32, &c);
>>> -        __clear_bit(X86_FEATURE_XTPR % 32, &c);
>>> -        __clear_bit(X86_FEATURE_PDCM % 32, &c);
>>> -        __clear_bit(X86_FEATURE_PCID % 32, &c);
>>> -        __clear_bit(X86_FEATURE_DCA % 32, &c);
>>> -        if ( !cpu_has_xsave )
>>> -        {
>>> -            __clear_bit(X86_FEATURE_XSAVE % 32, &c);
>>> -            __clear_bit(X86_FEATURE_AVX % 32, &c);
>>> -        }
>>> -        if ( !cpu_has_apic )
>>> -           __clear_bit(X86_FEATURE_X2APIC % 32, &c);
>>> -        __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
>>> +            c &= ~cpufeat_mask(X86_FEATURE_CX16);
>> Shouldn't this be taken care of by clearing LM and then applying
>> your dependencies correction? Or is that meant to only get
>> enforced later? Is it maybe still worth having both pv64_featureset[]
>> and pv32_featureset[]?
> 
> Again, this is just used as a halfway house.  Longterm, I plan to read
> the featureset straight from the per-domain policy, which won't be
> stored as an adhoc array.

Okay, that's certainly fine in this case I suppose.

>>> +    case 0x80000007:
>>> +        d &= pv_featureset[FEATURESET_e7d];
>>> +        break;
>> By not clearing eax and ebx (not sure about ecx) here you would
>> again expose flags to guests without proper white listing.
>>
>>> +    case 0x80000008:
>>> +        b &= pv_featureset[FEATURESET_e8b];
>>>          break;
>> Same here for ecx and edx and perhaps the upper 8 bits of eax.
> 
> Both of these would be changes to how these things are currently
> handled, whereby a guest gets to see whatever the toolstack managed to
> find in said leaf.  I was hoping to put off some of these decisions, but
> they probably need making now.  On the PV side they definitely can't be
> fully hidden, as these leaves are not maskable.

Right, but many are meaningful primarily to kernels, and there we
can hide them.

Since you're switching from black to white listing here, I also think
we need a default label alongside the "unsupported" one here.
Similarly I would think XSTATE sub-leaves beyond 63 need hiding
now.

Jan
Andrew Cooper Feb. 17, 2016, 10:43 a.m. UTC | #4
On 16/02/16 10:06, Jan Beulich wrote:
>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>>          /* Fix up VLAPIC details. */
>>>>          *ebx &= 0x00FFFFFFu;
>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>> +
>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>> Looks like I've overlooked an issue in patch 11, which becomes
>>> apparent here: How can you use a domain-independent featureset
>>> here, when features vary between HAP and shadow mode guests?
>>> I.e. in the earlier patch I suppose you need to calculate two
>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>> !hvm_funcs.hap_supported.
>> Their use here is a halfway house between nothing and the planned full
>> per-domain policies.
>>
>> In this case, the "don't expose $X to a non-hap domain" checks have been
>> retained, to cover the difference.
> Well, doesn't it seem to you that doing only half of the HAP/shadow
> separation is odd/confusing? I.e. could I talk you into not doing any
> such separation (enforcing the non-HAP overrides as is done now)
> or finishing the separation to become visible/usable here?

The HAP/shadow distinction is needed in the toolstack to account for the
hap=<bool> option.

The distinction will disappear when per-domain policies are introduced. 
If you notice, the distinction is private to the data generated by the
autogen script, and does not form a part of any API/ABI.  The sysctl
only has a single hvm featureset.

>>>> +    case 0x80000007:
>>>> +        d &= pv_featureset[FEATURESET_e7d];
>>>> +        break;
>>> By not clearing eax and ebx (not sure about ecx) here you would
>>> again expose flags to guests without proper white listing.
>>>
>>>> +    case 0x80000008:
>>>> +        b &= pv_featureset[FEATURESET_e8b];
>>>>          break;
>>> Same here for ecx and edx and perhaps the upper 8 bits of eax.
>> Both of these would be changes to how these things are currently
>> handled, whereby a guest gets to see whatever the toolstack managed to
>> find in said leaf.  I was hoping to put off some of these decisions, but
>> they probably need making now.  On the PV side they definitely can't be
>> fully hidden, as these leaves are not maskable.
> Right, but many are meaningful primarily to kernels, and there we
> can hide them.
>
> Since you're switching from black to white listing here, I also think
> we need a default label alongside the "unsupported" one here.
> Similarly I would think XSTATE sub-leaves beyond 63 need hiding
> now.

I would prefer not to do that now.  It is conflated with the future
work, deliberately deferred to make this work a manageable size.

At the moment, the toolstack won't ever generate subleaves beyond 63.

~Andrew
Jan Beulich Feb. 17, 2016, 10:55 a.m. UTC | #5
>>> On 17.02.16 at 11:43, <andrew.cooper3@citrix.com> wrote:
> On 16/02/16 10:06, Jan Beulich wrote:
>>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>>>          /* Fix up VLAPIC details. */
>>>>>          *ebx &= 0x00FFFFFFu;
>>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>>> +
>>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>>> Looks like I've overlooked an issue in patch 11, which becomes
>>>> apparent here: How can you use a domain-independent featureset
>>>> here, when features vary between HAP and shadow mode guests?
>>>> I.e. in the earlier patch I suppose you need to calculate two
>>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>>> !hvm_funcs.hap_supported.
>>> Their use here is a halfway house between nothing and the planned full
>>> per-domain policies.
>>>
>>> In this case, the "don't expose $X to a non-hap domain" checks have been
>>> retained, to cover the difference.
>> Well, doesn't it seem to you that doing only half of the HAP/shadow
>> separation is odd/confusing? I.e. could I talk you into not doing any
>> such separation (enforcing the non-HAP overrides as is done now)
>> or finishing the separation to become visible/usable here?
> 
> The HAP/shadow distinction is needed in the toolstack to account for the
> hap=<bool> option.
> 
> The distinction will disappear when per-domain policies are introduced. 
> If you notice, the distinction is private to the data generated by the
> autogen script, and does not form a part of any API/ABI.  The sysctl
> only has a single hvm featureset.

I don't see this as being in line with

    hvm_featuremask = hvm_funcs.hap_supported ?
        hvm_hap_featuremask : hvm_shadow_featuremask;

in patch 11. A shadow mode guest should see exactly the same
set of features, regardless of whether HAP was available (and
enabled) on a host.

>>>>> +    case 0x80000007:
>>>>> +        d &= pv_featureset[FEATURESET_e7d];
>>>>> +        break;
>>>> By not clearing eax and ebx (not sure about ecx) here you would
>>>> again expose flags to guests without proper white listing.
>>>>
>>>>> +    case 0x80000008:
>>>>> +        b &= pv_featureset[FEATURESET_e8b];
>>>>>          break;
>>>> Same here for ecx and edx and perhaps the upper 8 bits of eax.
>>> Both of these would be changes to how these things are currently
>>> handled, whereby a guest gets to see whatever the toolstack managed to
>>> find in said leaf.  I was hoping to put off some of these decisions, but
>>> they probably need making now.  On the PV side they definitely can't be
>>> fully hidden, as these leaves are not maskable.
>> Right, but many are meaningful primarily to kernels, and there we
>> can hide them.
>>
>> Since you're switching from black to white listing here, I also think
>> we need a default label alongside the "unsupported" one here.
>> Similarly I would think XSTATE sub-leaves beyond 63 need hiding
>> now.
> 
> I would prefer not to do that now.  It is conflated with the future
> work, deliberately deferred to make this work a manageable size.

I understand that you need to set some boundaries for this
first step. But I also think that we shouldn't stop in the middle
of switching from black listing to white listing here.

Jan
Andrew Cooper Feb. 17, 2016, 2:02 p.m. UTC | #6
On 17/02/16 10:55, Jan Beulich wrote:
>>>> On 17.02.16 at 11:43, <andrew.cooper3@citrix.com> wrote:
>> On 16/02/16 10:06, Jan Beulich wrote:
>>>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>>>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>>>>          /* Fix up VLAPIC details. */
>>>>>>          *ebx &= 0x00FFFFFFu;
>>>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>>>> +
>>>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>>>> Looks like I've overlooked an issue in patch 11, which becomes
>>>>> apparent here: How can you use a domain-independent featureset
>>>>> here, when features vary between HAP and shadow mode guests?
>>>>> I.e. in the earlier patch I suppose you need to calculate two
>>>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>>>> !hvm_funcs.hap_supported.
>>>> Their use here is a halfway house between nothing and the planned full
>>>> per-domain policies.
>>>>
>>>> In this case, the "don't expose $X to a non-hap domain" checks have been
>>>> retained, to cover the difference.
>>> Well, doesn't it seem to you that doing only half of the HAP/shadow
>>> separation is odd/confusing? I.e. could I talk you into not doing any
>>> such separation (enforcing the non-HAP overrides as is done now)
>>> or finishing the separation to become visible/usable here?
>> The HAP/shadow distinction is needed in the toolstack to account for the
>> hap=<bool> option.
>>
>> The distinction will disappear when per-domain policies are introduced. 
>> If you notice, the distinction is private to the data generated by the
>> autogen script, and does not form a part of any API/ABI.  The sysctl
>> only has a single hvm featureset.
> I don't see this as being in line with
>
>     hvm_featuremask = hvm_funcs.hap_supported ?
>         hvm_hap_featuremask : hvm_shadow_featuremask;
>
> in patch 11. A shadow mode guest should see exactly the same
> set of features, regardless of whether HAP was available (and
> enabled) on a host.

A shadow mode guest will see the same features, independently of whether
HAP was available.

This example could in principle be replaced with a clause similar to the
vmx side, but it doesn't remove the need for the shadow mask.

There is currently no way for the toolstack to query the cpuid policy,
which means it must have a correct idea of the eventual policy handed to
the guest.  The pv32/64 split isn't interesting in this regard as it
won't change.  HAP vs shadow however is very important to permit
migrating between hap and non-hap capable hosts.

>
>>>>>> +    case 0x80000007:
>>>>>> +        d &= pv_featureset[FEATURESET_e7d];
>>>>>> +        break;
>>>>> By not clearing eax and ebx (not sure about ecx) here you would
>>>>> again expose flags to guests without proper white listing.
>>>>>
>>>>>> +    case 0x80000008:
>>>>>> +        b &= pv_featureset[FEATURESET_e8b];
>>>>>>          break;
>>>>> Same here for ecx and edx and perhaps the upper 8 bits of eax.
>>>> Both of these would be changes to how these things are currently
>>>> handled, whereby a guest gets to see whatever the toolstack managed to
>>>> find in said leaf.  I was hoping to put off some of these decisions, but
>>>> they probably need making now.  On the PV side they definitely can't be
>>>> fully hidden, as these leaves are not maskable.
>>> Right, but many are meaningful primarily to kernels, and there we
>>> can hide them.
>>>
>>> Since you're switching from black to white listing here, I also think
>>> we need a default label alongside the "unsupported" one here.
>>> Similarly I would think XSTATE sub-leaves beyond 63 need hiding
>>> now.
>> I would prefer not to do that now.  It is conflated with the future
>> work, deliberately deferred to make this work a manageable size.
> I understand that you need to set some boundaries for this
> first step. But I also think that we shouldn't stop in the middle
> of switching from black listing to white listing here.

It was always a mixed black/white list which has turned a little less
like a blacklist.  There are still elements of both.

All this series is intended to do is turn the levellable feature bits
themselves to being strictly controlled.

~Andrew
Jan Beulich Feb. 17, 2016, 2:45 p.m. UTC | #7
>>> On 17.02.16 at 15:02, <andrew.cooper3@citrix.com> wrote:
> On 17/02/16 10:55, Jan Beulich wrote:
>>>>> On 17.02.16 at 11:43, <andrew.cooper3@citrix.com> wrote:
>>> On 16/02/16 10:06, Jan Beulich wrote:
>>>>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>>>>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>>>>>          /* Fix up VLAPIC details. */
>>>>>>>          *ebx &= 0x00FFFFFFu;
>>>>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>>>>> +
>>>>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>>>>> Looks like I've overlooked an issue in patch 11, which becomes
>>>>>> apparent here: How can you use a domain-independent featureset
>>>>>> here, when features vary between HAP and shadow mode guests?
>>>>>> I.e. in the earlier patch I suppose you need to calculate two
>>>>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>>>>> !hvm_funcs.hap_supported.
>>>>> Their use here is a halfway house between nothing and the planned full
>>>>> per-domain policies.
>>>>>
>>>>> In this case, the "don't expose $X to a non-hap domain" checks have been
>>>>> retained, to cover the difference.
>>>> Well, doesn't it seem to you that doing only half of the HAP/shadow
>>>> separation is odd/confusing? I.e. could I talk you into not doing any
>>>> such separation (enforcing the non-HAP overrides as is done now)
>>>> or finishing the separation to become visible/usable here?
>>> The HAP/shadow distinction is needed in the toolstack to account for the
>>> hap=<bool> option.
>>>
>>> The distinction will disappear when per-domain policies are introduced. 
>>> If you notice, the distinction is private to the data generated by the
>>> autogen script, and does not form a part of any API/ABI.  The sysctl
>>> only has a single hvm featureset.
>> I don't see this as being in line with
>>
>>     hvm_featuremask = hvm_funcs.hap_supported ?
>>         hvm_hap_featuremask : hvm_shadow_featuremask;
>>
>> in patch 11. A shadow mode guest should see exactly the same
>> set of features, regardless of whether HAP was available (and
>> enabled) on a host.
> 
> A shadow mode guest will see the same features, independently of whether
> HAP was available.

I'm afraid I'm being dense: Either the guest sees the same features,
which to me implies both of hvm_{hap,shadow}_featuremask are
identical, or the two masks are different, resulting in different guest
feature masks (and hence different guest features) depending on
HAP availability. What am I missing?

Jan
Andrew Cooper Feb. 18, 2016, 12:17 p.m. UTC | #8
On 17/02/16 14:45, Jan Beulich wrote:
>>>> On 17.02.16 at 15:02, <andrew.cooper3@citrix.com> wrote:
>> On 17/02/16 10:55, Jan Beulich wrote:
>>>>>> On 17.02.16 at 11:43, <andrew.cooper3@citrix.com> wrote:
>>>> On 16/02/16 10:06, Jan Beulich wrote:
>>>>>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>>>>>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>>>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
>>>>>>>>          /* Fix up VLAPIC details. */
>>>>>>>>          *ebx &= 0x00FFFFFFu;
>>>>>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>>>>>> +
>>>>>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>>>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>>>>>> Looks like I've overlooked an issue in patch 11, which becomes
>>>>>>> apparent here: How can you use a domain-independent featureset
>>>>>>> here, when features vary between HAP and shadow mode guests?
>>>>>>> I.e. in the earlier patch I suppose you need to calculate two
>>>>>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>>>>>> !hvm_funcs.hap_supported.
>>>>>> Their use here is a halfway house between nothing and the planned full
>>>>>> per-domain policies.
>>>>>>
>>>>>> In this case, the "don't expose $X to a non-hap domain" checks have been
>>>>>> retained, to cover the difference.
>>>>> Well, doesn't it seem to you that doing only half of the HAP/shadow
>>>>> separation is odd/confusing? I.e. could I talk you into not doing any
>>>>> such separation (enforcing the non-HAP overrides as is done now)
>>>>> or finishing the separation to become visible/usable here?
>>>> The HAP/shadow distinction is needed in the toolstack to account for the
>>>> hap=<bool> option.
>>>>
>>>> The distinction will disappear when per-domain policies are introduced. 
>>>> If you notice, the distinction is private to the data generated by the
>>>> autogen script, and does not form a part of any API/ABI.  The sysctl
>>>> only has a single hvm featureset.
>>> I don't see this as being in line with
>>>
>>>     hvm_featuremask = hvm_funcs.hap_supported ?
>>>         hvm_hap_featuremask : hvm_shadow_featuremask;
>>>
>>> in patch 11. A shadow mode guest should see exactly the same
>>> set of features, regardless of whether HAP was available (and
>>> enabled) on a host.
>> A shadow mode guest will see the same features, independently of whether
>> HAP was available.
> I'm afraid I'm being dense: Either the guest sees the same features,
> which to me implies both of hvm_{hap,shadow}_featuremask are
> identical, or the two masks are different, resulting in different guest
> feature masks (and hence different guest features) depending on
> HAP availability. What am I missing?

A guest booted with hap and a guest booted with shadow will see
different features when booted on the same host.  Hap includes 1GB
superpages, PCID, etc.

The problem comes with a shadow guest booted on a hap-capable host. 
Such a guest can safely be migrated to a non-hap capable host, but only
if the toolstack knows that the guest saw a reduced featureset.

As there is still no interface to query what a guest can actually see
(depends on full per-domain policys and no dynamic hiding), the shadow
featuremask is used by the toolstack as a promise of what the Xen
dynamic checks will do.

~Andrew
Jan Beulich Feb. 18, 2016, 1:23 p.m. UTC | #9
>>> On 18.02.16 at 13:17, <andrew.cooper3@citrix.com> wrote:
> On 17/02/16 14:45, Jan Beulich wrote:
>>>>> On 17.02.16 at 15:02, <andrew.cooper3@citrix.com> wrote:
>>> On 17/02/16 10:55, Jan Beulich wrote:
>>>>>>> On 17.02.16 at 11:43, <andrew.cooper3@citrix.com> wrote:
>>>>> On 16/02/16 10:06, Jan Beulich wrote:
>>>>>>>>> On 15.02.16 at 18:12, <andrew.cooper3@citrix.com> wrote:
>>>>>>> On 15/02/16 15:43, Jan Beulich wrote:
>>>>>>>>>>> On 05.02.16 at 14:42, <andrew.cooper3@citrix.com> wrote:
>>>>>>>>> @@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, 
> unsigned int *ebx,
>>>>>>>>>          /* Fix up VLAPIC details. */
>>>>>>>>>          *ebx &= 0x00FFFFFFu;
>>>>>>>>>          *ebx |= (v->vcpu_id * 2) << 24;
>>>>>>>>> +
>>>>>>>>> +        *ecx &= hvm_featureset[FEATURESET_1c];
>>>>>>>>> +        *edx &= hvm_featureset[FEATURESET_1d];
>>>>>>>> Looks like I've overlooked an issue in patch 11, which becomes
>>>>>>>> apparent here: How can you use a domain-independent featureset
>>>>>>>> here, when features vary between HAP and shadow mode guests?
>>>>>>>> I.e. in the earlier patch I suppose you need to calculate two
>>>>>>>> hvm_*_featureset[]s, with the HAP one perhaps empty when
>>>>>>>> !hvm_funcs.hap_supported.
>>>>>>> Their use here is a halfway house between nothing and the planned full
>>>>>>> per-domain policies.
>>>>>>>
>>>>>>> In this case, the "don't expose $X to a non-hap domain" checks have been
>>>>>>> retained, to cover the difference.
>>>>>> Well, doesn't it seem to you that doing only half of the HAP/shadow
>>>>>> separation is odd/confusing? I.e. could I talk you into not doing any
>>>>>> such separation (enforcing the non-HAP overrides as is done now)
>>>>>> or finishing the separation to become visible/usable here?
>>>>> The HAP/shadow distinction is needed in the toolstack to account for the
>>>>> hap=<bool> option.
>>>>>
>>>>> The distinction will disappear when per-domain policies are introduced. 
>>>>> If you notice, the distinction is private to the data generated by the
>>>>> autogen script, and does not form a part of any API/ABI.  The sysctl
>>>>> only has a single hvm featureset.
>>>> I don't see this as being in line with
>>>>
>>>>     hvm_featuremask = hvm_funcs.hap_supported ?
>>>>         hvm_hap_featuremask : hvm_shadow_featuremask;
>>>>
>>>> in patch 11. A shadow mode guest should see exactly the same
>>>> set of features, regardless of whether HAP was available (and
>>>> enabled) on a host.
>>> A shadow mode guest will see the same features, independently of whether
>>> HAP was available.
>> I'm afraid I'm being dense: Either the guest sees the same features,
>> which to me implies both of hvm_{hap,shadow}_featuremask are
>> identical, or the two masks are different, resulting in different guest
>> feature masks (and hence different guest features) depending on
>> HAP availability. What am I missing?
> 
> A guest booted with hap and a guest booted with shadow will see
> different features when booted on the same host.  Hap includes 1GB
> superpages, PCID, etc.

Okay, now I (think I) at least understand the background. What
I then still don't understand is why you don't - in the code still
visible above - use a HAP/shadow dependent mask, thus taking
care of those differences instead of elsewhere doing dynamic
adjustments.

Jan

> The problem comes with a shadow guest booted on a hap-capable host. 
> Such a guest can safely be migrated to a non-hap capable host, but only
> if the toolstack knows that the guest saw a reduced featureset.
> 
> As there is still no interface to query what a guest can actually see
> (depends on full per-domain policys and no dynamic hiding), the shadow
> featuremask is used by the toolstack as a promise of what the Xen
> dynamic checks will do.
> 
> ~Andrew
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 35ec6c9..03b3868 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -71,6 +71,7 @@ 
 #include <public/memory.h>
 #include <public/vm_event.h>
 #include <public/arch-x86/cpuid.h>
+#include <asm/cpuid.h>
 
 bool_t __read_mostly hvm_enabled;
 
@@ -4617,50 +4618,39 @@  void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
         /* Fix up VLAPIC details. */
         *ebx &= 0x00FFFFFFu;
         *ebx |= (v->vcpu_id * 2) << 24;
+
+        *ecx &= hvm_featureset[FEATURESET_1c];
+        *edx &= hvm_featureset[FEATURESET_1d];
+
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
-            __clear_bit(X86_FEATURE_APIC & 31, edx);
+            *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
 
         /* Fix up OSXSAVE. */
-        if ( cpu_has_xsave )
-            *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ?
-                     cpufeat_mask(X86_FEATURE_OSXSAVE) : 0;
+        if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
+            *ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
 
         /* Don't expose PCID to non-hap hvm. */
         if ( !hap_enabled(d) )
             *ecx &= ~cpufeat_mask(X86_FEATURE_PCID);
-
-        /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
-        if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
-            *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
         break;
+
     case 0x7:
         if ( count == 0 )
         {
-            if ( !cpu_has_smep )
-                *ebx &= ~cpufeat_mask(X86_FEATURE_SMEP);
-
-            if ( !cpu_has_smap )
-                *ebx &= ~cpufeat_mask(X86_FEATURE_SMAP);
-
-            /* Don't expose MPX to hvm when VMX support is not available */
-            if ( !(vmx_vmexit_control & VM_EXIT_CLEAR_BNDCFGS) ||
-                 !(vmx_vmentry_control & VM_ENTRY_LOAD_BNDCFGS) )
-                *ebx &= ~cpufeat_mask(X86_FEATURE_MPX);
+            *ebx &= hvm_featureset[FEATURESET_7b0];
+            *ecx &= hvm_featureset[FEATURESET_7c0];
 
             /* Don't expose INVPCID to non-hap hvm. */
             if ( !hap_enabled(d) )
                 *ebx &= ~cpufeat_mask(X86_FEATURE_INVPCID);
-
-            /* Don't expose PCOMMIT to hvm when VMX support is not available */
-            if ( !cpu_has_vmx_pcommit )
-                *ebx &= ~cpufeat_mask(X86_FEATURE_PCOMMIT);
         }
-
         break;
+
     case 0xb:
         /* Fix the x2APIC identifier. */
         *edx = v->vcpu_id * 2;
         break;
+
     case 0xd:
         /* EBX value of main leaf 0 depends on enabled xsave features */
         if ( count == 0 && v->arch.xcr0 ) 
@@ -4677,9 +4667,12 @@  void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
                     *ebx = _eax + _ebx;
             }
         }
+
         if ( count == 1 )
         {
-            if ( cpu_has_xsaves && cpu_has_vmx_xsaves )
+            *eax &= hvm_featureset[FEATURESET_Da1];
+
+            if ( *eax & cpufeat_mask(X86_FEATURE_XSAVES) )
             {
                 *ebx = XSTATE_AREA_MIN_SIZE;
                 if ( v->arch.xcr0 | v->arch.hvm_vcpu.msr_xss )
@@ -4694,6 +4687,9 @@  void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
         break;
 
     case 0x80000001:
+        *ecx &= hvm_featureset[FEATURESET_e1c];
+        *edx &= hvm_featureset[FEATURESET_e1d];
+
         /* We expose RDTSCP feature to guest only when
            tsc_mode == TSC_MODE_DEFAULT and host_tsc_is_safe() returns 1 */
         if ( d->arch.tsc_mode != TSC_MODE_DEFAULT ||
@@ -4702,12 +4698,10 @@  void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
         /* Hide 1GB-superpage feature if we can't emulate it. */
         if (!hvm_pse1gb_supported(d))
             *edx &= ~cpufeat_mask(X86_FEATURE_PAGE1GB);
-        /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
-        if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
-            *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
-        /* Hide data breakpoint extensions if the hardware has no support. */
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-            *ecx &= ~cpufeat_mask(X86_FEATURE_DBEXT);
+        break;
+
+    case 0x80000007:
+        *edx &= hvm_featureset[FEATURESET_e7d];
         break;
 
     case 0x80000008:
@@ -4725,6 +4719,8 @@  void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
         hvm_cpuid(0x80000001, NULL, NULL, NULL, &_edx);
         *eax = (*eax & ~0xffff00) | (_edx & cpufeat_mask(X86_FEATURE_LM)
                                      ? 0x3000 : 0x2000);
+
+        *ebx &= hvm_featureset[FEATURESET_e8b];
         break;
     }
 }
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 6a181bb..d0f836c 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -73,6 +73,7 @@ 
 #include <asm/hpet.h>
 #include <asm/vpmu.h>
 #include <public/arch-x86/cpuid.h>
+#include <asm/cpuid.h>
 #include <xsm/xsm.h>
 
 /*
@@ -841,69 +842,70 @@  void pv_cpuid(struct cpu_user_regs *regs)
     else
         cpuid_count(leaf, subleaf, &a, &b, &c, &d);
 
-    if ( (leaf & 0x7fffffff) == 0x00000001 )
-    {
-        /* Modify Feature Information. */
-        if ( !cpu_has_apic )
-            __clear_bit(X86_FEATURE_APIC, &d);
-
-        if ( !is_pvh_domain(currd) )
-        {
-            __clear_bit(X86_FEATURE_PSE, &d);
-            __clear_bit(X86_FEATURE_PGE, &d);
-            __clear_bit(X86_FEATURE_PSE36, &d);
-            __clear_bit(X86_FEATURE_VME, &d);
-        }
-    }
-
     switch ( leaf )
     {
     case 0x00000001:
-        /* Modify Feature Information. */
-        if ( !cpu_has_sep )
-            __clear_bit(X86_FEATURE_SEP, &d);
-        __clear_bit(X86_FEATURE_DS, &d);
-        __clear_bit(X86_FEATURE_ACC, &d);
-        __clear_bit(X86_FEATURE_PBE, &d);
-        if ( is_pvh_domain(currd) )
-            __clear_bit(X86_FEATURE_MTRR, &d);
-
-        __clear_bit(X86_FEATURE_DTES64 % 32, &c);
-        __clear_bit(X86_FEATURE_MWAIT % 32, &c);
-        __clear_bit(X86_FEATURE_DSCPL % 32, &c);
-        __clear_bit(X86_FEATURE_VMXE % 32, &c);
-        __clear_bit(X86_FEATURE_SMXE % 32, &c);
-        __clear_bit(X86_FEATURE_TM2 % 32, &c);
+        c &= pv_featureset[FEATURESET_1c];
+        d &= pv_featureset[FEATURESET_1d];
+
         if ( is_pv_32bit_domain(currd) )
-            __clear_bit(X86_FEATURE_CX16 % 32, &c);
-        __clear_bit(X86_FEATURE_XTPR % 32, &c);
-        __clear_bit(X86_FEATURE_PDCM % 32, &c);
-        __clear_bit(X86_FEATURE_PCID % 32, &c);
-        __clear_bit(X86_FEATURE_DCA % 32, &c);
-        if ( !cpu_has_xsave )
-        {
-            __clear_bit(X86_FEATURE_XSAVE % 32, &c);
-            __clear_bit(X86_FEATURE_AVX % 32, &c);
-        }
-        if ( !cpu_has_apic )
-           __clear_bit(X86_FEATURE_X2APIC % 32, &c);
-        __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
+            c &= ~cpufeat_mask(X86_FEATURE_CX16);
+
+        /*
+         * !!! Warning - OSXSAVE handling for PV guests is non-architectural !!!
+         *
+         * Architecturally, the correct code here is simply:
+         *
+         *   if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
+         *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+         *
+         * However because of bugs in Xen (before c/s bd19080b, Nov 2010, the
+         * XSAVE cpuid flag leaked into guests despite the feature not being
+         * avilable for use), buggy workarounds where introduced to Linux (c/s
+         * 947ccf9c, also Nov 2010) which relied on the fact that Xen also
+         * incorrectly leaked OSXSAVE into the guest.
+         *
+         * Furthermore, providing architectural OSXSAVE behaviour to a many
+         * Linux PV guests triggered a further kernel bug when the fpu code
+         * observes that XSAVEOPT is available, assumes that xsave state had
+         * been set up for the task, and follows a wild pointer.
+         *
+         * Therefore, the leaking of Xen's OSXSAVE setting has become a
+         * defacto part of the PV ABI and can't reasonably be corrected.
+         *
+         * The following situations and logic now applies:
+         *
+         * - Hardware without CPUID faulting support and native CPUID:
+         *    There is nothing Xen can do here.  The hosts XSAVE flag will
+         *    leak through and Xen's OSXSAVE choice will leak through.
+         *
+         *    In the case that the guest kernel has not set up OSXSAVE, only
+         *    SSE will be set in xcr0, and guest userspace can't do too much
+         *    damage itself.
+         *
+         * - Enlightened CPUID or CPUID faulting available:
+         *    Xen can fully control what is seen here.  Guest kernels need to
+         *    see the leaked OSXSAVE, but guest userspace is given
+         *    architectural behaviour, to reflect the guest kernels
+         *    intentions.
+         */
+        if ( (is_pv_domain(currd) && guest_kernel_mode(curr, regs) &&
+              (this_cpu(cr4) & X86_CR4_OSXSAVE)) ||
+             (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
+            c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+
+        c |= cpufeat_mask(X86_FEATURE_HYPERVISOR);
         break;
 
     case 0x00000007:
         if ( subleaf == 0 )
-            b &= (cpufeat_mask(X86_FEATURE_BMI1) |
-                  cpufeat_mask(X86_FEATURE_HLE)  |
-                  cpufeat_mask(X86_FEATURE_AVX2) |
-                  cpufeat_mask(X86_FEATURE_BMI2) |
-                  cpufeat_mask(X86_FEATURE_ERMS) |
-                  cpufeat_mask(X86_FEATURE_RTM)  |
-                  cpufeat_mask(X86_FEATURE_RDSEED)  |
-                  cpufeat_mask(X86_FEATURE_ADX)  |
-                  cpufeat_mask(X86_FEATURE_FSGSBASE));
+        {
+            b &= pv_featureset[FEATURESET_7b0];
+            c &= pv_featureset[FEATURESET_7c0];
+        }
         else
-            b = 0;
-        a = c = d = 0;
+            b = c = 0;
+        a = d = 0;
         break;
 
     case XSTATE_CPUID:
@@ -926,37 +928,32 @@  void pv_cpuid(struct cpu_user_regs *regs)
         }
 
         case 1:
-            a &= (boot_cpu_data.x86_capability[cpufeat_word(X86_FEATURE_XSAVEOPT)] &
-                  ~cpufeat_mask(X86_FEATURE_XSAVES));
+            a &= pv_featureset[FEATURESET_Da1];
             b = c = d = 0;
             break;
         }
         break;
 
     case 0x80000001:
-        /* Modify Feature Information. */
+        c &= pv_featureset[FEATURESET_e1c];
+        d &= pv_featureset[FEATURESET_e1d];
+
         if ( is_pv_32bit_domain(currd) )
         {
-            __clear_bit(X86_FEATURE_LM % 32, &d);
-            __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
-        }
-        if ( is_pv_32bit_domain(currd) &&
-             boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
-        __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
-        __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
-
-        __clear_bit(X86_FEATURE_SVM % 32, &c);
-        if ( !cpu_has_apic )
-           __clear_bit(X86_FEATURE_EXTAPIC % 32, &c);
-        __clear_bit(X86_FEATURE_OSVW % 32, &c);
-        __clear_bit(X86_FEATURE_IBS % 32, &c);
-        __clear_bit(X86_FEATURE_SKINIT % 32, &c);
-        __clear_bit(X86_FEATURE_WDT % 32, &c);
-        __clear_bit(X86_FEATURE_LWP % 32, &c);
-        __clear_bit(X86_FEATURE_NODEID_MSR % 32, &c);
-        __clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
-        __clear_bit(X86_FEATURE_MWAITX % 32, &c);
+            d &= ~cpufeat_mask(X86_FEATURE_LM);
+            c &= ~cpufeat_mask(X86_FEATURE_LAHF_LM);
+
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+                d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
+        }
+        break;
+
+    case 0x80000007:
+        d &= pv_featureset[FEATURESET_e7d];
+        break;
+
+    case 0x80000008:
+        b &= pv_featureset[FEATURESET_e8b];
         break;
 
     case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */