Message ID | 1379861095-628-2-git-send-email-bp@alien8.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sun, Sep 22, 2013 at 04:44:50PM +0200, Borislav Petkov wrote: > From: Borislav Petkov <bp@suse.de> > > Add a kvm ioctl which states which system functionality kvm emulates. > The format used is that of CPUID and we return the corresponding CPUID > bits set for which we do emulate functionality. Let me check if I understood the purpose of the new ioctl correctly: the only reason for GET_EMULATED_CPUID to exist is to allow userspace to differentiate features that are native or that are emulated efficiently (GET_SUPPORTED_CPUID) and features that are emulated not very efficiently (GET_EMULATED_CPUID)? If that's the case, how do we decide how efficient emulation should be, to deserve inclusion in GET_SUPPORTED_CPUID? I am guessing that the criterion will be: if enabling it doesn't risk making performance worse, it can get in GET_SUPPORTED_CPUID. > > Make sure ->padding is being passed on clean from userspace so that we > can use it for something in the future, after the ioctl gets cast in > stone. > > s/kvm_dev_ioctl_get_supported_cpuid/kvm_dev_ioctl_get_cpuid/ while at > it. > > Signed-off-by: Borislav Petkov <bp@suse.de> > --- > Documentation/virtual/kvm/api.txt | 77 +++++++++++++++++++++++++++++++++++++-- > arch/x86/include/uapi/asm/kvm.h | 6 +-- > arch/x86/kvm/cpuid.c | 57 ++++++++++++++++++++++++++--- > arch/x86/kvm/cpuid.h | 5 ++- > arch/x86/kvm/x86.c | 9 +++-- > include/uapi/linux/kvm.h | 2 + > 6 files changed, 139 insertions(+), 17 deletions(-) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index 858aecf21db2..7b70d670bb28 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -1122,9 +1122,9 @@ struct kvm_cpuid2 { > struct kvm_cpuid_entry2 entries[0]; > }; > > -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 > -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 > -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 > +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) > +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) > +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) > > struct kvm_cpuid_entry2 { > __u32 function; > @@ -2661,6 +2661,77 @@ and usually define the validity of a groups of registers. (e.g. one bit > }; > > > +4.81 KVM_GET_EMULATED_CPUID > + > +Capability: KVM_CAP_EXT_EMUL_CPUID > +Architectures: x86 > +Type: system ioctl > +Parameters: struct kvm_cpuid2 (in/out) > +Returns: 0 on success, -1 on error > + > +struct kvm_cpuid2 { > + __u32 nent; > + __u32 flags; > + struct kvm_cpuid_entry2 entries[0]; > +}; > + > +The member 'flags' is used for passing flags from userspace. > + > +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) > +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) > +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) > + > +struct kvm_cpuid_entry2 { > + __u32 function; > + __u32 index; > + __u32 flags; > + __u32 eax; > + __u32 ebx; > + __u32 ecx; > + __u32 edx; > + __u32 padding[3]; > +}; > + > +This ioctl returns x86 cpuid features which are emulated by > +kvm.Userspace can use the information returned by this ioctl to query > +which features are emulated by kvm instead of being present natively. > + > +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 > +structure with the 'nent' field indicating the number of entries in > +the variable-size array 'entries'. If the number of entries is too low > +to describe the cpu capabilities, an error (E2BIG) is returned. If the > +number is too high, the 'nent' field is adjusted and an error (ENOMEM) > +is returned. If the number is just right, the 'nent' field is adjusted > +to the number of valid entries in the 'entries' array, which is then > +filled. > + > +The entries returned are the set CPUID bits of the respective features > +which kvm emulates, as returned by the CPUID instruction, with unknown > +or unsupported feature bits cleared. > + > +Features like x2apic, for example, may not be present in the host cpu > +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be > +emulated efficiently and thus not included here. > + > +The fields in each entry are defined as follows: > + > + function: the eax value used to obtain the entry > + index: the ecx value used to obtain the entry (for entries that are > + affected by ecx) > + flags: an OR of zero or more of the following: > + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: > + if the index field is valid > + KVM_CPUID_FLAG_STATEFUL_FUNC: > + if cpuid for this function returns different values for successive > + invocations; there will be several entries with the same function, > + all with this flag set > + KVM_CPUID_FLAG_STATE_READ_NEXT: > + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is > + the first entry to be read by a cpu > + eax, ebx, ecx, edx: the values returned by the cpuid instruction for > + this function/index combination > + > + > 6. Capabilities that can be enabled > ----------------------------------- > > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h > index 5d9a3033b3d7..d3a87780c70b 100644 > --- a/arch/x86/include/uapi/asm/kvm.h > +++ b/arch/x86/include/uapi/asm/kvm.h > @@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 { > __u32 padding[3]; > }; > > -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 > -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 > -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 > +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) > +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) > +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) > > /* for KVM_SET_CPUID2 */ > struct kvm_cpuid2 { > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index b110fe6c03d4..eca357095a49 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -187,8 +187,14 @@ static bool supported_xcr0_bit(unsigned bit) > > #define F(x) bit(X86_FEATURE_##x) > > -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > - u32 index, int *nent, int maxnent) > +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, > + u32 func, u32 index, int *nent, int maxnent) > +{ > + return 0; > +} > + > +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > + u32 index, int *nent, int maxnent) > { > int r; > unsigned f_nx = is_efer_nx() ? F(NX) : 0; > @@ -481,6 +487,15 @@ out: > return r; > } > > +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, > + u32 idx, int *nent, int maxnent, unsigned int type) > +{ > + if (type == KVM_GET_EMULATED_CPUID) > + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); > + > + return __do_cpuid_ent(entry, func, idx, nent, maxnent); > +} > + > #undef F > > struct kvm_cpuid_param { > @@ -495,8 +510,34 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param) > return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; > } > > -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, > - struct kvm_cpuid_entry2 __user *entries) > +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, > + __u32 num_entries, unsigned int ioctl_type) > +{ > + int i; > + > + if (ioctl_type != KVM_GET_EMULATED_CPUID) > + return false; > + > + /* > + * We want to make sure that ->padding is being passed clean from > + * userspace in case we want to use it for something in the future. > + * > + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we > + * have to give ourselves satisfied only with the emulated side. /me > + * sheds a tear. > + */ > + for (i = 0; i < num_entries; i++) { > + if (entries[i].padding[0] || > + entries[i].padding[1] || > + entries[i].padding[2]) > + return true; > + } > + return false; > +} > + > +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, > + struct kvm_cpuid_entry2 __user *entries, > + unsigned int type) > { > struct kvm_cpuid_entry2 *cpuid_entries; > int limit, nent = 0, r = -E2BIG, i; > @@ -513,6 +554,10 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, > goto out; > if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) > cpuid->nent = KVM_MAX_CPUID_ENTRIES; > + > + if (sanity_check_entries(entries, cpuid->nent, type)) > + return -EINVAL; > + > r = -ENOMEM; > cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); > if (!cpuid_entries) > @@ -526,7 +571,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, > continue; > > r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, > - &nent, cpuid->nent); > + &nent, cpuid->nent, type); > > if (r) > goto out_free; > @@ -537,7 +582,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, > limit = cpuid_entries[nent - 1].eax; > for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) > r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, > - &nent, cpuid->nent); > + &nent, cpuid->nent, type); > > if (r) > goto out_free; > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h > index b7fd07984888..f1e4895174b2 100644 > --- a/arch/x86/kvm/cpuid.h > +++ b/arch/x86/kvm/cpuid.h > @@ -6,8 +6,9 @@ > void kvm_update_cpuid(struct kvm_vcpu *vcpu); > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, > u32 function, u32 index); > -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, > - struct kvm_cpuid_entry2 __user *entries); > +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, > + struct kvm_cpuid_entry2 __user *entries, > + unsigned int type); > int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, > struct kvm_cpuid *cpuid, > struct kvm_cpuid_entry __user *entries); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index e5ca72a5cdb6..8dfde7a52dab 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2564,6 +2564,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: > case KVM_CAP_SET_TSS_ADDR: > case KVM_CAP_EXT_CPUID: > + case KVM_CAP_EXT_EMUL_CPUID: > case KVM_CAP_CLOCKSOURCE: > case KVM_CAP_PIT: > case KVM_CAP_NOP_IO_DELAY: > @@ -2673,15 +2674,17 @@ long kvm_arch_dev_ioctl(struct file *filp, > r = 0; > break; > } > - case KVM_GET_SUPPORTED_CPUID: { > + case KVM_GET_SUPPORTED_CPUID: > + case KVM_GET_EMULATED_CPUID: { > struct kvm_cpuid2 __user *cpuid_arg = argp; > struct kvm_cpuid2 cpuid; > > r = -EFAULT; > if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) > goto out; > - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, > - cpuid_arg->entries); > + > + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, > + ioctl); > if (r) > goto out; > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 99c25338ede8..bb986a1674a3 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -541,6 +541,7 @@ struct kvm_ppc_smmu_info { > #define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 > #define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 > #define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 > +#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) > > /* > * Extension capability list. > @@ -668,6 +669,7 @@ struct kvm_ppc_smmu_info { > #define KVM_CAP_IRQ_XICS 92 > #define KVM_CAP_ARM_EL1_32BIT 93 > #define KVM_CAP_SPAPR_MULTITCE 94 > +#define KVM_CAP_EXT_EMUL_CPUID 95 > > #ifdef KVM_CAP_IRQ_ROUTING > > -- > 1.8.4 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, September 23, 2013 6:28 pm, Eduardo Habkost wrote: > On Sun, Sep 22, 2013 at 04:44:50PM +0200, Borislav Petkov wrote: >> From: Borislav Petkov <bp@suse.de> >> >> Add a kvm ioctl which states which system functionality kvm emulates. >> The format used is that of CPUID and we return the corresponding CPUID >> bits set for which we do emulate functionality. > > Let me check if I understood the purpose of the new ioctl correctly: the > only reason for GET_EMULATED_CPUID to exist is to allow userspace to > differentiate features that are native or that are emulated efficiently > (GET_SUPPORTED_CPUID) and features that are emulated not very > efficiently (GET_EMULATED_CPUID)? Not only that - emulated features are not reported in CPUID so they can be enabled only when specifically and explicitly requested, i.e. "+movbe". Basically, you want to emulate that feature for the guest but only for this specific guest - the others shouldn't see it. > If that's the case, how do we decide how efficient emulation should be, > to deserve inclusion in GET_SUPPORTED_CPUID? I am guessing that the > criterion will be: if enabling it doesn't risk making performance worse, > it can get in GET_SUPPORTED_CPUID. Well, in the MOVBE case, supported means, the host can execute this instruction natively. Now, you guys say you can emulate x2apic very efficiently and I'm guessing emulating x2apic doesn't bring any emulation overhead, thus SUPPORTED_CPUID. But for single instructions or group of instructions, the distinction should be very clear. At least this is how I see it but Gleb probably can comment too. Thanks.
On Tue, Sep 24, 2013 at 11:57:00AM +0200, Borislav Petkov wrote: > On Mon, September 23, 2013 6:28 pm, Eduardo Habkost wrote: > > On Sun, Sep 22, 2013 at 04:44:50PM +0200, Borislav Petkov wrote: > >> From: Borislav Petkov <bp@suse.de> > >> > >> Add a kvm ioctl which states which system functionality kvm emulates. > >> The format used is that of CPUID and we return the corresponding CPUID > >> bits set for which we do emulate functionality. > > > > Let me check if I understood the purpose of the new ioctl correctly: the > > only reason for GET_EMULATED_CPUID to exist is to allow userspace to > > differentiate features that are native or that are emulated efficiently > > (GET_SUPPORTED_CPUID) and features that are emulated not very > > efficiently (GET_EMULATED_CPUID)? > > Not only that - emulated features are not reported in CPUID so they > can be enabled only when specifically and explicitly requested, i.e. > "+movbe". Basically, you want to emulate that feature for the guest but > only for this specific guest - the others shouldn't see it. > > > If that's the case, how do we decide how efficient emulation should be, > > to deserve inclusion in GET_SUPPORTED_CPUID? I am guessing that the > > criterion will be: if enabling it doesn't risk making performance worse, > > it can get in GET_SUPPORTED_CPUID. > > Well, in the MOVBE case, supported means, the host can execute this > instruction natively. Now, you guys say you can emulate x2apic very > efficiently and I'm guessing emulating x2apic doesn't bring any > emulation overhead, thus SUPPORTED_CPUID. x2apic emulation has nothing to do with x2apic in a host. It is emulated same way no matter if host has it or not. x2apic is not really cpu feature, but apic one and apic is fully emulated by KVM anyway. > > But for single instructions or group of instructions, the distinction > should be very clear. > > At least this is how I see it but Gleb probably can comment too. > That's how I see it two. Basically you want to use movbe emulation (as opposite of virtualization) only if you have binary kernel that compiled for CPU with movbe (Borislav's use case), or you want to migrate temporarily from movbe enabled host to non movbe host because downtime is not an option. We should avoid enabling it "by mistake". -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Sep 24, 2013 at 01:04:14PM +0300, Gleb Natapov wrote: > On Tue, Sep 24, 2013 at 11:57:00AM +0200, Borislav Petkov wrote: > > On Mon, September 23, 2013 6:28 pm, Eduardo Habkost wrote: > > > On Sun, Sep 22, 2013 at 04:44:50PM +0200, Borislav Petkov wrote: > > >> From: Borislav Petkov <bp@suse.de> > > >> > > >> Add a kvm ioctl which states which system functionality kvm emulates. > > >> The format used is that of CPUID and we return the corresponding CPUID > > >> bits set for which we do emulate functionality. > > > > > > Let me check if I understood the purpose of the new ioctl correctly: the > > > only reason for GET_EMULATED_CPUID to exist is to allow userspace to > > > differentiate features that are native or that are emulated efficiently > > > (GET_SUPPORTED_CPUID) and features that are emulated not very > > > efficiently (GET_EMULATED_CPUID)? > > > > Not only that - emulated features are not reported in CPUID so they > > can be enabled only when specifically and explicitly requested, i.e. > > "+movbe". Basically, you want to emulate that feature for the guest but > > only for this specific guest - the others shouldn't see it. Then we may have a problem: some CPU models already have "movbe" included (e.g. Haswell), and patch 6/6 will make "-cpu Haswell" get movbe enabled even if it is being emulated. So if we really want to avoid enabling emulated features by mistake, we may need a new CPU flag in addition to "enforce" to tell QEMU that it is OK to enable emulated features (maybe "-cpu ...,emulate"?). > > > > > If that's the case, how do we decide how efficient emulation should be, > > > to deserve inclusion in GET_SUPPORTED_CPUID? I am guessing that the > > > criterion will be: if enabling it doesn't risk making performance worse, > > > it can get in GET_SUPPORTED_CPUID. > > > > Well, in the MOVBE case, supported means, the host can execute this > > instruction natively. Now, you guys say you can emulate x2apic very > > efficiently and I'm guessing emulating x2apic doesn't bring any > > emulation overhead, thus SUPPORTED_CPUID. > x2apic emulation has nothing to do with x2apic in a host. It is emulated > same way no matter if host has it or not. x2apic is not really cpu > feature, but apic one and apic is fully emulated by KVM anyway. But my question still stands: suppose we had x2apic emulation implemented but for some reason it was painfully slow, we wouldn't want to enable it by mistake. In this case, it would end up on EMULATED_CPUID and not on SUPPORTED_CPUID, right? > > > > > But for single instructions or group of instructions, the distinction > > should be very clear. > > > > At least this is how I see it but Gleb probably can comment too. > > > That's how I see it two. Basically you want to use movbe emulation (as > opposite of virtualization) only if you have binary kernel that compiled > for CPU with movbe (Borislav's use case), or you want to migrate > temporarily from movbe enabled host to non movbe host because downtime > is not an option. We should avoid enabling it "by mistake". "we should avoid enabling it 'by mistake'" sounds like a good criterion for including something on GET_EMULATED_CPUID instead of GET_SUPPORTED_CPUID. In that case, I believe QEMU should use GET_EMULATED_CPUID only if explicitly requested in the configuration/command-line (that's not what patch 6/6 does).
On Thu, Sep 26, 2013 at 11:19:15AM -0300, Eduardo Habkost wrote: > Then we may have a problem: some CPU models already have "movbe" > included (e.g. Haswell), and patch 6/6 will make "-cpu Haswell" get > movbe enabled even if it is being emulated. Huh? HSW has MOVBE so we won't #UD on it and MOVBE will get executed in hardware when executing the guest. IOW, we'll never get to the emulation path of piggybacking on the #UD. > So if we really want to avoid enabling emulated features by mistake, > we may need a new CPU flag in addition to "enforce" to tell QEMU that > it is OK to enable emulated features (maybe "-cpu ...,emulate"?). EMULATED_CPUID are off by default and only if you request them specifically, they get enabled. If you start with "-cpu Haswell", MOVBE will be already set in the host CPUID. Or am I missing something? > But my question still stands: suppose we had x2apic emulation > implemented but for some reason it was painfully slow, we wouldn't > want to enable it by mistake. In this case, it would end up on > EMULATED_CPUID and not on SUPPORTED_CPUID, right? IMHO we want to enable emulation only when explicitly requested... regardless of the emulation performance. Thanks.
On Thu, Sep 26, 2013 at 08:55:24PM +0200, Borislav Petkov wrote: > On Thu, Sep 26, 2013 at 11:19:15AM -0300, Eduardo Habkost wrote: > > Then we may have a problem: some CPU models already have "movbe" > > included (e.g. Haswell), and patch 6/6 will make "-cpu Haswell" get > > movbe enabled even if it is being emulated. > > Huh? HSW has MOVBE so we won't #UD on it and MOVBE will get executed in > hardware when executing the guest. IOW, we'll never get to the emulation > path of piggybacking on the #UD. > > > So if we really want to avoid enabling emulated features by mistake, > > we may need a new CPU flag in addition to "enforce" to tell QEMU that > > it is OK to enable emulated features (maybe "-cpu ...,emulate"?). > > EMULATED_CPUID are off by default and only if you request them > specifically, they get enabled. Please point me to the code that does this, because I don't see it on patch 6/6. > If you start with "-cpu Haswell", MOVBE > will be already set in the host CPUID. > > Or am I missing something? In the Haswell example, it is unlikely but possible in theory: you would need a CPU that supported all features from Haswell except movbe. But what will happen if you are using "-cpu n270,enforce" on a SandyBridge host? Also, we don't know anything about future CPUs or future features that will end up on EMULATED_CPUID. The current code doesn't have anything to differentiate features that were already included in the CPU definition and ones explicitly enabled in the command-line (and I would like to keep it that way). And just because a feature was explicitly enabled in the command-line, that doesn't mean the user believe it is acceptable to get it running in emulated mode. That's why I propose a new "emulate" flag, to allow features to be enabled in emulated mode. > > > But my question still stands: suppose we had x2apic emulation > > implemented but for some reason it was painfully slow, we wouldn't > > want to enable it by mistake. In this case, it would end up on > > EMULATED_CPUID and not on SUPPORTED_CPUID, right? > > IMHO we want to enable emulation only when explicitly requested... > regardless of the emulation performance. Well, x2apic is emulated by KVM, and it is on SUPPORTED_CPUID. Ditto for tsc-deadline. Or are you talking specifically about instruction emulation?
On Thu, Sep 26, 2013 at 04:20:59PM -0300, Eduardo Habkost wrote: > Please point me to the code that does this, because I don't see it on > patch 6/6. @@ -1850,7 +1850,14 @@ static void filter_features_for_kvm(X86CPU *cpu) wi->cpuid_ecx, wi->cpuid_reg); uint32_t requested_features = env->features[w]; + + uint32_t emul_features = kvm_arch_get_emulated_cpuid(s, wi->cpuid_eax, + wi->cpuid_ecx, + wi->cpuid_reg); + env->features[w] &= host_feat; + env->features[w] |= (requested_features & emul_features); Basically we give the requested_features a second chance here. If we don't request an emulated feature, it won't get enabled. > > If you start with "-cpu Haswell", MOVBE > > will be already set in the host CPUID. > > > > Or am I missing something? > > In the Haswell example, it is unlikely but possible in theory: you would > need a CPU that supported all features from Haswell except movbe. But > what will happen if you are using "-cpu n270,enforce" on a SandyBridge > host? That's an interesting question: AFAICT, it will fail because MOVBE is not available on the host, right? And if so, then this is correct behavior IMHO, or how exactly is the "enforce" thing supposed to work? Enforce host CPUID? > Also, we don't know anything about future CPUs or future features > that will end up on EMULATED_CPUID. The current code doesn't have > anything to differentiate features that were already included in the > CPU definition and ones explicitly enabled in the command-line (and I > would like to keep it that way). Ok. > And just because a feature was explicitly enabled in the command-line, > that doesn't mean the user believe it is acceptable to get it running > in emulated mode. That's why I propose a new "emulate" flag, to allow > features to be enabled in emulated mode. And I think, saying "-cpu ...,+movbe" is an explicit statement enough to say that yes, I am starting this guest and I want MOVBE emulation. > Well, x2apic is emulated by KVM, and it is on SUPPORTED_CPUID. Ditto > for tsc-deadline. Or are you talking specifically about instruction > emulation? Basically, I'm viewing this from a very practical standpoint - if I build a kernel which requires MOVBE support but I cannot boot it in kvm because it doesn't emulate MOVBE (TCG does now but it didn't before) I'd like to be able to address that shortcoming by emulating that instruction, if possible. And the whole discussion grew out from the standpoint of being able to emulate stuff so that you can do quick and dirty booting of kernels but not show that emulation capability to the wide audience since it is slow and it shouldn't be used and then migration has issues, etc, etc. But hey, I don't really care all that much if I have to also say -emulate in order to get my functionality. Thanks.
On Thu, Sep 26, 2013 at 10:32:06PM +0200, Borislav Petkov wrote: > On Thu, Sep 26, 2013 at 04:20:59PM -0300, Eduardo Habkost wrote: > > Please point me to the code that does this, because I don't see it on > > patch 6/6. > > @@ -1850,7 +1850,14 @@ static void filter_features_for_kvm(X86CPU *cpu) > wi->cpuid_ecx, > wi->cpuid_reg); > uint32_t requested_features = env->features[w]; > + > + uint32_t emul_features = kvm_arch_get_emulated_cpuid(s, wi->cpuid_eax, > + wi->cpuid_ecx, > + wi->cpuid_reg); > + > env->features[w] &= host_feat; > + env->features[w] |= (requested_features & emul_features); > > Basically we give the requested_features a second chance here. > > If we don't request an emulated feature, it won't get enabled. The problem here is that "requested_features" doesn't include just the explicit "+flag" flags, but any flag included in the CPU model definition. See the "-cpu n270" example below. > > > > If you start with "-cpu Haswell", MOVBE > > > will be already set in the host CPUID. > > > > > > Or am I missing something? > > > > In the Haswell example, it is unlikely but possible in theory: you would > > need a CPU that supported all features from Haswell except movbe. But > > what will happen if you are using "-cpu n270,enforce" on a SandyBridge > > host? > > That's an interesting question: AFAICT, it will fail because MOVBE is > not available on the host, right? It should, but your patch will make it stop failing because of MOVBE, as now it can be emulated[1]. > > And if so, then this is correct behavior IMHO, or how exactly is the > "enforce" thing supposed to work? Enforce host CPUID? "enforce" makes sure all features are really being enabled. It makes QEMU abort if there's any feature that can't be enabled on that host. [1] Maybe one source of confusion is that the existing code have two feature-filtering functions doing basically the same thing: filter_features_for_kvm() and kvm_check_features_against_host(). That's something we must clean up, and they should be unified. "enforce" should become synonymous to "make sure filtered_features is all zeroes". This way, libvirt can emulate what 'enforce" does while being able to collect detailed error information (which is not easy to do if QEMU simply aborts). > > > Also, we don't know anything about future CPUs or future features > > that will end up on EMULATED_CPUID. The current code doesn't have > > anything to differentiate features that were already included in the > > CPU definition and ones explicitly enabled in the command-line (and I > > would like to keep it that way). > > Ok. > > > And just because a feature was explicitly enabled in the command-line, > > that doesn't mean the user believe it is acceptable to get it running > > in emulated mode. That's why I propose a new "emulate" flag, to allow > > features to be enabled in emulated mode. > > And I think, saying "-cpu ...,+movbe" is an explicit statement enough to > say that yes, I am starting this guest and I want MOVBE emulation. Not necessarily. libvirt has some code that will translate its own CPU model definition to a "-cpu Model,+flag,+flag,+flag,-flag" command-line when necessary. It is by design that there is no difference between explicit "+flag" options and existing flags from the CPU model definition. > > > Well, x2apic is emulated by KVM, and it is on SUPPORTED_CPUID. Ditto > > for tsc-deadline. Or are you talking specifically about instruction > > emulation? > > Basically, I'm viewing this from a very practical standpoint - if I > build a kernel which requires MOVBE support but I cannot boot it in kvm > because it doesn't emulate MOVBE (TCG does now but it didn't before) > I'd like to be able to address that shortcoming by emulating that > instruction, if possible. > > And the whole discussion grew out from the standpoint of being able to > emulate stuff so that you can do quick and dirty booting of kernels but > not show that emulation capability to the wide audience since it is slow > and it shouldn't be used and then migration has issues, etc, etc. > > But hey, I don't really care all that much if I have to also say > -emulate in order to get my functionality. OK, I undestand your use case, now. Thanks for your explanation.
On Fri, Sep 27, 2013 at 11:21:34AM -0300, Eduardo Habkost wrote: > The problem here is that "requested_features" doesn't include just > the explicit "+flag" flags, but any flag included in the CPU model > definition. See the "-cpu n270" example below. Oh, you mean if requested_features would contain a flag included from the CPU model definition - a flag which we haven't requested explicitly - and if kvm emulates that flag, then it will get enabled? Hmm. > It should, but your patch will make it stop failing because of MOVBE, as > now it can be emulated[1]. Right. > "enforce" makes sure all features are really being enabled. It makes > QEMU abort if there's any feature that can't be enabled on that host. Ok. > [1] Maybe one source of confusion is that the existing code have two > feature-filtering functions doing basically the same thing: > filter_features_for_kvm() and kvm_check_features_against_host(). That's Yes, and the first gets executed unconditionally and does the feature filtering, right after the second has run in the kvm_enabled() branch. > something we must clean up, and they should be unified. "enforce" should > become synonymous to "make sure filtered_features is all zeroes". This > way, libvirt can emulate what 'enforce" does while being able to collect > detailed error information (which is not easy to do if QEMU simply > aborts). Ok, maybe someone who's more knowledgeable with this code should do it - not me :) Also, there's another aspect, while we're here: now that QEMU emulates MOVBE with TCG too, how do we specify on the command line, which emulation should be used - kvm.ko or QEMU? Thanks.
On Sat, Sep 28, 2013 at 12:49:04PM +0200, Borislav Petkov wrote: > On Fri, Sep 27, 2013 at 11:21:34AM -0300, Eduardo Habkost wrote: > > The problem here is that "requested_features" doesn't include just > > the explicit "+flag" flags, but any flag included in the CPU model > > definition. See the "-cpu n270" example below. > > Oh, you mean if requested_features would contain a flag included from > the CPU model definition - a flag which we haven't requested explicitly > - and if kvm emulates that flag, then it will get enabled? Exactly. The code needs to filter/check all feature bits on the CPU, not just the ones requested explicitly in the command-line. [...] > > [1] Maybe one source of confusion is that the existing code have two > > feature-filtering functions doing basically the same thing: > > filter_features_for_kvm() and kvm_check_features_against_host(). That's > > Yes, and the first gets executed unconditionally and does the feature > filtering, right after the second has run in the kvm_enabled() branch. This should be fixed, too: eventually "enforce" should work on TCG mode as well. > > > something we must clean up, and they should be unified. "enforce" should > > become synonymous to "make sure filtered_features is all zeroes". This > > way, libvirt can emulate what 'enforce" does while being able to collect > > detailed error information (which is not easy to do if QEMU simply > > aborts). > > Ok, maybe someone who's more knowledgeable with this code should do it - > not me :) I have added it to my TODO-list. :-) > > Also, there's another aspect, while we're here: now that QEMU emulates > MOVBE with TCG too, how do we specify on the command line, which > emulation should be used - kvm.ko or QEMU? You can use accel={tcg,kvm} option on the "-machine" argument, e.g. "-machine pc,accel=kvm". Or the "-enable-kvm" option.
On Mon, Sep 30, 2013 at 01:13:34PM -0300, Eduardo Habkost wrote: > I have added it to my TODO-list. :-) Cool, thanks. Let me know if I can test stuff and help out somehow. > > > > Also, there's another aspect, while we're here: now that QEMU emulates > > MOVBE with TCG too, how do we specify on the command line, which > > emulation should be used - kvm.ko or QEMU? > > You can use accel={tcg,kvm} option on the "-machine" argument, e.g. > "-machine pc,accel=kvm". Or the "-enable-kvm" option. Ah, ok. Thanks.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 858aecf21db2..7b70d670bb28 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1122,9 +1122,9 @@ struct kvm_cpuid2 { struct kvm_cpuid_entry2 entries[0]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) struct kvm_cpuid_entry2 { __u32 function; @@ -2661,6 +2661,77 @@ and usually define the validity of a groups of registers. (e.g. one bit }; +4.81 KVM_GET_EMULATED_CPUID + +Capability: KVM_CAP_EXT_EMUL_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; +}; + +The member 'flags' is used for passing flags from userspace. + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + + 6. Capabilities that can be enabled ----------------------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5d9a3033b3d7..d3a87780c70b 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b110fe6c03d4..eca357095a49 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -187,8 +187,14 @@ static bool supported_xcr0_bit(unsigned bit) #define F(x) bit(X86_FEATURE_##x) -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, u32 index, int *nent, int maxnent) +{ + return 0; +} + +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -481,6 +487,15 @@ out: return r; } +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, + u32 idx, int *nent, int maxnent, unsigned int type) +{ + if (type == KVM_GET_EMULATED_CPUID) + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + + return __do_cpuid_ent(entry, func, idx, nent, maxnent); +} + #undef F struct kvm_cpuid_param { @@ -495,8 +510,34 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param) return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, + __u32 num_entries, unsigned int ioctl_type) +{ + int i; + + if (ioctl_type != KVM_GET_EMULATED_CPUID) + return false; + + /* + * We want to make sure that ->padding is being passed clean from + * userspace in case we want to use it for something in the future. + * + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * have to give ourselves satisfied only with the emulated side. /me + * sheds a tear. + */ + for (i = 0; i < num_entries; i++) { + if (entries[i].padding[0] || + entries[i].padding[1] || + entries[i].padding[2]) + return true; + } + return false; +} + +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; @@ -513,6 +554,10 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; + + if (sanity_check_entries(entries, cpuid->nent, type)) + return -EINVAL; + r = -ENOMEM; cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) @@ -526,7 +571,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, continue; r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -537,7 +582,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b7fd07984888..f1e4895174b2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ca72a5cdb6..8dfde7a52dab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2564,6 +2564,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: @@ -2673,15 +2674,17 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { + case KVM_GET_SUPPORTED_CPUID: + case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + ioctl); if (r) goto out; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 99c25338ede8..bb986a1674a3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -541,6 +541,7 @@ struct kvm_ppc_smmu_info { #define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 #define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 #define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 +#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) /* * Extension capability list. @@ -668,6 +669,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_XICS 92 #define KVM_CAP_ARM_EL1_32BIT 93 #define KVM_CAP_SPAPR_MULTITCE 94 +#define KVM_CAP_EXT_EMUL_CPUID 95 #ifdef KVM_CAP_IRQ_ROUTING