Message ID | 20211011223611.069324121@linutronix.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/fpu: Preparatory cleanups for AMX support (part 1) | expand |
Just typos: On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote: > Swapping the host/guest FPU is directly fiddling with FPU internals which > requires 5 exports. The upcoming support of dymanically enabled states "dynamically" > /* > * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It > * disables preemption so be careful if you intend to use it for long periods > @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatur > > static inline void update_pasid(void) { } > > +/* FPSTATE related functions which are exported to KVM */ fpstate-related > +extern void fpu_init_fpstate_user(struct fpu *fpu); > + > +/* KVM specific functions */ > +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); > + > #endif /* _ASM_X86_FPU_API_H */ ... > /* Swap (qemu) user FPU context for the guest FPU context. */ > static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) > { > - fpregs_lock(); > - > - kvm_save_current_fpu(vcpu->arch.user_fpu); > - > /* > - * Guests with protected state can't have it set by the hypervisor, > - * so skip trying to set it. > + * Guest with protected state have guest_fpu == NULL which makes "Guests ... " > + * the swap only safe the host state. Exclude PKRU from restore as "save" > + * it is restored separately in kvm_x86_ops.run(). > */ > - if (vcpu->arch.guest_fpu) > - /* PKRU is separately restored in kvm_x86_ops.run. */ > - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, > - ~XFEATURE_MASK_PKRU); > - > - fpregs_mark_activate(); > - fpregs_unlock(); > - > + fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu, > + ~XFEATURE_MASK_PKRU); > trace_kvm_fpu(1); > } > > /* When vcpu_run ends, restore user space FPU context. */ > static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) > { > - fpregs_lock(); > - > /* > - * Guests with protected state can't have it read by the hypervisor, > - * so skip trying to save it. > + * Guest with protected state have guest_fpu == NULL which makes "Guests ... " > + * swap only restore the host state. > */ > - if (vcpu->arch.guest_fpu) > - kvm_save_current_fpu(vcpu->arch.guest_fpu); > - > - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); > - > - fpregs_mark_activate(); > - fpregs_unlock(); > - > + fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL); > ++vcpu->stat.fpu_reload; > trace_kvm_fpu(0); > } > --- a/arch/x86/mm/extable.c > +++ b/arch/x86/mm/extable.c > @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const s > WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", > (void *)instruction_pointer(regs)); > > - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); > + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); > return true; > } > >
On 12/10/21 02:00, Thomas Gleixner wrote: > Swapping the host/guest FPU is directly fiddling with FPU internals which > requires 5 exports. The upcoming support of dymanically enabled states > would even need more. > > Implement a swap function in the FPU core code and export that instead. > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > Cc: kvm@vger.kernel.org > Cc: Paolo Bonzini <pbonzini@redhat.com> > --- > arch/x86/include/asm/fpu/api.h | 8 +++++ > arch/x86/include/asm/fpu/internal.h | 15 +--------- > arch/x86/kernel/fpu/core.c | 30 ++++++++++++++++++--- > arch/x86/kernel/fpu/init.c | 1 > arch/x86/kernel/fpu/xstate.c | 1 > arch/x86/kvm/x86.c | 51 +++++++----------------------------- > arch/x86/mm/extable.c | 2 - > 7 files changed, 48 insertions(+), 60 deletions(-) > > --- a/arch/x86/include/asm/fpu/api.h > +++ b/arch/x86/include/asm/fpu/api.h > @@ -12,6 +12,8 @@ > #define _ASM_X86_FPU_API_H > #include <linux/bottom_half.h> > > +#include <asm/fpu/types.h> > + > /* > * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It > * disables preemption so be careful if you intend to use it for long periods > @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatur > > static inline void update_pasid(void) { } > > +/* FPSTATE related functions which are exported to KVM */ > +extern void fpu_init_fpstate_user(struct fpu *fpu); > + > +/* KVM specific functions */ > +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); > + > #endif /* _ASM_X86_FPU_API_H */ > --- a/arch/x86/include/asm/fpu/internal.h > +++ b/arch/x86/include/asm/fpu/internal.h > @@ -74,14 +74,8 @@ static __always_inline __pure bool use_f > return static_cpu_has(X86_FEATURE_FXSR); > } > > -/* > - * fpstate handling functions: > - */ > - > extern union fpregs_state init_fpstate; > - > extern void fpstate_init_user(union fpregs_state *state); > -extern void fpu_init_fpstate_user(struct fpu *fpu); > > #ifdef CONFIG_MATH_EMULATION > extern void fpstate_init_soft(struct swregs_state *soft); > @@ -381,12 +375,7 @@ static inline int os_xrstor_safe(struct > return err; > } > > -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); > - > -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) > -{ > - __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); > -} > +extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); > > extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); > > @@ -467,7 +456,7 @@ static inline void fpregs_restore_userre > */ > mask = xfeatures_mask_restore_user() | > xfeatures_mask_supervisor(); > - __restore_fpregs_from_fpstate(&fpu->state, mask); > + restore_fpregs_from_fpstate(&fpu->state, mask); > > fpregs_activate(fpu); > fpu->last_cpu = cpu; > --- a/arch/x86/kernel/fpu/core.c > +++ b/arch/x86/kernel/fpu/core.c > @@ -124,9 +124,8 @@ void save_fpregs_to_fpstate(struct fpu * > asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); > frstor(&fpu->state.fsave); > } > -EXPORT_SYMBOL(save_fpregs_to_fpstate); > > -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) > +void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) > { > /* > * AMD K7/K8 and later CPUs up to Zen don't save/restore > @@ -151,7 +150,31 @@ void __restore_fpregs_from_fpstate(union > frstor(&fpstate->fsave); > } > } > -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); > + > +#if IS_ENABLED(CONFIG_KVM) > +void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) > +{ > + fpregs_lock(); > + > + if (save) { > + if (test_thread_flag(TIF_NEED_FPU_LOAD)) { > + memcpy(&save->state, ¤t->thread.fpu.state, > + fpu_kernel_xstate_size); > + } else { > + save_fpregs_to_fpstate(save); > + } > + } > + > + if (rstor) { > + restore_mask &= xfeatures_mask_fpstate(); > + restore_fpregs_from_fpstate(&rstor->state, restore_mask); > + } > + > + fpregs_mark_activate(); > + fpregs_unlock(); > +} > +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); > +#endif > > void kernel_fpu_begin_mask(unsigned int kfpu_mask) > { > @@ -459,7 +482,6 @@ void fpregs_mark_activate(void) > fpu->last_cpu = smp_processor_id(); > clear_thread_flag(TIF_NEED_FPU_LOAD); > } > -EXPORT_SYMBOL_GPL(fpregs_mark_activate); > > /* > * x87 math exception handling: > --- a/arch/x86/kernel/fpu/init.c > +++ b/arch/x86/kernel/fpu/init.c > @@ -136,7 +136,6 @@ static void __init fpu__init_system_gene > * components into a single, continuous memory block: > */ > unsigned int fpu_kernel_xstate_size __ro_after_init; > -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); > > /* Get alignment of the TYPE. */ > #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) > --- a/arch/x86/kernel/fpu/xstate.c > +++ b/arch/x86/kernel/fpu/xstate.c > @@ -65,7 +65,6 @@ static short xsave_cpuid_features[] __in > * XSAVE buffer, both supervisor and user xstates. > */ > u64 xfeatures_mask_all __ro_after_init; > -EXPORT_SYMBOL_GPL(xfeatures_mask_all); > > static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = > { [ 0 ... XFEATURE_MAX - 1] = -1}; > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -68,7 +68,9 @@ > #include <asm/mce.h> > #include <asm/pkru.h> > #include <linux/kernel_stat.h> > -#include <asm/fpu/internal.h> /* Ugh! */ > +#include <asm/fpu/api.h> > +#include <asm/fpu/xcr.h> > +#include <asm/fpu/xstate.h> > #include <asm/pvclock.h> > #include <asm/div64.h> > #include <asm/irq_remapping.h> > @@ -9899,58 +9901,27 @@ static int complete_emulated_mmio(struct > return 0; > } > > -static void kvm_save_current_fpu(struct fpu *fpu) > -{ > - /* > - * If the target FPU state is not resident in the CPU registers, just > - * memcpy() from current, else save CPU state directly to the target. > - */ > - if (test_thread_flag(TIF_NEED_FPU_LOAD)) > - memcpy(&fpu->state, ¤t->thread.fpu.state, > - fpu_kernel_xstate_size); > - else > - save_fpregs_to_fpstate(fpu); > -} > - > /* Swap (qemu) user FPU context for the guest FPU context. */ > static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) > { > - fpregs_lock(); > - > - kvm_save_current_fpu(vcpu->arch.user_fpu); > - > /* > - * Guests with protected state can't have it set by the hypervisor, > - * so skip trying to set it. > + * Guest with protected state have guest_fpu == NULL which makes > + * the swap only safe the host state. Exclude PKRU from restore as > + * it is restored separately in kvm_x86_ops.run(). > */ > - if (vcpu->arch.guest_fpu) > - /* PKRU is separately restored in kvm_x86_ops.run. */ > - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, > - ~XFEATURE_MASK_PKRU); > - > - fpregs_mark_activate(); > - fpregs_unlock(); > - > + fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu, > + ~XFEATURE_MASK_PKRU); > trace_kvm_fpu(1); > } > > /* When vcpu_run ends, restore user space FPU context. */ > static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) > { > - fpregs_lock(); > - > /* > - * Guests with protected state can't have it read by the hypervisor, > - * so skip trying to save it. > + * Guest with protected state have guest_fpu == NULL which makes > + * swap only restore the host state. > */ > - if (vcpu->arch.guest_fpu) > - kvm_save_current_fpu(vcpu->arch.guest_fpu); > - > - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); > - > - fpregs_mark_activate(); > - fpregs_unlock(); > - > + fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL); > ++vcpu->stat.fpu_reload; > trace_kvm_fpu(0); > } > --- a/arch/x86/mm/extable.c > +++ b/arch/x86/mm/extable.c > @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const s > WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", > (void *)instruction_pointer(regs)); > > - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); > + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); > return true; > } > > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
On Tue, Oct 12 2021 at 18:53, Borislav Petkov wrote: > On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote: >> /* >> - * Guests with protected state can't have it set by the hypervisor, >> - * so skip trying to set it. >> + * Guest with protected state have guest_fpu == NULL which makes > > "Guests ... " > >> + * the swap only safe the host state. Exclude PKRU from restore as > > "save" No I meant safe, but let me rephrase it, Swap does both save and restore. But it's not safe to dereference a NULL pointer :) .... makes the swap only handle the host state. Exclude PKRU from restore as Thanks, tglx
On Tue, Oct 12 2021 at 20:25, Thomas Gleixner wrote: > On Tue, Oct 12 2021 at 18:53, Borislav Petkov wrote: >> On Tue, Oct 12, 2021 at 02:00:17AM +0200, Thomas Gleixner wrote: >>> /* >>> - * Guests with protected state can't have it set by the hypervisor, >>> - * so skip trying to set it. >>> + * Guest with protected state have guest_fpu == NULL which makes >> >> "Guests ... " >> >>> + * the swap only safe the host state. Exclude PKRU from restore as >> >> "save" > > No I meant safe, but let me rephrase it, Swap does both save and > restore. But it's not safe to dereference a NULL pointer :) > > .... makes the swap only handle the host state. Exclude PKRU from restore as Gah. I should have looked at the context first. "save" is correct here. Oh well...
> On 12/10/21 02:00, Thomas Gleixner wrote: > > Swapping the host/guest FPU is directly fiddling with FPU internals > > which requires 5 exports. The upcoming support of dymanically enabled > > states would even need more. > > > > Implement a swap function in the FPU core code and export that instead. > > > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > > Cc: kvm@vger.kernel.org > > Cc: Paolo Bonzini <pbonzini@redhat.com> > > --- > > arch/x86/include/asm/fpu/api.h | 8 +++++ > > arch/x86/include/asm/fpu/internal.h | 15 +--------- > > arch/x86/kernel/fpu/core.c | 30 ++++++++++++++++++--- > > arch/x86/kernel/fpu/init.c | 1 > > arch/x86/kernel/fpu/xstate.c | 1 > > arch/x86/kvm/x86.c | 51 +++++++----------------------------- > > arch/x86/mm/extable.c | 2 - > > 7 files changed, 48 insertions(+), 60 deletions(-) > > When looking into the tglx/devel.git x86/fpu for the full #1-#4 series and the KVM AMX support, I'd like to talk two things as follows, 1. KVM dynamic allocation API: Since KVM also uses dynamic allocation, after KVM detects guest requesting AMX by #NM trap, KVM need alloc extra buffer for this vcpu's current->thread.fpu.fpstate and guest_fpu related. So far, the kernel itself has such API like fpstate_realloc(), but it's static. How about making a common function usable for KVM? 2. There exists a case that *guest AMX state can be lost*: After KVM passthrough XFD to guest, when vmexit opening irq window and KVM is interrupted, kernel softirq path can call kernel_fpu_begin() to touch xsave state. This function does XSAVES. If guest XFD[18] is 1, and with guest AMX state in register, then guest AMX state is lost by XSAVES. The detailed example call trace in commit commit 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Author: Sean Christopherson <seanjc@google.com> Date: Fri Jan 17 11:30:51 2020 -0800 KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest" Reload the current thread's FPU state, which contains the guest's FPU state, to the CPU registers if necessary during vcpu_enter_guest(). TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, e.g. if I/O is triggered during a KVM call to get_user_pages() or if a softirq occurs while KVM is scheduled in. ... A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while vcpu state is loaded: <IRQ> gcmaes_crypt_by_sg.constprop.12+0x26e/0x660 ? 0xffffffffc024547d ? __qdisc_run+0x83/0x510 ? __dev_queue_xmit+0x45e/0x990 ... ? do_IRQ+0x7f/0xd0 ? common_interrupt+0xf/0xf </IRQ> ? irq_entries_start+0x20/0x660 ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel] ? kvm_set_msr_common+0xfc7/0x2380 [kvm] ? recalibrate_cpu_khz+0x10/0x10 ? ktime_get+0x3a/0xa0 ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm] ? kvm_init+0x6bf/0xd00 [kvm] For this case, I think one way is kernel doing something before XSAVES for KVM thread; another way is let KVM fix: maintaining a zero XFD value (by current->state.fpu.fpstate->xfd = 0) after vcpu fpu state is loaded and restore real guest XFD value before vmenter. Logic as follows. after vmexit: if XFD is passthrough then sync guest XFD to vmx->xfd; set XFD to current->state.fpu.fpstate->xfd (= 0) __this_cpu_write(xfd_state, 0); before vmenter (irq is disabled): if passthrough then restore to real guest XFD by vmx->xfd; vcpu_run: (if XFD is passthrough) load: swap from qemu's to a zero XFD put: swap zero to qemu's Thanks, Jing [...]
On 13/10/21 08:15, Liu, Jing2 wrote: > After KVM passthrough XFD to guest, when vmexit opening > irq window and KVM is interrupted, kernel softirq path can call > kernel_fpu_begin() to touch xsave state. This function does > XSAVES. If guest XFD[18] is 1, and with guest AMX state in register, > then guest AMX state is lost by XSAVES. Yes, the host value of XFD (which is zero) has to be restored after vmexit. See how KVM already handles SPEC_CTRL. Passthrough of XFD is only enabled after the guest has caused an #NM vmexit and the full XSAVE state has been dynamically allocated, therefore it is always possible to do an XSAVES even from atomic context. Paolo
> On 13/10/21 08:15, Liu, Jing2 wrote: > > After KVM passthrough XFD to guest, when vmexit opening irq window and > > KVM is interrupted, kernel softirq path can call > > kernel_fpu_begin() to touch xsave state. This function does XSAVES. If > > guest XFD[18] is 1, and with guest AMX state in register, then guest > > AMX state is lost by XSAVES. > > Yes, the host value of XFD (which is zero) has to be restored after vmexit. > See how KVM already handles SPEC_CTRL. > I'm trying to understand why qemu's XFD is zero after kernel supports AMX. Do you mean in guest #NM trap KVM also alloc extra user_fpu buffer and clear qemu's XFD? But why do we need do that? I think only when qemu userspace requests an AMX permission and exec AMX instruction generating host #NM, host kernel clears qemu's XFD[18]. If guest #NM being trapped, KVM *don't* need clear host's XFD, but only allocate guest_fpu's buffer and current->thread.fpu 's buffer, and clear guest's XFD. > Passthrough of XFD is only enabled after the guest has caused an #NM > vmexit Yes, passthrough is done by two cases: one is guest #NM trapped; another is guest clearing XFD before it generates #NM (this is possible for guest), then passthrough. For the two cases, we passthrough and allocate buffer for guest_fpu, and current->thread.fpu. Thanks, Jing and the full XSAVE state has been dynamically allocated, therefore it > is always possible to do an XSAVES even from atomic context. > > Paolo
On 13/10/21 09:46, Liu, Jing2 wrote: > >> On 13/10/21 08:15, Liu, Jing2 wrote: >>> After KVM passthrough XFD to guest, when vmexit opening irq window and >>> KVM is interrupted, kernel softirq path can call >>> kernel_fpu_begin() to touch xsave state. This function does XSAVES. If >>> guest XFD[18] is 1, and with guest AMX state in register, then guest >>> AMX state is lost by XSAVES. >> >> Yes, the host value of XFD (which is zero) has to be restored after vmexit. >> See how KVM already handles SPEC_CTRL. > > I'm trying to understand why qemu's XFD is zero after kernel supports AMX. There are three copies of XFD: - the guest value stored in vcpu->arch. - the "QEMU" value attached to host_fpu. This one only becomes zero if QEMU requires AMX (which shouldn't happen). - the internal KVM value attached to guest_fpu. When #NM happens, this one becomes zero. The CPU value is: - the host_fpu value before kvm_load_guest_fpu and after kvm_put_guest_fpu. This ensures that QEMU context switch is as cheap as possible. - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. This ensures that no state is lost in the case you are describing. - the OR of the guest value and the guest_fpu value while the guest runs (using either MSR load/save lists, or manual wrmsr like pt_guest_enter/pt_guest_exit). This ensures that the host has the opportunity to get a #NM exception, and allocate AMX state in the guest_fpu and in current->thread.fpu. > Yes, passthrough is done by two cases: one is guest #NM trapped; > another is guest clearing XFD before it generates #NM (this is possible for > guest), then passthrough. > For the two cases, we passthrough and allocate buffer for guest_fpu, and > current->thread.fpu. I think it's simpler to always wait for #NM, it will only happen once per vCPU. In other words, even if the guest clears XFD before it generates #NM, the guest_fpu's XFD remains nonzero and an #NM vmexit is possible. After #NM the guest_fpu's XFD is zero; then passthrough can happen and the #NM vmexit trap can be disabled. Paolo
On Wed, Oct 13, 2021, at 1:42 AM, Paolo Bonzini wrote: > On 13/10/21 09:46, Liu, Jing2 wrote: >> >>> On 13/10/21 08:15, Liu, Jing2 wrote: >>>> After KVM passthrough XFD to guest, when vmexit opening irq window and >>>> KVM is interrupted, kernel softirq path can call >>>> kernel_fpu_begin() to touch xsave state. This function does XSAVES. If >>>> guest XFD[18] is 1, and with guest AMX state in register, then guest >>>> AMX state is lost by XSAVES. >>> >>> Yes, the host value of XFD (which is zero) has to be restored after vmexit. >>> See how KVM already handles SPEC_CTRL. >> >> I'm trying to understand why qemu's XFD is zero after kernel supports AMX. > > There are three copies of XFD: > > - the guest value stored in vcpu->arch. > > - the "QEMU" value attached to host_fpu. This one only becomes zero if > QEMU requires AMX (which shouldn't happen). > > - the internal KVM value attached to guest_fpu. When #NM happens, this > one becomes zero. > > > The CPU value is: > > - the host_fpu value before kvm_load_guest_fpu and after > kvm_put_guest_fpu. This ensures that QEMU context switch is as cheap as > possible. > > - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. > This ensures that no state is lost in the case you are describing. > > - the OR of the guest value and the guest_fpu value while the guest runs > (using either MSR load/save lists, or manual wrmsr like > pt_guest_enter/pt_guest_exit). This ensures that the host has the > opportunity to get a #NM exception, and allocate AMX state in the > guest_fpu and in current->thread.fpu. > >> Yes, passthrough is done by two cases: one is guest #NM trapped; >> another is guest clearing XFD before it generates #NM (this is possible for >> guest), then passthrough. >> For the two cases, we passthrough and allocate buffer for guest_fpu, and >> current->thread.fpu. > > I think it's simpler to always wait for #NM, it will only happen once > per vCPU. In other words, even if the guest clears XFD before it > generates #NM, the guest_fpu's XFD remains nonzero and an #NM vmexit is > possible. After #NM the guest_fpu's XFD is zero; then passthrough can > happen and the #NM vmexit trap can be disabled. This will stop being at all optimal when Intel inevitably adds another feature that uses XFD. In the potentially infinite window in which the guest manages XFD and #NM on behalf of its userspace and when the guest allocates the other hypothetical feature, all the #NMs will have to be trapped by KVM. Is it really worthwhile for KVM to use XFD at all instead of preallocating the state and being done with it? KVM would still have to avoid data loss if the guest sets XFD with non-init state, but #NM could always pass through.
On 13/10/21 10:42, Paolo Bonzini wrote: > On 13/10/21 09:46, Liu, Jing2 wrote: > > > >> On 13/10/21 08:15, Liu, Jing2 wrote: > >>> After KVM passthrough XFD to guest, when vmexit opening irq window > >>> and KVM is interrupted, kernel softirq path can call > >>> kernel_fpu_begin() to touch xsave state. This function does XSAVES. > >>> If guest XFD[18] is 1, and with guest AMX state in register, then > >>> guest AMX state is lost by XSAVES. > >> > >> Yes, the host value of XFD (which is zero) has to be restored after vmexit. > >> See how KVM already handles SPEC_CTRL. > > > > I'm trying to understand why qemu's XFD is zero after kernel supports AMX. > > There are three copies of XFD: > > - the guest value stored in vcpu->arch. OK, let's call it e.g. vcpu->arch.xfd [...] > - the internal KVM value attached to guest_fpu. When #NM happens, this > one becomes zero. > The CPU value is: > > - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. > This ensures that no state is lost in the case you are describing. > OK, you mean using guest_fpu as a KVM value. Let me describe the flow to see if anything missing. When #NM trap which makes passthrough, guest_fpu XFD set to 0 and keeps forever. (don't change HW XFD which is still 1) In the #NM trap, KVM alloc buffer and regenerate a #NM exception to guest to make guest kernel alloc its thread buffer. Then in next vmexit, KVM sync vcpu->arch.xfd, load guest_fpu value (=0) and update current->thread.fpu XFD to 0 for kernel reference. > - the OR of the guest value and the guest_fpu value while the guest runs > (using either MSR load/save lists, or manual wrmsr like > pt_guest_enter/pt_guest_exit). This ensures that the host has the > opportunity to get a #NM exception, and allocate AMX state in the > guest_fpu and in current->thread.fpu. > > > Yes, passthrough is done by two cases: one is guest #NM trapped; > > another is guest clearing XFD before it generates #NM (this is possible for > > guest), then passthrough. > > For the two cases, we passthrough and allocate buffer for guest_fpu, and > > current->thread.fpu. > > I think it's simpler to always wait for #NM, it will only happen once > per vCPU. In other words, even if the guest clears XFD before it > generates #NM, the guest_fpu's XFD remains nonzero You mean a wrmsr trap doesn't do anything and return back? In this case, when next vmenter, the OR of the guest value (vcpu->arch.xfd) and the guest_fpu value is still 1, so this doesn't obey guest's HW assumption? (guest finds the wrmsr didn't work) Thanks, Jing and an #NM vmexit is > possible. After #NM the guest_fpu's XFD is zero; then passthrough can > happen and the #NM vmexit trap can be disabled. > > Paolo
On 13/10/21 12:14, Andy Lutomirski wrote: >> I think it's simpler to always wait for #NM, it will only happen >> once per vCPU. In other words, even if the guest clears XFD before >> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM >> vmexit is possible. After #NM the guest_fpu's XFD is zero; then >> passthrough can happen and the #NM vmexit trap can be disabled. > > This will stop being at all optimal when Intel inevitably adds > another feature that uses XFD. In the potentially infinite window in > which the guest manages XFD and #NM on behalf of its userspace and > when the guest allocates the other hypothetical feature, all the #NMs > will have to be trapped by KVM. The reason is that it's quite common to simply let the guest see all CPUID bits that KVM knows about. But it's not unlikely that most guests will not ever use any XFD feature, and therefore will not ever see an #NM. I wouldn't have any problem with allocating _all_ of the dynamic state space on the first #NM. Thinking more about it, #NM only has to be trapped if XCR0 enables a dynamic feature. In other words, the guest value of XFD can be limited to (host_XFD|guest_XFD) & guest_XCR0. This avoids that KVM unnecessarily traps for old guests that use CR0.TS. Paolo > Is it really worthwhile for KVM to use XFD at all instead of > preallocating the state and being done with it? KVM would still have > to avoid data loss if the guest sets XFD with non-init state, but #NM > could always pass through. >
On 13/10/21 12:25, Liu, Jing2 wrote: > [...] >> - the internal KVM value attached to guest_fpu. When #NM happens, this >> one becomes zero. > >> The CPU value is: >> >> - the guest_fpu value between kvm_load_guest_fpu and kvm_put_guest_fpu. >> This ensures that no state is lost in the case you are describing. >> > > OK, you mean using guest_fpu as a KVM value. Let me describe the > flow to see if anything missing. > > When #NM trap which makes passthrough, guest_fpu XFD set to 0 and keeps > forever. (don't change HW XFD which is still 1) > In the #NM trap, KVM alloc buffer and regenerate a #NM exception to guest > to make guest kernel alloc its thread buffer. > Then in next vmexit, KVM sync vcpu->arch.xfd, load guest_fpu value (=0) and > update current->thread.fpu XFD to 0 for kernel reference. In the #NM handler, KVM allocates the buffer and the guest_fpu XFD becomes zero. Also because the guest_fpu XFD is zero: - #NM vmexits are disabled. More precisely, trapping #NM is only necessary if guest_fpu->xfd & ~vcpu->arch.xfd & vcpu->arch.xcr0 is nonzero (i.e. only if there is a state that is guest_fpu-disabled, but enabled according to both XFD and XCR0). - On the next vmentry XFD is set to just vcpu->arch.xfd and the instruction is retried. If the instruction causes an #NM in the guest, it is not trapped and delivered normally to the guest. >> I think it's simpler to always wait for #NM, it will only happen once >> per vCPU. In other words, even if the guest clears XFD before it >> generates #NM, the guest_fpu's XFD remains nonzero > > You mean a wrmsr trap doesn't do anything and return back? The guest might run with the same XFD value as before (which is guest_fpu->xfd | vcpu->arch.xfd), but vcpu->arch.xfd is changed. The value in vcpu->arch.xfd will be read back by an RDMSR, because passthrough is not enabled and the RDMSR will cause a vmexit. Once an #NM is received and guest_fpu->xfd becomes zero, passthrough can be enabled. Paolo > In this case, when next vmenter, the OR of the guest value > (vcpu->arch.xfd) and the guest_fpu value is still 1, so this > doesn't obey guest's HW assumption? (guest finds the wrmsr > didn't work)
Paolo, On Wed, Oct 13 2021 at 10:42, Paolo Bonzini wrote: > On 13/10/21 09:46, Liu, Jing2 wrote: >>> Yes, the host value of XFD (which is zero) has to be restored after vmexit. >>> See how KVM already handles SPEC_CTRL. >> >> I'm trying to understand why qemu's XFD is zero after kernel supports AMX. > > There are three copies of XFD: > > - the guest value stored in vcpu->arch. > > - the "QEMU" value attached to host_fpu. This one only becomes zero if > QEMU requires AMX (which shouldn't happen). I don't think that makes sense. First of all, if QEMU wants to expose AMX to guests, then it has to ask for permission to do so as any other user space process. We're not going to make that special just because. The guest configuration will have to have a 'needs AMX' flag set. So QEMU knows that it is required upfront. Which also means that a guest configuration which has it not set will never get AMX passed through. That tells me, that we should not bother at all with on demand buffer reallocations for that case and just keep things simple. The on demand buffer allocation from the general OS point of view makes sense because there it really matters whether we allocate $N kilobytes per thread or not. But does it matter for the QEMU process and its vCPU threads when the guest is allowed to use AMX? I don't think so. It's an academic exercise IMO and just makes the handling of this way more complex than required. So the logic should be: qemu() read_config() if (dynamic_features_passthrough()) request_permission(feature) create_vcpu_threads() .... vcpu_thread() kvm_ioctl(ENABLE_DYN_FEATURE, feature) reallocate_buffers() realloc(tsk->fpu.fpstate, feature) realloc(guest_fpu.fpstate, feature) realloc(host_fpu.fpstate, feature) All of them will have fpstate.xfd = default_xfd & ~feature That makes also resume and migration simple because that's going to use exactly the same mechanism. Yes, it _allows_ QEMU user space to use AMX, but that's not the end of the world, really and avoids a ton of special cases to worry about. Also the extra memory consumption per vCPU thread is probably just noise compared to the rest of the vCPU state. With that the only thing you have to take care of is in vmx_vcpu_run(): local_irq_disable(); ... vmx_vcpu_run() wrmsrl(XFD, guest->xfd) vmenter() guest->xfd = rdmsrl(XFD) wrmsrl(XFD, host->xfd) It does not matter when at some day there is a XFD controlled bit 19 and you want to selectively allow access to guests because we have two mechanisms here: 1) XCR0 XSETBV in the guest is intercepted and checked against the allowed bits. If it tries to set one which is not allowed, then this is not any different from what KVM is doing today. I.e. Guest1 is allowed to set bit 18, but not 19 Guest2 is allowed to set bit 19, but not 18 Guest3 is allowed to set both 18 and 19 2) XFD Intercepting XFD is optional I think. It does not matter what the guest writes into it, because if XCRO[i] = 0 then the state of XFD[i] is irrelevant according to the ISE: "(IA32_XFD[i] does not affect processor operations if XCR0[i] = 0.)" The only thing different vs. bare metal is that when guest writes XFD[i]=1 it wont get #GP despite the fact that virtualized CPUID suggest that it should get one: "Bit i of either MSR can be set to 1 only if CPUID.(EAX=0DH,ECX=i):ECX[2] is enumerated as 1. An execution of WRMSR that attempts to set an unsupported bit in either MSR causes a general-protection fault (#GP)." Does it matter? Probably not, all it can figure out is that component[i] is supported in hardware, but it can't do anything with that information because the VMM will not allow it to set the corresponding XCR0 bit... Sure you can intercept XFD, check the write against the allowed guest bits and inject #GP if not. But keep in mind that the guest kernel will context switch it and that will not be any better than context switching XCR0 in the guest kernel... The thing we need to think about is the case where guest has XCR0[i] = XFD[i] = 1 and host has XFD[i] = 0, because setting XFD[i] = 1 does not bring the component[i] into init state. In that case we have the following situation after a vmexit: guest->xfd = rdmsrl(XFD) [i] = 1 wrmsrl(XFD, host->xfd) [i] = 0 If the component[i] is _not_ in init state then the next XSAVES on the host will save it and therefore have xsave.header.XSAVE_BV[i] = 1 in the buffer. A subsequent XRSTORS of that buffer on the host will restore the saved data into component[i]. But the subsequent vmenter() will restore the guest XFD which will just bring the guest into the exactly same state as before the VMEXIT. Ergo it does not matter at all. That also makes #NM handling trivial. Any #NM generated in the guest is completely uninteresting for the host with that scheme and it's the guests problem to deal with it. But that brings me to another issue: XFD_ERR. Assume guest takes #NM and before the handler can run and read/clear XFD_ERR a VMEXIT happens which means XFD_ERR will have the guest error bit set and nothing will clear it. So XFD_ERR has to be handled properly otherwise a subsequent #NM on the host will see a stale bit from the guest. vmx_vcpu_run() wrmsrl(XFD, guest->xfd) wrmsrl(XFD_ERR, guest->xfd_err) vmenter() guest->xfd_err = rdmsrl(XFD_ERR) guest->xfd = rdmsrl(XFD) wrmsrl(XFD_ERR, 0) wrmsrl(XFD, host->xfd) Of course that want's to be conditional on the guest configuration and you probably want all of that to be in the auto-load/store area, but you get the idea. Anything else will just create more problems than it solves. Especially #NM handling (think nested guest) and the XFD_ERR additive behaviour will be a nasty playground and easy to get wrong. Not having that at all makes life way simpler, right? Thanks, tglx
On Wed, Oct 13 2021 at 14:26, Paolo Bonzini wrote: > On 13/10/21 12:14, Andy Lutomirski wrote: >>> I think it's simpler to always wait for #NM, it will only happen >>> once per vCPU. In other words, even if the guest clears XFD before >>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM >>> vmexit is possible. After #NM the guest_fpu's XFD is zero; then >>> passthrough can happen and the #NM vmexit trap can be disabled. >> >> This will stop being at all optimal when Intel inevitably adds >> another feature that uses XFD. In the potentially infinite window in >> which the guest manages XFD and #NM on behalf of its userspace and >> when the guest allocates the other hypothetical feature, all the #NMs >> will have to be trapped by KVM. > > The reason is that it's quite common to simply let the guest see all > CPUID bits that KVM knows about. On fleets the cpu features exposed to guests matter a lot to ensure migratability and I would be surprised when such a feature would just be universally available to anyone. Thanks, tglx
On Wed, Oct 13 2021 at 16:14, Thomas Gleixner wrote: > On Wed, Oct 13 2021 at 14:26, Paolo Bonzini wrote: > >> On 13/10/21 12:14, Andy Lutomirski wrote: >>>> I think it's simpler to always wait for #NM, it will only happen >>>> once per vCPU. In other words, even if the guest clears XFD before >>>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM >>>> vmexit is possible. After #NM the guest_fpu's XFD is zero; then >>>> passthrough can happen and the #NM vmexit trap can be disabled. >>> >>> This will stop being at all optimal when Intel inevitably adds >>> another feature that uses XFD. In the potentially infinite window in >>> which the guest manages XFD and #NM on behalf of its userspace and >>> when the guest allocates the other hypothetical feature, all the #NMs >>> will have to be trapped by KVM. >> >> The reason is that it's quite common to simply let the guest see all >> CPUID bits that KVM knows about. > > On fleets the cpu features exposed to guests matter a lot to ensure > migratability and I would be surprised when such a feature would just > be universally available to anyone. As a VM customer you get charged for RAM, CPUs, storage and whatever extra features you need. So why would AMX be any different? So far Intel ignored the fact that these accelerators are managed resources even if they are accessible via instructions and do not require to open(/dev/magic_accelerator). But that's just wrong and XFD should already have happened with AVX512. Trying to expose AMX unconditionally is just wrong and overengineered and proliferating the mess we already have to suffer from. As I said in the other mail. QEMU has to get permissions to use AMX first and not doing it by circumventing the permission part via a KVM hack. Thanks, tglx
On Wed, Oct 13, 2021, at 5:26 AM, Paolo Bonzini wrote: > On 13/10/21 12:14, Andy Lutomirski wrote: >>> I think it's simpler to always wait for #NM, it will only happen >>> once per vCPU. In other words, even if the guest clears XFD before >>> it generates #NM, the guest_fpu's XFD remains nonzero and an #NM >>> vmexit is possible. After #NM the guest_fpu's XFD is zero; then >>> passthrough can happen and the #NM vmexit trap can be disabled. >> >> This will stop being at all optimal when Intel inevitably adds >> another feature that uses XFD. In the potentially infinite window in >> which the guest manages XFD and #NM on behalf of its userspace and >> when the guest allocates the other hypothetical feature, all the #NMs >> will have to be trapped by KVM. > > The reason is that it's quite common to simply let the guest see all > CPUID bits that KVM knows about. But it's not unlikely that most guests > will not ever use any XFD feature, and therefore will not ever see an > #NM. I wouldn't have any problem with allocating _all_ of the dynamic > state space on the first #NM. > > Thinking more about it, #NM only has to be trapped if XCR0 enables a > dynamic feature. In other words, the guest value of XFD can be limited > to (host_XFD|guest_XFD) & guest_XCR0. This avoids that KVM > unnecessarily traps for old guests that use CR0.TS. > You could simplify this by allocating the state the first time XCR0 enables the feature in question. (This is how regular non-virt userspace *should* work too, but it looks like I’ve probably been outvoted on that front…) > Paolo > >> Is it really worthwhile for KVM to use XFD at all instead of >> preallocating the state and being done with it? KVM would still have >> to avoid data loss if the guest sets XFD with non-init state, but #NM >> could always pass through. >>
On 13/10/21 16:59, Andy Lutomirski wrote: >> >> Thinking more about it, #NM only has to be trapped if XCR0 enables >> a dynamic feature. In other words, the guest value of XFD can be >> limited to (host_XFD|guest_XFD) & guest_XCR0. This avoids that >> KVM unnecessarily traps for old guests that use CR0.TS. >> > You could simplify this by allocating the state the first time XCR0 > enables the feature in question. > > (This is how regular non-virt userspace*should* work too, but it > looks like I’ve probably been outvoted on that front…) Good point, you could do that too and do the work on the XCR0 vmexit instead of #NM. Paolo
Jing, On Wed, Oct 13 2021 at 06:15, Jing2 Liu wrote: >> On 12/10/21 02:00, Thomas Gleixner wrote: > When looking into the tglx/devel.git x86/fpu for the full #1-#4 > series and the KVM AMX support, I'd like to talk two things > as follows, > > 1. KVM dynamic allocation API: > Since KVM also uses dynamic allocation, after KVM detects guest > requesting AMX by #NM trap, KVM need alloc extra buffer for > this vcpu's current->thread.fpu.fpstate and guest_fpu related. > So far, the kernel itself has such API like fpstate_realloc(), but it's > static. How about making a common function usable for KVM? Just making that function usable without a proper design how this should work at all does not solve anything. We first need a conclusion vs. buffer reallocation. Once that is sorted then we can create proper infrastructure for that in the FPU core code and not just expose a random function to KVM and hack it into submssion. Thanks, tglx
On 13/10/21 16:06, Thomas Gleixner wrote: >> - the guest value stored in vcpu->arch. >> >> - the "QEMU" value attached to host_fpu. This one only becomes zero if >> QEMU requires AMX (which shouldn't happen). > > I don't think that makes sense. > > First of all, if QEMU wants to expose AMX to guests, then it has to ask > for permission to do so as any other user space process. We're not going > to make that special just because. Hmm, I would have preferred if there was no need to enable AMX for the QEMU FPU. But you're saying that guest_fpu needs to swap out to current->thread.fpu if the guest is preempted, which would require XFD=0; and affect QEMU operation as well. In principle I don't like it very much; it would be nicer to say "you enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to keep things simple, so it's not a strong objection at all. > Anything else will just create more problems than it solves. Especially > #NM handling (think nested guest) and the XFD_ERR additive behaviour > will be a nasty playground and easy to get wrong. > > Not having that at all makes life way simpler, right? It is simpler indeed, and it makes sense to start simple. I am not sure if it will hold, but I agree it's better for the first implementation. Paolo
On 10/14/2021 2:50 PM, Paolo Bonzini wrote: > On 13/10/21 16:06, Thomas Gleixner wrote: > >> - the guest value stored in vcpu->arch. > >> > >> - the "QEMU" value attached to host_fpu. This one only becomes zero > >> if QEMU requires AMX (which shouldn't happen). > > > > I don't think that makes sense. > > > > First of all, if QEMU wants to expose AMX to guests, then it has to > > ask for permission to do so as any other user space process. We're not > > going to make that special just because. > > Hmm, I would have preferred if there was no need to enable AMX for the > QEMU FPU. But you're saying that guest_fpu needs to swap out to > current->thread.fpu if the guest is preempted, which would require > XFD=0; and affect QEMU operation as well. For preemption, if guest_fpu XFD is used as KVM internal value, then we can simply set current->thread.fpu XFD the same as KVM internal value in vmexit so kernel preemption can refer to it. Thus, I think this issue doesn't much effect if enabling AMX for Qemu FPU or not. > > In principle I don't like it very much; it would be nicer to say "you > enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for > the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to > keep things simple, so it's not a strong objection at all. > Does this mean that KVM allocate 3 buffers via 1) Qemu's request, instead of via 2) guest XCR0 trap? For the two ways, I think what we need care is the same: a) allocation time; b) lazy passthrough time which related to XFD handling and values. Because we don't want always rdmsr and clear XFD in vmexit, and don't want to trap different XFD switching in guest. For 1), Qemu need prctl() and ioctl(ENABLE_DYN_FEATURE). But *when* does Qemu do ioctl(ENABLE_DYN_FEATURE)? I mean if guest XCR0 doesn't set bit18, then KVM doesn't need alloc 3 buffers at all. Thus, XCR0 trap is a simple way? Meanwhile, for lazy passthrough, do we want to make it when guest wrmsr trap (i.e. guest changes XFD, not inits XFD) if using 1) qemu's request? Or using 2) via XCR0 trap, directly passthrough when XCR0 trap? > > Anything else will just create more problems than it solves. Especially > > #NM handling (think nested guest) and the XFD_ERR additive behaviour > > will be a nasty playground and easy to get wrong. > > > > Not having that at all makes life way simpler, right? > > It is simpler indeed, and it makes sense to start simple. I'd like to confirm which is the simpler way we'd like to :) Thanks, Jing I am not sure > if it will hold, but I agree it's better for the first implementation. > > Paolo
> > 1. KVM dynamic allocation API: > > Since KVM also uses dynamic allocation, after KVM detects guest > > requesting AMX by #NM trap, KVM need alloc extra buffer for this > > vcpu's current->thread.fpu.fpstate and guest_fpu related. > > So far, the kernel itself has such API like fpstate_realloc(), but > > it's static. How about making a common function usable for KVM? > > Just making that function usable without a proper design how this should > work at all does not solve anything. > > We first need a conclusion vs. buffer reallocation. > > Once that is sorted then we can create proper infrastructure for that in the > FPU core code and not just expose a random function to KVM and hack it into > submssion. Yes, we need a consensus on the way we choose and then to see if need a kernel function for KVM usage. Thanks, Jing > > Thanks, > > tglx
On 14/10/21 10:02, Liu, Jing2 wrote: >> In principle I don't like it very much; it would be nicer to say "you >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for >> the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to >> keep things simple, so it's not a strong objection at all. > > Does this mean that KVM allocate 3 buffers via > 1) Qemu's request, instead of via 2) guest XCR0 trap? Based on the input from Andy and Thomas, the new way would be like this: 1) host_fpu must always be checked for reallocation in kvm_load_guest_fpu (or in the FPU functions that it calls, that depends on the rest of Thomas's patches). That's because arch_prctl can enable AMX for QEMU at any point after KVM_CREATE_VCPU. 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only include those dynamic-feature bits that were enabled via arch_prctl. That is, something like: static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) { return vcpu->arch.guest_supported_xcr0 & (~xfeatures_mask_user_dynamic | \ current->thread.fpu.dynamic_state_perm); } 3) Even with passthrough disabled, the guest can run with XFD set to vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler than trapping #NM. The traps for writing XCR0 and XFD are used to allocate dynamic state for guest_fpu, and start the passthrough of XFD and XFD_ERR. What we need is: - if a dynamic state has XCR0[n]=0, bit n will never be set in XFD_ERR and the state will never be dirtied by the guest. - if a dynamic state has XCR0[n]=1, but all enabled dynamic states have XFD[n]=1, the guest is not able to dirty any dynamic XSAVE state, because they all have either XCR0[n]=0 or XFD[n]=1. An attempt to do so will cause an #NM trap and set the bit in XFD_ERR. - if a dynamic state has XCR0[n]=1 and XFD[n]=0, the state for bit n is allocated in guest_fpu, and it can also disable the vmexits for XFD and XFD_ERR. Therefore: - if passthrough is disabled, the XCR0 and XFD write traps can check guest_xcr0 & ~guest_xfd. If it includes a dynamic state bit, dynamic state is allocated for all bits enabled in guest_xcr0 and passthrough is started; this should happen shortly after the guest gets its first #NM trap for AMX. - if passthrough is enabled, the XCR0 write trap must still ensure that dynamic state is allocated for all bits enabled in guest_xcr0. So something like this pseudocode is called by both XCR0 and XFD writes: int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu) { u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm; u64 enabled_dynamic = vcpu->arch.xcr0 & xfeatures_mask_user_dynamic; /* All dynamic features have to be arch_prctl'd first. */ WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic); if (!vcpu->arch.xfd_passthrough) { /* All dynamic states will #NM? Wait and see. */ if ((enabled_dynamic & ~vcpu->arch.xfd) == 0) return 0; kvm_x86_ops.enable_xfd_passthrough(vcpu); } /* current->thread.fpu was already handled by arch_prctl. */ return fpu_alloc_features(vcpu->guest_fpu, vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic); } Paolo
On 10/14/2021 5:01 PM, Paolo Bonzini wrote: > On 14/10/21 10:02, Liu, Jing2 wrote: > >> In principle I don't like it very much; it would be nicer to say "you > >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and > >> for the guests via ioctl(KVM_SET_CPUID2)". But I can see why you > >> want to keep things simple, so it's not a strong objection at all. > > > > Does this mean that KVM allocate 3 buffers via > > 1) Qemu's request, instead of via 2) guest XCR0 trap? > > Based on the input from Andy and Thomas, the new way would be like this: > > 1) host_fpu must always be checked for reallocation in kvm_load_guest_fpu > (or in the FPU functions that it calls, that depends on the rest of Thomas's > patches). That's because arch_prctl can enable AMX for QEMU at any point > after KVM_CREATE_VCPU. For Qemu's XFD, I'd like to confirm that: Since the arch_prctl() onlys add current->group_leader->thread.fpu's state_perm, __state_size, (current->thread.fpu.* is not changed), thus in kvm_load_guest_fpu, host_fpu->xfd is always 1. That is to say, Qemu's arch_prctl() doesn't change any copies of XFD. > > 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only include > those dynamic-feature bits that were enabled via arch_prctl. > That is, something like: > > static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) { > return vcpu->arch.guest_supported_xcr0 & > (~xfeatures_mask_user_dynamic | \ > current->thread.fpu.dynamic_state_perm); > } > > 3) Even with passthrough disabled, the guest can run with XFD set to > vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler > than trapping #NM. The traps for writing XCR0 and XFD are used to allocate > dynamic state for guest_fpu, and start the passthrough of XFD and XFD_ERR. > What we need is: > > - if a dynamic state has XCR0[n]=0, bit n will never be set in XFD_ERR and the > state will never be dirtied by the guest. > > - if a dynamic state has XCR0[n]=1, but all enabled dynamic states have > XFD[n]=1, the guest is not able to dirty any dynamic XSAVE state, because > they all have either XCR0[n]=0 or XFD[n]=1. An attempt to do so will cause an > #NM trap and set the bit in XFD_ERR. > > - if a dynamic state has XCR0[n]=1 and XFD[n]=0, the state for bit n is > allocated in guest_fpu, and it can also disable the vmexits for XFD and > XFD_ERR. > Got it, the principle is once XCR0[n]=1 and XFD[n]=0, then guest is allowed to use the dynamic XSAVE state, thus KVM must prepare all things well before. This probably happens shortly after guest #NM. Only one thing: it seems we assume that vcpu->arch.xfd is guest runtime value. And before guest initializes XFD, KVM provides vcpu->arch.xfd[18]=1, right? But the spec asks XFD reset value as zero. If so, between guest init XCR0 to 1 and init XFD to 1, it's XCR0[n]=1 and XFD[n]=0. If a guest never init XFD and directly use dynamic state... Or do we want to provide guest a XFD[18]=1 value at the very beginning? > Therefore: > > - if passthrough is disabled, the XCR0 and XFD write traps can check > guest_xcr0 & ~guest_xfd. If it includes a dynamic state bit, dynamic state is > allocated for all bits enabled in guest_xcr0 and passthrough is started; this > should happen shortly after the guest gets its first #NM trap for AMX. > > - if passthrough is enabled, the XCR0 write trap must still ensure that > dynamic state is allocated for all bits enabled in guest_xcr0. > > So something like this pseudocode is called by both XCR0 and XFD writes: > > int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu) { > u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm; > u64 enabled_dynamic = > vcpu->arch.xcr0 & xfeatures_mask_user_dynamic; > > /* All dynamic features have to be arch_prctl'd first. */ > WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic); > > if (!vcpu->arch.xfd_passthrough) { > /* All dynamic states will #NM? Wait and see. */ > if ((enabled_dynamic & ~vcpu->arch.xfd) == 0) Here, when guest init XCR0 to 1, vcpu->arch.xfd should be 1 otherwise XCR0 trap makes passthrough and allocates buffer, which is not what we want. > return 0; > > kvm_x86_ops.enable_xfd_passthrough(vcpu); > } > > /* current->thread.fpu was already handled by arch_prctl. */ It seems so far, arch_prctl does not change current->thread.fpu, only #NM handler itself does it. We here alloc current too. Thanks, Jing > return fpu_alloc_features(vcpu->guest_fpu, > vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic); } > > Paolo
On 10/14/2021 5:01 PM, Paolo Bonzini wrote: > On 14/10/21 10:02, Liu, Jing2 wrote: > >> In principle I don't like it very much; it would be nicer to say "you > >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and > >> for the guests via ioctl(KVM_SET_CPUID2)". But I can see why you > >> want to keep things simple, so it's not a strong objection at all. > > > > Does this mean that KVM allocate 3 buffers via > > 1) Qemu's request, instead of via 2) guest XCR0 trap? > > Based on the input from Andy and Thomas, the new way would be like this: > > 1) host_fpu must always be checked for reallocation in kvm_load_guest_fpu > (or in the FPU functions that it calls, that depends on the rest of Thomas's > patches). That's because arch_prctl can enable AMX for QEMU at any point > after KVM_CREATE_VCPU. > > 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only include > those dynamic-feature bits that were enabled via arch_prctl. > That is, something like: > > static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) { > return vcpu->arch.guest_supported_xcr0 & > (~xfeatures_mask_user_dynamic | \ > current->thread.fpu.dynamic_state_perm); > } > > 3) Even with passthrough disabled, the guest can run with XFD set to > vcpu->arch.guest_xfd (and likewise for XFD_ERR) which is much simpler > than trapping #NM. The traps for writing XCR0 and XFD are used to allocate > dynamic state for guest_fpu, and start the passthrough of XFD and XFD_ERR. For XFD_ERR, since it can be auto changed by HW, write-protect is not need I think. KVM also not need trap rdmsr of it because no use. I guess we're worrying about is when KVM is sched_out, a nonzero XFD_ERR can be lost by other host thread. We can save guest XFD_ERR in sched_out and restore before next vmenter. Kernel is assumed not using AMX thus softirq won't make it lost. I think this solves the problem. So we can directly passthrough RW of it, and no need to rdmsr(XFD_ERR) in vmexit. Thanks, Jing
On 14/10/21 13:21, Liu, Jing2 wrote: > Got it, the principle is once XCR0[n]=1 and XFD[n]=0, then guest is allowed > to use the dynamic XSAVE state, thus KVM must prepare all things well > before. This probably happens shortly after guest #NM. > > Only one thing: it seems we assume that vcpu->arch.xfd is guest runtime > value. And before guest initializes XFD, KVM provides > vcpu->arch.xfd[18]=1, right? But the spec asks XFD reset value as zero. > If so, between guest init XCR0 to 1 and init XFD to 1, it's XCR0[n]=1 and > XFD[n]=0. If a guest never init XFD and directly use dynamic state... > > Or do we want to provide guest a XFD[18]=1 value at the very beginning? On reset the guest value has to be zero. For Linux, which we control, we probably want to write the bit in XFD before XSETBV. For other OSes there's nothing we can do, but hopefully they make similar considerations. Paolo
On 14/10/21 13:30, Liu, Jing2 wrote: > I guess we're worrying about is when KVM is sched_out, a nonzero XFD_ERR > can be lost by other host thread. We can save guest XFD_ERR in sched_out > and restore before next vmenter. Kernel is assumed not using AMX thus > softirq won't make it lost. > I think this solves the problem. So we can directly passthrough RW of it, > and no need to rdmsr(XFD_ERR) in vmexit. Correct; you can also use the "user-return MSRs" machinery (until Linux starts using AMX in the kernel, but that shouldn't happen too soon). Paolo
Paolo, On Thu, Oct 14 2021 at 08:50, Paolo Bonzini wrote: > On 13/10/21 16:06, Thomas Gleixner wrote: >>> - the guest value stored in vcpu->arch. >>> >>> - the "QEMU" value attached to host_fpu. This one only becomes zero if >>> QEMU requires AMX (which shouldn't happen). >> >> I don't think that makes sense. >> >> First of all, if QEMU wants to expose AMX to guests, then it has to ask >> for permission to do so as any other user space process. We're not going >> to make that special just because. > > Hmm, I would have preferred if there was no need to enable AMX for the > QEMU FPU. But you're saying that guest_fpu needs to swap out to > current->thread.fpu if the guest is preempted, which would require > XFD=0; and affect QEMU operation as well. Exactly. If we don't enable it for QEMY itself, then this is creating just a horrible inconsistency which requires nasty hacks. I'm not at all interested in those as I just got rid of quite some and made the code consistent. > In principle I don't like it very much; it would be nicer to say "you > enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for > the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to > keep things simple, so it's not a strong objection at all. Errm. qemu() read_config() if (dynamic_features_passthrough()) request_permission(feature) <- prctl(ARCH_SET_STATE_ENABLE) create_vcpu_threads() .... vcpu_thread() kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl That's what I lined out, right? >> Anything else will just create more problems than it solves. Especially >> #NM handling (think nested guest) and the XFD_ERR additive behaviour >> will be a nasty playground and easy to get wrong. >> >> Not having that at all makes life way simpler, right? > > It is simpler indeed, and it makes sense to start simple. I am not sure > if it will hold, but I agree it's better for the first implementation. KISS is a very reasonable engineering principle :) If there is a real world use case and a proper technical justification for doing the dynamic buffer allocation then I'm happy to discuss that. Thanks, tglx
On 14/10/21 14:23, Thomas Gleixner wrote: >> In principle I don't like it very much; it would be nicer to say "you >> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for >> the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to >> keep things simple, so it's not a strong objection at all. > Errm. > > qemu() > read_config() > if (dynamic_features_passthrough()) > request_permission(feature) <- prctl(ARCH_SET_STATE_ENABLE) > > create_vcpu_threads() > .... > > vcpu_thread() > kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl > > That's what I lined out, right? > I meant prctl for QEMU-in-user-mode vs. ioctl QEMU-in-guest-mode (i.e. no prctl if only the guest uses it). But anyway it's just abstract "beauty", let's stick to simple. :) Paolo
On Thu, Oct 14 2021 at 08:21, Jing2 Liu wrote: >> >> Once that is sorted then we can create proper infrastructure for that in the >> FPU core code and not just expose a random function to KVM and hack it into >> submssion. > Yes, we need a consensus on the way we choose and then to see if need a > kernel function for KVM usage. The question is not 'if'. The question is 'which' functionality we need. Thanks, tglx
On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote: > On 14/10/21 10:02, Liu, Jing2 wrote: > Based on the input from Andy and Thomas, the new way would be like this: > > 1) host_fpu must always be checked for reallocation in > kvm_load_guest_fpu (or in the FPU functions that it calls, that depends > on the rest of Thomas's patches). That's because arch_prctl can enable > AMX for QEMU at any point after KVM_CREATE_VCPU. No. 1) QEMU starts 2) QEMU requests permissions via prctl() 3) QEMU creates vCPU threads Doing it the other way around makes no sense at all and wont work. > 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only > include those dynamic-feature bits that were enabled via arch_prctl. > That is, something like: > > static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) > { > return vcpu->arch.guest_supported_xcr0 & > (~xfeatures_mask_user_dynamic | \ > current->thread.fpu.dynamic_state_perm); Bah. You can't get enough from poking in internals, right? vcpu_create() fpu_init_fpstate_user(guest_fpu, supported_xcr0) That will (it does not today) do: guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm(); for you. Once. The you have the information you need right in the guest FPU. See? > So something like this pseudocode is called by both XCR0 and XFD writes: > > int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu) > { > u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm; That's not a valid assumption. > u64 enabled_dynamic = > vcpu->arch.xcr0 & xfeatures_mask_user_dynamic; > > /* All dynamic features have to be arch_prctl'd first. */ > WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic); > > if (!vcpu->arch.xfd_passthrough) { > /* All dynamic states will #NM? Wait and see. */ > if ((enabled_dynamic & ~vcpu->arch.xfd) == 0) > return 0; > > kvm_x86_ops.enable_xfd_passthrough(vcpu); > } > > /* current->thread.fpu was already handled by arch_prctl. */ No. current->thread.fpu has the default buffer unless QEMU used AMX or something forced it to allocate a larger buffer. > return fpu_alloc_features(vcpu->guest_fpu, > vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic); This unconditionally calls into that allocation for every XCR0/XFD trap ? > } Also you really should not wait until _all_ dynamic states are cleared in guest XFD. Because a guest which has bit 18 and 19 available but only uses one of them is going to trap on every other context switch due to XFD writes. So you check for (guest_xfd & guest_perm) != guest_perm) and (guest_xr0 & guest_perm) != 0 If both are true, then you reallocate the buffers for _all_ permitted states _and_ set XFD to pass through. Thanks, tglx
On Thu, Oct 14 2021 at 14:26, Paolo Bonzini wrote: > On 14/10/21 14:23, Thomas Gleixner wrote: >>> In principle I don't like it very much; it would be nicer to say "you >>> enable it for QEMU itself via arch_prctl(ARCH_SET_STATE_ENABLE), and for >>> the guests via ioctl(KVM_SET_CPUID2)". But I can see why you want to >>> keep things simple, so it's not a strong objection at all. >> Errm. >> >> qemu() >> read_config() >> if (dynamic_features_passthrough()) >> request_permission(feature) <- prctl(ARCH_SET_STATE_ENABLE) >> >> create_vcpu_threads() >> .... >> >> vcpu_thread() >> kvm_ioctl(ENABLE_DYN_FEATURE, feature) <- KVM ioctl >> >> That's what I lined out, right? >> > > I meant prctl for QEMU-in-user-mode vs. ioctl QEMU-in-guest-mode (i.e. > no prctl if only the guest uses it). But anyway it's just abstract > "beauty", let's stick to simple. :) It's not about simple. It's about correctness in the first place. The prctl() is process wide and grants permission. If that permission is not granted, e.g. by a seccomp rule, then the vCPU threads cannot use it either. We are _not_ making exceptions for KVM just because it's KVM. Trying to pretend that the usermode thread does not need it is just illusion. The kernel representation of that very usermode vCPU thread must have a large fpstate. It still can have XFD set, but that's a detail. So what you are trying to sell me has nothing to do with beauty at all except when your definition of beauty originates from a tunnel of horrors. Thanks, tglx
On Thu, Oct 14 2021 at 16:09, Thomas Gleixner wrote: > On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote: > > Also you really should not wait until _all_ dynamic states are cleared > in guest XFD. Because a guest which has bit 18 and 19 available but only > uses one of them is going to trap on every other context switch due to > XFD writes. > > So you check for > > (guest_xfd & guest_perm) != guest_perm) > > and > > (guest_xr0 & guest_perm) != 0 > > If both are true, then you reallocate the buffers for _all_ permitted > states _and_ set XFD to pass through. And for that to work we must write XFD _before_ XSETBV in the guest boot phase. Thanks, tglx
On 14/10/21 16:09, Thomas Gleixner wrote: > On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote: >> On 14/10/21 10:02, Liu, Jing2 wrote: >> Based on the input from Andy and Thomas, the new way would be like this: >> >> 1) host_fpu must always be checked for reallocation in >> kvm_load_guest_fpu (or in the FPU functions that it calls, that depends >> on the rest of Thomas's patches). That's because arch_prctl can enable >> AMX for QEMU at any point after KVM_CREATE_VCPU. > > No. > > 1) QEMU starts > 2) QEMU requests permissions via prctl() > 3) QEMU creates vCPU threads > > Doing it the other way around makes no sense at all and wont work. Sure, but KVM needs to do something that makes sense even for userspaces that are not QEMU. For example, there could be a program that uses AMX *itself* and does not expose it to the guest. In that case, the arch_prctl can come at the point AMX is needed, which can be after the program creates vCPU threads. That's for host_fpu. For the guest_fpu, I agree that the arch_prctl must come before creating vCPUs. >> 2) every use of vcpu->arch.guest_supported_xcr0 is changed to only >> include those dynamic-feature bits that were enabled via arch_prctl. >> That is, something like: >> >> static u64 kvm_guest_supported_cr0(struct kvm_vcpu *vcpu) >> { >> return vcpu->arch.guest_supported_xcr0 & >> (~xfeatures_mask_user_dynamic | \ >> current->thread.fpu.dynamic_state_perm); > > Bah. You can't get enough from poking in internals, right? > > vcpu_create() > > fpu_init_fpstate_user(guest_fpu, supported_xcr0) > > That will (it does not today) do: > > guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm(); > > The you have the information you need right in the guest FPU. Good, I wasn't aware of the APIs that will be there. >> int kvm_alloc_fpu_dynamic_features(struct kvm_vcpu *vcpu) >> { >> u64 allowed_dynamic = current->thread.fpu.dynamic_state_perm; > > That's not a valid assumption. > >> u64 enabled_dynamic = >> vcpu->arch.xcr0 & xfeatures_mask_user_dynamic; >> >> /* All dynamic features have to be arch_prctl'd first. */ >> WARN_ON_ONCE(enabled_dynamic & ~allowed_dynamic); >> >> if (!vcpu->arch.xfd_passthrough) { >> /* All dynamic states will #NM? Wait and see. */ >> if ((enabled_dynamic & ~vcpu->arch.xfd) == 0) >> return 0; >> >> kvm_x86_ops.enable_xfd_passthrough(vcpu); >> } >> >> /* current->thread.fpu was already handled by arch_prctl. */ > > No. current->thread.fpu has the default buffer unless QEMU used AMX or > something forced it to allocate a larger buffer. > >> return fpu_alloc_features(vcpu->guest_fpu, >> vcpu->guest_fpu.dynamic_state_perm | enabled_dynamic); > > This unconditionally calls into that allocation for every XCR0/XFD > trap ? Calls into the function, but doesn't necessarily allocate anything. What you wrote below looks correct to me, thanks. Paolo > Also you really should not wait until _all_ dynamic states are cleared > in guest XFD. Because a guest which has bit 18 and 19 available but only > uses one of them is going to trap on every other context switch due to > XFD writes.
Paolo, On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote: > On 14/10/21 16:09, Thomas Gleixner wrote: >> On Thu, Oct 14 2021 at 11:01, Paolo Bonzini wrote: >>> On 14/10/21 10:02, Liu, Jing2 wrote: >>> Based on the input from Andy and Thomas, the new way would be like this: >>> >>> 1) host_fpu must always be checked for reallocation in >>> kvm_load_guest_fpu (or in the FPU functions that it calls, that depends >>> on the rest of Thomas's patches). That's because arch_prctl can enable >>> AMX for QEMU at any point after KVM_CREATE_VCPU. >> >> No. >> >> 1) QEMU starts >> 2) QEMU requests permissions via prctl() >> 3) QEMU creates vCPU threads >> >> Doing it the other way around makes no sense at all and wont work. > > Sure, but KVM needs to do something that makes sense even for userspaces > that are not QEMU. > > For example, there could be a program that uses AMX *itself* and does > not expose it to the guest. In that case, the arch_prctl can come at > the point AMX is needed, which can be after the program creates vCPU > threads. That's for host_fpu. That wont affect the vCPU threads unless they start to use AMX in user space themself. Which means they have the default buffer and their vCPU user/guest FPU's too. The prctl() sets the permission nothing else. As long as they don't use AMX their XFD[18] stays set. Only when they start using AMX in user space themself they trigger #NM which allocates a larger buffer for the thread. So then the point where it matters is fpu_swap_kvm_fpu() and that's preemptible context so we can do allocations before fiddling with the buffers. Not rocket science. And that has nothing to do with the whole XCR0/XFD/XFD_ERR/#NM guest mess. > For the guest_fpu, I agree that the arch_prctl must come before creating > vCPUs. Good :) >> vcpu_create() >> >> fpu_init_fpstate_user(guest_fpu, supported_xcr0) >> >> That will (it does not today) do: >> >> guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm(); >> >> The you have the information you need right in the guest FPU. > > Good, I wasn't aware of the APIs that will be there. Me neither, but that's a pretty obvious consequence of the work I'm doing for AMX. So I made it up for you. :) >> This unconditionally calls into that allocation for every XCR0/XFD >> trap ? > > Calls into the function, but doesn't necessarily allocate anything. Sure. > What you wrote below looks correct to me, thanks. > > Paolo > Properly quoting mail is hard, right? >> Also you really should not wait until _all_ dynamic states are cleared >> in guest XFD. Because a guest which has bit 18 and 19 available but only > uses one of them is going to trap on every other context switch due to >> XFD writes. Thanks, tglx
On 10/14/2021 11:01 PM, Paolo Bonzini wrote: [...] > Calls into the function, but doesn't necessarily allocate anything. > What you wrote below looks correct to me, thanks. > For the guest dynamic state support, based on the latest discussion, four copies of XFD need be cared and switched, I'd like to list as follows. - vcpu->arch.xfd: this is the real guest value for running. Since kernel init XFD before XCR0, so I think KVM can initialize it as bit[n]=0, for a guest start value. Otherwise, kvm_arch_vcpu_create() need initializes vcpu->arch.xfd=guest_fpu->xfd=user_fpu->xfd=1. Guest wrmsr XFD trap will make it update. - user_fpu->fpstate->xfd: Qemu itself and not for guest, which is probably always set. - guest_fpu->fpstate->xfd: this is for KVM internal value between time[*]. KVM reinitializes it as bit[n]=0 (not the same as user_fpu), and it will be updated when guest wrmsr trap. Thus, before passthrough, it's the same as vcpu->arch.xfd, thus vmenter/vmexit need not rewrite msr. After passthrough, this keeps bit[n] as 0 forever. - current_fpu->fpstate->xfd: it should be the same as KVM internal value between time[*]. [*] this means between kvm_load_guest_fpu and kvm_put_guest_fpu. From guest booting timeline, the values are: Booting start... # In this time, vcpu->arch.xfd[n]=guest_fpu->xfd[n]=0 Init XFD by WRMSR(XFD[n], 1) # Then, vcpu->arch.xfd[n]=guest_fpu->xfd[n]=1 Init XCR0 by XSETBV ... #NM WRMSR(XFD[n], 0) # Then, guest_fpu->xfd[n]=0, vcpu->arch.xfd[n]=0. vcpu->arch.xfd will be updated in later vmexits. BTW, we only need lazy-passthrough XFD WRITE and passthrough READ directly. Thanks, Jing > Paolo > > > Also you really should not wait until _all_ dynamic states are cleared > > in guest XFD. Because a guest which has bit 18 and 19 available but > > only > uses one of them is going to trap on every other context switch due > to XFD writes.
On 10/15/2021 3:14 AM, Thomas Gleixner wrote: > Paolo, > [...] > >> vcpu_create() > >> > >> fpu_init_fpstate_user(guest_fpu, supported_xcr0) > >> > >> That will (it does not today) do: > >> > >> guest_fpu::__state_perm = supported_xcr0 & > >> xstate_get_group_perm(); > >> > >> The you have the information you need right in the guest FPU. > > > > Good, I wasn't aware of the APIs that will be there. > > Me neither, but that's a pretty obvious consequence of the work I'm doing > for AMX. So I made it up for you. :) Do you mean that fpu_init_fpstate_user() will be updated to add supported_xcr0 later? :) I'm thinking if guest_fpu::xfd is good to directly initialize as user's init_fpstate.xfd. Because before guest initializes XFD, "hardware" is reset value. So it would be better to make guest_fpu::xfd the same so no need to reload zero before vmenter during this time. Thanks, Jing
Paolo, On Thu, Oct 14 2021 at 21:14, Thomas Gleixner wrote: > On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote: >>> vcpu_create() >>> >>> fpu_init_fpstate_user(guest_fpu, supported_xcr0) >>> >>> That will (it does not today) do: >>> >>> guest_fpu::__state_perm = supported_xcr0 & xstate_get_group_perm(); >>> >>> The you have the information you need right in the guest FPU. >> >> Good, I wasn't aware of the APIs that will be there. > > Me neither, but that's a pretty obvious consequence of the work I'm > doing for AMX. So I made it up for you. :) let me make some more up for you! If you carefully look at part 2 of the rework, then you might notice that there is a fundamental change which allows to do a real simplification for KVM FPU handling: current->thread.fpu.fpstate is now a pointer. So you can spare one FPU allocation because we can now do: fpu_attach_guest_fpu(supported_xcr0) { guest_fpstate = alloc_fpstate(supported_xcr0); fpu_init_fpstate_user(guest_fpstate, supported_xcr0); current->thread.fpu.guest_fpstate = guest_fpstate; } fpu_swap_kvm_fpu() becomes in the first step: fpu_swap_kvm_fpu(bool enter_guest) { safe_fpregs_to_fpstate(current->thread.fpu.fpstate); swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate); restore_fpregs_from_fpstate(current->thread.fpu.fpstate); } @enter guest will allow to do some sanity checks In a second step: fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) { possibly_reallocate(enter_guest, guest_needs_features); safe_fpregs_to_fpstate(current->thread.fpu.fpstate); swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate); restore_fpregs_from_fpstate(current->thread.fpu.fpstate); possibly_reallocate(enter_guest, guest_needs_features); } @guest_needs_features is the information which you gather via guest XCR0 and guest XFD. So fpu_swap_kvm_fpu() is going to be the place where reallocation happens and that's good enough for both cases: vcpu_run() fpu_swap_kvm_fpu(); <- 1 while (...) vmenter(); fpu_swap_kvm_fpu(); <- 2 #1 QEMU user space used feature and has already large fpstate #2 Guest requires feature but has not used it yet (XCR0/XFD trapping) See? It's not only correct, it's also simple and truly beautiful. Thanks, tglx
Jing, On Fri, Oct 15 2021 at 09:00, Jing2 Liu wrote: > On 10/14/2021 11:01 PM, Paolo Bonzini wrote: > For the guest dynamic state support, based on the latest discussion, > four copies of XFD need be cared and switched, I'd like to list as > follows. There will not be 4 copies. Read my last mail and think about the consequences. I'm really tired of this tinkering frenzy. There is only one correct approach to this: 1) Define the requirements 2) Define the best trapping mechanism 3) Sit down, look at the existing code including the FPU rework for AMX. Come up with a proper integration plan 4) Clean up the existing KVM FPU mess further so the integration can be done smoothly 5) Add the required infrastructure in FPU core and KVM 6) Add the trapping mechanics 7) Enable feature What you are doing is looking for the quickest way to duct tape this into the existing mess. That might be matching the KVM expectations, but it's not going to happen. KVM already violates all well known rules of encapsulation and just fiddles in the guts of FPU mechanism, duplicates code in buggy ways. This has to stop now! You are free to ignore me, but all you are going to achieve is to delay AMX integration further. Seriously, I'm not even going to reply to anything which is not based on the above approach. I'm sure you can figure out at which point we are at the moment. Thanks, tglx
On 15/10/21 12:50, Thomas Gleixner wrote: > That might be matching the KVM expectations, but it's not going to > happen. > > KVM already violates all well known rules of encapsulation and just > fiddles in the guts of FPU mechanism, duplicates code in buggy ways. > > This has to stop now! FWIW, I totally agree about that. Over the years we've gotten more well-thought hooks in the kernel for KVM and less hacks, and I'll only be happy if that extends to the FPU code which I'm quite wary of touching. Most of it has been unchanged since Ingo's last rewrite. Paolo > You are free to ignore me, but all you are going to achieve is to delay > AMX integration further. Seriously, I'm not even going to reply to > anything which is not based on the above approach.
Hi Thomas, On 10/15/2021 6:50 PM, Thomas Gleixner wrote: > Jing, > > On Fri, Oct 15 2021 at 09:00, Jing2 Liu wrote: > > On 10/14/2021 11:01 PM, Paolo Bonzini wrote: > > For the guest dynamic state support, based on the latest discussion, > > four copies of XFD need be cared and switched, I'd like to list as > > follows. > > There will not be 4 copies. Read my last mail and think about the > consequences. > Actually I saw there are fpu_init_fpstate_user(vcpu->arch.user_fpu) and fpu_init_fpstate_user(vcpu->arch.guest_fpu) in the full series, so I understood that we'd keep it this way. (Your last mail corrects me) But yes, these xstate copies really make things complex and bad, and I'm glad to do for a good clean way. I'll reply the thinking (based on your approach below) on that thread later. > I'm really tired of this tinkering frenzy. There is only one correct approach to > this: > > 1) Define the requirements > > 2) Define the best trapping mechanism > > 3) Sit down, look at the existing code including the FPU rework for > AMX. Come up with a proper integration plan > > 4) Clean up the existing KVM FPU mess further so the integration > can be done smoothly > > 5) Add the required infrastructure in FPU core and KVM > > 6) Add the trapping mechanics > > 7) Enable feature > > What you are doing is looking for the quickest way to duct tape this into the > existing mess. > > That might be matching the KVM expectations, but it's not going to happen. > > KVM already violates all well known rules of encapsulation and just fiddles in > the guts of FPU mechanism, duplicates code in buggy ways. > > This has to stop now! > Yes, this is an opportunity to make current KVM FPU better. > You are free to ignore me, Of course I won't, because I also want to try a good way that both KVM and kernel are glad to use. Thanks, Jing but all you are going to achieve is to delay AMX > integration further. Seriously, I'm not even going to reply to anything which is > not based on the above approach. > > I'm sure you can figure out at which point we are at the moment. > > Thanks, > > tglx
Hi Thomas, On 10/15/2021 5:36 PM, Thomas Gleixner wrote: > Paolo, > > On Thu, Oct 14 2021 at 21:14, Thomas Gleixner wrote: > > On Thu, Oct 14 2021 at 17:01, Paolo Bonzini wrote: > >>> vcpu_create() > >>> > >>> fpu_init_fpstate_user(guest_fpu, supported_xcr0) > >>> > >>> That will (it does not today) do: > >>> > >>> guest_fpu::__state_perm = supported_xcr0 & > >>> xstate_get_group_perm(); > >>> > >>> The you have the information you need right in the guest FPU. > >> > >> Good, I wasn't aware of the APIs that will be there. > > > > Me neither, but that's a pretty obvious consequence of the work I'm > > doing for AMX. So I made it up for you. :) > > let me make some more up for you! > > If you carefully look at part 2 of the rework, then you might notice that there > is a fundamental change which allows to do a real simplification for KVM FPU > handling: > > current->thread.fpu.fpstate > > is now a pointer. So you can spare one FPU allocation because we can now > do: Trying to understand your point, seems struct fpu will add a guest_fpstate pointer. And this will be allocated when vcpu_create() by the following function. Swap between the two pointers in load/put. What I was thinking is that vcpu keeps having guest_fpu and delete user_fpu. > > fpu_attach_guest_fpu(supported_xcr0) > { > guest_fpstate = alloc_fpstate(supported_xcr0); I supposed this is called when vcpu_create(). Not sure the reason of supported_xcr0 input here. supported_xcr0[n]=1 and guest _state_perm[n]=1 which is requested before vcpu_create(), so this will allocate full buffer, at vcpu_create() stage? Or do you mean vcpu->arch.guest_supported_xcr0. Please correct me if I misunderstood. Thanks. > fpu_init_fpstate_user(guest_fpstate, supported_xcr0); > current->thread.fpu.guest_fpstate = guest_fpstate; } > > fpu_swap_kvm_fpu() becomes in the first step: > > fpu_swap_kvm_fpu(bool enter_guest) > { > safe_fpregs_to_fpstate(current->thread.fpu.fpstate); > > swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate); > > restore_fpregs_from_fpstate(current->thread.fpu.fpstate); > } > > @enter guest will allow to do some sanity checks > > In a second step: > > fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) { > possibly_reallocate(enter_guest, guest_needs_features); When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate fpstate buffer for full features. Because in next vmexit, guest might have dynamic state and KVM can be preempted before running fpu_swap_kvm_fpu(). Thus, here the current->thread.fpu.fpstate already has enough space for saving guest. Thanks, Jing > safe_fpregs_to_fpstate(current->thread.fpu.fpstate); > > swap(current->thread.fpu.fpstate, current->thread.fpu.guest_fpstate); > > restore_fpregs_from_fpstate(current->thread.fpu.fpstate); > possibly_reallocate(enter_guest, guest_needs_features); } > > @guest_needs_features is the information which you gather via guest XCR0 > and guest XFD. > > So fpu_swap_kvm_fpu() is going to be the place where reallocation happens > and that's good enough for both cases: > > vcpu_run() > > fpu_swap_kvm_fpu(); <- 1 > > while (...) > vmenter(); > > fpu_swap_kvm_fpu(); <- 2 > > #1 QEMU user space used feature and has already large fpstate > > #2 Guest requires feature but has not used it yet (XCR0/XFD trapping) > > See? > > It's not only correct, it's also simple and truly beautiful. > > Thanks, > > tglx
On 15/10/21 16:24, Liu, Jing2 wrote: >> fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) { >> possibly_reallocate(enter_guest, guest_needs_features); > When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate > fpstate buffer for full features. You mean XCR0 and XFD (not XFD in #NM), but yeah at the point of fpu_swap_kvm_fpu we are in atomic context. Still, for now the first pass of AMX implementation doesn't need to do anything but swap the pointers, and it can simply allocate the full buffer at vCPU creation. Paolo > Because in next vmexit, guest might have dynamic state and KVM > can be preempted before running fpu_swap_kvm_fpu(). > Thus, here the current->thread.fpu.fpstate already has enough space > for saving guest.
Jing, On Fri, Oct 15 2021 at 14:24, Jing2 Liu wrote: > On 10/15/2021 5:36 PM, Thomas Gleixner wrote: >> If you carefully look at part 2 of the rework, then you might notice that there >> is a fundamental change which allows to do a real simplification for KVM FPU >> handling: >> >> current->thread.fpu.fpstate >> >> is now a pointer. So you can spare one FPU allocation because we can now >> do: > > Trying to understand your point, seems struct fpu will add a guest_fpstate > pointer. And this will be allocated when vcpu_create() by the following > function. Swap between the two pointers in load/put. What I was thinking > is that vcpu keeps having guest_fpu and delete user_fpu. unfortunately we can't do that in vcpu_create() because the thread doing that is not necessarily the vCPU thread which invokes vcpu_run() later. But that does not matter much. So vcpu_create() will do vcpu->arch.guest_fpstate = fpu_alloc_guest_fpstate(); and in vcpu_run() invoke fpu_swap_kvm_fpstate(guest_fpstate, ...) which in turn does: int fpu_swap_kvm_fpstate(struct fpstate *guest_fps, bool enter_guest, u64 restore_mask) { struct fpu *fpu = ¤t->thread.fpu; struct fpstate *fps = fpu->fpstate; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = fps; fpu->fpstate = guest_fps; } else { fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } fps = fpu->fpstate; /* * Once XFD support is added, XFP switching happens here * right before the restore. */ restore_mask &= XFEATURE_MASK_FPSTATE; restore_fpregs_from_fpstate(fps, restore_mask); fpregs_mark_activate(); fpregs_unlock(); return 0; } That's a simplified version of what I have already running on top of the FPU rework part 3, but you get the idea. If you are curious: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git/log/?h=x86/fpu-3-kvm If you compare that to the current KVM FPU swap handling then you'll notice that there is only _one_ buffer required and in case that TIF_NEED_FPU_RELOAD is set there is no memcpy() required either because the state is already saved in the to be swapped out buffer. That's a valuable cleanup and improvement independent of AMX. See? This also makes the confidential computing case less awkward because we can do: if (!fpstate->is-scratch && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); instead of the current hack of freeing guest_fpu. See the git tree. Though I'm not sure whether the logic for this "is_scratch" optimization is correct as I implemnted it, but I'm neither sure that the current logic in KVM is correct. But that's just a implementation detail which needs to be looked at. XFD support will be also fully consistently integrated: XFD will be switched before the restore and this will be fully consistent with everything we are doing vs. host side support because current->thread.fpu.fpstate->xfd will always be the authoritive answer. No need to copy any information from one place to another. Ergo: No 4 copies of XFD. >> In a second step: >> >> fpu_swap_kvm_fpu(bool enter_guest, u64 guest_needs_features) { >> possibly_reallocate(enter_guest, guest_needs_features); > > When KVM traps guest wrmsr XFD in #NM, I think KVM need allocate > fpstate buffer for full features. > Because in next vmexit, guest might have dynamic state and KVM > can be preempted before running fpu_swap_kvm_fpu(). > Thus, here the current->thread.fpu.fpstate already has enough space > for saving guest. I think we are talking past each other. You are looking at this from the point of view of what you have been doing so far and I am looking at it from a design and abstraction point of view. That explains why we have different excpectations vs. XCR0/XFD/#NM. So the regular boring case will be: H vcpu_run() H fpu_swap_kvm_fpstate() <- Sets guest_fpstate->xfd H H while (..) { H vmenter() G .... G .... -> vmexit (unrelated to XCR0/XFD) H ... H } H H fpu_swap_kvm_fpstate() <- Sets user (task) XFD Now let's look at the XFD/XCR0 intercept case: H vcpu_run() H fpu_swap_kvm_fpstate() <- Sets guest_fpstate->xfd H H while (..) { H vmenter() G .... G write to XFD/XCR0; -> intercept H ... H if (reason == write to XFD/XCR0)) { H if (needs_action(guest_fpstate, $XFDVAL, $XCR0VAL)) { H fpstate_set_realloc_request(guest_fpstate); H H break; H H } H } H ..... H } H H fpu_swap_kvm_fpstate() fpu_swap_kvm_fpstate() will see the reallocation request in guest_fpstate and act accordingly. Both user and guest state are fully consistent at that point. Why? It does not matter at all whether the wrmsrl(XFD) or XSETBV affecting XCR0 in the guest happens because the guest decided it is cool to enable it just for fun or because the guest took a #NM and wrote to XFD. In both cases the XFD controlled component is in init state at that point. So there is _nothing_ to save and _nothing_ which can be lost and no buffer size problem at all. Therefore it does also not matter whether the vCPU thread gets preempted or not on the way out to fpu_swap_kvm_fpstate(). It's all still consistent. So fpu_swap_kvm_fpstate() will do in case of a reallocation request: 1) Allocate new guest fpstate If that fails, it does a save and restore with the existing fpstates and return an error code which makes KVM drop out to user space to decide what to do. On success initialize the state properly including the new XFD value. 2) Save guest state into new guest fpstate 3) Restore host (user) state from the existing fpstate See? It does not even have to allocate a new host (user) fpstate to do that. Why? Because once the fpstate pointer is flipped the state is consistent in both directions including XFD. See? Now if you think about the other way round then the same principle applies: If the host (user) side of the vCPU thread used a dynamic state it has a large buffer, but that does not require the guest side buffer to be large as well. So this is what Paolo wanted, right? I was fighting that because with the existing three buffer scheme this cannot not work. See? The point is that: - the simple state switching was impossible because the proposed host side infrastructure had the wrong abstraction: It just reallocated the register buffer, but did not give it a container which carries the other relevant information, i.e. features, sizes and importantly xfd. - the simple state switching was impossible because the user/guest FPU concept of KVM was preventing that. - it was tried to plug the reallocation into the wrong place: There is no point to do that from inside the vcpu_run() loop. It's a one off event and that extra overhead of going out to the place where this can be handled sanely does not matter at all. Just to be clear: I'm not blamning you for any of this at all. There have been enough senior people involved who should have seen the obvious instead of misguiding you. So please just forget the horrors which you went through due to lack of proper guidance, sit back and think about it. The code in that git branch is just a first step and requires a few tweaks to get the reallocation handled correctly, but if you look at the above then you might realize that there are two related but largely independent problems to solve: 1) What needs to be intercepted and analyzed in the intercept handlers of XCR0 and XFD 2) Handling the reallocation problem #1 is a KVM problem and #2 is a host FPU problem As you are a KVM wizard, I let you sort out #1 with the KVM folks and I'm looking at the #2 part together with Chang and Dave. Why? #1 is not my area of expertise, but I surely might have opinions. #2 is not any different from handling the host side lazy reallocation. Can you spot the difference between the original approach and the approach I have taken? Maybe you understand now why I was explaining over and over that we need consistent state and asked everyone to look at the AMX host series. Just for the record. When I looked at that KVM FPU switching exactly two weeks ago while I was thinking about the right abstraction for AMX, it was bloody obvious that just reallocating the register state buffer is wrong. And it was bloody obvious that the user/guest FPU concept of KVM is nonsense to begin with and going to be in the way of doing a clean integration. Why? Because when you switch buffers, you need to switch state information which belongs to the buffer, i.e. features, sizes and xfd, as well because otherwise you create inconsistent state. Sure you can copy tons of stuff back and forth, but why would you do that if you just can switch the full state by swapping the pointer to a container which contains all the information which is needed and makes everything else (KVM trap bits aside) just work. So you can rightfully ask why I did not tell that plan right away? The reason is that I wanted all of you look at the AMX host series and I desperately hoped that my insisting on state consistency will make at least one of the involved people come back and say: "Thomas, why don't you do the obvious in fpu_swap_kvm_fpu() and switch the fpstate pointers? That FPU switching with the three buffers you kept around is bloody stupid." My answer would have been: "Doh, of course, it's obvious. Stupid me." But that did not happen. Even when I brought up the vcpu_create() -> alloc_and_attach() vcpu_run() -> swap() -> vmenter_loop() -> swap() vcpu_destroy() -> detach_and_free() proposal nobody told me: "Thomas, this can't work because the thread which creates the vCPU is not necessarily the same as the one which runs it." No, I had to ask the question myself because I had second thoughts when I was starting to implement that scheme. I had not thought about that when I wrote it up in mail simply because I'm not a KVM expert. But it did not matter because the concept stayed the same, just the implementation details changed: vcpu_create() -> alloc() vcpu_run() -> attach() -> vmenter_loop() -> detach() vcpu_destroy() -> free() Why? Because everyone was busy trying to cram their hacks into the code I just changed instead of rethinking the situation. See? Jing, as I said before, I'm not blaming you personally. What I blame is the general approach to add new features to the kernel: Hack it into the existing mess until it works by some definition of works. That simply cannot go anywhere because it makes the code slow and unmaintainable in the long run. If a new feature does not fit nicely into the existing code, then the only viable approach is to sit back, look at the requirements of the new feature and come up with proper abstractions and a plan how to refactor the code so that the feature falls into place at the very end and does not create mess and overhead all over the place. If you look at the three series I posted, then you see not a single bit which does not make sense on it's own, except for the last series which adds the fpu config data structure with the pairs of default_* and max_*. Even the fpstate pointer makes sense on it's own because the resulting cleanup of the KVM FPU switch code is already worthwhile even w/o AMX and XFD in terms of memory consumption, performance and robustness. See? The real AMX stuff which still needs to be posted is just building upon this refactoring. It adds the necessary infrastructure for AMX, which is all slow path code. In the hotpath it adds only the XFD update at exactly 4 places where state is saved or restored. IOW, all hotpath operations are exactly the same. If XFD is not available on a CPU then the overhead of the XFD update code is a few extra NOPs due to the patched out static branch. If enabled then yes, it has an extra conditional and when the XFD value really changes then a wrmsrl, but that's inevitable. See? Now if you sit back and look at the KVM concepts I explained above then you surely can see that the overhead for the KVM case is going to exactly a few extra NOPs in the hotpath when XFD is not available. When XFD is enabled then yes, it needs the extra setup for XFD, but the common case in the vmenter_loop() will have only a minimalistic overhead if at all. The common case in fpu_swap_kvm_fpstate() will only grow a single conditional in the hotpath: if (unlikely(guest_fpstate->need_realloc)) { } but that's not even measurable. See? Thanks, Thomas
Hi Paolo, > On 10/14/2021 7:39 PM, Paolo Bonzini wrote: > > On 14/10/21 13:30, Liu, Jing2 wrote: > > I guess we're worrying about is when KVM is sched_out, a nonzero > > XFD_ERR can be lost by other host thread. We can save guest XFD_ERR in > > sched_out and restore before next vmenter. Kernel is assumed not using > > AMX thus softirq won't make it lost. > > I think this solves the problem. So we can directly passthrough RW of > > it, and no need to rdmsr(XFD_ERR) in vmexit. > > Correct; you can also use the "user-return MSRs" machinery (until Linux > starts using AMX in the kernel, but that shouldn't happen too soon). > Thanks for the suggestion. For user-return MSR mechanism using by emulated MSRs, it calls kvm_set_user_return_msr() to wrmsr of guest value, update curr value and switch host once kernel exit to userspace. For XFD_ERR, it's automatically changed by H/W in guest, so KVM need correctly update guest XFD_ERR value at a time, which is different from other user-return MSRs, e.g., at KVM preemption and kvm_put_guest_fpu() time, and both cases need not do wrmsr. And for kvm_put_guest_fpu(), it does return to userspace. Also, XFD_ERR cannot refer to vmx->guest_uret_msrs_loaded to update before vmenter, since curr may not an up-to-date value. My feeling is the mechanism may not much suitable for this case and need special handling. Since guest non-zero XFD_ERR is rare case at vmexit, how about saving XFD_ERR when preemption, mark flag=true and restore if non-zero before vcpu enter? This seems simple and direct way, drawback is if XFD_ERR is not changed when schedule out, KVM need a wrmsr, but this only happens when it's non-zero&&flag=true. Thanks, Jing > Paolo
--- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,6 +12,8 @@ #define _ASM_X86_FPU_API_H #include <linux/bottom_half.h> +#include <asm/fpu/types.h> + /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatur static inline void update_pasid(void) { } +/* FPSTATE related functions which are exported to KVM */ +extern void fpu_init_fpstate_user(struct fpu *fpu); + +/* KVM specific functions */ +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); + #endif /* _ASM_X86_FPU_API_H */ --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -74,14 +74,8 @@ static __always_inline __pure bool use_f return static_cpu_has(X86_FEATURE_FXSR); } -/* - * fpstate handling functions: - */ - extern union fpregs_state init_fpstate; - extern void fpstate_init_user(union fpregs_state *state); -extern void fpu_init_fpstate_user(struct fpu *fpu); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); @@ -381,12 +375,7 @@ static inline int os_xrstor_safe(struct return err; } -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); - -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) -{ - __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); -} +extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -467,7 +456,7 @@ static inline void fpregs_restore_userre */ mask = xfeatures_mask_restore_user() | xfeatures_mask_supervisor(); - __restore_fpregs_from_fpstate(&fpu->state, mask); + restore_fpregs_from_fpstate(&fpu->state, mask); fpregs_activate(fpu); fpu->last_cpu = cpu; --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -124,9 +124,8 @@ void save_fpregs_to_fpstate(struct fpu * asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); frstor(&fpu->state.fsave); } -EXPORT_SYMBOL(save_fpregs_to_fpstate); -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -151,7 +150,31 @@ void __restore_fpregs_from_fpstate(union frstor(&fpstate->fsave); } } -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); + +#if IS_ENABLED(CONFIG_KVM) +void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) +{ + fpregs_lock(); + + if (save) { + if (test_thread_flag(TIF_NEED_FPU_LOAD)) { + memcpy(&save->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + } else { + save_fpregs_to_fpstate(save); + } + } + + if (rstor) { + restore_mask &= xfeatures_mask_fpstate(); + restore_fpregs_from_fpstate(&rstor->state, restore_mask); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); +#endif void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -459,7 +482,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -136,7 +136,6 @@ static void __init fpu__init_system_gene * components into a single, continuous memory block: */ unsigned int fpu_kernel_xstate_size __ro_after_init; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,7 +65,6 @@ static short xsave_cpuid_features[] __in * XSAVE buffer, both supervisor and user xstates. */ u64 xfeatures_mask_all __ro_after_init; -EXPORT_SYMBOL_GPL(xfeatures_mask_all); static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,7 +68,9 @@ #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> @@ -9899,58 +9901,27 @@ static int complete_emulated_mmio(struct return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. + * Guest with protected state have guest_fpu == NULL which makes + * the swap only safe the host state. Exclude PKRU from restore as + * it is restored separately in kvm_x86_ops.run(). */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu, + ~XFEATURE_MASK_PKRU); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. + * Guest with protected state have guest_fpu == NULL which makes + * swap only restore the host state. */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const s WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); return true; }
Swapping the host/guest FPU is directly fiddling with FPU internals which requires 5 exports. The upcoming support of dymanically enabled states would even need more. Implement a swap function in the FPU core code and export that instead. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: kvm@vger.kernel.org Cc: Paolo Bonzini <pbonzini@redhat.com> --- arch/x86/include/asm/fpu/api.h | 8 +++++ arch/x86/include/asm/fpu/internal.h | 15 +--------- arch/x86/kernel/fpu/core.c | 30 ++++++++++++++++++--- arch/x86/kernel/fpu/init.c | 1 arch/x86/kernel/fpu/xstate.c | 1 arch/x86/kvm/x86.c | 51 +++++++----------------------------- arch/x86/mm/extable.c | 2 - 7 files changed, 48 insertions(+), 60 deletions(-)