Message ID | 20171114215424.32214-2-riel@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
2017-11-15 5:54 GMT+08:00 <riel@redhat.com>: > From: Rik van Riel <riel@redhat.com> > > Currently, every time a VCPU is scheduled out, the host kernel will > first save the guest FPU/xstate context, then load the qemu userspace > FPU context, only to then immediately save the qemu userspace FPU > context back to memory. When scheduling in a VCPU, the same extraneous > FPU loads and saves are done. > > This could be avoided by moving from a model where the guest FPU is > loaded and stored with preemption disabled, to a model where the > qemu userspace FPU is swapped out for the guest FPU context for > the duration of the KVM_RUN ioctl. What will happen if CONFIG_PREEMPT is enabled? Regards, Wanpeng Li > > This is done under the VCPU mutex, which is also taken when other > tasks inspect the VCPU FPU context, so the code should already be > safe for this change. That should come as no surprise, given that > s390 already has this optimization. > > No performance changes were detected in quick ping-pong tests on > my 4 socket system, which is expected since an FPU+xstate load is > on the order of 0.1us, while ping-ponging between CPUs is on the > order of 20us, and somewhat noisy. > > There may be other tests where performance changes are noticeable. > > Signed-off-by: Rik van Riel <riel@redhat.com> > Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com> > --- > arch/x86/include/asm/kvm_host.h | 13 +++++++++++++ > arch/x86/kvm/x86.c | 34 +++++++++++++--------------------- > include/linux/kvm_host.h | 2 +- > 3 files changed, 27 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 9d7d856b2d89..ffe54958491f 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -536,7 +536,20 @@ struct kvm_vcpu_arch { > struct kvm_mmu_memory_cache mmu_page_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > + /* > + * QEMU userspace and the guest each have their own FPU state. > + * In vcpu_run, we switch between the user and guest FPU contexts. > + * While running a VCPU, the VCPU thread will have the guest FPU > + * context. > + * > + * Note that while the PKRU state lives inside the fpu registers, > + * it is switched out separately at VMENTER and VMEXIT time. The > + * "guest_fpu" state here contains the guest FPU context, with the > + * host PRKU bits. > + */ > + struct fpu user_fpu; > struct fpu guest_fpu; > + > u64 xcr0; > u64 guest_supported_xcr0; > u32 guest_xstate_size; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 03869eb7fcd6..aad5181ed4e9 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2917,7 +2917,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > srcu_read_unlock(&vcpu->kvm->srcu, idx); > pagefault_enable(); > kvm_x86_ops->vcpu_put(vcpu); > - kvm_put_guest_fpu(vcpu); > vcpu->arch.last_host_tsc = rdtsc(); > } > > @@ -5228,13 +5227,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) > > static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) > { > - preempt_disable(); > - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); > } > > static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) > { > - preempt_enable(); > } > > static int emulator_intercept(struct x86_emulate_ctxt *ctxt, > @@ -6908,7 +6904,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > preempt_disable(); > > kvm_x86_ops->prepare_guest_switch(vcpu); > - kvm_load_guest_fpu(vcpu); > > /* > * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt > @@ -7255,12 +7250,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > } > } > > + kvm_load_guest_fpu(vcpu); > + > if (unlikely(vcpu->arch.complete_userspace_io)) { > int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; > vcpu->arch.complete_userspace_io = NULL; > r = cui(vcpu); > if (r <= 0) > - goto out; > + goto out_fpu; > } else > WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); > > @@ -7269,6 +7266,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > else > r = vcpu_run(vcpu); > > +out_fpu: > + kvm_put_guest_fpu(vcpu); > out: > post_kvm_run_save(vcpu); > if (vcpu->sigset_active) > @@ -7663,32 +7662,25 @@ static void fx_init(struct kvm_vcpu *vcpu) > vcpu->arch.cr0 |= X86_CR0_ET; > } > > +/* Swap (qemu) user FPU context for the guest FPU context. */ > void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) > { > - if (vcpu->guest_fpu_loaded) > - return; > - > - /* > - * Restore all possible states in the guest, > - * and assume host would use all available bits. > - * Guest xcr0 would be loaded later. > - */ > - vcpu->guest_fpu_loaded = 1; > - __kernel_fpu_begin(); > + preempt_disable(); > + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); > /* PKRU is separately restored in kvm_x86_ops->run. */ > __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, > ~XFEATURE_MASK_PKRU); > + preempt_enable(); > trace_kvm_fpu(1); > } > > +/* When vcpu_run ends, restore user space FPU context. */ > void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) > { > - if (!vcpu->guest_fpu_loaded) > - return; > - > - vcpu->guest_fpu_loaded = 0; > + preempt_disable(); > copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); > - __kernel_fpu_end(); > + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); > + preempt_enable(); > ++vcpu->stat.fpu_reload; > trace_kvm_fpu(0); > } > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 6882538eda32..354608487b8d 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -232,7 +232,7 @@ struct kvm_vcpu { > struct mutex mutex; > struct kvm_run *run; > > - int guest_fpu_loaded, guest_xcr0_loaded; > + int guest_xcr0_loaded; > struct swait_queue_head wq; > struct pid __rcu *pid; > int sigset_active; > -- > 2.9.4 >
On Wed, 2017-11-15 at 08:47 +0800, Wanpeng Li wrote: > 2017-11-15 5:54 GMT+08:00 <riel@redhat.com>: > > From: Rik van Riel <riel@redhat.com> > > > > Currently, every time a VCPU is scheduled out, the host kernel will > > first save the guest FPU/xstate context, then load the qemu > > userspace > > FPU context, only to then immediately save the qemu userspace FPU > > context back to memory. When scheduling in a VCPU, the same > > extraneous > > FPU loads and saves are done. > > > > This could be avoided by moving from a model where the guest FPU is > > loaded and stored with preemption disabled, to a model where the > > qemu userspace FPU is swapped out for the guest FPU context for > > the duration of the KVM_RUN ioctl. > > What will happen if CONFIG_PREEMPT is enabled? The scheduler will save the guest FPU context when a VCPU thread is preempted, and restore it when it is scheduled back in. While inside kvm_arch_vcpu_ioctl_run, the task FPU context will be the guest FPU context, instead of the qemu userspace FPU context.
2017-11-15 11:03 GMT+08:00 Rik van Riel <riel@redhat.com>: > On Wed, 2017-11-15 at 08:47 +0800, Wanpeng Li wrote: >> 2017-11-15 5:54 GMT+08:00 <riel@redhat.com>: >> > From: Rik van Riel <riel@redhat.com> >> > >> > Currently, every time a VCPU is scheduled out, the host kernel will >> > first save the guest FPU/xstate context, then load the qemu >> > userspace >> > FPU context, only to then immediately save the qemu userspace FPU >> > context back to memory. When scheduling in a VCPU, the same >> > extraneous >> > FPU loads and saves are done. >> > >> > This could be avoided by moving from a model where the guest FPU is >> > loaded and stored with preemption disabled, to a model where the >> > qemu userspace FPU is swapped out for the guest FPU context for >> > the duration of the KVM_RUN ioctl. >> >> What will happen if CONFIG_PREEMPT is enabled? > > The scheduler will save the guest FPU context when a > VCPU thread is preempted, and restore it when it is > scheduled back in. I mean all the involved processes will use fpu. Before patch if kernel preempt occur: context_switch -> prepare_task_switch -> fire_sched_out_preempt_notifiers -> kvm_sched_out -> kvm_arch_vcpu_put -> kvm_put_guest_fpu -> copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu) save xsave area to guest fpu buffer -> __kernel_fpu_end -> copy_kernel_to_fpregs(¤t->thread.fpu.state) restore prev vCPU qemu userspace FPU to the xsave area -> switch_to -> __switch_to -> switch_fpu_prepare -> copy_fpregs_to_fpstate => save xsave area to prev vCPU qemu userspace FPU -> switch_fpu_finish -> copy_kernel_to_fpgregs => restore next task FPU to xsave area After the patch: context_switch -> prepare_task_switch -> fire_sched_out_preempt_notifiers -> kvm_sched_out -> switch_to -> __switch_to -> switch_fpu_prepare -> copy_fpregs_to_fpstate => Oops save xsave area to prev vCPU qemu userspace FPU, actually the guest FPU buffer is loaded in xsave area, you transmit guest FPU in xsave area into the prev vCPU qemu userspace FPU Regards, Wanpeng Li
On 2017/11/15 05:54, riel@redhat.com wrote: > From: Rik van Riel <riel@redhat.com> > > Currently, every time a VCPU is scheduled out, the host kernel will > first save the guest FPU/xstate context, then load the qemu userspace > FPU context, only to then immediately save the qemu userspace FPU > context back to memory. When scheduling in a VCPU, the same extraneous > FPU loads and saves are done. Rik, be careful with VM migration. with you patch, I don't think you could load fpu/xstate context accurately after VM migration. Quan Alibaba Cloud > This could be avoided by moving from a model where the guest FPU is > loaded and stored with preemption disabled, to a model where the > qemu userspace FPU is swapped out for the guest FPU context for > the duration of the KVM_RUN ioctl. > > This is done under the VCPU mutex, which is also taken when other > tasks inspect the VCPU FPU context, so the code should already be > safe for this change. That should come as no surprise, given that > s390 already has this optimization. > > No performance changes were detected in quick ping-pong tests on > my 4 socket system, which is expected since an FPU+xstate load is > on the order of 0.1us, while ping-ponging between CPUs is on the > order of 20us, and somewhat noisy. > > There may be other tests where performance changes are noticeable. > > Signed-off-by: Rik van Riel <riel@redhat.com> > Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com> > --- > arch/x86/include/asm/kvm_host.h | 13 +++++++++++++ > arch/x86/kvm/x86.c | 34 +++++++++++++--------------------- > include/linux/kvm_host.h | 2 +- > 3 files changed, 27 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 9d7d856b2d89..ffe54958491f 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -536,7 +536,20 @@ struct kvm_vcpu_arch { > struct kvm_mmu_memory_cache mmu_page_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > + /* > + * QEMU userspace and the guest each have their own FPU state. > + * In vcpu_run, we switch between the user and guest FPU contexts. > + * While running a VCPU, the VCPU thread will have the guest FPU > + * context. > + * > + * Note that while the PKRU state lives inside the fpu registers, > + * it is switched out separately at VMENTER and VMEXIT time. The > + * "guest_fpu" state here contains the guest FPU context, with the > + * host PRKU bits. > + */ > + struct fpu user_fpu; > struct fpu guest_fpu; > + > u64 xcr0; > u64 guest_supported_xcr0; > u32 guest_xstate_size; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 03869eb7fcd6..aad5181ed4e9 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2917,7 +2917,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > srcu_read_unlock(&vcpu->kvm->srcu, idx); > pagefault_enable(); > kvm_x86_ops->vcpu_put(vcpu); > - kvm_put_guest_fpu(vcpu); > vcpu->arch.last_host_tsc = rdtsc(); > } > > @@ -5228,13 +5227,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) > > static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) > { > - preempt_disable(); > - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); > } > > static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) > { > - preempt_enable(); > } > > static int emulator_intercept(struct x86_emulate_ctxt *ctxt, > @@ -6908,7 +6904,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > preempt_disable(); > > kvm_x86_ops->prepare_guest_switch(vcpu); > - kvm_load_guest_fpu(vcpu); > > /* > * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt > @@ -7255,12 +7250,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > } > } > > + kvm_load_guest_fpu(vcpu); > + > if (unlikely(vcpu->arch.complete_userspace_io)) { > int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; > vcpu->arch.complete_userspace_io = NULL; > r = cui(vcpu); > if (r <= 0) > - goto out; > + goto out_fpu; > } else > WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); > > @@ -7269,6 +7266,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > else > r = vcpu_run(vcpu); > > +out_fpu: > + kvm_put_guest_fpu(vcpu); > out: > post_kvm_run_save(vcpu); > if (vcpu->sigset_active) > @@ -7663,32 +7662,25 @@ static void fx_init(struct kvm_vcpu *vcpu) > vcpu->arch.cr0 |= X86_CR0_ET; > } > > +/* Swap (qemu) user FPU context for the guest FPU context. */ > void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) > { > - if (vcpu->guest_fpu_loaded) > - return; > - > - /* > - * Restore all possible states in the guest, > - * and assume host would use all available bits. > - * Guest xcr0 would be loaded later. > - */ > - vcpu->guest_fpu_loaded = 1; > - __kernel_fpu_begin(); > + preempt_disable(); > + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); > /* PKRU is separately restored in kvm_x86_ops->run. */ > __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, > ~XFEATURE_MASK_PKRU); > + preempt_enable(); > trace_kvm_fpu(1); > } > > +/* When vcpu_run ends, restore user space FPU context. */ > void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) > { > - if (!vcpu->guest_fpu_loaded) > - return; > - > - vcpu->guest_fpu_loaded = 0; > + preempt_disable(); > copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); > - __kernel_fpu_end(); > + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); > + preempt_enable(); > ++vcpu->stat.fpu_reload; > trace_kvm_fpu(0); > } > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 6882538eda32..354608487b8d 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -232,7 +232,7 @@ struct kvm_vcpu { > struct mutex mutex; > struct kvm_run *run; > > - int guest_fpu_loaded, guest_xcr0_loaded; > + int guest_xcr0_loaded; > struct swait_queue_head wq; > struct pid __rcu *pid; > int sigset_active;
On Wed, 2017-11-15 at 12:33 +0800, Wanpeng Li wrote: > 2017-11-15 11:03 GMT+08:00 Rik van Riel <riel@redhat.com>: > > On Wed, 2017-11-15 at 08:47 +0800, Wanpeng Li wrote: > > > 2017-11-15 5:54 GMT+08:00 <riel@redhat.com>: > > > > From: Rik van Riel <riel@redhat.com> > > > > > > > > Currently, every time a VCPU is scheduled out, the host kernel > > > > will > > > > first save the guest FPU/xstate context, then load the qemu > > > > userspace > > > > FPU context, only to then immediately save the qemu userspace > > > > FPU > > > > context back to memory. When scheduling in a VCPU, the same > > > > extraneous > > > > FPU loads and saves are done. > > > > > > > > This could be avoided by moving from a model where the guest > > > > FPU is > > > > loaded and stored with preemption disabled, to a model where > > > > the > > > > qemu userspace FPU is swapped out for the guest FPU context for > > > > the duration of the KVM_RUN ioctl. > > > > > > What will happen if CONFIG_PREEMPT is enabled? > > > > The scheduler will save the guest FPU context when a > > VCPU thread is preempted, and restore it when it is > > scheduled back in. > > I mean all the involved processes will use fpu. Before patch if > kernel > preempt occur: > > context_switch > -> prepare_task_switch > -> fire_sched_out_preempt_notifiers > -> kvm_sched_out > -> kvm_arch_vcpu_put > -> kvm_put_guest_fpu > -> copy_fpregs_to_fpstate(&vcpu- > >arch.guest_fpu) > save xsave area to guest fpu > buffer > -> __kernel_fpu_end > -> > copy_kernel_to_fpregs(¤t->thread.fpu.state) > restore prev vCPU qemu > userspace FPU to the xsave area > -> switch_to > -> __switch_to > -> switch_fpu_prepare > -> copy_fpregs_to_fpstate => save xsave area to > prev > vCPU qemu userspace FPU > -> switch_fpu_finish > -> copy_kernel_to_fpgregs => restore next task FPU > to xsave area > > > After the patch: > > context_switch > -> prepare_task_switch > -> fire_sched_out_preempt_notifiers > -> kvm_sched_out > > -> switch_to > -> __switch_to > -> switch_fpu_prepare > -> copy_fpregs_to_fpstate => Oops > save xsave area to prev vCPU qemu userspace FPU, > actually the guest FPU buffer is loaded in xsave area, you transmit > guest FPU in xsave area into the prev vCPU qemu userspace FPU When entering kvm_arch_vcpu_ioctl_run we save the qemu userspace FPU context in &vcpu->arch.user_fpu, and we restore that before leaving kvm_arch_vcpu_ioctl_run. Userspace should always see the userspace FPU context, no? Am I overlooking anything?
On Wed, 2017-11-15 at 14:53 +0800, quan.xu04@gmail.com wrote: > > On 2017/11/15 05:54, riel@redhat.com wrote: > > From: Rik van Riel <riel@redhat.com> > > > > Currently, every time a VCPU is scheduled out, the host kernel will > > first save the guest FPU/xstate context, then load the qemu > > userspace > > FPU context, only to then immediately save the qemu userspace FPU > > context back to memory. When scheduling in a VCPU, the same > > extraneous > > FPU loads and saves are done. > > Rik, be careful with VM migration. with you patch, I don't think you > could load fpu/xstate > context accurately after VM migration. Can you explain why you believe that? Getting the guest FPU or XSTATE is done under the vcpu->mutex. This patch switches out guest and userspace FPU/XSTATE under the vcpu->mutex, and switches it back before releasing the vcpu->mutex. By the time a KVM_GET_FPU has obtained the vcpu->mutex, the guest FPU state will be in vcpu->arch.guest_fpu.state, where you expect it to be. What am I missing?
On 15/11/2017 15:43, Rik van Riel wrote: >> Rik, be careful with VM migration. with you patch, I don't think you >> could load fpu/xstate >> context accurately after VM migration. > Can you explain why you believe that? > > Getting the guest FPU or XSTATE is done under the vcpu->mutex. > > This patch switches out guest and userspace FPU/XSTATE under the > vcpu->mutex, and switches it back before releasing the vcpu->mutex. > > By the time a KVM_GET_FPU has obtained the vcpu->mutex, the guest > FPU state will be in vcpu->arch.guest_fpu.state, where you expect > it to be. > > What am I missing? Nothing as far as I can see. :) Paolo
On 2017-11-15 22:43, Rik van Riel wrote: > Can you explain why you believe that? for example, a vcpu thread is running in kvm mode under cretical condition to stop. QEMU send an IPI to cause a VM-exit to happen immediately, and this IPI doesn't make vcpu return to QEMU. IIUC this vcpu thread will still continue to run in kvm mode when is waked up at targer machine. with your patch, I don't see a chance to load guest FPU or XSTATE, until return to QEMU and run kvm mode again. then the FPU or XSTATE status is inconsistent for a small window, what's even worse is that the vcpu is running. Did I misunderstand? Quan Alibaba Cloud > Getting the guest FPU or XSTATE is done under the vcpu->mutex. > > This patch switches out guest and userspace FPU/XSTATE under the > vcpu->mutex, and switches it back before releasing the vcpu->mutex. > > By the time a KVM_GET_FPU has obtained the vcpu->mutex, the guest > FPU state will be in vcpu->arch.guest_fpu.state, where you expect > it to be. > > What am I missing? >
On Thu, 2017-11-16 at 10:50 +0800, Quan Xu wrote: > > On 2017-11-15 22:43, Rik van Riel wrote: > > Can you explain why you believe that? > > for example, a vcpu thread is running in kvm mode under cretical > condition to stop. QEMU send an IPI to cause a VM-exit to happen > immediately, and this IPI doesn't make vcpu return to QEMU. IIUC > this vcpu thread will still continue to run in kvm mode when is > waked up at targer machine. with your patch, I don't see a chance > to load guest FPU or XSTATE, until return to QEMU and run kvm mode > again. > > then the FPU or XSTATE status is inconsistent for a small window, > what's > even > worse is that the vcpu is running. > > Did I misunderstand? At context switch time, the context switch code will save the guest FPU state to current->thread.fpu when the VCPU thread is scheduled out. When the VCPU thread is scheduled back in, the context switch code will restore current->thread.fpu to the FPU registers. The VCPU thread will never run with anything else than the guest FPU state, while inside the KVM_RUN code.
On 2017-11-16 12:21, Rik van Riel wrote: > On Thu, 2017-11-16 at 10:50 +0800, Quan Xu wrote: >> On 2017-11-15 22:43, Rik van Riel wrote: >>> Can you explain why you believe that? >> for example, a vcpu thread is running in kvm mode under cretical >> condition to stop. QEMU send an IPI to cause a VM-exit to happen >> immediately, and this IPI doesn't make vcpu return to QEMU. IIUC >> this vcpu thread will still continue to run in kvm mode when is >> waked up at targer machine. with your patch, I don't see a chance >> to load guest FPU or XSTATE, until return to QEMU and run kvm mode >> again. >> >> then the FPU or XSTATE status is inconsistent for a small window, >> what's >> even >> worse is that the vcpu is running. >> >> Did I misunderstand? > At context switch time, the context switch code will save > the guest FPU state to current->thread.fpu when the > VCPU thread is scheduled out. > > When the VCPU thread is scheduled back in, the context > switch code will restore current->thread.fpu to the FPU > registers. good catch! Also as your comment, PKRU is switched out separately at VMENTER and VMEXIT time, but with a lots of IF conditions.. the pkru may be restored with host pkru after VMEXIT. when vcpu thread is scheduled out, the pkru value in current->thread.fpu.state may be the host pkru value, instead of guest pkru value (of course, this _assumes_ that the pkru is in current->thread.fpu.state as well). in this way, the pkru may be a coner case. VM migration again, in case, source_host_pkru_value != guest_pkru_value, target_host_pkru_value == guest_pkru_value.. the pkru status would be inconsistent.. Quan Alibaba Cloud > The VCPU thread will never run with anything else than > the guest FPU state, while inside the KVM_RUN code. >
2017-11-15 22:40 GMT+08:00 Rik van Riel <riel@redhat.com>: > On Wed, 2017-11-15 at 12:33 +0800, Wanpeng Li wrote: >> 2017-11-15 11:03 GMT+08:00 Rik van Riel <riel@redhat.com>: >> > On Wed, 2017-11-15 at 08:47 +0800, Wanpeng Li wrote: >> > > 2017-11-15 5:54 GMT+08:00 <riel@redhat.com>: >> > > > From: Rik van Riel <riel@redhat.com> >> > > > >> > > > Currently, every time a VCPU is scheduled out, the host kernel >> > > > will >> > > > first save the guest FPU/xstate context, then load the qemu >> > > > userspace >> > > > FPU context, only to then immediately save the qemu userspace >> > > > FPU >> > > > context back to memory. When scheduling in a VCPU, the same >> > > > extraneous >> > > > FPU loads and saves are done. >> > > > >> > > > This could be avoided by moving from a model where the guest >> > > > FPU is >> > > > loaded and stored with preemption disabled, to a model where >> > > > the >> > > > qemu userspace FPU is swapped out for the guest FPU context for >> > > > the duration of the KVM_RUN ioctl. >> > > >> > > What will happen if CONFIG_PREEMPT is enabled? >> > >> > The scheduler will save the guest FPU context when a >> > VCPU thread is preempted, and restore it when it is >> > scheduled back in. >> >> I mean all the involved processes will use fpu. Before patch if >> kernel >> preempt occur: >> >> context_switch >> -> prepare_task_switch >> -> fire_sched_out_preempt_notifiers >> -> kvm_sched_out >> -> kvm_arch_vcpu_put >> -> kvm_put_guest_fpu >> -> copy_fpregs_to_fpstate(&vcpu- >> >arch.guest_fpu) >> save xsave area to guest fpu >> buffer >> -> __kernel_fpu_end >> -> >> copy_kernel_to_fpregs(¤t->thread.fpu.state) >> restore prev vCPU qemu >> userspace FPU to the xsave area >> -> switch_to >> -> __switch_to >> -> switch_fpu_prepare >> -> copy_fpregs_to_fpstate => save xsave area to >> prev >> vCPU qemu userspace FPU >> -> switch_fpu_finish >> -> copy_kernel_to_fpgregs => restore next task FPU >> to xsave area >> >> >> After the patch: >> >> context_switch >> -> prepare_task_switch >> -> fire_sched_out_preempt_notifiers >> -> kvm_sched_out >> >> -> switch_to >> -> __switch_to >> -> switch_fpu_prepare >> -> copy_fpregs_to_fpstate => Oops >> save xsave area to prev vCPU qemu userspace FPU, >> actually the guest FPU buffer is loaded in xsave area, you transmit >> guest FPU in xsave area into the prev vCPU qemu userspace FPU > > When entering kvm_arch_vcpu_ioctl_run we save the qemu userspace > FPU context in &vcpu->arch.user_fpu, and we restore that before > leaving kvm_arch_vcpu_ioctl_run. > > Userspace should always see the userspace FPU context, no? You are right. Regards, Wanpeng Li
On 16/11/2017 06:06, Quan Xu wrote: > when vcpu thread is scheduled out, the pkru value in > current->thread.fpu.state may be the host pkru value, instead of > guest pkru value (of course, this _assumes_ that the pkru is in > current->thread.fpu.state as well). in this way, the pkru may be a > coner case. Rik may correct me, but I think this is not possible. Preemption is disabled all the time while PKRU = guest_pkru (which is only during vmx_vcpu_run). Context switching will only happen in vcpu_enter_guest() after preempt_enable() for a preemptible kernel, or in vcpu_run via cond_resched() for a non-preemptible kernel. Thanks, Paolo > > VM migration again, in case, > source_host_pkru_value != guest_pkru_value, > target_host_pkru_value == guest_pkru_value.. > > the pkru status would be inconsistent..
On 2017-11-16 18:21, Paolo Bonzini wrote: > On 16/11/2017 06:06, Quan Xu wrote: >> when vcpu thread is scheduled out, the pkru value in >> current->thread.fpu.state may be the host pkru value, instead of >> guest pkru value (of course, this _assumes_ that the pkru is in >> current->thread.fpu.state as well). in this way, the pkru may be a >> coner case. > Rik may correct me, but I think this is not possible. Preemption is > disabled all the time while PKRU = guest_pkru (which is only during > vmx_vcpu_run). refer to the following code, vcpu_enter_guest() { ... preempt_disable() ... kvm_x86_ops->run(vcpu); (actually it is vmx_vcpu_run()) ... ... preempt_enable(); } when preempt_disable before to run kvm_x86_ops->run.. in kvm_x86_ops->run, the pkru is restored with host_pkru (IF guest_pkru != host_pkru). all this happened under preempt_disable(). it's not true that preemtion is disable all the time while PKRU = guest_pkru.. However it seems there is still some gap.. as Rik said, "at context switch time, the context switch code will save the guest FPU state to current->thread.fpu when the VCPU thread is scheduled out." after preempt_enable() in vcpu_enter_guest(), the vcpu thread is scheduled out, in kvm_x86_ops->run, the PKRU has been restored with host_pkru (IF guest_pkru != host_pkru).. instead of guest_pkru.. then the PKRU is host_pkru, how to save guest_pkru current->thread.fpu? as mentioned, all this _assumes_ that the pkru is in current->thread.fpu.state as well. thanks, Quan Alibaba Cloud > Context switching will only happen in vcpu_enter_guest() after > preempt_enable() for a preemptible kernel, or in vcpu_run via > cond_resched() for a non-preemptible kernel. > > Thanks, > > Paolo > >> VM migration again, in case, >> source_host_pkru_value != guest_pkru_value, >> target_host_pkru_value == guest_pkru_value.. >> >> the pkru status would be inconsistent.. >
On 16/11/2017 13:12, Quan Xu wrote: > However it seems there is still some gap.. > > as Rik said, "at context switch time, the context switch code will save > the guest FPU state to current->thread.fpu when the VCPU thread is scheduled out." By "guest FPU state" Rik means "guest FPU with host PKRU". Guest PKRU is always stored in vcpu->arch.pkru rather than in the guest FPU state, so guest PKRU will never be in current->thread.fpu.state either. KVM_GET_XSAVE will the guest FPU state with vcpu->arch.pkru and migration will work properly. Thanks, Paolo > after preempt_enable() in vcpu_enter_guest(), the vcpu thread is > scheduled out, > in kvm_x86_ops->run, the PKRU has been restored with host_pkru (IF > guest_pkru != host_pkru).. > instead of guest_pkru.. > > then the PKRU is host_pkru, how to save guest_pkru current->thread.fpu? > > as mentioned, all this _assumes_ that the pkru is in > current->thread.fpu.state as well. > > > thanks, > > Quan > Alibaba Cloud > >> Context switching will only happen in vcpu_enter_guest() after >> preempt_enable() for a preemptible kernel, or in vcpu_run via >> cond_resched() for a non-preemptible kernel. >> >> Thanks, >> >> Paolo >> >>> VM migration again, in case, >>> source_host_pkru_value != guest_pkru_value, >>> target_host_pkru_value == guest_pkru_value.. >>> >>> the pkru status would be inconsistent.. >> >
On 2017-11-16 20:18, Paolo Bonzini wrote: > On 16/11/2017 13:12, Quan Xu wrote: >> However it seems there is still some gap.. >> >> as Rik said, "at context switch time, the context switch code will save >> the guest FPU state to current->thread.fpu when the VCPU thread is scheduled out." > By "guest FPU state" Rik means "guest FPU with host PKRU". :( actually it is host_pkru, just with different names.. > Guest PKRU is always stored in vcpu->arch.pkru rather than in the guest > FPU state, so guest PKRU will never be in current->thread.fpu.state either. > > KVM_GET_XSAVE will the guest FPU state with vcpu->arch.pkru and > migration will work properly. agreed, I fix it.. that's why I concern.. there are so much methods to write PKRU with host_pkru/guest_pkru.. after migration, KVM_GET|SET_XSAVE restore the right pkru.. but we introduce another method: -- When the VCPU thread is scheduled back in, the context switch code will restore current->thread.fpu to the FPU registers. there is still a window to restore current->thread.fpu to the FPU registers before enter guest mode and preempt_disable(). on target machine, after migration, the pkru value is source_host_pkru in current->thread.fpu. in case, source_host_pkru_value != guest_pkru_value, target_host_pkru_value == guest_pkru_value.. source_host_pkru_value may be restored to PKRU.. make pkru status inconsistent.. thanks Quan Alibaba Cloud > Thanks, > > Paolo > >> after preempt_enable() in vcpu_enter_guest(), the vcpu thread is >> scheduled out, >> in kvm_x86_ops->run, the PKRU has been restored with host_pkru (IF >> guest_pkru != host_pkru).. >> instead of guest_pkru.. >> >> then the PKRU is host_pkru, how to save guest_pkru current->thread.fpu? >> >> as mentioned, all this _assumes_ that the pkru is in >> current->thread.fpu.state as well. >> >> >> thanks, >> >> Quan >> Alibaba Cloud >> >>> Context switching will only happen in vcpu_enter_guest() after >>> preempt_enable() for a preemptible kernel, or in vcpu_run via >>> cond_resched() for a non-preemptible kernel. >>> >>> Thanks, >>> >>> Paolo >>> >>>> VM migration again, in case, >>>> source_host_pkru_value != guest_pkru_value, >>>> target_host_pkru_value == guest_pkru_value.. >>>> >>>> the pkru status would be inconsistent.. >
On 16/11/2017 14:35, Quan Xu wrote: > but we introduce another method: > > -- When the VCPU thread is scheduled back in, the context > switch code will restore current->thread.fpu to the FPU > registers. > > > there is still a window to restore current->thread.fpu to the FPU > registers before enter guest mode and > > preempt_disable(). That will always use the host PKRU. The guest PKRU is _never_ visible to the context switch code, because it's only ever used in a section that runs with preemption disabled. It's actually much simpler than before. Paolo > on target machine, after migration, the pkru value is source_host_pkru > in current->thread.fpu. > > in case, > > source_host_pkru_value != guest_pkru_value, > target_host_pkru_value == guest_pkru_value.. > > source_host_pkru_value may be restored to PKRU.. make pkru status > inconsistent..
On 16/11/2017 15:28, Quan Xu wrote: > vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); > > + kvm_load_guest_fpu(vcpu); > + > for (;;) { > if (kvm_vcpu_running(vcpu)) { > r = vcpu_enter_guest(vcpu); > > << > > > > as Rik dropped kvm_load_guest_fpu(vcpu) in vcpu_enter_guest() .. > in case still in kvm mode, how to make sure to pkru is always the > right one before enter guest mode, not be preempted before > preempt_disable() after migration? :( As you know: 1) kvm_load_guest_fpu doesn't load the guest PKRU, it keeps the userspace PKRU. 2) the guest PKRU is only ever set in a preemption-disabled area Thus, context switch always sees the userspace PKRU. The guest PKRU is only set the next time vmx_vcpu_run executes. Paolo
On 2017-11-17 01:50, Paolo Bonzini wrote: > On 16/11/2017 15:28, Quan Xu wrote: >> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); >> >> + kvm_load_guest_fpu(vcpu); >> + >> for (;;) { >> if (kvm_vcpu_running(vcpu)) { >> r = vcpu_enter_guest(vcpu); >> >> << >> >> >> >> as Rik dropped kvm_load_guest_fpu(vcpu) in vcpu_enter_guest() .. >> in case still in kvm mode, how to make sure to pkru is always the >> right one before enter guest mode, not be preempted before >> preempt_disable() after migration? :( > As you know: > > 1) kvm_load_guest_fpu doesn't load the guest PKRU, it keeps the > userspace PKRU. > > 2) the guest PKRU is only ever set in a preemption-disabled area > > Thus, context switch always sees the userspace PKRU. The guest PKRU is > only set the next time vmx_vcpu_run executes. > > Paolo > Paolo, thanks for your explanation!!:-) Rik, could you cc me in v2? Quan
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9d7d856b2d89..ffe54958491f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,7 +536,20 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + /* + * QEMU userspace and the guest each have their own FPU state. + * In vcpu_run, we switch between the user and guest FPU contexts. + * While running a VCPU, the VCPU thread will have the guest FPU + * context. + * + * Note that while the PKRU state lives inside the fpu registers, + * it is switched out separately at VMENTER and VMEXIT time. The + * "guest_fpu" state here contains the guest FPU context, with the + * host PRKU bits. + */ + struct fpu user_fpu; struct fpu guest_fpu; + u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03869eb7fcd6..aad5181ed4e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2917,7 +2917,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } @@ -5228,13 +5227,10 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { - preempt_disable(); - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) { - preempt_enable(); } static int emulator_intercept(struct x86_emulate_ctxt *ctxt, @@ -6908,7 +6904,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -7255,12 +7250,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } + kvm_load_guest_fpu(vcpu); + if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) - goto out; + goto out_fpu; } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); @@ -7269,6 +7266,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) else r = vcpu_run(vcpu); +out_fpu: + kvm_put_guest_fpu(vcpu); out: post_kvm_run_save(vcpu); if (vcpu->sigset_active) @@ -7663,32 +7662,25 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +/* Swap (qemu) user FPU context for the guest FPU context. */ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (vcpu->guest_fpu_loaded) - return; - - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - vcpu->guest_fpu_loaded = 1; - __kernel_fpu_begin(); + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, ~XFEATURE_MASK_PKRU); + preempt_enable(); trace_kvm_fpu(1); } +/* When vcpu_run ends, restore user space FPU context. */ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; + preempt_disable(); copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - __kernel_fpu_end(); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6882538eda32..354608487b8d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -232,7 +232,7 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; - int guest_fpu_loaded, guest_xcr0_loaded; + int guest_xcr0_loaded; struct swait_queue_head wq; struct pid __rcu *pid; int sigset_active;