diff mbox

[3/3] KVM/arm64: enable enhanced armv8 fp/simd lazy switch

Message ID 1446242193-8424-4-git-send-email-m.smarduch@samsung.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mario Smarduch Oct. 30, 2015, 9:56 p.m. UTC
This patch enables arm64 lazy fp/simd switch, similar to arm described in
second patch. Change from previous version - restore function is moved to
host. 

Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/arm64/kernel/asm-offsets.c   |  1 +
 arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
 3 files changed, 33 insertions(+), 7 deletions(-)

Comments

Christoffer Dall Nov. 5, 2015, 3:02 p.m. UTC | #1
On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
> This patch enables arm64 lazy fp/simd switch, similar to arm described in
> second patch. Change from previous version - restore function is moved to
> host. 
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  2 +-
>  arch/arm64/kernel/asm-offsets.c   |  1 +
>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
>  3 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 26a2347..dcecf92 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
>  
>  void kvm_arm_init_debug(void);
>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
>  
>  #endif /* __ARM64_KVM_HOST_H__ */
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 8d89cf8..c9c5242 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -124,6 +124,7 @@ int main(void)
>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> index e583613..ed2c4cf 100644
> --- a/arch/arm64/kvm/hyp.S
> +++ b/arch/arm64/kvm/hyp.S
> @@ -36,6 +36,28 @@
>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
>  
>  	.text
> +
> +/**
> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
> + *	fp/simd switch, saves the guest, restores host. Called from host
> + *	mode, placed outside of hyp section.

same comments on style as previous patch

> + */
> +ENTRY(kvm_restore_host_vfp_state)
> +	push    xzr, lr
> +
> +	add     x2, x0, #VCPU_CONTEXT
> +	mov     w3, #0
> +	strb    w3, [x0, #VCPU_VFP_DIRTY]

I've been discussing with myself if it would make more sense to clear
the dirty flag in the C-code...

> +
> +	bl __save_fpsimd
> +
> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
> +	bl __restore_fpsimd
> +
> +	pop     xzr, lr
> +	ret
> +ENDPROC(kvm_restore_host_vfp_state)
> +
>  	.pushsection	.hyp.text, "ax"
>  	.align	PAGE_SHIFT
>  
> @@ -482,7 +504,11 @@
>  99:
>  	msr     hcr_el2, x2
>  	mov	x2, #CPTR_EL2_TTA
> +
> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
> +	tbnz    w3, #0, 98f
>  	orr     x2, x2, #CPTR_EL2_TFP
> +98:

mmm, don't you need to only set the fpexc32 when you're actually going
to trap the guest accesses?

also, you can consider only setting this in vcpu_load (jumping quickly
to EL2 to do so) if we're running a 32-bit guest.  Probably worth
measuring the difference between the extra EL2 jump on vcpu_load
compared to hitting this register on every entry/exit.

Code-wise, it will be nicer to do it on vcpu_load.

>  	msr	cptr_el2, x2
>  
>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
> @@ -669,14 +695,12 @@ __restore_debug:
>  	ret
>  
>  __save_fpsimd:
> -	skip_fpsimd_state x3, 1f
>  	save_fpsimd
> -1:	ret
> +	ret
>  
>  __restore_fpsimd:
> -	skip_fpsimd_state x3, 1f
>  	restore_fpsimd
> -1:	ret
> +	ret
>  
>  switch_to_guest_fpsimd:
>  	push	x4, lr
> @@ -688,6 +712,9 @@ switch_to_guest_fpsimd:
>  
>  	mrs	x0, tpidr_el2
>  
> +	mov     w2, #1
> +	strb    w2, [x0, #VCPU_VFP_DIRTY]

hmm, just noticing this.  Are you not writing a 32-bit value to a
potentially 8-bit field (ignoring padding in the struct), as the dirty
flag is declared a bool.

Are you also doing this on the 32-bit side?

> +
>  	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
>  	kern_hyp_va x2
>  	bl __save_fpsimd
> @@ -763,7 +790,6 @@ __kvm_vcpu_return:
>  	add	x2, x0, #VCPU_CONTEXT
>  
>  	save_guest_regs
> -	bl __save_fpsimd
>  	bl __save_sysregs
>  
>  	skip_debug_state x3, 1f
> @@ -784,7 +810,6 @@ __kvm_vcpu_return:
>  	kern_hyp_va x2
>  
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
>  	/* Clear FPSIMD and Trace trapping */
>  	msr     cptr_el2, xzr
>  
> -- 
> 1.9.1
> 

Thanks,
-Christoffer
Mario Smarduch Nov. 6, 2015, 12:57 a.m. UTC | #2
On 11/5/2015 7:02 AM, Christoffer Dall wrote:
> On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
>> This patch enables arm64 lazy fp/simd switch, similar to arm described in
>> second patch. Change from previous version - restore function is moved to
>> host. 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm64/include/asm/kvm_host.h |  2 +-
>>  arch/arm64/kernel/asm-offsets.c   |  1 +
>>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
>>  3 files changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 26a2347..dcecf92 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
>>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
>>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
>>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
>> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
>>  
>>  void kvm_arm_init_debug(void);
>>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
>>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
>>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
>> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
>>  
>>  #endif /* __ARM64_KVM_HOST_H__ */
>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>> index 8d89cf8..c9c5242 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -124,6 +124,7 @@ int main(void)
>>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
>>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
>>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
>> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
>>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
>>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index e583613..ed2c4cf 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -36,6 +36,28 @@
>>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
>>  
>>  	.text
>> +
>> +/**
>> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
>> + *	fp/simd switch, saves the guest, restores host. Called from host
>> + *	mode, placed outside of hyp section.
> 
> same comments on style as previous patch
Got it.
> 
>> + */
>> +ENTRY(kvm_restore_host_vfp_state)
>> +	push    xzr, lr
>> +
>> +	add     x2, x0, #VCPU_CONTEXT
>> +	mov     w3, #0
>> +	strb    w3, [x0, #VCPU_VFP_DIRTY]
> 
> I've been discussing with myself if it would make more sense to clear
> the dirty flag in the C-code...
Since all the work is done here I placed it here.
> 
>> +
>> +	bl __save_fpsimd
>> +
>> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
>> +	bl __restore_fpsimd
>> +
>> +	pop     xzr, lr
>> +	ret
>> +ENDPROC(kvm_restore_host_vfp_state)
>> +
>>  	.pushsection	.hyp.text, "ax"
>>  	.align	PAGE_SHIFT
>>  
>> @@ -482,7 +504,11 @@
>>  99:
>>  	msr     hcr_el2, x2
>>  	mov	x2, #CPTR_EL2_TTA
>> +
>> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
>> +	tbnz    w3, #0, 98f
>>  	orr     x2, x2, #CPTR_EL2_TFP
>> +98:
> 
> mmm, don't you need to only set the fpexc32 when you're actually going
> to trap the guest accesses?

My understanding is you always need to set enable in fpexec32 for 32 bit guests,
otherwise EL1 would get the trap instead of EL2. Not sure if that's the point
you're making.

> 
> also, you can consider only setting this in vcpu_load (jumping quickly
> to EL2 to do so) if we're running a 32-bit guest.  Probably worth
> measuring the difference between the extra EL2 jump on vcpu_load
> compared to hitting this register on every entry/exit.

Sure, makes sense since this is a hot code path.
> 
> Code-wise, it will be nicer to do it on vcpu_load.
> 
>>  	msr	cptr_el2, x2
>>  
>>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
>> @@ -669,14 +695,12 @@ __restore_debug:
>>  	ret
>>  
>>  __save_fpsimd:
>> -	skip_fpsimd_state x3, 1f
>>  	save_fpsimd
>> -1:	ret
>> +	ret
>>  
>>  __restore_fpsimd:
>> -	skip_fpsimd_state x3, 1f
>>  	restore_fpsimd
>> -1:	ret
>> +	ret
>>  
>>  switch_to_guest_fpsimd:
>>  	push	x4, lr
>> @@ -688,6 +712,9 @@ switch_to_guest_fpsimd:
>>  
>>  	mrs	x0, tpidr_el2
>>  
>> +	mov     w2, #1
>> +	strb    w2, [x0, #VCPU_VFP_DIRTY]
> 
> hmm, just noticing this.  Are you not writing a 32-bit value to a
> potentially 8-bit field (ignoring padding in the struct), as the dirty
> flag is declared a bool.

From the manuals byte stores require a word size registers on arm64/arm32.
The interconnect probably drops the remaining bytes.
Also double checked and the compiler uses same instructions.


> 
> Are you also doing this on the 32-bit side?
strb is used on both v7/v8
> 
>> +
>>  	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
>>  	kern_hyp_va x2
>>  	bl __save_fpsimd
>> @@ -763,7 +790,6 @@ __kvm_vcpu_return:
>>  	add	x2, x0, #VCPU_CONTEXT
>>  
>>  	save_guest_regs
>> -	bl __save_fpsimd
>>  	bl __save_sysregs
>>  
>>  	skip_debug_state x3, 1f
>> @@ -784,7 +810,6 @@ __kvm_vcpu_return:
>>  	kern_hyp_va x2
>>  
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>  	/* Clear FPSIMD and Trace trapping */
>>  	msr     cptr_el2, xzr
>>  
>> -- 
>> 1.9.1
>>
> 
> Thanks,
> -Christoffer
>
Christoffer Dall Nov. 6, 2015, 11:29 a.m. UTC | #3
On Thu, Nov 05, 2015 at 04:57:12PM -0800, Mario Smarduch wrote:
> 
> 
> On 11/5/2015 7:02 AM, Christoffer Dall wrote:
> > On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
> >> This patch enables arm64 lazy fp/simd switch, similar to arm described in
> >> second patch. Change from previous version - restore function is moved to
> >> host. 
> >>
> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> >> ---
> >>  arch/arm64/include/asm/kvm_host.h |  2 +-
> >>  arch/arm64/kernel/asm-offsets.c   |  1 +
> >>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
> >>  3 files changed, 33 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> >> index 26a2347..dcecf92 100644
> >> --- a/arch/arm64/include/asm/kvm_host.h
> >> +++ b/arch/arm64/include/asm/kvm_host.h
> >> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
> >>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
> >>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
> >>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
> >> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
> >>  
> >>  void kvm_arm_init_debug(void);
> >>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
> >>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
> >>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
> >> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
> >>  
> >>  #endif /* __ARM64_KVM_HOST_H__ */
> >> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> >> index 8d89cf8..c9c5242 100644
> >> --- a/arch/arm64/kernel/asm-offsets.c
> >> +++ b/arch/arm64/kernel/asm-offsets.c
> >> @@ -124,6 +124,7 @@ int main(void)
> >>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
> >>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
> >>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
> >> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
> >>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
> >>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
> >>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
> >> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> >> index e583613..ed2c4cf 100644
> >> --- a/arch/arm64/kvm/hyp.S
> >> +++ b/arch/arm64/kvm/hyp.S
> >> @@ -36,6 +36,28 @@
> >>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
> >>  
> >>  	.text
> >> +
> >> +/**
> >> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
> >> + *	fp/simd switch, saves the guest, restores host. Called from host
> >> + *	mode, placed outside of hyp section.
> > 
> > same comments on style as previous patch
> Got it.
> > 
> >> + */
> >> +ENTRY(kvm_restore_host_vfp_state)
> >> +	push    xzr, lr
> >> +
> >> +	add     x2, x0, #VCPU_CONTEXT
> >> +	mov     w3, #0
> >> +	strb    w3, [x0, #VCPU_VFP_DIRTY]
> > 
> > I've been discussing with myself if it would make more sense to clear
> > the dirty flag in the C-code...
> Since all the work is done here I placed it here.

yeah, that's what I thought first, but then I thought it's typically
easier to understand the logic in the C-code and this is technically a
side-effect from the name of the function "kvm_restore_host_vfp_state"
which should then be "kvm_restore_host_vfp_state_and_clear_dirty_flag"
:)

> > 
> >> +
> >> +	bl __save_fpsimd
> >> +
> >> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
> >> +	bl __restore_fpsimd
> >> +
> >> +	pop     xzr, lr
> >> +	ret
> >> +ENDPROC(kvm_restore_host_vfp_state)
> >> +
> >>  	.pushsection	.hyp.text, "ax"
> >>  	.align	PAGE_SHIFT
> >>  
> >> @@ -482,7 +504,11 @@
> >>  99:
> >>  	msr     hcr_el2, x2
> >>  	mov	x2, #CPTR_EL2_TTA
> >> +
> >> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
> >> +	tbnz    w3, #0, 98f
> >>  	orr     x2, x2, #CPTR_EL2_TFP
> >> +98:
> > 
> > mmm, don't you need to only set the fpexc32 when you're actually going
> > to trap the guest accesses?
> 
> My understanding is you always need to set enable in fpexec32 for 32 bit guests,
> otherwise EL1 would get the trap instead of EL2. Not sure if that's the point
> you're making.
> 

If you're expecting to trap all accesses by setting CPTR_EL2_TFP and
you're running a 32-bit guest, you must also enable in fpexec32, because
otherwise the trap is not taken to EL2, but to EL1 instead.

Before these patches, we *always* expected to trap VFP accesses after
entering the guest, but since that has now changed, you should only
fiddle with fpexec32 if you are indeed trapping (first entry after
vcpu_load) and if it's a 32-bit VM.

Does that help?

> > 
> > also, you can consider only setting this in vcpu_load (jumping quickly
> > to EL2 to do so) if we're running a 32-bit guest.  Probably worth
> > measuring the difference between the extra EL2 jump on vcpu_load
> > compared to hitting this register on every entry/exit.
> 
> Sure, makes sense since this is a hot code path.
> > 
> > Code-wise, it will be nicer to do it on vcpu_load.
> > 
> >>  	msr	cptr_el2, x2
> >>  
> >>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
> >> @@ -669,14 +695,12 @@ __restore_debug:
> >>  	ret
> >>  
> >>  __save_fpsimd:
> >> -	skip_fpsimd_state x3, 1f
> >>  	save_fpsimd
> >> -1:	ret
> >> +	ret
> >>  
> >>  __restore_fpsimd:
> >> -	skip_fpsimd_state x3, 1f
> >>  	restore_fpsimd
> >> -1:	ret
> >> +	ret
> >>  
> >>  switch_to_guest_fpsimd:
> >>  	push	x4, lr
> >> @@ -688,6 +712,9 @@ switch_to_guest_fpsimd:
> >>  
> >>  	mrs	x0, tpidr_el2
> >>  
> >> +	mov     w2, #1
> >> +	strb    w2, [x0, #VCPU_VFP_DIRTY]
> > 
> > hmm, just noticing this.  Are you not writing a 32-bit value to a
> > potentially 8-bit field (ignoring padding in the struct), as the dirty
> > flag is declared a bool.
> 
> From the manuals byte stores require a word size registers on arm64/arm32.
> The interconnect probably drops the remaining bytes.
> Also double checked and the compiler uses same instructions.
> 

duh, I didn't see the 'b' in strb - I guess I was too tired to review
patches.

-Christoffer
Mario Smarduch Nov. 6, 2015, 4:10 p.m. UTC | #4
On 11/6/2015 3:29 AM, Christoffer Dall wrote:
> On Thu, Nov 05, 2015 at 04:57:12PM -0800, Mario Smarduch wrote:
>>
>>
>> On 11/5/2015 7:02 AM, Christoffer Dall wrote:
>>> On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
>>>> This patch enables arm64 lazy fp/simd switch, similar to arm described in
>>>> second patch. Change from previous version - restore function is moved to
>>>> host. 
>>>>
>>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>>>> ---
>>>>  arch/arm64/include/asm/kvm_host.h |  2 +-
>>>>  arch/arm64/kernel/asm-offsets.c   |  1 +
>>>>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
>>>>  3 files changed, 33 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>>> index 26a2347..dcecf92 100644
>>>> --- a/arch/arm64/include/asm/kvm_host.h
>>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>>> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
>>>>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
>>>>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
>>>>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
>>>> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
>>>>  
>>>>  void kvm_arm_init_debug(void);
>>>>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
>>>>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
>>>>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
>>>> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
>>>>  
>>>>  #endif /* __ARM64_KVM_HOST_H__ */
>>>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>>>> index 8d89cf8..c9c5242 100644
>>>> --- a/arch/arm64/kernel/asm-offsets.c
>>>> +++ b/arch/arm64/kernel/asm-offsets.c
>>>> @@ -124,6 +124,7 @@ int main(void)
>>>>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
>>>>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
>>>>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
>>>> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
>>>>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>>>>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
>>>>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
>>>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>>>> index e583613..ed2c4cf 100644
>>>> --- a/arch/arm64/kvm/hyp.S
>>>> +++ b/arch/arm64/kvm/hyp.S
>>>> @@ -36,6 +36,28 @@
>>>>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
>>>>  
>>>>  	.text
>>>> +
>>>> +/**
>>>> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
>>>> + *	fp/simd switch, saves the guest, restores host. Called from host
>>>> + *	mode, placed outside of hyp section.
>>>
>>> same comments on style as previous patch
>> Got it.
>>>
>>>> + */
>>>> +ENTRY(kvm_restore_host_vfp_state)
>>>> +	push    xzr, lr
>>>> +
>>>> +	add     x2, x0, #VCPU_CONTEXT
>>>> +	mov     w3, #0
>>>> +	strb    w3, [x0, #VCPU_VFP_DIRTY]
>>>
>>> I've been discussing with myself if it would make more sense to clear
>>> the dirty flag in the C-code...
>> Since all the work is done here I placed it here.
> 
> yeah, that's what I thought first, but then I thought it's typically
> easier to understand the logic in the C-code and this is technically a
> side-effect from the name of the function "kvm_restore_host_vfp_state"
> which should then be "kvm_restore_host_vfp_state_and_clear_dirty_flag"
> :)
> 

Ok I'll set in C.
>>>
>>>> +
>>>> +	bl __save_fpsimd
>>>> +
>>>> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
>>>> +	bl __restore_fpsimd
>>>> +
>>>> +	pop     xzr, lr
>>>> +	ret
>>>> +ENDPROC(kvm_restore_host_vfp_state)
>>>> +
>>>>  	.pushsection	.hyp.text, "ax"
>>>>  	.align	PAGE_SHIFT
>>>>  
>>>> @@ -482,7 +504,11 @@
>>>>  99:
>>>>  	msr     hcr_el2, x2
>>>>  	mov	x2, #CPTR_EL2_TTA
>>>> +
>>>> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
>>>> +	tbnz    w3, #0, 98f
>>>>  	orr     x2, x2, #CPTR_EL2_TFP
>>>> +98:
>>>
>>> mmm, don't you need to only set the fpexc32 when you're actually going
>>> to trap the guest accesses?
>>
>> My understanding is you always need to set enable in fpexec32 for 32 bit guests,
>> otherwise EL1 would get the trap instead of EL2. Not sure if that's the point
>> you're making.
>>
> 
> If you're expecting to trap all accesses by setting CPTR_EL2_TFP and
> you're running a 32-bit guest, you must also enable in fpexec32, because
> otherwise the trap is not taken to EL2, but to EL1 instead.
> 
> Before these patches, we *always* expected to trap VFP accesses after
> entering the guest, but since that has now changed, you should only
> fiddle with fpexec32 if you are indeed trapping (first entry after
> vcpu_load) and if it's a 32-bit VM.
> 
> Does that help?
Yes sure does! Puts the vcpu_load jump to EL2 vs always into perspective.
> 
>>>
>>> also, you can consider only setting this in vcpu_load (jumping quickly
>>> to EL2 to do so) if we're running a 32-bit guest.  Probably worth
>>> measuring the difference between the extra EL2 jump on vcpu_load
>>> compared to hitting this register on every entry/exit.
>>
>> Sure, makes sense since this is a hot code path.
>>>
>>> Code-wise, it will be nicer to do it on vcpu_load.
>>>
>>>>  	msr	cptr_el2, x2
>>>>  
>>>>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
>>>> @@ -669,14 +695,12 @@ __restore_debug:
>>>>  	ret
>>>>  
>>>>  __save_fpsimd:
>>>> -	skip_fpsimd_state x3, 1f
>>>>  	save_fpsimd
>>>> -1:	ret
>>>> +	ret
>>>>  
>>>>  __restore_fpsimd:
>>>> -	skip_fpsimd_state x3, 1f
>>>>  	restore_fpsimd
>>>> -1:	ret
>>>> +	ret
>>>>  
>>>>  switch_to_guest_fpsimd:
>>>>  	push	x4, lr
>>>> @@ -688,6 +712,9 @@ switch_to_guest_fpsimd:
>>>>  
>>>>  	mrs	x0, tpidr_el2
>>>>  
>>>> +	mov     w2, #1
>>>> +	strb    w2, [x0, #VCPU_VFP_DIRTY]
>>>
>>> hmm, just noticing this.  Are you not writing a 32-bit value to a
>>> potentially 8-bit field (ignoring padding in the struct), as the dirty
>>> flag is declared a bool.
>>
>> From the manuals byte stores require a word size registers on arm64/arm32.
>> The interconnect probably drops the remaining bytes.
>> Also double checked and the compiler uses same instructions.
>>
> 
> duh, I didn't see the 'b' in strb - I guess I was too tired to review
> patches.
> 
> -Christoffer
>
Mario Smarduch Nov. 9, 2015, 11:13 p.m. UTC | #5
On 11/5/2015 7:02 AM, Christoffer Dall wrote:
> On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
>> This patch enables arm64 lazy fp/simd switch, similar to arm described in
>> second patch. Change from previous version - restore function is moved to
>> host. 
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm64/include/asm/kvm_host.h |  2 +-
>>  arch/arm64/kernel/asm-offsets.c   |  1 +
>>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
>>  3 files changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 26a2347..dcecf92 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
>>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
>>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
>>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
>> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
>>  
>>  void kvm_arm_init_debug(void);
>>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
>>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
>>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
>> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
>>  
>>  #endif /* __ARM64_KVM_HOST_H__ */
>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>> index 8d89cf8..c9c5242 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -124,6 +124,7 @@ int main(void)
>>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
>>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
>>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
>> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
>>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
>>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index e583613..ed2c4cf 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -36,6 +36,28 @@
>>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
>>  
>>  	.text
>> +
>> +/**
>> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
>> + *	fp/simd switch, saves the guest, restores host. Called from host
>> + *	mode, placed outside of hyp section.
> 
> same comments on style as previous patch
> 
>> + */
>> +ENTRY(kvm_restore_host_vfp_state)
>> +	push    xzr, lr
>> +
>> +	add     x2, x0, #VCPU_CONTEXT
>> +	mov     w3, #0
>> +	strb    w3, [x0, #VCPU_VFP_DIRTY]
> 
> I've been discussing with myself if it would make more sense to clear
> the dirty flag in the C-code...
> 
>> +
>> +	bl __save_fpsimd
>> +
>> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
>> +	bl __restore_fpsimd
>> +
>> +	pop     xzr, lr
>> +	ret
>> +ENDPROC(kvm_restore_host_vfp_state)
>> +
>>  	.pushsection	.hyp.text, "ax"
>>  	.align	PAGE_SHIFT
>>  
>> @@ -482,7 +504,11 @@
>>  99:
>>  	msr     hcr_el2, x2
>>  	mov	x2, #CPTR_EL2_TTA
>> +
>> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
>> +	tbnz    w3, #0, 98f
>>  	orr     x2, x2, #CPTR_EL2_TFP
>> +98:
> 
> mmm, don't you need to only set the fpexc32 when you're actually going
> to trap the guest accesses?
> 
> also, you can consider only setting this in vcpu_load (jumping quickly
> to EL2 to do so) if we're running a 32-bit guest.  Probably worth
> measuring the difference between the extra EL2 jump on vcpu_load
> compared to hitting this register on every entry/exit.
> 
> Code-wise, it will be nicer to do it on vcpu_load.
Hi Christoffer, Marc -
  just want to run this by you, I ran a test with typical number of
fp threads and couple lmbench benchmarks  the stride and bandwidth ones. The
ratio of exits to vcpu puts is high 50:1 or so. But of course that's subject
to the loads you run.

I substituted:
tbnz x2, #HCR_RW_SHIFT, 99f
mov x3, #(1 << 30)
msr fpexc32_el2, x3
isb

with vcpu_load hyp call and check for 32 bit guest in C
mov x1, #(1 << 30)
msr fpexc32_el2, x3
ret

And then
skip_fpsimd_state x8, 2f
mrs	x6, fpexec_el2
str	x6, [x3, #16]

with vcpu_put hyp call with check for 32 bit guest in C this is called
substantially less often then vcpu_load since fp/simd registers are not
always dirty on vcpu_put

kern_hyp_va x0
add x2, x0, #VCPU_CONTEXT
mrs x1, fpexec32_el2
str x1, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
ret

Of course each hyp call has additional overhead, at a high exit to
vcpu_put ratio hyp call appears better. But all this is very
highly dependent on exit rate and fp/simd usage. IMO hyp call
works better under extreme loads should be pretty close
for general loads.

Any thoughts?

- Mario

> 
>>  	msr	cptr_el2, x2
>>  
>>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
>> @@ -669,14 +695,12 @@ __restore_debug:
>>  	ret
>>  
>>  __save_fpsimd:
>> -	skip_fpsimd_state x3, 1f
>>  	save_fpsimd
>> -1:	ret
>> +	ret
>>  
>>  __restore_fpsimd:
>> -	skip_fpsimd_state x3, 1f
>>  	restore_fpsimd
>> -1:	ret
>> +	ret
>>  
>>  switch_to_guest_fpsimd:
>>  	push	x4, lr
>> @@ -688,6 +712,9 @@ switch_to_guest_fpsimd:
>>  
>>  	mrs	x0, tpidr_el2
>>  
>> +	mov     w2, #1
>> +	strb    w2, [x0, #VCPU_VFP_DIRTY]
> 
> hmm, just noticing this.  Are you not writing a 32-bit value to a
> potentially 8-bit field (ignoring padding in the struct), as the dirty
> flag is declared a bool.
> 
> Are you also doing this on the 32-bit side?
> 
>> +
>>  	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
>>  	kern_hyp_va x2
>>  	bl __save_fpsimd
>> @@ -763,7 +790,6 @@ __kvm_vcpu_return:
>>  	add	x2, x0, #VCPU_CONTEXT
>>  
>>  	save_guest_regs
>> -	bl __save_fpsimd
>>  	bl __save_sysregs
>>  
>>  	skip_debug_state x3, 1f
>> @@ -784,7 +810,6 @@ __kvm_vcpu_return:
>>  	kern_hyp_va x2
>>  
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>  	/* Clear FPSIMD and Trace trapping */
>>  	msr     cptr_el2, xzr
>>  
>> -- 
>> 1.9.1
>>
> 
> Thanks,
> -Christoffer
>
Christoffer Dall Nov. 10, 2015, 11:18 a.m. UTC | #6
On Mon, Nov 09, 2015 at 03:13:15PM -0800, Mario Smarduch wrote:
> 
> 
> On 11/5/2015 7:02 AM, Christoffer Dall wrote:
> > On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
> >> This patch enables arm64 lazy fp/simd switch, similar to arm described in
> >> second patch. Change from previous version - restore function is moved to
> >> host. 
> >>
> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> >> ---
> >>  arch/arm64/include/asm/kvm_host.h |  2 +-
> >>  arch/arm64/kernel/asm-offsets.c   |  1 +
> >>  arch/arm64/kvm/hyp.S              | 37 +++++++++++++++++++++++++++++++------
> >>  3 files changed, 33 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> >> index 26a2347..dcecf92 100644
> >> --- a/arch/arm64/include/asm/kvm_host.h
> >> +++ b/arch/arm64/include/asm/kvm_host.h
> >> @@ -251,11 +251,11 @@ static inline void kvm_arch_hardware_unsetup(void) {}
> >>  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
> >>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
> >>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
> >> -static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
> >>  
> >>  void kvm_arm_init_debug(void);
> >>  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
> >>  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
> >>  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
> >> +void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
> >>  
> >>  #endif /* __ARM64_KVM_HOST_H__ */
> >> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> >> index 8d89cf8..c9c5242 100644
> >> --- a/arch/arm64/kernel/asm-offsets.c
> >> +++ b/arch/arm64/kernel/asm-offsets.c
> >> @@ -124,6 +124,7 @@ int main(void)
> >>    DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
> >>    DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
> >>    DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
> >> +  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
> >>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
> >>    DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
> >>    DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
> >> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> >> index e583613..ed2c4cf 100644
> >> --- a/arch/arm64/kvm/hyp.S
> >> +++ b/arch/arm64/kvm/hyp.S
> >> @@ -36,6 +36,28 @@
> >>  #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
> >>  
> >>  	.text
> >> +
> >> +/**
> >> + * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
> >> + *	fp/simd switch, saves the guest, restores host. Called from host
> >> + *	mode, placed outside of hyp section.
> > 
> > same comments on style as previous patch
> > 
> >> + */
> >> +ENTRY(kvm_restore_host_vfp_state)
> >> +	push    xzr, lr
> >> +
> >> +	add     x2, x0, #VCPU_CONTEXT
> >> +	mov     w3, #0
> >> +	strb    w3, [x0, #VCPU_VFP_DIRTY]
> > 
> > I've been discussing with myself if it would make more sense to clear
> > the dirty flag in the C-code...
> > 
> >> +
> >> +	bl __save_fpsimd
> >> +
> >> +	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
> >> +	bl __restore_fpsimd
> >> +
> >> +	pop     xzr, lr
> >> +	ret
> >> +ENDPROC(kvm_restore_host_vfp_state)
> >> +
> >>  	.pushsection	.hyp.text, "ax"
> >>  	.align	PAGE_SHIFT
> >>  
> >> @@ -482,7 +504,11 @@
> >>  99:
> >>  	msr     hcr_el2, x2
> >>  	mov	x2, #CPTR_EL2_TTA
> >> +
> >> +	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
> >> +	tbnz    w3, #0, 98f
> >>  	orr     x2, x2, #CPTR_EL2_TFP
> >> +98:
> > 
> > mmm, don't you need to only set the fpexc32 when you're actually going
> > to trap the guest accesses?
> > 
> > also, you can consider only setting this in vcpu_load (jumping quickly
> > to EL2 to do so) if we're running a 32-bit guest.  Probably worth
> > measuring the difference between the extra EL2 jump on vcpu_load
> > compared to hitting this register on every entry/exit.
> > 
> > Code-wise, it will be nicer to do it on vcpu_load.
> Hi Christoffer, Marc -
>   just want to run this by you, I ran a test with typical number of
> fp threads and couple lmbench benchmarks  the stride and bandwidth ones. The
> ratio of exits to vcpu puts is high 50:1 or so. But of course that's subject
> to the loads you run.
> 
> I substituted:
> tbnz x2, #HCR_RW_SHIFT, 99f
> mov x3, #(1 << 30)
> msr fpexc32_el2, x3
> isb
> 
> with vcpu_load hyp call and check for 32 bit guest in C
> mov x1, #(1 << 30)
> msr fpexc32_el2, x3
> ret
> 
> And then
> skip_fpsimd_state x8, 2f
> mrs	x6, fpexec_el2
> str	x6, [x3, #16]
> 
> with vcpu_put hyp call with check for 32 bit guest in C this is called
> substantially less often then vcpu_load since fp/simd registers are not
> always dirty on vcpu_put
> 
> kern_hyp_va x0
> add x2, x0, #VCPU_CONTEXT
> mrs x1, fpexec32_el2
> str x1, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
> ret
> 
> Of course each hyp call has additional overhead, at a high exit to
> vcpu_put ratio hyp call appears better. But all this is very
> highly dependent on exit rate and fp/simd usage. IMO hyp call
> works better under extreme loads should be pretty close
> for general loads.
> 
> Any thoughts?
> 
I think the typical case will be lots of exits and few
vcpu_load/vcpu_put, and I think it's reasonable to write the code that
way.

That should also be much better for VHE.

So I would go that direction.

Thanks,
-Christoffer
Mario Smarduch Nov. 14, 2015, 11:04 p.m. UTC | #7
On 11/10/2015 3:18 AM, Christoffer Dall wrote:
> On Mon, Nov 09, 2015 at 03:13:15PM -0800, Mario Smarduch wrote:
>>
>>
>> On 11/5/2015 7:02 AM, Christoffer Dall wrote:
>>> On Fri, Oct 30, 2015 at 02:56:33PM -0700, Mario Smarduch wrote:
[....]
>> kern_hyp_va x0
>> add x2, x0, #VCPU_CONTEXT
>> mrs x1, fpexec32_el2
>> str x1, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
>> ret
>>
>> Of course each hyp call has additional overhead, at a high exit to
>> vcpu_put ratio hyp call appears better. But all this is very
>> highly dependent on exit rate and fp/simd usage. IMO hyp call
>> works better under extreme loads should be pretty close
>> for general loads.
>>
>> Any thoughts?
>>
> I think the typical case will be lots of exits and few
> vcpu_load/vcpu_put, and I think it's reasonable to write the code that
> way.

Yes, especially for RT guests where vCPU is pinned.

Thanks.
> 
> That should also be much better for VHE.
> 
> So I would go that direction.
> 
> Thanks,
> -Christoffer
>
diff mbox

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 26a2347..dcecf92 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -251,11 +251,11 @@  static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
-static inline void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu) {}
 
 void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+void kvm_restore_host_vfp_state(struct kvm_vcpu *vcpu);
 
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 8d89cf8..c9c5242 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -124,6 +124,7 @@  int main(void)
   DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
   DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
   DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
+  DEFINE(VCPU_VFP_DIRTY,	offsetof(struct kvm_vcpu, arch.vfp_dirty));
   DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
   DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
   DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index e583613..ed2c4cf 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -36,6 +36,28 @@ 
 #define CPU_SYSREG_OFFSET(x)	(CPU_SYSREGS + 8*x)
 
 	.text
+
+/**
+ * void kvm_restore_host_vfp_state(struct vcpu *vcpu) - Executes lazy
+ *	fp/simd switch, saves the guest, restores host. Called from host
+ *	mode, placed outside of hyp section.
+ */
+ENTRY(kvm_restore_host_vfp_state)
+	push    xzr, lr
+
+	add     x2, x0, #VCPU_CONTEXT
+	mov     w3, #0
+	strb    w3, [x0, #VCPU_VFP_DIRTY]
+
+	bl __save_fpsimd
+
+	ldr     x2, [x0, #VCPU_HOST_CONTEXT]
+	bl __restore_fpsimd
+
+	pop     xzr, lr
+	ret
+ENDPROC(kvm_restore_host_vfp_state)
+
 	.pushsection	.hyp.text, "ax"
 	.align	PAGE_SHIFT
 
@@ -482,7 +504,11 @@ 
 99:
 	msr     hcr_el2, x2
 	mov	x2, #CPTR_EL2_TTA
+
+	ldrb	w3, [x0, #VCPU_VFP_DIRTY]
+	tbnz    w3, #0, 98f
 	orr     x2, x2, #CPTR_EL2_TFP
+98:
 	msr	cptr_el2, x2
 
 	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
@@ -669,14 +695,12 @@  __restore_debug:
 	ret
 
 __save_fpsimd:
-	skip_fpsimd_state x3, 1f
 	save_fpsimd
-1:	ret
+	ret
 
 __restore_fpsimd:
-	skip_fpsimd_state x3, 1f
 	restore_fpsimd
-1:	ret
+	ret
 
 switch_to_guest_fpsimd:
 	push	x4, lr
@@ -688,6 +712,9 @@  switch_to_guest_fpsimd:
 
 	mrs	x0, tpidr_el2
 
+	mov     w2, #1
+	strb    w2, [x0, #VCPU_VFP_DIRTY]
+
 	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
 	kern_hyp_va x2
 	bl __save_fpsimd
@@ -763,7 +790,6 @@  __kvm_vcpu_return:
 	add	x2, x0, #VCPU_CONTEXT
 
 	save_guest_regs
-	bl __save_fpsimd
 	bl __save_sysregs
 
 	skip_debug_state x3, 1f
@@ -784,7 +810,6 @@  __kvm_vcpu_return:
 	kern_hyp_va x2
 
 	bl __restore_sysregs
-	bl __restore_fpsimd
 	/* Clear FPSIMD and Trace trapping */
 	msr     cptr_el2, xzr