Message ID | 20190621093843.220980-17-marc.zyngier@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: arm64: ARMv8.3 Nested Virtualization support | expand |
On 06/21/2019 10:38 AM, Marc Zyngier wrote: > From: Andre Przywara <andre.przywara@arm.com> > > Whenever we need to restore the guest's system registers to the CPU, we > now need to take care of the EL2 system registers as well. Most of them > are accessed via traps only, but some have an immediate effect and also > a guest running in VHE mode would expect them to be accessible via their > EL1 encoding, which we do not trap. > > Split the current __sysreg_{save,restore}_el1_state() functions into > handling common sysregs, then differentiate between the guest running in > vEL2 and vEL1. > > For vEL2 we write the virtual EL2 registers with an identical format directly > into their EL1 counterpart, and translate the few registers that have a > different format for the same effect on the execution when running a > non-VHE guest guest hypervisor. > > [ Commit message reworked and many bug fixes applied by Marc Zyngier > and Christoffer Dall. ] > > Signed-off-by: Andre Przywara <andre.przywara@arm.com> > Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> > Signed-off-by: Christoffer Dall <christoffer.dall@arm.com> > --- > arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++-- > 1 file changed, 153 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c > index 62866a68e852..2abb9c3ff24f 100644 > --- a/arch/arm64/kvm/hyp/sysreg-sr.c > +++ b/arch/arm64/kvm/hyp/sysreg-sr.c [...] > @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) > write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); > } > > -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) > +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt) > { > + u64 val; > + > + write_sysreg(read_cpuid_id(), vpidr_el2); > write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); > - write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); > + write_sysreg_el1(ctxt->sys_regs[MAIR_EL2], SYS_MAIR); > + write_sysreg_el1(ctxt->sys_regs[VBAR_EL2], SYS_VBAR); > + write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR); > + write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2], SYS_AMAIR); > + > + if (__vcpu_el2_e2h_is_set(ctxt)) { > + /* > + * In VHE mode those registers are compatible between > + * EL1 and EL2. > + */ > + write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2], SYS_SCTLR); > + write_sysreg_el1(ctxt->sys_regs[CPTR_EL2], SYS_CPACR); > + write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2], SYS_TTBR0); > + write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2], SYS_TTBR1); > + write_sysreg_el1(ctxt->sys_regs[TCR_EL2], SYS_TCR); > + write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2], SYS_CNTKCTL); > + } else { > + write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]), > + SYS_SCTLR); > + write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]), > + SYS_CPACR); > + write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]), > + SYS_TTBR0); > + write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]), > + SYS_TCR); > + write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]), > + SYS_CNTKCTL); > + } > + > + /* > + * These registers can be modified behind our back by a fault > + * taken inside vEL2. Save them, always. > + */ > + write_sysreg_el1(ctxt->sys_regs[ESR_EL2], SYS_ESR); > + write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2], SYS_AFSR0); > + write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2], SYS_AFSR1); > + write_sysreg_el1(ctxt->sys_regs[FAR_EL2], SYS_FAR); > + write_sysreg(ctxt->sys_regs[SP_EL2], sp_el1); > + write_sysreg_el1(ctxt->sys_regs[ELR_EL2], SYS_ELR); > + > + val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]); > + write_sysreg_el1(val, SYS_SPSR); > +} > + > +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt) > +{ > + u64 mpidr; > + > + if (has_vhe()) { > + struct kvm_vcpu *vcpu; > + > + /* > + * Warning: this hack only works on VHE, because we only > + * call this with the *guest* context, which is part of > + * struct kvm_vcpu. On a host context, you'd get pure junk. > + */ > + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); This seems very fragile, just to find out whether the guest has hyp capabilities. It would be at least nice to make sure this is indeed a guest context. The *clean* way to do it could be to have a pointer to kvm_vcpu in the kvm_cpu_context which would be NULL for host contexts. Otherwise, I'm under the impression that for a host context, ctxt->sys_reg[HCR_EL2] == 0 and that this would also be true for a guest without nested virt capability. Could we use something like that here? Cheers, Julien
On 6/21/19 10:38 AM, Marc Zyngier wrote: > From: Andre Przywara <andre.przywara@arm.com> > > Whenever we need to restore the guest's system registers to the CPU, we > now need to take care of the EL2 system registers as well. Most of them > are accessed via traps only, but some have an immediate effect and also > a guest running in VHE mode would expect them to be accessible via their > EL1 encoding, which we do not trap. > > Split the current __sysreg_{save,restore}_el1_state() functions into > handling common sysregs, then differentiate between the guest running in > vEL2 and vEL1. > > For vEL2 we write the virtual EL2 registers with an identical format directly > into their EL1 counterpart, and translate the few registers that have a > different format for the same effect on the execution when running a > non-VHE guest guest hypervisor. > > [ Commit message reworked and many bug fixes applied by Marc Zyngier > and Christoffer Dall. ] > > Signed-off-by: Andre Przywara <andre.przywara@arm.com> > Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> > Signed-off-by: Christoffer Dall <christoffer.dall@arm.com> > --- > arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++-- > 1 file changed, 153 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c > index 62866a68e852..2abb9c3ff24f 100644 > --- a/arch/arm64/kvm/hyp/sysreg-sr.c > +++ b/arch/arm64/kvm/hyp/sysreg-sr.c > @@ -22,6 +22,7 @@ > #include <asm/kvm_asm.h> > #include <asm/kvm_emulate.h> > #include <asm/kvm_hyp.h> > +#include <asm/kvm_nested.h> > > /* > * Non-VHE: Both host and guest must save everything. > @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) > ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0); > } > > -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) > +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt) > { > - ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); > ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(SYS_SCTLR); > - ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); > ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(SYS_CPACR); > ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(SYS_TTBR0); > ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(SYS_TTBR1); > @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) > ctxt->sys_regs[CONTEXTIDR_EL1] = read_sysreg_el1(SYS_CONTEXTIDR); > ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(SYS_AMAIR); > ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(SYS_CNTKCTL); > - ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); > - ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); > > ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); > ctxt->gp_regs.elr_el1 = read_sysreg_el1(SYS_ELR); > ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR); > } > > +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt) > +{ > + ctxt->sys_regs[ESR_EL2] = read_sysreg_el1(SYS_ESR); > + ctxt->sys_regs[AFSR0_EL2] = read_sysreg_el1(SYS_AFSR0); > + ctxt->sys_regs[AFSR1_EL2] = read_sysreg_el1(SYS_AFSR1); > + ctxt->sys_regs[FAR_EL2] = read_sysreg_el1(SYS_FAR); > + ctxt->sys_regs[MAIR_EL2] = read_sysreg_el1(SYS_MAIR); > + ctxt->sys_regs[VBAR_EL2] = read_sysreg_el1(SYS_VBAR); > + ctxt->sys_regs[CONTEXTIDR_EL2] = read_sysreg_el1(SYS_CONTEXTIDR); > + ctxt->sys_regs[AMAIR_EL2] = read_sysreg_el1(SYS_AMAIR); > + > + /* > + * In VHE mode those registers are compatible between EL1 and EL2, > + * and the guest uses the _EL1 versions on the CPU naturally. > + * So we save them into their _EL2 versions here. > + * For nVHE mode we trap accesses to those registers, so our > + * _EL2 copy in sys_regs[] is always up-to-date and we don't need > + * to save anything here. > + */ > + if (__vcpu_el2_e2h_is_set(ctxt)) { > + ctxt->sys_regs[SCTLR_EL2] = read_sysreg_el1(SYS_SCTLR); > + ctxt->sys_regs[CPTR_EL2] = read_sysreg_el1(SYS_CPACR); > + ctxt->sys_regs[TTBR0_EL2] = read_sysreg_el1(SYS_TTBR0); > + ctxt->sys_regs[TTBR1_EL2] = read_sysreg_el1(SYS_TTBR1); > + ctxt->sys_regs[TCR_EL2] = read_sysreg_el1(SYS_TCR); > + ctxt->sys_regs[CNTHCTL_EL2] = read_sysreg_el1(SYS_CNTKCTL); This goes against how the register is declared in arch/arm64/kvm/sys_regs.c (added by patch 13), where it's declared as a "pure" EL2 register with no EL1 counterpart. I think this is correct, and having it as a pure register is not the right approach, I'll explain why in patch 13. > + } > + > + ctxt->sys_regs[SP_EL2] = read_sysreg(sp_el1); > + ctxt->sys_regs[ELR_EL2] = read_sysreg_el1(SYS_ELR); > + ctxt->sys_regs[SPSR_EL2] = __fixup_spsr_el2_read(ctxt, read_sysreg_el1(SYS_SPSR)); > +} > + > +static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) > +{ > + ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); > + ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); > + ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); > + ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); > + > + if (unlikely(__is_hyp_ctxt(ctxt))) > + __sysreg_save_vel2_state(ctxt); > + else > + __sysreg_save_vel1_state(ctxt); > +} > + > static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) > { > ctxt->gp_regs.regs.pc = read_sysreg_el2(SYS_ELR); > @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) > write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); > } > > -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) > +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt) > { > + u64 val; > + > + write_sysreg(read_cpuid_id(), vpidr_el2); > write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); > - write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); > + write_sysreg_el1(ctxt->sys_regs[MAIR_EL2], SYS_MAIR); > + write_sysreg_el1(ctxt->sys_regs[VBAR_EL2], SYS_VBAR); > + write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR); > + write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2], SYS_AMAIR); > + > + if (__vcpu_el2_e2h_is_set(ctxt)) { > + /* > + * In VHE mode those registers are compatible between > + * EL1 and EL2. > + */ > + write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2], SYS_SCTLR); > + write_sysreg_el1(ctxt->sys_regs[CPTR_EL2], SYS_CPACR); > + write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2], SYS_TTBR0); > + write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2], SYS_TTBR1); > + write_sysreg_el1(ctxt->sys_regs[TCR_EL2], SYS_TCR); > + write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2], SYS_CNTKCTL); > + } else { > + write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]), > + SYS_SCTLR); > + write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]), > + SYS_CPACR); > + write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]), > + SYS_TTBR0); > + write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]), > + SYS_TCR); > + write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]), > + SYS_CNTKCTL); > + } > + > + /* > + * These registers can be modified behind our back by a fault > + * taken inside vEL2. Save them, always. > + */ > + write_sysreg_el1(ctxt->sys_regs[ESR_EL2], SYS_ESR); > + write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2], SYS_AFSR0); > + write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2], SYS_AFSR1); > + write_sysreg_el1(ctxt->sys_regs[FAR_EL2], SYS_FAR); > + write_sysreg(ctxt->sys_regs[SP_EL2], sp_el1); > + write_sysreg_el1(ctxt->sys_regs[ELR_EL2], SYS_ELR); > + > + val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]); > + write_sysreg_el1(val, SYS_SPSR); > +} > + > +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt) > +{ > + u64 mpidr; > + > + if (has_vhe()) { > + struct kvm_vcpu *vcpu; > + > + /* > + * Warning: this hack only works on VHE, because we only > + * call this with the *guest* context, which is part of > + * struct kvm_vcpu. On a host context, you'd get pure junk. > + */ > + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); > + > + if (nested_virt_in_use(vcpu)) { > + /* > + * Only set VPIDR_EL2 for nested VMs, as this is the > + * only time it changes. We'll restore the MIDR_EL1 > + * view on put. > + */ > + write_sysreg(ctxt->sys_regs[VPIDR_EL2], vpidr_el2); > + > + /* > + * As we're restoring a nested guest, set the value > + * provided by the guest hypervisor. > + */ > + mpidr = ctxt->sys_regs[VMPIDR_EL2]; > + } else { > + mpidr = ctxt->sys_regs[MPIDR_EL1]; > + } > + } else { > + mpidr = ctxt->sys_regs[MPIDR_EL1]; > + } > + > + write_sysreg(mpidr, vmpidr_el2); > write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR); > write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); > write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], SYS_CPACR); > @@ -151,6 +275,19 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) > write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR); > } > > +static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) > +{ > + write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); > + write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); > + write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); > + write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); > + > + if (__is_hyp_ctxt(ctxt)) > + __sysreg_restore_vel2_state(ctxt); > + else > + __sysreg_restore_vel1_state(ctxt); > +} > + > static void __hyp_text > __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) > { > @@ -307,6 +444,15 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) > /* Restore host user state */ > __sysreg_restore_user_state(host_ctxt); > > + /* > + * If leaving a nesting guest, restore MPIDR_EL1 default view. It is > + * slightly ugly to do it here, but the alternative is to penalize > + * all non-nesting guests by forcing this on every load. Instead, we > + * choose to only penalize nesting VMs. > + */ > + if (nested_virt_in_use(vcpu)) > + write_sysreg(read_cpuid_id(), vpidr_el2); > + > vcpu->arch.sysregs_loaded_on_cpu = false; > } >
On 25/06/2019 09:48, Julien Thierry wrote: > > > On 06/21/2019 10:38 AM, Marc Zyngier wrote: >> From: Andre Przywara <andre.przywara@arm.com> >> >> Whenever we need to restore the guest's system registers to the CPU, we >> now need to take care of the EL2 system registers as well. Most of them >> are accessed via traps only, but some have an immediate effect and also >> a guest running in VHE mode would expect them to be accessible via their >> EL1 encoding, which we do not trap. >> >> Split the current __sysreg_{save,restore}_el1_state() functions into >> handling common sysregs, then differentiate between the guest running in >> vEL2 and vEL1. >> >> For vEL2 we write the virtual EL2 registers with an identical format directly >> into their EL1 counterpart, and translate the few registers that have a >> different format for the same effect on the execution when running a >> non-VHE guest guest hypervisor. >> >> [ Commit message reworked and many bug fixes applied by Marc Zyngier >> and Christoffer Dall. ] >> >> Signed-off-by: Andre Przywara <andre.przywara@arm.com> >> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> >> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com> >> --- >> arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++-- >> 1 file changed, 153 insertions(+), 7 deletions(-) >> >> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c >> index 62866a68e852..2abb9c3ff24f 100644 >> --- a/arch/arm64/kvm/hyp/sysreg-sr.c >> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c > > [...] > >> @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) >> write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); >> } >> >> -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) >> +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt) >> { >> + u64 val; >> + >> + write_sysreg(read_cpuid_id(), vpidr_el2); >> write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); >> - write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); >> + write_sysreg_el1(ctxt->sys_regs[MAIR_EL2], SYS_MAIR); >> + write_sysreg_el1(ctxt->sys_regs[VBAR_EL2], SYS_VBAR); >> + write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR); >> + write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2], SYS_AMAIR); >> + >> + if (__vcpu_el2_e2h_is_set(ctxt)) { >> + /* >> + * In VHE mode those registers are compatible between >> + * EL1 and EL2. >> + */ >> + write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2], SYS_SCTLR); >> + write_sysreg_el1(ctxt->sys_regs[CPTR_EL2], SYS_CPACR); >> + write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2], SYS_TTBR0); >> + write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2], SYS_TTBR1); >> + write_sysreg_el1(ctxt->sys_regs[TCR_EL2], SYS_TCR); >> + write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2], SYS_CNTKCTL); >> + } else { >> + write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]), >> + SYS_SCTLR); >> + write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]), >> + SYS_CPACR); >> + write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]), >> + SYS_TTBR0); >> + write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]), >> + SYS_TCR); >> + write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]), >> + SYS_CNTKCTL); >> + } >> + >> + /* >> + * These registers can be modified behind our back by a fault >> + * taken inside vEL2. Save them, always. >> + */ >> + write_sysreg_el1(ctxt->sys_regs[ESR_EL2], SYS_ESR); >> + write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2], SYS_AFSR0); >> + write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2], SYS_AFSR1); >> + write_sysreg_el1(ctxt->sys_regs[FAR_EL2], SYS_FAR); >> + write_sysreg(ctxt->sys_regs[SP_EL2], sp_el1); >> + write_sysreg_el1(ctxt->sys_regs[ELR_EL2], SYS_ELR); >> + >> + val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]); >> + write_sysreg_el1(val, SYS_SPSR); >> +} >> + >> +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt) >> +{ >> + u64 mpidr; >> + >> + if (has_vhe()) { >> + struct kvm_vcpu *vcpu; >> + >> + /* >> + * Warning: this hack only works on VHE, because we only >> + * call this with the *guest* context, which is part of >> + * struct kvm_vcpu. On a host context, you'd get pure junk. >> + */ >> + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); > > This seems very fragile, just to find out whether the guest has hyp > capabilities. It would be at least nice to make sure this is indeed a > guest context. Oh come on! It is such a nice hack! ;-) I distinctly remember Christoffer being >that< close to vomiting when he saw that first. More seriously, we know what the context is by construction. > The *clean* way to do it could be to have a pointer to kvm_vcpu in the > kvm_cpu_context which would be NULL for host contexts. Funny you mention that. We have the exact opposite (host context pointing to the running vcpu, and NULL in the guest context). Maybe we can come up with something that always point to the vcpu, assuming nothing yet checks for NULL to identify a guest context! ;-) > Otherwise, I'm under the impression that for a host context, > ctxt->sys_reg[HCR_EL2] == 0 and that this would also be true for a guest > without nested virt capability. Could we use something like that here? Urgh. I think I'd prefer the above suggestion. Or even my hack. Thanks, M.
On 6/21/19 10:38 AM, Marc Zyngier wrote: > From: Andre Przywara <andre.przywara@arm.com> > > Whenever we need to restore the guest's system registers to the CPU, we > now need to take care of the EL2 system registers as well. Most of them > are accessed via traps only, but some have an immediate effect and also > a guest running in VHE mode would expect them to be accessible via their > EL1 encoding, which we do not trap. > > Split the current __sysreg_{save,restore}_el1_state() functions into > handling common sysregs, then differentiate between the guest running in > vEL2 and vEL1. > > For vEL2 we write the virtual EL2 registers with an identical format directly > into their EL1 counterpart, and translate the few registers that have a > different format for the same effect on the execution when running a > non-VHE guest guest hypervisor. > > [ Commit message reworked and many bug fixes applied by Marc Zyngier > and Christoffer Dall. ] > > Signed-off-by: Andre Przywara <andre.przywara@arm.com> > Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> > Signed-off-by: Christoffer Dall <christoffer.dall@arm.com> > --- > arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++-- > 1 file changed, 153 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c > index 62866a68e852..2abb9c3ff24f 100644 > --- a/arch/arm64/kvm/hyp/sysreg-sr.c > +++ b/arch/arm64/kvm/hyp/sysreg-sr.c > @@ -22,6 +22,7 @@ > #include <asm/kvm_asm.h> > #include <asm/kvm_emulate.h> > #include <asm/kvm_hyp.h> > +#include <asm/kvm_nested.h> > > /* > * Non-VHE: Both host and guest must save everything. > @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) > ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0); > } > > -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) > +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt) > { > - ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); > ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(SYS_SCTLR); > - ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); > ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(SYS_CPACR); > ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(SYS_TTBR0); > ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(SYS_TTBR1); > @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) > ctxt->sys_regs[CONTEXTIDR_EL1] = read_sysreg_el1(SYS_CONTEXTIDR); > ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(SYS_AMAIR); > ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(SYS_CNTKCTL); > - ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); > - ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); > > ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); > ctxt->gp_regs.elr_el1 = read_sysreg_el1(SYS_ELR); > ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR); > } > > +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt) > +{ > + ctxt->sys_regs[ESR_EL2] = read_sysreg_el1(SYS_ESR); > + ctxt->sys_regs[AFSR0_EL2] = read_sysreg_el1(SYS_AFSR0); > + ctxt->sys_regs[AFSR1_EL2] = read_sysreg_el1(SYS_AFSR1); > + ctxt->sys_regs[FAR_EL2] = read_sysreg_el1(SYS_FAR); > + ctxt->sys_regs[MAIR_EL2] = read_sysreg_el1(SYS_MAIR); > + ctxt->sys_regs[VBAR_EL2] = read_sysreg_el1(SYS_VBAR); > + ctxt->sys_regs[CONTEXTIDR_EL2] = read_sysreg_el1(SYS_CONTEXTIDR); > + ctxt->sys_regs[AMAIR_EL2] = read_sysreg_el1(SYS_AMAIR); > + > + /* > + * In VHE mode those registers are compatible between EL1 and EL2, > + * and the guest uses the _EL1 versions on the CPU naturally. > + * So we save them into their _EL2 versions here. > + * For nVHE mode we trap accesses to those registers, so our > + * _EL2 copy in sys_regs[] is always up-to-date and we don't need > + * to save anything here. > + */ > + if (__vcpu_el2_e2h_is_set(ctxt)) { > + ctxt->sys_regs[SCTLR_EL2] = read_sysreg_el1(SYS_SCTLR); > + ctxt->sys_regs[CPTR_EL2] = read_sysreg_el1(SYS_CPACR); > + ctxt->sys_regs[TTBR0_EL2] = read_sysreg_el1(SYS_TTBR0); > + ctxt->sys_regs[TTBR1_EL2] = read_sysreg_el1(SYS_TTBR1); > + ctxt->sys_regs[TCR_EL2] = read_sysreg_el1(SYS_TCR); > + ctxt->sys_regs[CNTHCTL_EL2] = read_sysreg_el1(SYS_CNTKCTL); > + } This can break guests that run with VHE on, then disable it. I stumbled into this while working on kvm-unit-tests, which uses TTBR0 for the translation tables. Let's consider the following scenario: 1. Guest sets HCR_EL2.E2H 2. Guest programs translation tables in TTBR0_EL1, which should reflect in TTBR0_EL2. 3. Guest enabled MMU and does stuff. 4. Guest disables MMU and clears HCR_EL2.E2H 5. Guest turns MMU on. It doesn't change TTBR0_EL2, because it will use the same translation tables as when running with E2H set. 6. The vcpu gets scheduled out. E2H is not set, so the value that the guest programmed in hardware TTBR0_EL1 won't be copied to virtual TTBR0_EL2. 7. The vcpu gets scheduled back in. KVM will write the reset value for virtual TTBR0_EL2 (which is 0x0). 8. The guest hangs. I think this is actually a symptom of a deeper issue. When E2H is cleared, the values that the guest wrote to the EL1 registers aren't immediately reflected in the virtual EL2 registers, as it happens on real hardware. Instead, some of the hardware values from the EL1 registers are copied to the corresponding EL2 registers on the next vcpu_put, which happens at a later time. I am thinking that something like this will fix the issues (it did fix disabling VHE in kvm-unit-tests): diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1d896113f1f8..f2b5a39762d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -333,7 +333,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) * to reverse-translate virtual EL2 system registers for a * non-VHE guest hypervisor. */ - __vcpu_sys_reg(vcpu, reg) = val; + if (reg != HCR_EL2) + __vcpu_sys_reg(vcpu, reg) = val; switch (reg) { case ELR_EL2: @@ -370,7 +371,17 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) return; memory_write: - __vcpu_sys_reg(vcpu, reg) = val; + if (reg == HCR_EL2 && vcpu_el2_e2h_is_set(vcpu) && !(val & HCR_E2H)) { + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + + __vcpu_sys_reg(vcpu, reg) = val; + + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); + } else { + __vcpu_sys_reg(vcpu, reg) = val; + } } /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ I don't think there's any need to convert EL1 registers to their non-vhe EL2 format because of how RES0/RES1 is defined in the architecture glossary (ARM DDI 0487E.a, page 7893 for RES0 and 7894 for RES1): "If a bit is RES0 only in some contexts: A read of the bit must return the value last successfully written to the bit, by either a direct or an indirect write, regardless of the use of the register when the bit was written [..] While the use of the register is such that the bit is described as RES0, the value of the bit must have no effect on the operation of the PE, other than determining the value read back from that bit, unless this Manual explicitly defines additional properties for the bit" We have the translate functions that should take care of converting the non-vhe EL2 format to the hardware EL1 format. As an aside, the diff looks weird because the vcpu_write_sys_reg is very complex, there are a LOT of exit points from the function, and the register value is written twice for some registers. I think it's worth considering making the function simpler, maybe splitting it into two separate functions, one for EL2 registers, one for regular registers. Here's the kvm-unit-tests diff that I used to spot the bug. It's very far from being correct, but the test is able to finish with the fix (it hangs otherwise). You can apply it on top of 2130fd4154ad ("tscdeadline_latency: Check condition first before loop"): diff --git a/arm/cstart64.S b/arm/cstart64.S index b0e8baa1a23a..01357b3b116b 100644 --- a/arm/cstart64.S +++ b/arm/cstart64.S @@ -51,6 +51,18 @@ start: b 1b 1: + mrs x4, CurrentEL + cmp x4, CurrentEL_EL2 + b.ne 1f + mrs x4, mpidr_el1 + msr vmpidr_el2, x4 + mrs x4, midr_el1 + msr vpidr_el2, x4 + ldr x4, =(HCR_EL2_TGE | HCR_EL2_E2H) + msr hcr_el2, x4 + isb + +1: /* set up stack */ mov x4, #1 msr spsel, x4 @@ -101,6 +113,18 @@ get_mmu_off: .globl secondary_entry secondary_entry: + mrs x0, CurrentEL + cmp x0, CurrentEL_EL2 + b.ne 1f + mrs x0, mpidr_el1 + msr vmpidr_el2, x0 + mrs x0, midr_el1 + msr vpidr_el2, x0 + ldr x0, =(HCR_EL2_TGE | HCR_EL2_E2H) + msr hcr_el2, x0 + isb + +1: /* Enable FP/ASIMD */ mov x0, #(3 << 20) msr cpacr_el1, x0 @@ -194,6 +218,33 @@ asm_mmu_enable: ret +asm_mmu_enable_hyp: + ic iallu + tlbi alle2is + dsb ish + + /* TCR */ + ldr x1, =TCR_EL2_RES1 | \ + TCR_T0SZ(VA_BITS) | \ + TCR_TG0_64K | \ + TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | \ + TCR_SH0_IS + mrs x2, id_aa64mmfr0_el1 + bfi x1, x2, #TCR_EL2_PS_SHIFT, #3 + msr tcr_el2, x1 + + /* Same MAIR and TTBR0 as in VHE mode */ + + /* SCTLR */ + ldr x1, =SCTLR_EL2_RES1 | \ + SCTLR_EL2_C | \ + SCTLR_EL2_I | \ + SCTLR_EL2_M + msr sctlr_el2, x1 + isb + + ret + .globl asm_mmu_disable asm_mmu_disable: mrs x0, sctlr_el1 @@ -202,6 +253,18 @@ asm_mmu_disable: isb ret +.globl asm_disable_vhe +asm_disable_vhe: + str x30, [sp, #-16]! + + bl asm_mmu_disable + msr hcr_el2, xzr + isb + bl asm_mmu_enable_hyp + + ldr x30, [sp], #16 + ret + /* * Vectors * Adapted from arch/arm64/kernel/entry.S diff --git a/arm/selftest.c b/arm/selftest.c index 28a17f7a7531..68a18036221b 100644 --- a/arm/selftest.c +++ b/arm/selftest.c @@ -287,6 +287,12 @@ static void user_psci_system_off(struct pt_regs *regs, unsigned int esr) { __user_psci_system_off(); } + +extern void asm_disable_vhe(void); +static void check_el2(void) +{ + asm_disable_vhe(); +} #endif static void check_vectors(void *arg __unused) @@ -369,6 +375,10 @@ int main(int argc, char **argv) report("PSCI version", psci_check()); on_cpus(cpu_report, NULL); + } else if (strcmp(argv[1], "el2") == 0) { + + check_el2(); + } else { printf("Unknown subtest\n"); abort(); diff --git a/lib/arm/psci.c b/lib/arm/psci.c index c3d399064ae3..5ef1c0386ce1 100644 --- a/lib/arm/psci.c +++ b/lib/arm/psci.c @@ -16,7 +16,7 @@ int psci_invoke(unsigned long function_id, unsigned long arg0, unsigned long arg1, unsigned long arg2) { asm volatile( - "hvc #0" + "smc #0" : "+r" (function_id) : "r" (arg0), "r" (arg1), "r" (arg2)); return function_id; diff --git a/lib/arm64/asm/pgtable-hwdef.h b/lib/arm64/asm/pgtable-hwdef.h index 045a3ce12645..6b80e34dda0c 100644 --- a/lib/arm64/asm/pgtable-hwdef.h +++ b/lib/arm64/asm/pgtable-hwdef.h @@ -95,18 +95,42 @@ /* * TCR flags. */ -#define TCR_TxSZ(x) (((UL(64) - (x)) << 16) | ((UL(64) - (x)) << 0)) -#define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24)) -#define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24)) -#define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24)) -#define TCR_IRGN_WBnWA ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_IRGN_MASK ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_ORGN_NC ((UL(0) << 10) | (UL(0) << 26)) -#define TCR_ORGN_WBWA ((UL(1) << 10) | (UL(1) << 26)) -#define TCR_ORGN_WT ((UL(2) << 10) | (UL(2) << 26)) -#define TCR_ORGN_WBnWA ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_ORGN_MASK ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_SHARED ((UL(3) << 12) | (UL(3) << 28)) +#define TCR_T0SZ(x) ((UL(64) - (x)) << 0) +#define TCR_T1SZ(x) ((UL(64) - (x)) << 16) +#define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) +#define TCR_IRGN0_NC (UL(0) << 8) +#define TCR_IRGN1_NC (UL(0) << 24) +#define TCR_IRGN_NC (TCR_IRGN0_NC | TCR_IRGN1_NC) +#define TCR_IRGN0_WBWA (UL(1) << 8) +#define TCR_IRGN1_WBWA (UL(1) << 24) +#define TCR_IRGN_WBWA (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) +#define TCR_IRGN0_WT (UL(2) << 8) +#define TCR_IRGN1_WT (UL(2) << 24) +#define TCR_IRGN_WT (TCR_IRGN0_WT | TCR_IRGN1_WT) +#define TCR_IRGN0_WBnWA (UL(3) << 8) +#define TCR_IRGN1_WBnWA (UL(3) << 24) +#define TCR_IRGN_WBnWA (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA) +#define TCR_IRGN0_MASK (UL(3) << 8) +#define TCR_IRGN1_MASK (UL(3) << 24) +#define TCR_IRGN_MASK (TCR_IRGN0_MASK | TCR_IRGN1_MASK) +#define TCR_ORGN0_NC (UL(0) << 10) +#define TCR_ORGN1_NC (UL(0) << 26) +#define TCR_ORGN_NC (TCR_ORGN0_NC | TCR_ORGN1_NC) +#define TCR_ORGN0_WBWA (UL(1) << 10) +#define TCR_ORGN1_WBWA (UL(1) << 26) +#define TCR_ORGN_WBWA (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA) +#define TCR_ORGN0_WT (UL(2) << 10) +#define TCR_ORGN1_WT (UL(2) << 26) +#define TCR_ORGN_WT (TCR_ORGN0_WT | TCR_ORGN1_WT) +#define TCR_ORGN0_WBnWA (UL(3) << 8) +#define TCR_ORGN1_WBnWA (UL(3) << 24) +#define TCR_ORGN_WBnWA (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA) +#define TCR_ORGN0_MASK (UL(3) << 10) +#define TCR_ORGN1_MASK (UL(3) << 26) +#define TCR_ORGN_MASK (TCR_ORGN0_MASK | TCR_ORGN1_MASK) +#define TCR_SH0_IS (UL(3) << 12) +#define TCR_SH1_IS (UL(3) << 28) +#define TCR_SHARED (TCR_SH0_IS | TCR_SH1_IS) #define TCR_TG0_4K (UL(0) << 14) #define TCR_TG0_64K (UL(1) << 14) #define TCR_TG0_16K (UL(2) << 14) @@ -116,6 +140,9 @@ #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) +#define TCR_EL2_RES1 ((UL(1) << 31) | (UL(1) << 23)) +#define TCR_EL2_PS_SHIFT 16 + /* * Memory types available. */ diff --git a/lib/arm64/asm/processor.h b/lib/arm64/asm/processor.h index 1d9223f728a5..b2136acda743 100644 --- a/lib/arm64/asm/processor.h +++ b/lib/arm64/asm/processor.h @@ -16,6 +16,16 @@ #define SCTLR_EL1_A (1 << 1) #define SCTLR_EL1_M (1 << 0) +#define HCR_EL2_TGE (1 << 27) +#define HCR_EL2_E2H (1 << 34) + +#define SCTLR_EL2_RES1 ((UL(3) << 28) | (UL(3) << 22) | \ + (UL(1) << 18) | (UL(1) << 16) | \ + (UL(1) << 11) | (UL(3) << 4)) +#define SCTLR_EL2_I SCTLR_EL1_I +#define SCTLR_EL2_C SCTLR_EL1_C +#define SCTLR_EL2_M SCTLR_EL1_M + #ifndef __ASSEMBLY__ #include <asm/ptrace.h> #include <asm/esr.h> To run it: lkvm run -f selftest.flat -c 1 -m 128 -p el2 --nested --irqchip gicv3 --console serial Thanks, Alex
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 62866a68e852..2abb9c3ff24f 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -22,6 +22,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> /* * Non-VHE: Both host and guest must save everything. @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0); } -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt) { - ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(SYS_SCTLR); - ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(SYS_CPACR); ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(SYS_TTBR0); ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(SYS_TTBR1); @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt->sys_regs[CONTEXTIDR_EL1] = read_sysreg_el1(SYS_CONTEXTIDR); ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(SYS_AMAIR); ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(SYS_CNTKCTL); - ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); - ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); ctxt->gp_regs.elr_el1 = read_sysreg_el1(SYS_ELR); ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR); } +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt) +{ + ctxt->sys_regs[ESR_EL2] = read_sysreg_el1(SYS_ESR); + ctxt->sys_regs[AFSR0_EL2] = read_sysreg_el1(SYS_AFSR0); + ctxt->sys_regs[AFSR1_EL2] = read_sysreg_el1(SYS_AFSR1); + ctxt->sys_regs[FAR_EL2] = read_sysreg_el1(SYS_FAR); + ctxt->sys_regs[MAIR_EL2] = read_sysreg_el1(SYS_MAIR); + ctxt->sys_regs[VBAR_EL2] = read_sysreg_el1(SYS_VBAR); + ctxt->sys_regs[CONTEXTIDR_EL2] = read_sysreg_el1(SYS_CONTEXTIDR); + ctxt->sys_regs[AMAIR_EL2] = read_sysreg_el1(SYS_AMAIR); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (__vcpu_el2_e2h_is_set(ctxt)) { + ctxt->sys_regs[SCTLR_EL2] = read_sysreg_el1(SYS_SCTLR); + ctxt->sys_regs[CPTR_EL2] = read_sysreg_el1(SYS_CPACR); + ctxt->sys_regs[TTBR0_EL2] = read_sysreg_el1(SYS_TTBR0); + ctxt->sys_regs[TTBR1_EL2] = read_sysreg_el1(SYS_TTBR1); + ctxt->sys_regs[TCR_EL2] = read_sysreg_el1(SYS_TCR); + ctxt->sys_regs[CNTHCTL_EL2] = read_sysreg_el1(SYS_CNTKCTL); + } + + ctxt->sys_regs[SP_EL2] = read_sysreg(sp_el1); + ctxt->sys_regs[ELR_EL2] = read_sysreg_el1(SYS_ELR); + ctxt->sys_regs[SPSR_EL2] = __fixup_spsr_el2_read(ctxt, read_sysreg_el1(SYS_SPSR)); +} + +static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) +{ + ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); + ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); + ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); + ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); + + if (unlikely(__is_hyp_ctxt(ctxt))) + __sysreg_save_vel2_state(ctxt); + else + __sysreg_save_vel1_state(ctxt); +} + static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { ctxt->gp_regs.regs.pc = read_sysreg_el2(SYS_ELR); @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); } -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt) { + u64 val; + + write_sysreg(read_cpuid_id(), vpidr_el2); write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); - write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); + write_sysreg_el1(ctxt->sys_regs[MAIR_EL2], SYS_MAIR); + write_sysreg_el1(ctxt->sys_regs[VBAR_EL2], SYS_VBAR); + write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR); + write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2], SYS_AMAIR); + + if (__vcpu_el2_e2h_is_set(ctxt)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2], SYS_SCTLR); + write_sysreg_el1(ctxt->sys_regs[CPTR_EL2], SYS_CPACR); + write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2], SYS_TTBR0); + write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2], SYS_TTBR1); + write_sysreg_el1(ctxt->sys_regs[TCR_EL2], SYS_TCR); + write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2], SYS_CNTKCTL); + } else { + write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]), + SYS_SCTLR); + write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]), + SYS_CPACR); + write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]), + SYS_TTBR0); + write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]), + SYS_TCR); + write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]), + SYS_CNTKCTL); + } + + /* + * These registers can be modified behind our back by a fault + * taken inside vEL2. Save them, always. + */ + write_sysreg_el1(ctxt->sys_regs[ESR_EL2], SYS_ESR); + write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2], SYS_AFSR0); + write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2], SYS_AFSR1); + write_sysreg_el1(ctxt->sys_regs[FAR_EL2], SYS_FAR); + write_sysreg(ctxt->sys_regs[SP_EL2], sp_el1); + write_sysreg_el1(ctxt->sys_regs[ELR_EL2], SYS_ELR); + + val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]); + write_sysreg_el1(val, SYS_SPSR); +} + +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt) +{ + u64 mpidr; + + if (has_vhe()) { + struct kvm_vcpu *vcpu; + + /* + * Warning: this hack only works on VHE, because we only + * call this with the *guest* context, which is part of + * struct kvm_vcpu. On a host context, you'd get pure junk. + */ + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + if (nested_virt_in_use(vcpu)) { + /* + * Only set VPIDR_EL2 for nested VMs, as this is the + * only time it changes. We'll restore the MIDR_EL1 + * view on put. + */ + write_sysreg(ctxt->sys_regs[VPIDR_EL2], vpidr_el2); + + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + mpidr = ctxt->sys_regs[VMPIDR_EL2]; + } else { + mpidr = ctxt->sys_regs[MPIDR_EL1]; + } + } else { + mpidr = ctxt->sys_regs[MPIDR_EL1]; + } + + write_sysreg(mpidr, vmpidr_el2); write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR); write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], SYS_CPACR); @@ -151,6 +275,19 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR); } +static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +{ + write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); + write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); + write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); + write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); + + if (__is_hyp_ctxt(ctxt)) + __sysreg_restore_vel2_state(ctxt); + else + __sysreg_restore_vel1_state(ctxt); +} + static void __hyp_text __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { @@ -307,6 +444,15 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); + /* + * If leaving a nesting guest, restore MPIDR_EL1 default view. It is + * slightly ugly to do it here, but the alternative is to penalize + * all non-nesting guests by forcing this on every load. Instead, we + * choose to only penalize nesting VMs. + */ + if (nested_virt_in_use(vcpu)) + write_sysreg(read_cpuid_id(), vpidr_el2); + vcpu->arch.sysregs_loaded_on_cpu = false; }