diff mbox

KVM: PPC: Book3S HV: Fix bug causing host SLB to be restored incorrectly

Message ID 20170912040712.haizyajckxi76z25@oak.ozlabs.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paul Mackerras Sept. 12, 2017, 4:07 a.m. UTC
Aneesh Kumar reported seeing host crashes when running recent kernels
on POWER8.  The symptom was an oops like this:

Unable to handle kernel paging request for data at address 0xf00000000786c620
Faulting instruction address: 0xc00000000030e1e4
Oops: Kernel access of bad area, sig: 11 [#1]
LE SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: powernv_op_panel
CPU: 24 PID: 6663 Comm: qemu-system-ppc Tainted: G        W 4.13.0-rc7-43932-gfc36c59 #2
task: c000000fdeadfe80 task.stack: c000000fdeb68000
NIP:  c00000000030e1e4 LR: c00000000030de6c CTR: c000000000103620
REGS: c000000fdeb6b450 TRAP: 0300   Tainted: G        W        (4.13.0-rc7-43932-gfc36c59)
MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24044428  XER: 20000000
CFAR: c00000000030e134 DAR: f00000000786c620 DSISR: 40000000 SOFTE: 0
GPR00: 0000000000000000 c000000fdeb6b6d0 c0000000010bd000 000000000000e1b0
GPR04: c00000000115e168 c000001fffa6e4b0 c00000000115d000 c000001e1b180386
GPR08: f000000000000000 c000000f9a8913e0 f00000000786c600 00007fff587d0000
GPR12: c000000fdeb68000 c00000000fb0f000 0000000000000001 00007fff587cffff
GPR16: 0000000000000000 c000000000000000 00000000003fffff c000000fdebfe1f8
GPR20: 0000000000000004 c000000fdeb6b8a8 0000000000000001 0008000000000040
GPR24: 07000000000000c0 00007fff587cffff c000000fdec20bf8 00007fff587d0000
GPR28: c000000fdeca9ac0 00007fff587d0000 00007fff587c0000 00007fff587d0000
NIP [c00000000030e1e4] __get_user_pages_fast+0x434/0x1070
LR [c00000000030de6c] __get_user_pages_fast+0xbc/0x1070
Call Trace:
[c000000fdeb6b6d0] [c00000000139dab8] lock_classes+0x0/0x35fe50 (unreliable)
[c000000fdeb6b7e0] [c00000000030ef38] get_user_pages_fast+0xf8/0x120
[c000000fdeb6b830] [c000000000112318] kvmppc_book3s_hv_page_fault+0x308/0xf30
[c000000fdeb6b960] [c00000000010e10c] kvmppc_vcpu_run_hv+0xfdc/0x1f00
[c000000fdeb6bb20] [c0000000000e915c] kvmppc_vcpu_run+0x2c/0x40
[c000000fdeb6bb40] [c0000000000e5650] kvm_arch_vcpu_ioctl_run+0x110/0x300
[c000000fdeb6bbe0] [c0000000000d6468] kvm_vcpu_ioctl+0x528/0x900
[c000000fdeb6bd40] [c0000000003bc04c] do_vfs_ioctl+0xcc/0x950
[c000000fdeb6bde0] [c0000000003bc930] SyS_ioctl+0x60/0x100
[c000000fdeb6be30] [c00000000000b96c] system_call+0x58/0x6c
Instruction dump:
7ca81a14 2fa50000 41de0010 7cc8182a 68c60002 78c6ffe2 0b060000 3cc2000a
794a3664 390610d8 e9080000 7d485214 <e90a0020> 7d435378 790507e1 408202f0
---[ end trace fad4a342d0414aa2 ]---

It turns out that what has happened is that the SLB entry for the
vmmemap region hasn't been reloaded on exit from a guest, and it has
the wrong page size.  Then, when the host next accesses the vmemmap
region, it gets a page fault.

Commit a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with
KVM", 2017-07-24) modified the guest exit code so that it now only clears
out the SLB for hash guest.  The code tests the radix flag and puts the
result in a non-volatile CR field, CR2, and later branches based on CR2.

Unfortunately, the kvmppc_save_tm function, which gets called between
those two points, modifies all the user-visible registers in the case
where the guest was in transactional or suspended state, except for a
few which it restores (namely r1, r2, r9 and r13).  Thus the hash/radix indication in CR2 gets corrupted.

This fixes the problem by re-doing the comparison just before the
result is needed.  For good measure, this also adds comments next to
the call sites of kvmppc_save_tm and kvmppc_restore_tm pointing out
that non-volatile register state will be lost.

Cc: stable@vger.kernel.org # v4.13
Fixes: a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

Comments

Aneesh Kumar K.V Sept. 12, 2017, 5:56 a.m. UTC | #1
Paul Mackerras <paulus@ozlabs.org> writes:

> Aneesh Kumar reported seeing host crashes when running recent kernels
> on POWER8.  The symptom was an oops like this:
>
> Unable to handle kernel paging request for data at address 0xf00000000786c620
> Faulting instruction address: 0xc00000000030e1e4
> Oops: Kernel access of bad area, sig: 11 [#1]
> LE SMP NR_CPUS=2048 NUMA PowerNV
> Modules linked in: powernv_op_panel
> CPU: 24 PID: 6663 Comm: qemu-system-ppc Tainted: G        W 4.13.0-rc7-43932-gfc36c59 #2
> task: c000000fdeadfe80 task.stack: c000000fdeb68000
> NIP:  c00000000030e1e4 LR: c00000000030de6c CTR: c000000000103620
> REGS: c000000fdeb6b450 TRAP: 0300   Tainted: G        W        (4.13.0-rc7-43932-gfc36c59)
> MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24044428  XER: 20000000
> CFAR: c00000000030e134 DAR: f00000000786c620 DSISR: 40000000 SOFTE: 0
> GPR00: 0000000000000000 c000000fdeb6b6d0 c0000000010bd000 000000000000e1b0
> GPR04: c00000000115e168 c000001fffa6e4b0 c00000000115d000 c000001e1b180386
> GPR08: f000000000000000 c000000f9a8913e0 f00000000786c600 00007fff587d0000
> GPR12: c000000fdeb68000 c00000000fb0f000 0000000000000001 00007fff587cffff
> GPR16: 0000000000000000 c000000000000000 00000000003fffff c000000fdebfe1f8
> GPR20: 0000000000000004 c000000fdeb6b8a8 0000000000000001 0008000000000040
> GPR24: 07000000000000c0 00007fff587cffff c000000fdec20bf8 00007fff587d0000
> GPR28: c000000fdeca9ac0 00007fff587d0000 00007fff587c0000 00007fff587d0000
> NIP [c00000000030e1e4] __get_user_pages_fast+0x434/0x1070
> LR [c00000000030de6c] __get_user_pages_fast+0xbc/0x1070
> Call Trace:
> [c000000fdeb6b6d0] [c00000000139dab8] lock_classes+0x0/0x35fe50 (unreliable)
> [c000000fdeb6b7e0] [c00000000030ef38] get_user_pages_fast+0xf8/0x120
> [c000000fdeb6b830] [c000000000112318] kvmppc_book3s_hv_page_fault+0x308/0xf30
> [c000000fdeb6b960] [c00000000010e10c] kvmppc_vcpu_run_hv+0xfdc/0x1f00
> [c000000fdeb6bb20] [c0000000000e915c] kvmppc_vcpu_run+0x2c/0x40
> [c000000fdeb6bb40] [c0000000000e5650] kvm_arch_vcpu_ioctl_run+0x110/0x300
> [c000000fdeb6bbe0] [c0000000000d6468] kvm_vcpu_ioctl+0x528/0x900
> [c000000fdeb6bd40] [c0000000003bc04c] do_vfs_ioctl+0xcc/0x950
> [c000000fdeb6bde0] [c0000000003bc930] SyS_ioctl+0x60/0x100
> [c000000fdeb6be30] [c00000000000b96c] system_call+0x58/0x6c
> Instruction dump:
> 7ca81a14 2fa50000 41de0010 7cc8182a 68c60002 78c6ffe2 0b060000 3cc2000a
> 794a3664 390610d8 e9080000 7d485214 <e90a0020> 7d435378 790507e1 408202f0
> ---[ end trace fad4a342d0414aa2 ]---
>
> It turns out that what has happened is that the SLB entry for the
> vmmemap region hasn't been reloaded on exit from a guest, and it has
> the wrong page size.  Then, when the host next accesses the vmemmap
> region, it gets a page fault.
>
> Commit a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with
> KVM", 2017-07-24) modified the guest exit code so that it now only clears
> out the SLB for hash guest.  The code tests the radix flag and puts the
> result in a non-volatile CR field, CR2, and later branches based on CR2.
>
> Unfortunately, the kvmppc_save_tm function, which gets called between
> those two points, modifies all the user-visible registers in the case
> where the guest was in transactional or suspended state, except for a
> few which it restores (namely r1, r2, r9 and r13).  Thus the hash/radix indication in CR2 gets corrupted.
>
> This fixes the problem by re-doing the comparison just before the
> result is needed.  For good measure, this also adds comments next to
> the call sites of kvmppc_save_tm and kvmppc_restore_tm pointing out
> that non-volatile register state will be lost.
>

Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

> Cc: stable@vger.kernel.org # v4.13
> Fixes: a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM")
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++++-
>  1 file changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 663a4a861e7f..17936f82d3c7 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
>
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  BEGIN_FTR_SECTION
> +	/*
> +	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
> +	 */
>  	bl	kvmppc_restore_tm
>  END_FTR_SECTION_IFSET(CPU_FTR_TM)
>  #endif
> @@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  BEGIN_FTR_SECTION
> +	/*
> +	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
> +	 */
>  	bl	kvmppc_save_tm
>  END_FTR_SECTION_IFSET(CPU_FTR_TM)
>  #endif
> @@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>  	/*
>  	 * Are we running hash or radix ?
>  	 */
> -	beq	cr2,3f
> +	ld	r5, VCPU_KVM(r9)
> +	lbz	r0, KVM_RADIX(r5)
> +	cmpwi	cr2, r0, 0
> +	beq	cr2, 3f
>
>  	/* Radix: Handle the case where the guest used an illegal PID */
>  	LOAD_REG_ADDR(r4, mmu_base_pid)
> @@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
>
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  BEGIN_FTR_SECTION
> +	/*
> +	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
> +	 */
>  	ld	r9, HSTATE_KVM_VCPU(r13)
>  	bl	kvmppc_save_tm
>  END_FTR_SECTION_IFSET(CPU_FTR_TM)
> @@ -2578,6 +2590,9 @@ kvm_end_cede:
>
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  BEGIN_FTR_SECTION
> +	/*
> +	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
> +	 */
>  	bl	kvmppc_restore_tm
>  END_FTR_SECTION_IFSET(CPU_FTR_TM)
>  #endif
> -- 
> 2.11.0
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 663a4a861e7f..17936f82d3c7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -771,6 +771,9 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1630,6 +1633,9 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1749,7 +1755,10 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	/*
 	 * Are we running hash or radix ?
 	 */
-	beq	cr2,3f
+	ld	r5, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r5)
+	cmpwi	cr2, r0, 0
+	beq	cr2, 3f
 
 	/* Radix: Handle the case where the guest used an illegal PID */
 	LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -2466,6 +2475,9 @@  _GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -2578,6 +2590,9 @@  kvm_end_cede:
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif