diff mbox series

[-next,v2] riscv: add VMAP_STACK overflow detection

Message ID 20210621032855.130650-1-tongtiangen@huawei.com (mailing list archive)
State New, archived
Headers show
Series [-next,v2] riscv: add VMAP_STACK overflow detection | expand

Commit Message

Tong Tiangen June 21, 2021, 3:28 a.m. UTC
This patch adds stack overflow detection to riscv, usable when
CONFIG_VMAP_STACK=y.

Overflow is detected in kernel exception entry(kernel/entry.S), if the kernel
stack is overflow and been detected, the overflow handler is invoked on a
per-cpu overflow stack. This approach preserves GPRs and the original exception
information.

The overflow detect is performed before any attempt is made to access the stack
and the principle of stack overflow detection: kernel stacks are aligned to
double their size, enabling overflow to be detected with a single bit test. For
example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be
zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of
less than the size of the stack) can be detected by testing whether this bit is
set.

This gives us a useful error message on stack overflow, as can be trigger with
the LKDTM overflow test:

[  388.053267] lkdtm: Performing direct entry EXHAUST_STACK
[  388.053663] lkdtm: Calling function with 1024 frame size to depth 32 ...
[  388.054016] lkdtm: loop 32/32 ...
[  388.054186] lkdtm: loop 31/32 ...
[  388.054491] lkdtm: loop 30/32 ...
[  388.054672] lkdtm: loop 29/32 ...
[  388.054859] lkdtm: loop 28/32 ...
[  388.055010] lkdtm: loop 27/32 ...
[  388.055163] lkdtm: loop 26/32 ...
[  388.055309] lkdtm: loop 25/32 ...
[  388.055481] lkdtm: loop 24/32 ...
[  388.055653] lkdtm: loop 23/32 ...
[  388.055837] lkdtm: loop 22/32 ...
[  388.056015] lkdtm: loop 21/32 ...
[  388.056188] lkdtm: loop 20/32 ...
[  388.058145] Insufficient stack space to handle exception!
[  388.058153] Task stack:     [0xffffffd014260000..0xffffffd014264000]
[  388.058160] Overflow stack: [0xffffffe1f8d2c220..0xffffffe1f8d2d220]
[  388.058168] CPU: 0 PID: 89 Comm: bash Not tainted 5.12.0-rc8-dirty #90
[  388.058175] Hardware name: riscv-virtio,qemu (DT)
[  388.058187] epc : number+0x32/0x2c0
[  388.058247]  ra : vsnprintf+0x2ae/0x3f0
[  388.058255] epc : ffffffe0002d38f6 ra : ffffffe0002d814e sp : ffffffd01425ffc0
[  388.058263]  gp : ffffffe0012e4010 tp : ffffffe08014da00 t0 : ffffffd0142606e8
[  388.058271]  t1 : 0000000000000000 t2 : 0000000000000000 s0 : ffffffd014260070
[  388.058303]  s1 : ffffffd014260158 a0 : ffffffd01426015e a1 : ffffffd014260158
[  388.058311]  a2 : 0000000000000013 a3 : ffff0a01ffffff10 a4 : ffffffe000c398e0
[  388.058319]  a5 : 511b02ec65f3e300 a6 : 0000000000a1749a a7 : 0000000000000000
[  388.058327]  s2 : ffffffff000000ff s3 : 00000000ffff0a01 s4 : ffffffe0012e50a8
[  388.058335]  s5 : 0000000000ffff0a s6 : ffffffe0012e50a8 s7 : ffffffe000da1cc0
[  388.058343]  s8 : ffffffffffffffff s9 : ffffffd0142602b0 s10: ffffffd0142602a8
[  388.058351]  s11: ffffffd01426015e t3 : 00000000000f0000 t4 : ffffffffffffffff
[  388.058359]  t5 : 000000000000002f t6 : ffffffd014260158
[  388.058366] status: 0000000000000100 badaddr: ffffffd01425fff8 cause: 000000000000000f
[  388.058374] Kernel panic - not syncing: Kernel stack overflow
[  388.058381] CPU: 0 PID: 89 Comm: bash Not tainted 5.12.0-rc8-dirty #90
[  388.058387] Hardware name: riscv-virtio,qemu (DT)
[  388.058393] Call Trace:
[  388.058400] [<ffffffe000004944>] walk_stackframe+0x0/0xce
[  388.058406] [<ffffffe0006f0b28>] dump_backtrace+0x38/0x46
[  388.058412] [<ffffffe0006f0b46>] show_stack+0x10/0x18
[  388.058418] [<ffffffe0006f3690>] dump_stack+0x74/0x8e
[  388.058424] [<ffffffe0006f0d52>] panic+0xfc/0x2b2
[  388.058430] [<ffffffe0006f0acc>] print_trace_address+0x0/0x24
[  388.058436] [<ffffffe0002d814e>] vsnprintf+0x2ae/0x3f0
[  388.058956] SMP: stopping secondary CPUs

Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---

v2:
* 1. fix tests fail if STRICT_KERNEL_RWX=n.
  2. fix W=1 build warning.

 arch/riscv/Kconfig                      |   1 +
 arch/riscv/include/asm/asm-prototypes.h |   3 +
 arch/riscv/include/asm/thread_info.h    |  15 ++++
 arch/riscv/kernel/entry.S               | 108 ++++++++++++++++++++++++
 arch/riscv/kernel/traps.c               |  35 ++++++++
 arch/riscv/kernel/vmlinux.lds.S         |   2 +-
 6 files changed, 163 insertions(+), 1 deletion(-)

Comments

Palmer Dabbelt July 6, 2021, 8:24 p.m. UTC | #1
On Sun, 20 Jun 2021 20:28:55 PDT (-0700), tongtiangen@huawei.com wrote:
> This patch adds stack overflow detection to riscv, usable when
> CONFIG_VMAP_STACK=y.
>
> Overflow is detected in kernel exception entry(kernel/entry.S), if the kernel
> stack is overflow and been detected, the overflow handler is invoked on a
> per-cpu overflow stack. This approach preserves GPRs and the original exception
> information.
>
> The overflow detect is performed before any attempt is made to access the stack
> and the principle of stack overflow detection: kernel stacks are aligned to
> double their size, enabling overflow to be detected with a single bit test. For
> example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be
> zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of
> less than the size of the stack) can be detected by testing whether this bit is
> set.
>
> This gives us a useful error message on stack overflow, as can be trigger with
> the LKDTM overflow test:
>
> [  388.053267] lkdtm: Performing direct entry EXHAUST_STACK
> [  388.053663] lkdtm: Calling function with 1024 frame size to depth 32 ...
> [  388.054016] lkdtm: loop 32/32 ...
> [  388.054186] lkdtm: loop 31/32 ...
> [  388.054491] lkdtm: loop 30/32 ...
> [  388.054672] lkdtm: loop 29/32 ...
> [  388.054859] lkdtm: loop 28/32 ...
> [  388.055010] lkdtm: loop 27/32 ...
> [  388.055163] lkdtm: loop 26/32 ...
> [  388.055309] lkdtm: loop 25/32 ...
> [  388.055481] lkdtm: loop 24/32 ...
> [  388.055653] lkdtm: loop 23/32 ...
> [  388.055837] lkdtm: loop 22/32 ...
> [  388.056015] lkdtm: loop 21/32 ...
> [  388.056188] lkdtm: loop 20/32 ...
> [  388.058145] Insufficient stack space to handle exception!
> [  388.058153] Task stack:     [0xffffffd014260000..0xffffffd014264000]
> [  388.058160] Overflow stack: [0xffffffe1f8d2c220..0xffffffe1f8d2d220]
> [  388.058168] CPU: 0 PID: 89 Comm: bash Not tainted 5.12.0-rc8-dirty #90
> [  388.058175] Hardware name: riscv-virtio,qemu (DT)
> [  388.058187] epc : number+0x32/0x2c0
> [  388.058247]  ra : vsnprintf+0x2ae/0x3f0
> [  388.058255] epc : ffffffe0002d38f6 ra : ffffffe0002d814e sp : ffffffd01425ffc0
> [  388.058263]  gp : ffffffe0012e4010 tp : ffffffe08014da00 t0 : ffffffd0142606e8
> [  388.058271]  t1 : 0000000000000000 t2 : 0000000000000000 s0 : ffffffd014260070
> [  388.058303]  s1 : ffffffd014260158 a0 : ffffffd01426015e a1 : ffffffd014260158
> [  388.058311]  a2 : 0000000000000013 a3 : ffff0a01ffffff10 a4 : ffffffe000c398e0
> [  388.058319]  a5 : 511b02ec65f3e300 a6 : 0000000000a1749a a7 : 0000000000000000
> [  388.058327]  s2 : ffffffff000000ff s3 : 00000000ffff0a01 s4 : ffffffe0012e50a8
> [  388.058335]  s5 : 0000000000ffff0a s6 : ffffffe0012e50a8 s7 : ffffffe000da1cc0
> [  388.058343]  s8 : ffffffffffffffff s9 : ffffffd0142602b0 s10: ffffffd0142602a8
> [  388.058351]  s11: ffffffd01426015e t3 : 00000000000f0000 t4 : ffffffffffffffff
> [  388.058359]  t5 : 000000000000002f t6 : ffffffd014260158
> [  388.058366] status: 0000000000000100 badaddr: ffffffd01425fff8 cause: 000000000000000f
> [  388.058374] Kernel panic - not syncing: Kernel stack overflow
> [  388.058381] CPU: 0 PID: 89 Comm: bash Not tainted 5.12.0-rc8-dirty #90
> [  388.058387] Hardware name: riscv-virtio,qemu (DT)
> [  388.058393] Call Trace:
> [  388.058400] [<ffffffe000004944>] walk_stackframe+0x0/0xce
> [  388.058406] [<ffffffe0006f0b28>] dump_backtrace+0x38/0x46
> [  388.058412] [<ffffffe0006f0b46>] show_stack+0x10/0x18
> [  388.058418] [<ffffffe0006f3690>] dump_stack+0x74/0x8e
> [  388.058424] [<ffffffe0006f0d52>] panic+0xfc/0x2b2
> [  388.058430] [<ffffffe0006f0acc>] print_trace_address+0x0/0x24
> [  388.058436] [<ffffffe0002d814e>] vsnprintf+0x2ae/0x3f0
> [  388.058956] SMP: stopping secondary CPUs
>
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
> ---
>
> v2:
> * 1. fix tests fail if STRICT_KERNEL_RWX=n.
>   2. fix W=1 build warning.
>
>  arch/riscv/Kconfig                      |   1 +
>  arch/riscv/include/asm/asm-prototypes.h |   3 +
>  arch/riscv/include/asm/thread_info.h    |  15 ++++
>  arch/riscv/kernel/entry.S               | 108 ++++++++++++++++++++++++
>  arch/riscv/kernel/traps.c               |  35 ++++++++
>  arch/riscv/kernel/vmlinux.lds.S         |   2 +-
>  6 files changed, 163 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a97b03164080..c28284f45434 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -70,6 +70,7 @@ config RISCV
>  	select HAVE_ARCH_MMAP_RND_BITS if MMU
>  	select HAVE_ARCH_SECCOMP_FILTER
>  	select HAVE_ARCH_TRACEHOOK
> +	select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
>  	select HAVE_ASM_MODVERSIONS
>  	select HAVE_CONTEXT_TRACKING
>  	select HAVE_DEBUG_KMEMLEAK
> diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h
> index 2a652b0c987d..ef386fcf3939 100644
> --- a/arch/riscv/include/asm/asm-prototypes.h
> +++ b/arch/riscv/include/asm/asm-prototypes.h
> @@ -25,4 +25,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_s);
>  DECLARE_DO_ERROR_INFO(do_trap_ecall_m);
>  DECLARE_DO_ERROR_INFO(do_trap_break);
>
> +asmlinkage unsigned long get_overflow_stack(void);
> +asmlinkage void handle_bad_stack(struct pt_regs *regs);
> +
>  #endif /* _ASM_RISCV_PROTOTYPES_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 0e549a3089b3..60da0dcacf14 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -19,6 +19,21 @@
>  #endif
>  #define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
>
> +/*
> + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
> + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
> + * assembly.
> + */
> +#ifdef CONFIG_VMAP_STACK
> +#define THREAD_ALIGN            (2 * THREAD_SIZE)
> +#else
> +#define THREAD_ALIGN            THREAD_SIZE
> +#endif
> +
> +#define THREAD_SHIFT            (PAGE_SHIFT + THREAD_SIZE_ORDER)
> +#define OVERFLOW_STACK_SIZE     SZ_4K
> +#define SHADOW_OVERFLOW_STACK_SIZE (1024)
> +
>  #ifndef __ASSEMBLY__
>
>  #include <asm/processor.h>
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 80d5a9e017b0..98f502654edd 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -30,6 +30,15 @@ ENTRY(handle_exception)
>  _restore_kernel_tpsp:
>  	csrr tp, CSR_SCRATCH
>  	REG_S sp, TASK_TI_KERNEL_SP(tp)
> +
> +#ifdef CONFIG_VMAP_STACK
> +	addi sp, sp, -(PT_SIZE_ON_STACK)
> +	srli sp, sp, THREAD_SHIFT
> +	andi sp, sp, 0x1
> +	bnez sp, handle_kernel_stack_overflow
> +	REG_L sp, TASK_TI_KERNEL_SP(tp)
> +#endif
> +
>  _save_context:
>  	REG_S sp, TASK_TI_USER_SP(tp)
>  	REG_L sp, TASK_TI_KERNEL_SP(tp)
> @@ -376,6 +385,105 @@ handle_syscall_trace_exit:
>  	call do_syscall_trace_exit
>  	j ret_from_exception
>
> +#ifdef CONFIG_VMAP_STACK
> +handle_kernel_stack_overflow:
> +	la sp, shadow_stack
> +	addi sp, sp, SHADOW_OVERFLOW_STACK_SIZE
> +
> +	//save caller register to shadow stack
> +	addi sp, sp, -(PT_SIZE_ON_STACK)
> +	REG_S x1,  PT_RA(sp)
> +	REG_S x5,  PT_T0(sp)
> +	REG_S x6,  PT_T1(sp)
> +	REG_S x7,  PT_T2(sp)
> +	REG_S x10, PT_A0(sp)
> +	REG_S x11, PT_A1(sp)
> +	REG_S x12, PT_A2(sp)
> +	REG_S x13, PT_A3(sp)
> +	REG_S x14, PT_A4(sp)
> +	REG_S x15, PT_A5(sp)
> +	REG_S x16, PT_A6(sp)
> +	REG_S x17, PT_A7(sp)
> +	REG_S x28, PT_T3(sp)
> +	REG_S x29, PT_T4(sp)
> +	REG_S x30, PT_T5(sp)
> +	REG_S x31, PT_T6(sp)
> +
> +	la ra, restore_caller_reg
> +	tail get_overflow_stack
> +
> +restore_caller_reg:
> +	//save per-cpu overflow stack
> +	REG_S a0, -8(sp)
> +	//restore caller register from shadow_stack
> +	REG_L x1,  PT_RA(sp)
> +	REG_L x5,  PT_T0(sp)
> +	REG_L x6,  PT_T1(sp)
> +	REG_L x7,  PT_T2(sp)
> +	REG_L x10, PT_A0(sp)
> +	REG_L x11, PT_A1(sp)
> +	REG_L x12, PT_A2(sp)
> +	REG_L x13, PT_A3(sp)
> +	REG_L x14, PT_A4(sp)
> +	REG_L x15, PT_A5(sp)
> +	REG_L x16, PT_A6(sp)
> +	REG_L x17, PT_A7(sp)
> +	REG_L x28, PT_T3(sp)
> +	REG_L x29, PT_T4(sp)
> +	REG_L x30, PT_T5(sp)
> +	REG_L x31, PT_T6(sp)
> +
> +	//load per-cpu overflow stack
> +	REG_L sp, -8(sp)
> +	addi sp, sp, -(PT_SIZE_ON_STACK)
> +
> +	//save context to overflow stack
> +	REG_S x1,  PT_RA(sp)
> +	REG_S x3,  PT_GP(sp)
> +	REG_S x5,  PT_T0(sp)
> +	REG_S x6,  PT_T1(sp)
> +	REG_S x7,  PT_T2(sp)
> +	REG_S x8,  PT_S0(sp)
> +	REG_S x9,  PT_S1(sp)
> +	REG_S x10, PT_A0(sp)
> +	REG_S x11, PT_A1(sp)
> +	REG_S x12, PT_A2(sp)
> +	REG_S x13, PT_A3(sp)
> +	REG_S x14, PT_A4(sp)
> +	REG_S x15, PT_A5(sp)
> +	REG_S x16, PT_A6(sp)
> +	REG_S x17, PT_A7(sp)
> +	REG_S x18, PT_S2(sp)
> +	REG_S x19, PT_S3(sp)
> +	REG_S x20, PT_S4(sp)
> +	REG_S x21, PT_S5(sp)
> +	REG_S x22, PT_S6(sp)
> +	REG_S x23, PT_S7(sp)
> +	REG_S x24, PT_S8(sp)
> +	REG_S x25, PT_S9(sp)
> +	REG_S x26, PT_S10(sp)
> +	REG_S x27, PT_S11(sp)
> +	REG_S x28, PT_T3(sp)
> +	REG_S x29, PT_T4(sp)
> +	REG_S x30, PT_T5(sp)
> +	REG_S x31, PT_T6(sp)
> +
> +	REG_L s0, TASK_TI_KERNEL_SP(tp)
> +	csrr s1, CSR_STATUS
> +	csrr s2, CSR_EPC
> +	csrr s3, CSR_TVAL
> +	csrr s4, CSR_CAUSE
> +	csrr s5, CSR_SCRATCH
> +	REG_S s0, PT_SP(sp)
> +	REG_S s1, PT_STATUS(sp)
> +	REG_S s2, PT_EPC(sp)
> +	REG_S s3, PT_BADADDR(sp)
> +	REG_S s4, PT_CAUSE(sp)
> +	REG_S s5, PT_TP(sp)
> +	move a0, sp
> +	tail handle_bad_stack
> +#endif
> +
>  END(handle_exception)
>
>  ENTRY(ret_from_fork)
> diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
> index 7bc88d8aab97..0a98fd0ddfe9 100644
> --- a/arch/riscv/kernel/traps.c
> +++ b/arch/riscv/kernel/traps.c
> @@ -203,3 +203,38 @@ int is_valid_bugaddr(unsigned long pc)
>  void __init trap_init(void)
>  {
>  }
> +
> +#ifdef CONFIG_VMAP_STACK
> +static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)],
> +		overflow_stack)__aligned(16);
> +/*
> + * shadow stack, handled_ kernel_ stack_ overflow(in kernel/entry.S) is used
> + * to get per-cpu overflow stack(get_overflow_stack).
> + */
> +long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)];
> +asmlinkage unsigned long get_overflow_stack(void)
> +{
> +	return (unsigned long)this_cpu_ptr(overflow_stack) +
> +		OVERFLOW_STACK_SIZE;
> +}
> +
> +asmlinkage void handle_bad_stack(struct pt_regs *regs)
> +{
> +	unsigned long tsk_stk = (unsigned long)current->stack;
> +	unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
> +
> +	console_verbose();
> +
> +	pr_emerg("Insufficient stack space to handle exception!\n");
> +	pr_emerg("Task stack:     [0x%016lx..0x%016lx]\n",
> +			tsk_stk, tsk_stk + THREAD_SIZE);
> +	pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n",
> +			ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);
> +
> +	__show_regs(regs);
> +	panic("Kernel stack overflow");
> +
> +	for (;;)
> +		wait_for_interrupt();
> +}
> +#endif
> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
> index 891742ff75a7..502d0826ecb1 100644
> --- a/arch/riscv/kernel/vmlinux.lds.S
> +++ b/arch/riscv/kernel/vmlinux.lds.S
> @@ -117,7 +117,7 @@ SECTIONS
>  	. = ALIGN(SECTION_ALIGN);
>  	_data = .;
>
> -	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
> +	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
>  	.sdata : {
>  		__global_pointer$ = . + 0x800;
>  		*(.sdata*)

Thanks, this is on for-next.
Andreas Schwab July 15, 2021, 4:22 p.m. UTC | #2
On Jun 21 2021, Tong Tiangen wrote:

> This patch adds stack overflow detection to riscv, usable when
> CONFIG_VMAP_STACK=y.

This breaks get_wchan:

[   65.609696] Unable to handle kernel paging request at virtual address ffffffd0003bbde8
[   65.610460] Oops [#1]
[   65.610626] Modules linked in: virtio_blk virtio_mmio rtc_goldfish btrfs blake2b_generic libcrc32c xor raid6_pq sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs
[   65.611670] CPU: 2 PID: 1 Comm: systemd Not tainted 5.14.0-rc1-1.g34fe32a-default #1 openSUSE Tumbleweed (unreleased) c62f7109153e5a0897ee58ba52393ad99b070fd2
[   65.612334] Hardware name: riscv-virtio,qemu (DT)
[   65.613008] epc : get_wchan+0x5c/0x88
[   65.613334]  ra : get_wchan+0x42/0x88
[   65.613625] epc : ffffffff800048a4 ra : ffffffff8000488a sp : ffffffd00021bb90
[   65.614008]  gp : ffffffff817709f8 tp : ffffffe07fe91b80 t0 : 00000000000001f8
[   65.614411]  t1 : 0000000000020000 t2 : 0000000000000000 s0 : ffffffd00021bbd0
[   65.614818]  s1 : ffffffd0003bbdf0 a0 : 0000000000000001 a1 : 0000000000000002
[   65.615237]  a2 : ffffffff81618008 a3 : 0000000000000000 a4 : 0000000000000000
[   65.615637]  a5 : ffffffd0003bc000 a6 : 0000000000000002 a7 : ffffffe27d370000
[   65.616022]  s2 : ffffffd0003bbd90 s3 : ffffffff8071a81e s4 : 0000000000003fff
[   65.616407]  s5 : ffffffffffffc000 s6 : 0000000000000000 s7 : ffffffff81618008
[   65.616845]  s8 : 0000000000000001 s9 : 0000000180000040 s10: 0000000000000000
[   65.617248]  s11: 000000000000016b t3 : 000000ff00000000 t4 : 0c6aec92de5e3fd7
[   65.617672]  t5 : fff78f60608fcfff t6 : 0000000000000078
[   65.618088] status: 0000000000000120 badaddr: ffffffd0003bbde8 cause: 000000000000000d
[   65.618621] [<ffffffff800048a4>] get_wchan+0x5c/0x88
[   65.619008] [<ffffffff8022da88>] do_task_stat+0x7a2/0xa46
[   65.619325] [<ffffffff8022e87e>] proc_tgid_stat+0xe/0x16
[   65.619637] [<ffffffff80227dd6>] proc_single_show+0x46/0x96
[   65.619979] [<ffffffff801ccb1e>] seq_read_iter+0x190/0x31e
[   65.620341] [<ffffffff801ccd70>] seq_read+0xc4/0x104
[   65.620633] [<ffffffff801a6bfe>] vfs_read+0x6a/0x112
[   65.620922] [<ffffffff801a701c>] ksys_read+0x54/0xbe
[   65.621206] [<ffffffff801a7094>] sys_read+0xe/0x16
[   65.621474] [<ffffffff8000303e>] ret_from_syscall+0x0/0x2
[   65.622169] ---[ end trace f24856ed2b8789c5 ]---
[   65.622832] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b

Andreas.
Jisheng Zhang July 16, 2021, 12:53 p.m. UTC | #3
Hi Andreas,

On Thu, 15 Jul 2021 18:22:19 +0200
Andreas Schwab <schwab@linux-m68k.org> wrote:

> On Jun 21 2021, Tong Tiangen wrote:
> 
> > This patch adds stack overflow detection to riscv, usable when
> > CONFIG_VMAP_STACK=y.  
> 
> This breaks get_wchan:

I can't reproduce the panic error. Mind to share your .config?

Thanks in advance

> 
> [   65.609696] Unable to handle kernel paging request at virtual address ffffffd0003bbde8
> [   65.610460] Oops [#1]
> [   65.610626] Modules linked in: virtio_blk virtio_mmio rtc_goldfish btrfs blake2b_generic libcrc32c xor raid6_pq sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs
> [   65.611670] CPU: 2 PID: 1 Comm: systemd Not tainted 5.14.0-rc1-1.g34fe32a-default #1 openSUSE Tumbleweed (unreleased) c62f7109153e5a0897ee58ba52393ad99b070fd2
> [   65.612334] Hardware name: riscv-virtio,qemu (DT)
> [   65.613008] epc : get_wchan+0x5c/0x88
> [   65.613334]  ra : get_wchan+0x42/0x88
> [   65.613625] epc : ffffffff800048a4 ra : ffffffff8000488a sp : ffffffd00021bb90
> [   65.614008]  gp : ffffffff817709f8 tp : ffffffe07fe91b80 t0 : 00000000000001f8
> [   65.614411]  t1 : 0000000000020000 t2 : 0000000000000000 s0 : ffffffd00021bbd0
> [   65.614818]  s1 : ffffffd0003bbdf0 a0 : 0000000000000001 a1 : 0000000000000002
> [   65.615237]  a2 : ffffffff81618008 a3 : 0000000000000000 a4 : 0000000000000000
> [   65.615637]  a5 : ffffffd0003bc000 a6 : 0000000000000002 a7 : ffffffe27d370000
> [   65.616022]  s2 : ffffffd0003bbd90 s3 : ffffffff8071a81e s4 : 0000000000003fff
> [   65.616407]  s5 : ffffffffffffc000 s6 : 0000000000000000 s7 : ffffffff81618008
> [   65.616845]  s8 : 0000000000000001 s9 : 0000000180000040 s10: 0000000000000000
> [   65.617248]  s11: 000000000000016b t3 : 000000ff00000000 t4 : 0c6aec92de5e3fd7
> [   65.617672]  t5 : fff78f60608fcfff t6 : 0000000000000078
> [   65.618088] status: 0000000000000120 badaddr: ffffffd0003bbde8 cause: 000000000000000d
> [   65.618621] [<ffffffff800048a4>] get_wchan+0x5c/0x88
> [   65.619008] [<ffffffff8022da88>] do_task_stat+0x7a2/0xa46
> [   65.619325] [<ffffffff8022e87e>] proc_tgid_stat+0xe/0x16
> [   65.619637] [<ffffffff80227dd6>] proc_single_show+0x46/0x96
> [   65.619979] [<ffffffff801ccb1e>] seq_read_iter+0x190/0x31e
> [   65.620341] [<ffffffff801ccd70>] seq_read+0xc4/0x104
> [   65.620633] [<ffffffff801a6bfe>] vfs_read+0x6a/0x112
> [   65.620922] [<ffffffff801a701c>] ksys_read+0x54/0xbe
> [   65.621206] [<ffffffff801a7094>] sys_read+0xe/0x16
> [   65.621474] [<ffffffff8000303e>] ret_from_syscall+0x0/0x2
> [   65.622169] ---[ end trace f24856ed2b8789c5 ]---
> [   65.622832] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> 
> Andreas.
>
Andreas Schwab July 16, 2021, 3:53 p.m. UTC | #4
On Jul 16 2021, Jisheng Zhang wrote:

> I can't reproduce the panic error. Mind to share your .config?

https://github.com/openSUSE/kernel-source/blob/master/config/riscv64/default

Andreas.
Jisheng Zhang July 17, 2021, 2:18 a.m. UTC | #5
On Fri, 16 Jul 2021 17:53:50 +0200
Andreas Schwab <schwab@linux-m68k.org> wrote:

> On Jul 16 2021, Jisheng Zhang wrote:
> 
> > I can't reproduce the panic error. Mind to share your .config?  
> 
> https://github.com/openSUSE/kernel-source/blob/master/config/riscv64/default
> 
> Andreas.
> 

Looks a bit strange. With the above .config, I still can't reproduce the panic
either by

cat /proc/PID/wchan

or

cat /proc/PID/stat

I use the latest linus tree w/o any patch. I noticed that some SUSE_* config
in the .config file, is it possible that you applied some patches? If so,
can you please try the lasest linus tree w/o any patch?

Thanks
Andreas Schwab July 17, 2021, 6:55 a.m. UTC | #6
Please use
https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
and run it in qemu with u-boot as kernel.

Andreas.
Tong Tiangen July 19, 2021, 3:27 a.m. UTC | #7
On 2021/7/17 14:55, Andreas Schwab wrote:
> Please use
> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
> and run it in qemu with u-boot as kernel.
>
> Andreas.
>

Hi andreas:
I used today's latest mainline code and .config provided by you, and I 
can't reproduce this panic.
Can you provide your code branch, I'll test and analyze it.

Thanks.
Andreas Schwab July 19, 2021, 7:23 a.m. UTC | #8
On Jul 19 2021, tongtiangen wrote:

> On 2021/7/17 14:55, Andreas Schwab wrote:
>> Please use
>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
>> and run it in qemu with u-boot as kernel.
>>
>> Andreas.
>>
>
> Hi andreas:
> I used today's latest mainline code and .config provided by you, and I
> can't reproduce this panic.

Did you test it like I said above?

Andreas.
Palmer Dabbelt July 22, 2021, 6:12 a.m. UTC | #9
On Mon, 19 Jul 2021 00:23:06 PDT (-0700), schwab@linux-m68k.org wrote:
> On Jul 19 2021, tongtiangen wrote:
>
>> On 2021/7/17 14:55, Andreas Schwab wrote:
>>> Please use
>>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
>>> and run it in qemu with u-boot as kernel.
>>>
>>> Andreas.
>>>
>>
>> Hi andreas:
>> I used today's latest mainline code and .config provided by you, and I
>> can't reproduce this panic.
>
> Did you test it like I said above?
>
> Andreas.

I'm getting this on and off, with just 

CONFIG_VMAP_STACK=y

on top of defconfig, when running on QEMU.  It's not showing up right 
now: I'd thought it was an issue with that initrd patch, but it went 
away when I re-ran the tests so I'm guessing it's something 
non-deterministic.  I'll try to take a look if it comes back.
Atish Patra July 22, 2021, 8:35 a.m. UTC | #10
On Wed, Jul 21, 2021 at 11:12 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Mon, 19 Jul 2021 00:23:06 PDT (-0700), schwab@linux-m68k.org wrote:
> > On Jul 19 2021, tongtiangen wrote:
> >
> >> On 2021/7/17 14:55, Andreas Schwab wrote:
> >>> Please use
> >>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
> >>> and run it in qemu with u-boot as kernel.
> >>>
> >>> Andreas.
> >>>
> >>
> >> Hi andreas:
> >> I used today's latest mainline code and .config provided by you, and I
> >> can't reproduce this panic.
> >
> > Did you test it like I said above?
> >
> > Andreas.
>
> I'm getting this on and off, with just
>
> CONFIG_VMAP_STACK=y
>
> on top of defconfig, when running on QEMU.  It's not showing up right
> now: I'd thought it was an issue with that initrd patch, but it went
> away when I re-ran the tests so I'm guessing it's something
> non-deterministic.  I'll try to take a look if it comes back.
>

I got it very frequently on beagleV with the following branch & config.
https://github.com/esmil/linux/commits/beaglev

beaglev_defconfig

Disabling CONFIG_VMAP_STACK avoids the crash.

> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Andreas Schwab July 22, 2021, 9:02 a.m. UTC | #11
On Jul 21 2021, Palmer Dabbelt wrote:

> On Mon, 19 Jul 2021 00:23:06 PDT (-0700), schwab@linux-m68k.org wrote:
>> On Jul 19 2021, tongtiangen wrote:
>>
>>> On 2021/7/17 14:55, Andreas Schwab wrote:
>>>> Please use
>>>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
>>>> and run it in qemu with u-boot as kernel.
>>>>
>>>> Andreas.
>>>>
>>>
>>> Hi andreas:
>>> I used today's latest mainline code and .config provided by you, and I
>>> can't reproduce this panic.
>>
>> Did you test it like I said above?
>>
>> Andreas.
>
> I'm getting this on and off, with just 
> CONFIG_VMAP_STACK=y
>
> on top of defconfig, when running on QEMU.  It's not showing up right now:
> I'd thought it was an issue with that initrd patch, but it went away when
> I re-ran the tests so I'm guessing it's something non-deterministic.  I'll
> try to take a look if it comes back.

The crash happens reliably with the image above.

Andreas.
Jisheng Zhang July 22, 2021, 1:37 p.m. UTC | #12
On Thu, 22 Jul 2021 01:35:23 -0700
Atish Patra <atishp@atishpatra.org> wrote:

> On Wed, Jul 21, 2021 at 11:12 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >
> > On Mon, 19 Jul 2021 00:23:06 PDT (-0700), schwab@linux-m68k.org wrote:  
> > > On Jul 19 2021, tongtiangen wrote:
> > >  
> > >> On 2021/7/17 14:55, Andreas Schwab wrote:  
> > >>> Please use
> > >>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
> > >>> and run it in qemu with u-boot as kernel.
> > >>>
> > >>> Andreas.
> > >>>  
> > >>
> > >> Hi andreas:
> > >> I used today's latest mainline code and .config provided by you, and I
> > >> can't reproduce this panic.  
> > >
> > > Did you test it like I said above?
> > >
> > > Andreas.  
> >
> > I'm getting this on and off, with just
> >
> > CONFIG_VMAP_STACK=y
> >
> > on top of defconfig, when running on QEMU.  It's not showing up right
> > now: I'd thought it was an issue with that initrd patch, but it went
> > away when I re-ran the tests so I'm guessing it's something
> > non-deterministic.  I'll try to take a look if it comes back.
> >  
> 
> I got it very frequently on beagleV with the following branch & config.
> https://github.com/esmil/linux/commits/beaglev
> 
> beaglev_defconfig
> 
> Disabling CONFIG_VMAP_STACK avoids the crash.

Hi all,

I think we need to pin the stack before calling get_wchan(), could you please
try below patch?

Thanks

diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index ff467b98c3e3..ac7593607fa6 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -132,8 +132,12 @@ unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc = 0;
 
-	if (likely(task && task != current && !task_is_running(task)))
+	if (likely(task && task != current && !task_is_running(task))) {
+		if (!try_get_task_stack(task))
+			return 0;
 		walk_stackframe(task, NULL, save_wchan, &pc);
+		put_task_stack(task);
+	}
 	return pc;
 }
 


> 
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv  
> 
> 
>
Jisheng Zhang July 22, 2021, 2:24 p.m. UTC | #13
On Thu, 22 Jul 2021 21:37:24 +0800
Jisheng Zhang  wrote:

> On Thu, 22 Jul 2021 01:35:23 -0700
> Atish Patra <atishp@atishpatra.org> wrote:
> 
> > On Wed, Jul 21, 2021 at 11:12 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:  
> > >
> > > On Mon, 19 Jul 2021 00:23:06 PDT (-0700), schwab@linux-m68k.org wrote:    
> > > > On Jul 19 2021, tongtiangen wrote:
> > > >    
> > > >> On 2021/7/17 14:55, Andreas Schwab wrote:    
> > > >>> Please use
> > > >>> https://download.opensuse.org/repositories/home:/Andreas_Schwab:/riscv:/jeos/images/openSUSE-Tumbleweed-RISC-V-JeOS-efi.riscv64.raw.xz
> > > >>> and run it in qemu with u-boot as kernel.
> > > >>>
> > > >>> Andreas.
> > > >>>    
> > > >>
> > > >> Hi andreas:
> > > >> I used today's latest mainline code and .config provided by you, and I
> > > >> can't reproduce this panic.    
> > > >
> > > > Did you test it like I said above?
> > > >
> > > > Andreas.    
> > >
> > > I'm getting this on and off, with just
> > >
> > > CONFIG_VMAP_STACK=y
> > >
> > > on top of defconfig, when running on QEMU.  It's not showing up right
> > > now: I'd thought it was an issue with that initrd patch, but it went
> > > away when I re-ran the tests so I'm guessing it's something
> > > non-deterministic.  I'll try to take a look if it comes back.
> > >    
> > 
> > I got it very frequently on beagleV with the following branch & config.
> > https://github.com/esmil/linux/commits/beaglev
> > 
> > beaglev_defconfig
> > 
> > Disabling CONFIG_VMAP_STACK avoids the crash.  
> 
> Hi all,
> 
> I think we need to pin the stack before calling get_wchan(), could you please

Typo: s/get_wchan/walk_stackframe

> try below patch?
> 
> Thanks
> 
> diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
> index ff467b98c3e3..ac7593607fa6 100644
> --- a/arch/riscv/kernel/stacktrace.c
> +++ b/arch/riscv/kernel/stacktrace.c
> @@ -132,8 +132,12 @@ unsigned long get_wchan(struct task_struct *task)
>  {
>  	unsigned long pc = 0;
>  
> -	if (likely(task && task != current && !task_is_running(task)))
> +	if (likely(task && task != current && !task_is_running(task))) {
> +		if (!try_get_task_stack(task))
> +			return 0;
>  		walk_stackframe(task, NULL, save_wchan, &pc);
> +		put_task_stack(task);
> +	}
>  	return pc;
>  }
>  
> 
> 
> >   
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv    
> > 
> > 
> >   
> 
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Andreas Schwab July 22, 2021, 3:42 p.m. UTC | #14
On Jul 22 2021, Jisheng Zhang wrote:

> I think we need to pin the stack before calling get_wchan(), could you please
> try below patch?

Thanks, this fixes the crash for me.

Andreas.
Jisheng Zhang July 22, 2021, 11:54 p.m. UTC | #15
On Thu, 22 Jul 2021 17:42:52 +0200
Andreas Schwab <schwab@linux-m68k.org> wrote:

> On Jul 22 2021, Jisheng Zhang wrote:
> 
> > I think we need to pin the stack before calling get_wchan(), could you please
> > try below patch?  
> 
> Thanks, this fixes the crash for me.
> 
> Andreas.
> 

Thanks for testing. I will send out formal patch later

Thanks
Tong Tiangen July 23, 2021, 1:36 a.m. UTC | #16
On 2021/7/23 7:54, Jisheng Zhang wrote:
> On Thu, 22 Jul 2021 17:42:52 +0200
> Andreas Schwab <schwab@linux-m68k.org> wrote:
>
>> On Jul 22 2021, Jisheng Zhang wrote:
>>
>>> I think we need to pin the stack before calling get_wchan(), could you please
>>> try below patch?
>>
>> Thanks, this fixes the crash for me.
>>
>> Andreas.
>>
>
> Thanks for testing. I will send out formal patch later
>
> Thanks
>
> .
>

Hi all:
I tried to reproduced this crash in openSUSE code repo( 
https://github.com/opensuse/kernel ), but not reproduced successfully.

 From the patch of problem repair, the crash is due to task->stack is 
released before calling get_wchan, the task state of maybe TASK_DEAD.

VMAP_STACK is used to detect kernel stack overflow, there is no 
connection between the two, it makes me a little confused.
Jisheng Zhang July 23, 2021, 4:29 a.m. UTC | #17
On Fri, 23 Jul 2021 09:36:47 +0800
tongtiangen <tongtiangen@huawei.com> wrote:

> On 2021/7/23 7:54, Jisheng Zhang wrote:
> > On Thu, 22 Jul 2021 17:42:52 +0200
> > Andreas Schwab <schwab@linux-m68k.org> wrote:
> >  
> >> On Jul 22 2021, Jisheng Zhang wrote:
> >>  
> >>> I think we need to pin the stack before calling get_wchan(), could you please
> >>> try below patch?  
> >>
> >> Thanks, this fixes the crash for me.
> >>
> >> Andreas.
> >>  
> >
> > Thanks for testing. I will send out formal patch later
> >
> > Thanks
> >
> > .
> >  
> 
> Hi all:
> I tried to reproduced this crash in openSUSE code repo( 
> https://github.com/opensuse/kernel ), but not reproduced successfully.
> 
>  From the patch of problem repair, the crash is due to task->stack is 
> released before calling get_wchan, the task state of maybe TASK_DEAD.
> 
> VMAP_STACK is used to detect kernel stack overflow, there is no 
> connection between the two, it makes me a little confused.

I believe the bug exists from the first day of riscv mainlined.

Since THREAD_INFO_IN_TASK=y in riscv, so when task stack can be freed
before being destroyed.

When VMAP_STACK=n, task's stack is allocated from linear mapping. When
task stack is freed, the corresponding mapping still exists, and since
get_wchan() only read, no harm is observed so far.

When VMAP_STACK=y, task's stack is allocated from vmalloc area. When
task stack is freed, the corresponding mapping may not exist, so I expect
MMU fault here, thus the kernel panic.

In summary, the bug isn't related with VMAP_STACK, but VMAP_STACK makes
the bug observable.

Thanks
Jisheng Zhang July 23, 2021, 4:40 a.m. UTC | #18
On Fri, 23 Jul 2021 12:29:25 +0800
Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote:

> On Fri, 23 Jul 2021 09:36:47 +0800
> tongtiangen <tongtiangen@huawei.com> wrote:
> 
> > On 2021/7/23 7:54, Jisheng Zhang wrote:  
> > > On Thu, 22 Jul 2021 17:42:52 +0200
> > > Andreas Schwab <schwab@linux-m68k.org> wrote:
> > >    
> > >> On Jul 22 2021, Jisheng Zhang wrote:
> > >>    
> > >>> I think we need to pin the stack before calling get_wchan(), could you please
> > >>> try below patch?    
> > >>
> > >> Thanks, this fixes the crash for me.
> > >>
> > >> Andreas.
> > >>    
> > >
> > > Thanks for testing. I will send out formal patch later
> > >
> > > Thanks
> > >
> > > .
> > >    
> > 
> > Hi all:
> > I tried to reproduced this crash in openSUSE code repo( 
> > https://github.com/opensuse/kernel ), but not reproduced successfully.
> > 
> >  From the patch of problem repair, the crash is due to task->stack is 
> > released before calling get_wchan, the task state of maybe TASK_DEAD.
> > 
> > VMAP_STACK is used to detect kernel stack overflow, there is no 
> > connection between the two, it makes me a little confused.  
> 
> I believe the bug exists from the first day of riscv mainlined.
> 
> Since THREAD_INFO_IN_TASK=y in riscv, so when task stack can be freed
> before being destroyed.

typo: task stack can be freed before task is destroyed

> 
> When VMAP_STACK=n, task's stack is allocated from linear mapping. When
> task stack is freed, the corresponding mapping still exists, and since
> get_wchan() only read, no harm is observed so far.
> 
> When VMAP_STACK=y, task's stack is allocated from vmalloc area. When
> task stack is freed, the corresponding mapping may not exist, so I expect
> MMU fault here, thus the kernel panic.
> 
> In summary, the bug isn't related with VMAP_STACK, but VMAP_STACK makes
> the bug observable.
> 
> Thanks
> 
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Tong Tiangen July 23, 2021, 6:49 a.m. UTC | #19
On 2021/7/23 12:40, Jisheng Zhang wrote:
> On Fri, 23 Jul 2021 12:29:25 +0800
> Jisheng Zhang <jszhang3@mail.ustc.edu.cn> wrote:
>
>> On Fri, 23 Jul 2021 09:36:47 +0800
>> tongtiangen <tongtiangen@huawei.com> wrote:
>>
>>> On 2021/7/23 7:54, Jisheng Zhang wrote:
>>>> On Thu, 22 Jul 2021 17:42:52 +0200
>>>> Andreas Schwab <schwab@linux-m68k.org> wrote:
>>>>
>>>>> On Jul 22 2021, Jisheng Zhang wrote:
>>>>>
>>>>>> I think we need to pin the stack before calling get_wchan(), could you please
>>>>>> try below patch?
>>>>>
>>>>> Thanks, this fixes the crash for me.
>>>>>
>>>>> Andreas.
>>>>>
>>>>
>>>> Thanks for testing. I will send out formal patch later
>>>>
>>>> Thanks
>>>>
>>>> .
>>>>
>>>
>>> Hi all:
>>> I tried to reproduced this crash in openSUSE code repo(
>>> https://github.com/opensuse/kernel ), but not reproduced successfully.
>>>
>>>  From the patch of problem repair, the crash is due to task->stack is
>>> released before calling get_wchan, the task state of maybe TASK_DEAD.
>>>
>>> VMAP_STACK is used to detect kernel stack overflow, there is no
>>> connection between the two, it makes me a little confused.
>>
>> I believe the bug exists from the first day of riscv mainlined.
>>
>> Since THREAD_INFO_IN_TASK=y in riscv, so when task stack can be freed
>> before being destroyed.
>
> typo: task stack can be freed before task is destroyed
>
>>
>> When VMAP_STACK=n, task's stack is allocated from linear mapping. When
>> task stack is freed, the corresponding mapping still exists, and since
>> get_wchan() only read, no harm is observed so far.
>>
>> When VMAP_STACK=y, task's stack is allocated from vmalloc area. When
>> task stack is freed, the corresponding mapping may not exist, so I expect
>> MMU fault here, thus the kernel panic.
>>
>> In summary, the bug isn't related with VMAP_STACK, but VMAP_STACK makes
>> the bug observable.
>>
>> Thanks

This explanation is understandable, it is necessary to perform a stack 
validity check before walk_stackframe.

Thanks

>>
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
>
> .
>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a97b03164080..c28284f45434 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -70,6 +70,7 @@  config RISCV
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
 	select HAVE_ASM_MODVERSIONS
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h
index 2a652b0c987d..ef386fcf3939 100644
--- a/arch/riscv/include/asm/asm-prototypes.h
+++ b/arch/riscv/include/asm/asm-prototypes.h
@@ -25,4 +25,7 @@  DECLARE_DO_ERROR_INFO(do_trap_ecall_s);
 DECLARE_DO_ERROR_INFO(do_trap_ecall_m);
 DECLARE_DO_ERROR_INFO(do_trap_break);
 
+asmlinkage unsigned long get_overflow_stack(void);
+asmlinkage void handle_bad_stack(struct pt_regs *regs);
+
 #endif /* _ASM_RISCV_PROTOTYPES_H */
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 0e549a3089b3..60da0dcacf14 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -19,6 +19,21 @@ 
 #endif
 #define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
 
+/*
+ * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
+ * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
+ * assembly.
+ */
+#ifdef CONFIG_VMAP_STACK
+#define THREAD_ALIGN            (2 * THREAD_SIZE)
+#else
+#define THREAD_ALIGN            THREAD_SIZE
+#endif
+
+#define THREAD_SHIFT            (PAGE_SHIFT + THREAD_SIZE_ORDER)
+#define OVERFLOW_STACK_SIZE     SZ_4K
+#define SHADOW_OVERFLOW_STACK_SIZE (1024)
+
 #ifndef __ASSEMBLY__
 
 #include <asm/processor.h>
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 80d5a9e017b0..98f502654edd 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -30,6 +30,15 @@  ENTRY(handle_exception)
 _restore_kernel_tpsp:
 	csrr tp, CSR_SCRATCH
 	REG_S sp, TASK_TI_KERNEL_SP(tp)
+
+#ifdef CONFIG_VMAP_STACK
+	addi sp, sp, -(PT_SIZE_ON_STACK)
+	srli sp, sp, THREAD_SHIFT
+	andi sp, sp, 0x1
+	bnez sp, handle_kernel_stack_overflow
+	REG_L sp, TASK_TI_KERNEL_SP(tp)
+#endif
+
 _save_context:
 	REG_S sp, TASK_TI_USER_SP(tp)
 	REG_L sp, TASK_TI_KERNEL_SP(tp)
@@ -376,6 +385,105 @@  handle_syscall_trace_exit:
 	call do_syscall_trace_exit
 	j ret_from_exception
 
+#ifdef CONFIG_VMAP_STACK
+handle_kernel_stack_overflow:
+	la sp, shadow_stack
+	addi sp, sp, SHADOW_OVERFLOW_STACK_SIZE
+
+	//save caller register to shadow stack
+	addi sp, sp, -(PT_SIZE_ON_STACK)
+	REG_S x1,  PT_RA(sp)
+	REG_S x5,  PT_T0(sp)
+	REG_S x6,  PT_T1(sp)
+	REG_S x7,  PT_T2(sp)
+	REG_S x10, PT_A0(sp)
+	REG_S x11, PT_A1(sp)
+	REG_S x12, PT_A2(sp)
+	REG_S x13, PT_A3(sp)
+	REG_S x14, PT_A4(sp)
+	REG_S x15, PT_A5(sp)
+	REG_S x16, PT_A6(sp)
+	REG_S x17, PT_A7(sp)
+	REG_S x28, PT_T3(sp)
+	REG_S x29, PT_T4(sp)
+	REG_S x30, PT_T5(sp)
+	REG_S x31, PT_T6(sp)
+
+	la ra, restore_caller_reg
+	tail get_overflow_stack
+
+restore_caller_reg:
+	//save per-cpu overflow stack
+	REG_S a0, -8(sp)
+	//restore caller register from shadow_stack
+	REG_L x1,  PT_RA(sp)
+	REG_L x5,  PT_T0(sp)
+	REG_L x6,  PT_T1(sp)
+	REG_L x7,  PT_T2(sp)
+	REG_L x10, PT_A0(sp)
+	REG_L x11, PT_A1(sp)
+	REG_L x12, PT_A2(sp)
+	REG_L x13, PT_A3(sp)
+	REG_L x14, PT_A4(sp)
+	REG_L x15, PT_A5(sp)
+	REG_L x16, PT_A6(sp)
+	REG_L x17, PT_A7(sp)
+	REG_L x28, PT_T3(sp)
+	REG_L x29, PT_T4(sp)
+	REG_L x30, PT_T5(sp)
+	REG_L x31, PT_T6(sp)
+
+	//load per-cpu overflow stack
+	REG_L sp, -8(sp)
+	addi sp, sp, -(PT_SIZE_ON_STACK)
+
+	//save context to overflow stack
+	REG_S x1,  PT_RA(sp)
+	REG_S x3,  PT_GP(sp)
+	REG_S x5,  PT_T0(sp)
+	REG_S x6,  PT_T1(sp)
+	REG_S x7,  PT_T2(sp)
+	REG_S x8,  PT_S0(sp)
+	REG_S x9,  PT_S1(sp)
+	REG_S x10, PT_A0(sp)
+	REG_S x11, PT_A1(sp)
+	REG_S x12, PT_A2(sp)
+	REG_S x13, PT_A3(sp)
+	REG_S x14, PT_A4(sp)
+	REG_S x15, PT_A5(sp)
+	REG_S x16, PT_A6(sp)
+	REG_S x17, PT_A7(sp)
+	REG_S x18, PT_S2(sp)
+	REG_S x19, PT_S3(sp)
+	REG_S x20, PT_S4(sp)
+	REG_S x21, PT_S5(sp)
+	REG_S x22, PT_S6(sp)
+	REG_S x23, PT_S7(sp)
+	REG_S x24, PT_S8(sp)
+	REG_S x25, PT_S9(sp)
+	REG_S x26, PT_S10(sp)
+	REG_S x27, PT_S11(sp)
+	REG_S x28, PT_T3(sp)
+	REG_S x29, PT_T4(sp)
+	REG_S x30, PT_T5(sp)
+	REG_S x31, PT_T6(sp)
+
+	REG_L s0, TASK_TI_KERNEL_SP(tp)
+	csrr s1, CSR_STATUS
+	csrr s2, CSR_EPC
+	csrr s3, CSR_TVAL
+	csrr s4, CSR_CAUSE
+	csrr s5, CSR_SCRATCH
+	REG_S s0, PT_SP(sp)
+	REG_S s1, PT_STATUS(sp)
+	REG_S s2, PT_EPC(sp)
+	REG_S s3, PT_BADADDR(sp)
+	REG_S s4, PT_CAUSE(sp)
+	REG_S s5, PT_TP(sp)
+	move a0, sp
+	tail handle_bad_stack
+#endif
+
 END(handle_exception)
 
 ENTRY(ret_from_fork)
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 7bc88d8aab97..0a98fd0ddfe9 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -203,3 +203,38 @@  int is_valid_bugaddr(unsigned long pc)
 void __init trap_init(void)
 {
 }
+
+#ifdef CONFIG_VMAP_STACK
+static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)],
+		overflow_stack)__aligned(16);
+/*
+ * shadow stack, handled_ kernel_ stack_ overflow(in kernel/entry.S) is used
+ * to get per-cpu overflow stack(get_overflow_stack).
+ */
+long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)];
+asmlinkage unsigned long get_overflow_stack(void)
+{
+	return (unsigned long)this_cpu_ptr(overflow_stack) +
+		OVERFLOW_STACK_SIZE;
+}
+
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+	unsigned long tsk_stk = (unsigned long)current->stack;
+	unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
+
+	console_verbose();
+
+	pr_emerg("Insufficient stack space to handle exception!\n");
+	pr_emerg("Task stack:     [0x%016lx..0x%016lx]\n",
+			tsk_stk, tsk_stk + THREAD_SIZE);
+	pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n",
+			ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);
+
+	__show_regs(regs);
+	panic("Kernel stack overflow");
+
+	for (;;)
+		wait_for_interrupt();
+}
+#endif
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 891742ff75a7..502d0826ecb1 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -117,7 +117,7 @@  SECTIONS
 	. = ALIGN(SECTION_ALIGN);
 	_data = .;
 
-	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
 	.sdata : {
 		__global_pointer$ = . + 0x800;
 		*(.sdata*)