diff mbox series

[v12,2/6] arm64: add support for ARCH_HAS_COPY_MC

Message ID 20240528085915.1955987-3-tongtiangen@huawei.com (mailing list archive)
State New
Headers show
Series arm64: add ARCH_HAS_COPY_MC support | expand

Commit Message

Tong Tiangen May 28, 2024, 8:59 a.m. UTC
For the arm64 kernel, when it processes hardware memory errors for
synchronize notifications(do_sea()), if the errors is consumed within the
kernel, the current processing is panic. However, it is not optimal.

Take copy_from/to_user for example, If ld* triggers a memory error, even in
kernel mode, only the associated process is affected. Killing the user
process and isolating the corrupt page is a better choice.

New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
that can recover from memory errors triggered by access to kernel memory.

Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/asm-extable.h | 31 +++++++++++++++++++++++-----
 arch/arm64/include/asm/asm-uaccess.h |  4 ++++
 arch/arm64/include/asm/extable.h     |  1 +
 arch/arm64/lib/copy_to_user.S        | 10 ++++-----
 arch/arm64/mm/extable.c              | 19 +++++++++++++++++
 arch/arm64/mm/fault.c                | 27 +++++++++++++++++-------
 7 files changed, 75 insertions(+), 18 deletions(-)

Comments

Jonathan Cameron Aug. 19, 2024, 10:30 a.m. UTC | #1
On Tue, 28 May 2024 16:59:11 +0800
Tong Tiangen <tongtiangen@huawei.com> wrote:

> For the arm64 kernel, when it processes hardware memory errors for
> synchronize notifications(do_sea()), if the errors is consumed within the
> kernel, the current processing is panic. However, it is not optimal.
> 
> Take copy_from/to_user for example, If ld* triggers a memory error, even in
> kernel mode, only the associated process is affected. Killing the user
> process and isolating the corrupt page is a better choice.
> 
> New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
> that can recover from memory errors triggered by access to kernel memory.
> 
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>

Hi - this is going slow :(

A few comments inline in the meantime but this really needs ARM maintainers
to take a (hopefully final) look.

Jonathan


> diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
> index 980d1dd8e1a3..9c0664fe1eb1 100644
> --- a/arch/arm64/include/asm/asm-extable.h
> +++ b/arch/arm64/include/asm/asm-extable.h
> @@ -5,11 +5,13 @@
>  #include <linux/bits.h>
>  #include <asm/gpr-num.h>
>  
> -#define EX_TYPE_NONE			0
> -#define EX_TYPE_BPF			1
> -#define EX_TYPE_UACCESS_ERR_ZERO	2
> -#define EX_TYPE_KACCESS_ERR_ZERO	3
> -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
> +#define EX_TYPE_NONE				0
> +#define EX_TYPE_BPF				1
> +#define EX_TYPE_UACCESS_ERR_ZERO		2
> +#define EX_TYPE_KACCESS_ERR_ZERO		3
> +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
> +/* kernel access memory error safe */
> +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5

Does anyone care enough about the alignment to bother realigning for one
long line? I'd be tempted not to bother, but up to maintainers.


> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index 802231772608..2ac716c0d6d8 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -20,7 +20,7 @@
>   *	x0 - bytes not copied
>   */
>  	.macro ldrb1 reg, ptr, val
> -	ldrb  \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro strb1 reg, ptr, val
> @@ -28,7 +28,7 @@
>  	.endm
>  
>  	.macro ldrh1 reg, ptr, val
> -	ldrh  \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro strh1 reg, ptr, val
> @@ -36,7 +36,7 @@
>  	.endm
>  
>  	.macro ldr1 reg, ptr, val
> -	ldr \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro str1 reg, ptr, val
> @@ -44,7 +44,7 @@
>  	.endm
>  
>  	.macro ldp1 reg1, reg2, ptr, val
> -	ldp \reg1, \reg2, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
>  	.endm
>  
>  	.macro stp1 reg1, reg2, ptr, val
> @@ -64,7 +64,7 @@ SYM_FUNC_START(__arch_copy_to_user)
>  9997:	cmp	dst, dstin
>  	b.ne	9998f
>  	// Before being absolutely sure we couldn't copy anything, try harder
> -	ldrb	tmp1w, [srcin]
> +KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])

Alignment looks off?

>  USER(9998f, sttrb tmp1w, [dst])
>  	add	dst, dst, #1
>  9998:	sub	x0, end, dst			// bytes not copied



> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 451ba7cbd5ad..2dc65f99d389 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -708,21 +708,32 @@ static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
>  	return 1; /* "fault" */
>  }
>  
> +/*
> + * APEI claimed this as a firmware-first notification.
> + * Some processing deferred to task_work before ret_to_user().
> + */
> +static bool do_apei_claim_sea(struct pt_regs *regs)
> +{
> +	if (user_mode(regs)) {
> +		if (!apei_claim_sea(regs))

I'd keep to the the (apei_claim_sea(regs) == 0)
used in the original code. That hints to the reader that we are
interested here in an 'error' code rather than apei_claim_sea() returning
a bool.   I initially wondered why we return true when the code
fails to claim it.

Also, perhaps if you return 0 for success and an error code if not
you could just make this

	if (user_mode(regs))
		return apei_claim_sea(regs);

	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
		if (fixup_exception_me(regs)) {
			return apei_claim_sea(regs);
		}
	}

	return false;

or maybe even (I may have messed this up, but I think this logic
works).

	if (!user_mode(regs) && IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
		if (!fixup_exception_me(regs))
			return false;
	}
	return apei_claim_sea(regs);


> +			return true;
> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
> +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))

Same here with using apei_claim_sea(regs) == 0 so it's obvious we
are checking for an error, not a boolean.

> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
>  	unsigned long siaddr;
>  
> -	inf = esr_to_fault_info(esr);
> -
> -	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
> -		/*
> -		 * APEI claimed this as a firmware-first notification.
> -		 * Some processing deferred to task_work before ret_to_user().
> -		 */
> +	if (do_apei_claim_sea(regs))

It might be made sense to factor this out first, then could be reviewed
as a noop before the new stuff is added.  Still it's not much code, so doesn't
really matter.
Might be worth keeping to returning 0 for success, error code
otherwise as per apei_claim_sea(regs)

The bool returning functions in the nearby code tend to be is_xxxx
not things that succeed or not.

If you change it to return int make this
	if (do_apei_claim_sea(regs) == 0)
so it's obvious this is the no error case.

>  		return 0;
> -	}
>  
> +	inf = esr_to_fault_info(esr);
>  	if (esr & ESR_ELx_FnV) {
>  		siaddr = 0;
>  	} else {
Mark Rutland Aug. 19, 2024, 5:29 p.m. UTC | #2
Hi Tong,

On Tue, May 28, 2024 at 04:59:11PM +0800, Tong Tiangen wrote:
> For the arm64 kernel, when it processes hardware memory errors for
> synchronize notifications(do_sea()), if the errors is consumed within the
> kernel, the current processing is panic. However, it is not optimal.
> 
> Take copy_from/to_user for example, If ld* triggers a memory error, even in
> kernel mode, only the associated process is affected. Killing the user
> process and isolating the corrupt page is a better choice.
> 
> New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
> that can recover from memory errors triggered by access to kernel memory.
> 
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>

Generally this looks ok, but I have a couple of comments below.

> ---
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/asm-extable.h | 31 +++++++++++++++++++++++-----
>  arch/arm64/include/asm/asm-uaccess.h |  4 ++++
>  arch/arm64/include/asm/extable.h     |  1 +
>  arch/arm64/lib/copy_to_user.S        | 10 ++++-----
>  arch/arm64/mm/extable.c              | 19 +++++++++++++++++
>  arch/arm64/mm/fault.c                | 27 +++++++++++++++++-------
>  7 files changed, 75 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 5d91259ee7b5..13ca06ddf3dd 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -20,6 +20,7 @@ config ARM64
>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
>  	select ARCH_HAS_CACHE_LINE_SIZE
> +	select ARCH_HAS_COPY_MC if ACPI_APEI_GHES
>  	select ARCH_HAS_CURRENT_STACK_POINTER
>  	select ARCH_HAS_DEBUG_VIRTUAL
>  	select ARCH_HAS_DEBUG_VM_PGTABLE
> diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
> index 980d1dd8e1a3..9c0664fe1eb1 100644
> --- a/arch/arm64/include/asm/asm-extable.h
> +++ b/arch/arm64/include/asm/asm-extable.h
> @@ -5,11 +5,13 @@
>  #include <linux/bits.h>
>  #include <asm/gpr-num.h>
>  
> -#define EX_TYPE_NONE			0
> -#define EX_TYPE_BPF			1
> -#define EX_TYPE_UACCESS_ERR_ZERO	2
> -#define EX_TYPE_KACCESS_ERR_ZERO	3
> -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
> +#define EX_TYPE_NONE				0
> +#define EX_TYPE_BPF				1
> +#define EX_TYPE_UACCESS_ERR_ZERO		2
> +#define EX_TYPE_KACCESS_ERR_ZERO		3
> +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
> +/* kernel access memory error safe */
> +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5

Could we please use 'MEM_ERR', and likewise for the macros below? That's
more obvious than 'ME_SAFE', and we wouldn't need the comment here.
Likewise elsewhere in this patch and the series.

To Jonathan's comment, I do prefer these numbers are aligned, so aside
from the naming, the diff above looks good.

>  
>  /* Data fields for EX_TYPE_UACCESS_ERR_ZERO */
>  #define EX_DATA_REG_ERR_SHIFT	0
> @@ -51,6 +53,17 @@
>  #define _ASM_EXTABLE_UACCESS(insn, fixup)				\
>  	_ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, wzr, wzr)
>  
> +#define _ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, err, zero)	\
> +	__ASM_EXTABLE_RAW(insn, fixup, 					\
> +			  EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE,		\
> +			  (						\
> +			    EX_DATA_REG(ERR, err) |			\
> +			    EX_DATA_REG(ZERO, zero)			\
> +			  ))
> +
> +#define _ASM_EXTABLE_KACCESS_ME_SAFE(insn, fixup)			\
> +	_ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, wzr, wzr)
> +
>  /*
>   * Create an exception table entry for uaccess `insn`, which will branch to `fixup`
>   * when an unhandled fault is taken.
> @@ -69,6 +82,14 @@
>  	.endif
>  	.endm
>  
> +/*
> + * Create an exception table entry for kaccess me(memory error) safe `insn`, which
> + * will branch to `fixup` when an unhandled fault is taken.
> + */
> +	.macro          _asm_extable_kaccess_me_safe, insn, fixup
> +	_ASM_EXTABLE_KACCESS_ME_SAFE(\insn, \fixup)
> +	.endm
> +

With the naming above, I think this can be:

| /*
|  * Create an exception table entry for kaccess `insn`, which will branch to
|  * `fixup` when a memory error is taken
|  */
| 	.macro		_asm_extable_kaccess_mem_err, insn, fixup
| 	_ASM_EXTABLE_KACCESS_MEM_ERR(\insn, \fixup)
| 	.endm

>  #else /* __ASSEMBLY__ */
>  
>  #include <linux/stringify.h>
> diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
> index 5b6efe8abeeb..7bbebfa5b710 100644
> --- a/arch/arm64/include/asm/asm-uaccess.h
> +++ b/arch/arm64/include/asm/asm-uaccess.h
> @@ -57,6 +57,10 @@ alternative_else_nop_endif
>  	.endm
>  #endif
>  
> +#define KERNEL_ME_SAFE(l, x...)			\
> +9999:	x;					\
> +	_asm_extable_kaccess_me_safe	9999b, l
> +
>  #define USER(l, x...)				\
>  9999:	x;					\
>  	_asm_extable_uaccess	9999b, l
> diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
> index 72b0e71cc3de..bc49443bc502 100644
> --- a/arch/arm64/include/asm/extable.h
> +++ b/arch/arm64/include/asm/extable.h
> @@ -46,4 +46,5 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
>  #endif /* !CONFIG_BPF_JIT */
>  
>  bool fixup_exception(struct pt_regs *regs);
> +bool fixup_exception_me(struct pt_regs *regs);
>  #endif
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index 802231772608..2ac716c0d6d8 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -20,7 +20,7 @@
>   *	x0 - bytes not copied
>   */
>  	.macro ldrb1 reg, ptr, val
> -	ldrb  \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro strb1 reg, ptr, val
> @@ -28,7 +28,7 @@
>  	.endm
>  
>  	.macro ldrh1 reg, ptr, val
> -	ldrh  \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro strh1 reg, ptr, val
> @@ -36,7 +36,7 @@
>  	.endm
>  
>  	.macro ldr1 reg, ptr, val
> -	ldr \reg, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
>  	.endm
>  
>  	.macro str1 reg, ptr, val
> @@ -44,7 +44,7 @@
>  	.endm
>  
>  	.macro ldp1 reg1, reg2, ptr, val
> -	ldp \reg1, \reg2, [\ptr], \val
> +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
>  	.endm
>  
>  	.macro stp1 reg1, reg2, ptr, val

These changes mean that regular copy_to_user() will handle kernel memory
errors, rather than only doing that in copy_mc_to_user(). If that's
intentional, please call that out explicitly in the commit message.

> @@ -64,7 +64,7 @@ SYM_FUNC_START(__arch_copy_to_user)
>  9997:	cmp	dst, dstin
>  	b.ne	9998f
>  	// Before being absolutely sure we couldn't copy anything, try harder
> -	ldrb	tmp1w, [srcin]
> +KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
>  USER(9998f, sttrb tmp1w, [dst])
>  	add	dst, dst, #1
>  9998:	sub	x0, end, dst			// bytes not copied

Same comment as above.

> diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
> index 228d681a8715..8c690ae61944 100644
> --- a/arch/arm64/mm/extable.c
> +++ b/arch/arm64/mm/extable.c
> @@ -72,7 +72,26 @@ bool fixup_exception(struct pt_regs *regs)
>  		return ex_handler_uaccess_err_zero(ex, regs);
>  	case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
>  		return ex_handler_load_unaligned_zeropad(ex, regs);
> +	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
> +		return false;
>  	}
>  
>  	BUG();
>  }
> +
> +bool fixup_exception_me(struct pt_regs *regs)
> +{
> +	const struct exception_table_entry *ex;
> +
> +	ex = search_exception_tables(instruction_pointer(regs));
> +	if (!ex)
> +		return false;
> +
> +	switch (ex->type) {
> +	case EX_TYPE_UACCESS_ERR_ZERO:
> +	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
> +		return ex_handler_uaccess_err_zero(ex, regs);
> +	}
> +
> +	return false;
> +}
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 451ba7cbd5ad..2dc65f99d389 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -708,21 +708,32 @@ static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
>  	return 1; /* "fault" */
>  }
>  
> +/*
> + * APEI claimed this as a firmware-first notification.
> + * Some processing deferred to task_work before ret_to_user().
> + */
> +static bool do_apei_claim_sea(struct pt_regs *regs)
> +{
> +	if (user_mode(regs)) {
> +		if (!apei_claim_sea(regs))
> +			return true;
> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
> +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
> +			return true;
> +	}
> +
> +	return false;
> +}

Hmm... that'll fixup the exception even if we don't manage to claim a
the SEA. I suspect this should probably be:

static bool do_apei_claim_sea(struct pt_regs *regs)
{
	if (apei_claim_sea(regs))
		return false;
	if (user_mode(regs))
		return true;
	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC))
		return !fixup_excepton_mem_err(regs);
	
	return false;
}

... unless we *don't* want to claim the SEA in the case we don't have a
fixup?

Mark.

> +
>  static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
>  	unsigned long siaddr;
>  
> -	inf = esr_to_fault_info(esr);
> -
> -	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
> -		/*
> -		 * APEI claimed this as a firmware-first notification.
> -		 * Some processing deferred to task_work before ret_to_user().
> -		 */
> +	if (do_apei_claim_sea(regs))
>  		return 0;
> -	}
>  
> +	inf = esr_to_fault_info(esr);
>  	if (esr & ESR_ELx_FnV) {
>  		siaddr = 0;
>  	} else {
> -- 
> 2.25.1
> 
>
Tong Tiangen Aug. 20, 2024, 2:11 a.m. UTC | #3
在 2024/8/20 1:29, Mark Rutland 写道:
> Hi Tong,
> 
> On Tue, May 28, 2024 at 04:59:11PM +0800, Tong Tiangen wrote:
>> For the arm64 kernel, when it processes hardware memory errors for
>> synchronize notifications(do_sea()), if the errors is consumed within the
>> kernel, the current processing is panic. However, it is not optimal.
>>
>> Take copy_from/to_user for example, If ld* triggers a memory error, even in
>> kernel mode, only the associated process is affected. Killing the user
>> process and isolating the corrupt page is a better choice.
>>
>> New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
>> that can recover from memory errors triggered by access to kernel memory.
>>
>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> 
> Generally this looks ok, but I have a couple of comments below.
> 
>> ---
>>   arch/arm64/Kconfig                   |  1 +
>>   arch/arm64/include/asm/asm-extable.h | 31 +++++++++++++++++++++++-----
>>   arch/arm64/include/asm/asm-uaccess.h |  4 ++++
>>   arch/arm64/include/asm/extable.h     |  1 +
>>   arch/arm64/lib/copy_to_user.S        | 10 ++++-----
>>   arch/arm64/mm/extable.c              | 19 +++++++++++++++++
>>   arch/arm64/mm/fault.c                | 27 +++++++++++++++++-------
>>   7 files changed, 75 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 5d91259ee7b5..13ca06ddf3dd 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -20,6 +20,7 @@ config ARM64
>>   	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
>>   	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
>>   	select ARCH_HAS_CACHE_LINE_SIZE
>> +	select ARCH_HAS_COPY_MC if ACPI_APEI_GHES
>>   	select ARCH_HAS_CURRENT_STACK_POINTER
>>   	select ARCH_HAS_DEBUG_VIRTUAL
>>   	select ARCH_HAS_DEBUG_VM_PGTABLE
>> diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
>> index 980d1dd8e1a3..9c0664fe1eb1 100644
>> --- a/arch/arm64/include/asm/asm-extable.h
>> +++ b/arch/arm64/include/asm/asm-extable.h
>> @@ -5,11 +5,13 @@
>>   #include <linux/bits.h>
>>   #include <asm/gpr-num.h>
>>   
>> -#define EX_TYPE_NONE			0
>> -#define EX_TYPE_BPF			1
>> -#define EX_TYPE_UACCESS_ERR_ZERO	2
>> -#define EX_TYPE_KACCESS_ERR_ZERO	3
>> -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
>> +#define EX_TYPE_NONE				0
>> +#define EX_TYPE_BPF				1
>> +#define EX_TYPE_UACCESS_ERR_ZERO		2
>> +#define EX_TYPE_KACCESS_ERR_ZERO		3
>> +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
>> +/* kernel access memory error safe */
>> +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5
> 
> Could we please use 'MEM_ERR', and likewise for the macros below? That's
> more obvious than 'ME_SAFE', and we wouldn't need the comment here.
> Likewise elsewhere in this patch and the series.
> 
> To Jonathan's comment, I do prefer these numbers are aligned, so aside
> from the naming, the diff above looks good.

OK, I also modified other locations to use 'MEM_ERR'.

> 
>>   
>>   /* Data fields for EX_TYPE_UACCESS_ERR_ZERO */
>>   #define EX_DATA_REG_ERR_SHIFT	0
>> @@ -51,6 +53,17 @@
>>   #define _ASM_EXTABLE_UACCESS(insn, fixup)				\
>>   	_ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, wzr, wzr)
>>   
>> +#define _ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, err, zero)	\
>> +	__ASM_EXTABLE_RAW(insn, fixup, 					\
>> +			  EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE,		\
>> +			  (						\
>> +			    EX_DATA_REG(ERR, err) |			\
>> +			    EX_DATA_REG(ZERO, zero)			\
>> +			  ))
>> +
>> +#define _ASM_EXTABLE_KACCESS_ME_SAFE(insn, fixup)			\
>> +	_ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, wzr, wzr)
>> +
>>   /*
>>    * Create an exception table entry for uaccess `insn`, which will branch to `fixup`
>>    * when an unhandled fault is taken.
>> @@ -69,6 +82,14 @@
>>   	.endif
>>   	.endm
>>   
>> +/*
>> + * Create an exception table entry for kaccess me(memory error) safe `insn`, which
>> + * will branch to `fixup` when an unhandled fault is taken.
>> + */
>> +	.macro          _asm_extable_kaccess_me_safe, insn, fixup
>> +	_ASM_EXTABLE_KACCESS_ME_SAFE(\insn, \fixup)
>> +	.endm
>> +
> 
> With the naming above, I think this can be:
> 
> | /*
> |  * Create an exception table entry for kaccess `insn`, which will branch to
> |  * `fixup` when a memory error is taken
> |  */
> | 	.macro		_asm_extable_kaccess_mem_err, insn, fixup
> | 	_ASM_EXTABLE_KACCESS_MEM_ERR(\insn, \fixup)
> | 	.endm
> 

OK, will be fixed next version.

>>   #else /* __ASSEMBLY__ */
>>   
>>   #include <linux/stringify.h>
>> diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
>> index 5b6efe8abeeb..7bbebfa5b710 100644
>> --- a/arch/arm64/include/asm/asm-uaccess.h
>> +++ b/arch/arm64/include/asm/asm-uaccess.h
>> @@ -57,6 +57,10 @@ alternative_else_nop_endif
>>   	.endm
>>   #endif
>>   
>> +#define KERNEL_ME_SAFE(l, x...)			\
>> +9999:	x;					\
>> +	_asm_extable_kaccess_me_safe	9999b, l
>> +
>>   #define USER(l, x...)				\
>>   9999:	x;					\
>>   	_asm_extable_uaccess	9999b, l
>> diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
>> index 72b0e71cc3de..bc49443bc502 100644
>> --- a/arch/arm64/include/asm/extable.h
>> +++ b/arch/arm64/include/asm/extable.h
>> @@ -46,4 +46,5 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
>>   #endif /* !CONFIG_BPF_JIT */
>>   
>>   bool fixup_exception(struct pt_regs *regs);
>> +bool fixup_exception_me(struct pt_regs *regs);
>>   #endif
>> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
>> index 802231772608..2ac716c0d6d8 100644
>> --- a/arch/arm64/lib/copy_to_user.S
>> +++ b/arch/arm64/lib/copy_to_user.S
>> @@ -20,7 +20,7 @@
>>    *	x0 - bytes not copied
>>    */
>>   	.macro ldrb1 reg, ptr, val
>> -	ldrb  \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro strb1 reg, ptr, val
>> @@ -28,7 +28,7 @@
>>   	.endm
>>   
>>   	.macro ldrh1 reg, ptr, val
>> -	ldrh  \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro strh1 reg, ptr, val
>> @@ -36,7 +36,7 @@
>>   	.endm
>>   
>>   	.macro ldr1 reg, ptr, val
>> -	ldr \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro str1 reg, ptr, val
>> @@ -44,7 +44,7 @@
>>   	.endm
>>   
>>   	.macro ldp1 reg1, reg2, ptr, val
>> -	ldp \reg1, \reg2, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro stp1 reg1, reg2, ptr, val
> 
> These changes mean that regular copy_to_user() will handle kernel memory
> errors, rather than only doing that in copy_mc_to_user(). If that's
> intentional, please call that out explicitly in the commit message.

Yes. This is the purpose of the modification. If the copy_to_user()
function encounters a memory error, this uaccess affects only the
current process. and only need to kill the current process instead of
the entire kernel panic. Do not add copy_mc_to_user() so that
copy_to_user() can process memory errors.

I'll add a description in the commit msg next version.


> 
>> @@ -64,7 +64,7 @@ SYM_FUNC_START(__arch_copy_to_user)
>>   9997:	cmp	dst, dstin
>>   	b.ne	9998f
>>   	// Before being absolutely sure we couldn't copy anything, try harder
>> -	ldrb	tmp1w, [srcin]
>> +KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
>>   USER(9998f, sttrb tmp1w, [dst])
>>   	add	dst, dst, #1
>>   9998:	sub	x0, end, dst			// bytes not copied
> 
> Same comment as above.
> 
>> diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
>> index 228d681a8715..8c690ae61944 100644
>> --- a/arch/arm64/mm/extable.c
>> +++ b/arch/arm64/mm/extable.c
>> @@ -72,7 +72,26 @@ bool fixup_exception(struct pt_regs *regs)
>>   		return ex_handler_uaccess_err_zero(ex, regs);
>>   	case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
>>   		return ex_handler_load_unaligned_zeropad(ex, regs);
>> +	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
>> +		return false;
>>   	}
>>   
>>   	BUG();
>>   }
>> +
>> +bool fixup_exception_me(struct pt_regs *regs)
>> +{
>> +	const struct exception_table_entry *ex;
>> +
>> +	ex = search_exception_tables(instruction_pointer(regs));
>> +	if (!ex)
>> +		return false;
>> +
>> +	switch (ex->type) {
>> +	case EX_TYPE_UACCESS_ERR_ZERO:
>> +	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
>> +		return ex_handler_uaccess_err_zero(ex, regs);
>> +	}
>> +
>> +	return false;
>> +}
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 451ba7cbd5ad..2dc65f99d389 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -708,21 +708,32 @@ static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
>>   	return 1; /* "fault" */
>>   }
>>   
>> +/*
>> + * APEI claimed this as a firmware-first notification.
>> + * Some processing deferred to task_work before ret_to_user().
>> + */
>> +static bool do_apei_claim_sea(struct pt_regs *regs)
>> +{
>> +	if (user_mode(regs)) {
>> +		if (!apei_claim_sea(regs))
>> +			return true;
>> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
>> +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
> 
> Hmm... that'll fixup the exception even if we don't manage to claim a
> the SEA. I suspect this should probably be:
> 
> static bool do_apei_claim_sea(struct pt_regs *regs)
> {
> 	if (apei_claim_sea(regs))
> 		return false;
> 	if (user_mode(regs))
> 		return true;
> 	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC))
> 		return !fixup_excepton_mem_err(regs);
> 	
> 	return false;
> }
> 
> ... unless we *don't* want to claim the SEA in the case we don't have a
> fixup?
> 
> Mark.
> 

Yes. My original meaning here is that if not have fixup, panic is
performed in do_sea() according to the original logic, and claim sea is
not required.

Thanks,
Tong.

>> +
>>   static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
>>   {
>>   	const struct fault_info *inf;
>>   	unsigned long siaddr;
>>   
>> -	inf = esr_to_fault_info(esr);
>> -
>> -	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
>> -		/*
>> -		 * APEI claimed this as a firmware-first notification.
>> -		 * Some processing deferred to task_work before ret_to_user().
>> -		 */
>> +	if (do_apei_claim_sea(regs))
>>   		return 0;
>> -	}
>>   
>> +	inf = esr_to_fault_info(esr);
>>   	if (esr & ESR_ELx_FnV) {
>>   		siaddr = 0;
>>   	} else {
>> -- 
>> 2.25.1
>>
>>
> 
> .
Tong Tiangen Aug. 20, 2024, 2:43 a.m. UTC | #4
在 2024/8/19 18:30, Jonathan Cameron 写道:
> On Tue, 28 May 2024 16:59:11 +0800
> Tong Tiangen <tongtiangen@huawei.com> wrote:
> 
>> For the arm64 kernel, when it processes hardware memory errors for
>> synchronize notifications(do_sea()), if the errors is consumed within the
>> kernel, the current processing is panic. However, it is not optimal.
>>
>> Take copy_from/to_user for example, If ld* triggers a memory error, even in
>> kernel mode, only the associated process is affected. Killing the user
>> process and isolating the corrupt page is a better choice.
>>
>> New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
>> that can recover from memory errors triggered by access to kernel memory.
>>
>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> 
> Hi - this is going slow :(
> 
> A few comments inline in the meantime but this really needs ARM maintainers
> to take a (hopefully final) look.
> 
> Jonathan
> 
> 
>> diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
>> index 980d1dd8e1a3..9c0664fe1eb1 100644
>> --- a/arch/arm64/include/asm/asm-extable.h
>> +++ b/arch/arm64/include/asm/asm-extable.h
>> @@ -5,11 +5,13 @@
>>   #include <linux/bits.h>
>>   #include <asm/gpr-num.h>
>>   
>> -#define EX_TYPE_NONE			0
>> -#define EX_TYPE_BPF			1
>> -#define EX_TYPE_UACCESS_ERR_ZERO	2
>> -#define EX_TYPE_KACCESS_ERR_ZERO	3
>> -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
>> +#define EX_TYPE_NONE				0
>> +#define EX_TYPE_BPF				1
>> +#define EX_TYPE_UACCESS_ERR_ZERO		2
>> +#define EX_TYPE_KACCESS_ERR_ZERO		3
>> +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
>> +/* kernel access memory error safe */
>> +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5
> 
> Does anyone care enough about the alignment to bother realigning for one
> long line? I'd be tempted not to bother, but up to maintainers.
> 
> 
>> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
>> index 802231772608..2ac716c0d6d8 100644
>> --- a/arch/arm64/lib/copy_to_user.S
>> +++ b/arch/arm64/lib/copy_to_user.S
>> @@ -20,7 +20,7 @@
>>    *	x0 - bytes not copied
>>    */
>>   	.macro ldrb1 reg, ptr, val
>> -	ldrb  \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro strb1 reg, ptr, val
>> @@ -28,7 +28,7 @@
>>   	.endm
>>   
>>   	.macro ldrh1 reg, ptr, val
>> -	ldrh  \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro strh1 reg, ptr, val
>> @@ -36,7 +36,7 @@
>>   	.endm
>>   
>>   	.macro ldr1 reg, ptr, val
>> -	ldr \reg, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro str1 reg, ptr, val
>> @@ -44,7 +44,7 @@
>>   	.endm
>>   
>>   	.macro ldp1 reg1, reg2, ptr, val
>> -	ldp \reg1, \reg2, [\ptr], \val
>> +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
>>   	.endm
>>   
>>   	.macro stp1 reg1, reg2, ptr, val
>> @@ -64,7 +64,7 @@ SYM_FUNC_START(__arch_copy_to_user)
>>   9997:	cmp	dst, dstin
>>   	b.ne	9998f
>>   	// Before being absolutely sure we couldn't copy anything, try harder
>> -	ldrb	tmp1w, [srcin]
>> +KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
> 
> Alignment looks off?

Hi, Jonathan:

How about we change this in conjunction with mark's suggestion? :)

> 
>>   USER(9998f, sttrb tmp1w, [dst])
>>   	add	dst, dst, #1
>>   9998:	sub	x0, end, dst			// bytes not copied
> 
> 
> 
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 451ba7cbd5ad..2dc65f99d389 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -708,21 +708,32 @@ static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
>>   	return 1; /* "fault" */
>>   }
>>   
>> +/*
>> + * APEI claimed this as a firmware-first notification.
>> + * Some processing deferred to task_work before ret_to_user().
>> + */
>> +static bool do_apei_claim_sea(struct pt_regs *regs)
>> +{
>> +	if (user_mode(regs)) {
>> +		if (!apei_claim_sea(regs))
> 
> I'd keep to the the (apei_claim_sea(regs) == 0)
> used in the original code. That hints to the reader that we are
> interested here in an 'error' code rather than apei_claim_sea() returning
> a bool.   I initially wondered why we return true when the code
> fails to claim it.
> 
> Also, perhaps if you return 0 for success and an error code if not
> you could just make this
> 
> 	if (user_mode(regs))
> 		return apei_claim_sea(regs);
> 
> 	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
> 		if (fixup_exception_me(regs)) {
> 			return apei_claim_sea(regs);
> 		}
> 	}
> 
> 	return false;
> 
> or maybe even (I may have messed this up, but I think this logic
> works).
> 
> 	if (!user_mode(regs) && IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
> 		if (!fixup_exception_me(regs))
> 			return false;
> 	}
> 	return apei_claim_sea(regs);
> 
> 
>> +			return true;
>> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
>> +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
> 
> Same here with using apei_claim_sea(regs) == 0 so it's obvious we
> are checking for an error, not a boolean.
> 
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>   static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
>>   {
>>   	const struct fault_info *inf;
>>   	unsigned long siaddr;
>>   
>> -	inf = esr_to_fault_info(esr);
>> -
>> -	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
>> -		/*
>> -		 * APEI claimed this as a firmware-first notification.
>> -		 * Some processing deferred to task_work before ret_to_user().
>> -		 */
>> +	if (do_apei_claim_sea(regs))
> 
> It might be made sense to factor this out first, then could be reviewed
> as a noop before the new stuff is added.  Still it's not much code, so doesn't
> really matter.
> Might be worth keeping to returning 0 for success, error code
> otherwise as per apei_claim_sea(regs)
> 
> The bool returning functions in the nearby code tend to be is_xxxx
> not things that succeed or not.
> 
> If you change it to return int make this
> 	if (do_apei_claim_sea(regs) == 0)
> so it's obvious this is the no error case.
> 

My fault, treating the return value of apei_claim_sea() as bool has
caused some confusion. Perhaps using "== 0" can reduce this confuse.

Here's the change:

	static int do_apei_claim_sea(struct pt_regs *regs)
	{
		if (!user_mode(regs) && IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
			if (!fixup_exception_me(regs)))
				return -ENOENT;
		}
		
		return apei_claim_sea(regs);
	}
	
	static int do_sea(...)
	{
		[...]
		if (do_apei_claim_sea(regs) == 0)
			return 0;
		[...]
	}

I'll modify it later with the comments of mark.

Thanks,
Tong.


>>   		return 0;
>> -	}
>>   
>> +	inf = esr_to_fault_info(esr);
>>   	if (esr & ESR_ELx_FnV) {
>>   		siaddr = 0;
>>   	} else {
> 
> .
Mark Rutland Aug. 20, 2024, 9:12 a.m. UTC | #5
On Tue, Aug 20, 2024 at 10:11:45AM +0800, Tong Tiangen wrote:
> 在 2024/8/20 1:29, Mark Rutland 写道:
> > Hi Tong,
> > 
> > On Tue, May 28, 2024 at 04:59:11PM +0800, Tong Tiangen wrote:
> > > For the arm64 kernel, when it processes hardware memory errors for
> > > synchronize notifications(do_sea()), if the errors is consumed within the
> > > kernel, the current processing is panic. However, it is not optimal.
> > > 
> > > Take copy_from/to_user for example, If ld* triggers a memory error, even in
> > > kernel mode, only the associated process is affected. Killing the user
> > > process and isolating the corrupt page is a better choice.
> > > 
> > > New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
> > > that can recover from memory errors triggered by access to kernel memory.
> > > 
> > > Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>

[...]

> > > diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
> > > index 980d1dd8e1a3..9c0664fe1eb1 100644
> > > --- a/arch/arm64/include/asm/asm-extable.h
> > > +++ b/arch/arm64/include/asm/asm-extable.h
> > > @@ -5,11 +5,13 @@
> > >   #include <linux/bits.h>
> > >   #include <asm/gpr-num.h>
> > > -#define EX_TYPE_NONE			0
> > > -#define EX_TYPE_BPF			1
> > > -#define EX_TYPE_UACCESS_ERR_ZERO	2
> > > -#define EX_TYPE_KACCESS_ERR_ZERO	3
> > > -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
> > > +#define EX_TYPE_NONE				0
> > > +#define EX_TYPE_BPF				1
> > > +#define EX_TYPE_UACCESS_ERR_ZERO		2
> > > +#define EX_TYPE_KACCESS_ERR_ZERO		3
> > > +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
> > > +/* kernel access memory error safe */
> > > +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5
> > 
> > Could we please use 'MEM_ERR', and likewise for the macros below? That's
> > more obvious than 'ME_SAFE', and we wouldn't need the comment here.
> > Likewise elsewhere in this patch and the series.
> > 
> > To Jonathan's comment, I do prefer these numbers are aligned, so aside
> > from the naming, the diff above looks good.
> 
> OK, I also modified other locations to use 'MEM_ERR'.

Thanks!

[...]

> > > diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> > > index 802231772608..2ac716c0d6d8 100644
> > > --- a/arch/arm64/lib/copy_to_user.S
> > > +++ b/arch/arm64/lib/copy_to_user.S
> > > @@ -20,7 +20,7 @@
> > >    *	x0 - bytes not copied
> > >    */
> > >   	.macro ldrb1 reg, ptr, val
> > > -	ldrb  \reg, [\ptr], \val
> > > +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
> > >   	.endm
> > >   	.macro strb1 reg, ptr, val
> > > @@ -28,7 +28,7 @@
> > >   	.endm
> > >   	.macro ldrh1 reg, ptr, val
> > > -	ldrh  \reg, [\ptr], \val
> > > +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
> > >   	.endm
> > >   	.macro strh1 reg, ptr, val
> > > @@ -36,7 +36,7 @@
> > >   	.endm
> > >   	.macro ldr1 reg, ptr, val
> > > -	ldr \reg, [\ptr], \val
> > > +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
> > >   	.endm
> > >   	.macro str1 reg, ptr, val
> > > @@ -44,7 +44,7 @@
> > >   	.endm
> > >   	.macro ldp1 reg1, reg2, ptr, val
> > > -	ldp \reg1, \reg2, [\ptr], \val
> > > +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
> > >   	.endm
> > >   	.macro stp1 reg1, reg2, ptr, val
> > 
> > These changes mean that regular copy_to_user() will handle kernel memory
> > errors, rather than only doing that in copy_mc_to_user(). If that's
> > intentional, please call that out explicitly in the commit message.
> 
> Yes. This is the purpose of the modification. If the copy_to_user()
> function encounters a memory error, this uaccess affects only the
> current process. and only need to kill the current process instead of
> the entire kernel panic. Do not add copy_mc_to_user() so that
> copy_to_user() can process memory errors.
> 
> I'll add a description in the commit msg next version.

Ok; why do powerpc and x86 have separate copy_mc_to_user()
implementations, then?

[...]

> > > +/*
> > > + * APEI claimed this as a firmware-first notification.
> > > + * Some processing deferred to task_work before ret_to_user().
> > > + */
> > > +static bool do_apei_claim_sea(struct pt_regs *regs)
> > > +{
> > > +	if (user_mode(regs)) {
> > > +		if (!apei_claim_sea(regs))
> > > +			return true;
> > > +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
> > > +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > 
> > Hmm... that'll fixup the exception even if we don't manage to claim a
> > the SEA. I suspect this should probably be:
> > 
> > static bool do_apei_claim_sea(struct pt_regs *regs)
> > {
> > 	if (apei_claim_sea(regs))
> > 		return false;
> > 	if (user_mode(regs))
> > 		return true;
> > 	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC))
> > 		return !fixup_excepton_mem_err(regs);
> > 	
> > 	return false;
> > }
> > 
> > ... unless we *don't* want to claim the SEA in the case we don't have a
> > fixup?
> > 
> > Mark.
> > 
> 
> Yes. My original meaning here is that if not have fixup, panic is
> performed in do_sea() according to the original logic, and claim sea is
> not required.

AFAICT my suggestion doesn't change that; if we don't have a fixup the
proprosed do_apei_claim_sea() would return false, and so do_sea() would
caryy on to arm64_notify_die(...).

I'm specifically asking if we need to avoid calling apei_claim_sea()
when we don't have a fixup handler, or if calling that would be fine.

One important thing is that if apei_claim_sea() fails to claim the SEA,
we'd like to panic(), and in that case it'd be good to have not applied
the fixup handler, so that the pt_regs::pc shows where the fault was
taken from.

Mark.
Tong Tiangen Aug. 20, 2024, 1:26 p.m. UTC | #6
在 2024/8/20 17:12, Mark Rutland 写道:
> On Tue, Aug 20, 2024 at 10:11:45AM +0800, Tong Tiangen wrote:
>> 在 2024/8/20 1:29, Mark Rutland 写道:
>>> Hi Tong,
>>>
>>> On Tue, May 28, 2024 at 04:59:11PM +0800, Tong Tiangen wrote:
>>>> For the arm64 kernel, when it processes hardware memory errors for
>>>> synchronize notifications(do_sea()), if the errors is consumed within the
>>>> kernel, the current processing is panic. However, it is not optimal.
>>>>
>>>> Take copy_from/to_user for example, If ld* triggers a memory error, even in
>>>> kernel mode, only the associated process is affected. Killing the user
>>>> process and isolating the corrupt page is a better choice.
>>>>
>>>> New fixup type EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE is added to identify insn
>>>> that can recover from memory errors triggered by access to kernel memory.
>>>>
>>>> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
> 
> [...]
> 
>>>> diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
>>>> index 980d1dd8e1a3..9c0664fe1eb1 100644
>>>> --- a/arch/arm64/include/asm/asm-extable.h
>>>> +++ b/arch/arm64/include/asm/asm-extable.h
>>>> @@ -5,11 +5,13 @@
>>>>    #include <linux/bits.h>
>>>>    #include <asm/gpr-num.h>
>>>> -#define EX_TYPE_NONE			0
>>>> -#define EX_TYPE_BPF			1
>>>> -#define EX_TYPE_UACCESS_ERR_ZERO	2
>>>> -#define EX_TYPE_KACCESS_ERR_ZERO	3
>>>> -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
>>>> +#define EX_TYPE_NONE				0
>>>> +#define EX_TYPE_BPF				1
>>>> +#define EX_TYPE_UACCESS_ERR_ZERO		2
>>>> +#define EX_TYPE_KACCESS_ERR_ZERO		3
>>>> +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
>>>> +/* kernel access memory error safe */
>>>> +#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5
>>>
>>> Could we please use 'MEM_ERR', and likewise for the macros below? That's
>>> more obvious than 'ME_SAFE', and we wouldn't need the comment here.
>>> Likewise elsewhere in this patch and the series.
>>>
>>> To Jonathan's comment, I do prefer these numbers are aligned, so aside
>>> from the naming, the diff above looks good.
>>
>> OK, I also modified other locations to use 'MEM_ERR'.
> 
> Thanks!
> 
> [...]
> 
>>>> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
>>>> index 802231772608..2ac716c0d6d8 100644
>>>> --- a/arch/arm64/lib/copy_to_user.S
>>>> +++ b/arch/arm64/lib/copy_to_user.S
>>>> @@ -20,7 +20,7 @@
>>>>     *	x0 - bytes not copied
>>>>     */
>>>>    	.macro ldrb1 reg, ptr, val
>>>> -	ldrb  \reg, [\ptr], \val
>>>> +	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
>>>>    	.endm
>>>>    	.macro strb1 reg, ptr, val
>>>> @@ -28,7 +28,7 @@
>>>>    	.endm
>>>>    	.macro ldrh1 reg, ptr, val
>>>> -	ldrh  \reg, [\ptr], \val
>>>> +	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
>>>>    	.endm
>>>>    	.macro strh1 reg, ptr, val
>>>> @@ -36,7 +36,7 @@
>>>>    	.endm
>>>>    	.macro ldr1 reg, ptr, val
>>>> -	ldr \reg, [\ptr], \val
>>>> +	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
>>>>    	.endm
>>>>    	.macro str1 reg, ptr, val
>>>> @@ -44,7 +44,7 @@
>>>>    	.endm
>>>>    	.macro ldp1 reg1, reg2, ptr, val
>>>> -	ldp \reg1, \reg2, [\ptr], \val
>>>> +	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
>>>>    	.endm
>>>>    	.macro stp1 reg1, reg2, ptr, val
>>>
>>> These changes mean that regular copy_to_user() will handle kernel memory
>>> errors, rather than only doing that in copy_mc_to_user(). If that's
>>> intentional, please call that out explicitly in the commit message.
>>
>> Yes. This is the purpose of the modification. If the copy_to_user()
>> function encounters a memory error, this uaccess affects only the
>> current process. and only need to kill the current process instead of
>> the entire kernel panic. Do not add copy_mc_to_user() so that
>> copy_to_user() can process memory errors.
>>
>> I'll add a description in the commit msg next version.
> 
> Ok; why do powerpc and x86 have separate copy_mc_to_user()
> implementations, then?

Taking x86 as an example:

unsigned long __must_check copy_mc_to_user(...)
{
	unsigned long ret;

	if (copy_mc_fragile_enabled) {
		instrument_copy_to_user(dst, src, len);
		__uaccess_begin();
		ret = copy_mc_fragile((__force void *)dst, src, len);
		__uaccess_end();
		return ret;
	}

	if (static_cpu_has(X86_FEATURE_ERMS)) {
		instrument_copy_to_user(dst, src, len);
		__uaccess_begin();
		ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len);
		__uaccess_end();
		return ret;
	}

	return copy_user_generic((__force void *)dst, src, len);
}

Through checking the source code, I found that "copy_mc_fragile_enabled"
and "X86_FEATURE_ERMS" both rely on the hardware features of x86. I
cannot explain the reasons for the details, but I feel that these are
related to the hardware implementation.


Dan Williams should be able to explain the reason.

Hi Dan:

We need your help:)

Compared to copy_to_user(), copy_mc_to_user() added memory error
handling. My question is why the error handling is not directly
implemented on copy_to_user(), but instead the copy_mc_to_user()
function is added?  Related to hardware features or performance
considerations ?


Thanks,
Tong.

> 
> [...]
> 
>>>> +/*
>>>> + * APEI claimed this as a firmware-first notification.
>>>> + * Some processing deferred to task_work before ret_to_user().
>>>> + */
>>>> +static bool do_apei_claim_sea(struct pt_regs *regs)
>>>> +{
>>>> +	if (user_mode(regs)) {
>>>> +		if (!apei_claim_sea(regs))
>>>> +			return true;
>>>> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
>>>> +		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
>>>> +			return true;
>>>> +	}
>>>> +
>>>> +	return false;
>>>> +}
>>>
>>> Hmm... that'll fixup the exception even if we don't manage to claim a
>>> the SEA. I suspect this should probably be:
>>>
>>> static bool do_apei_claim_sea(struct pt_regs *regs)
>>> {
>>> 	if (apei_claim_sea(regs))
>>> 		return false;
>>> 	if (user_mode(regs))
>>> 		return true;
>>> 	if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC))
>>> 		return !fixup_excepton_mem_err(regs);
>>> 	
>>> 	return false;
>>> }
>>>
>>> ... unless we *don't* want to claim the SEA in the case we don't have a
>>> fixup?
>>>
>>> Mark.
>>>
>>
>> Yes. My original meaning here is that if not have fixup, panic is
>> performed in do_sea() according to the original logic, and claim sea is
>> not required.
> 
> AFAICT my suggestion doesn't change that; if we don't have a fixup the
> proprosed do_apei_claim_sea() would return false, and so do_sea() would
> caryy on to arm64_notify_die(...).
> 
> I'm specifically asking if we need to avoid calling apei_claim_sea()
> when we don't have a fixup handler, or if calling that would be fine.
> 
> One important thing is that if apei_claim_sea() fails to claim the SEA,
> we'd like to panic(), and in that case it'd be good to have not applied
> the fixup handler, so that the pt_regs::pc shows where the fault was
> taken from.
> 
> Mark.

I roughly understand what you mean. The prerequisite of fixup is sea 
claimed succeed. But the fixup here actually just set the regs->pc, and 
not applied the fixup handler here. If claim sea fails, it will directly 
panic() here without applying the fixup handler.

Thanks,
Tong.

> 
> .
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5d91259ee7b5..13ca06ddf3dd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -20,6 +20,7 @@  config ARM64
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_CACHE_LINE_SIZE
+	select ARCH_HAS_COPY_MC if ACPI_APEI_GHES
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
index 980d1dd8e1a3..9c0664fe1eb1 100644
--- a/arch/arm64/include/asm/asm-extable.h
+++ b/arch/arm64/include/asm/asm-extable.h
@@ -5,11 +5,13 @@ 
 #include <linux/bits.h>
 #include <asm/gpr-num.h>
 
-#define EX_TYPE_NONE			0
-#define EX_TYPE_BPF			1
-#define EX_TYPE_UACCESS_ERR_ZERO	2
-#define EX_TYPE_KACCESS_ERR_ZERO	3
-#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD	4
+#define EX_TYPE_NONE				0
+#define EX_TYPE_BPF				1
+#define EX_TYPE_UACCESS_ERR_ZERO		2
+#define EX_TYPE_KACCESS_ERR_ZERO		3
+#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD		4
+/* kernel access memory error safe */
+#define EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE	5
 
 /* Data fields for EX_TYPE_UACCESS_ERR_ZERO */
 #define EX_DATA_REG_ERR_SHIFT	0
@@ -51,6 +53,17 @@ 
 #define _ASM_EXTABLE_UACCESS(insn, fixup)				\
 	_ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, wzr, wzr)
 
+#define _ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, err, zero)	\
+	__ASM_EXTABLE_RAW(insn, fixup, 					\
+			  EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE,		\
+			  (						\
+			    EX_DATA_REG(ERR, err) |			\
+			    EX_DATA_REG(ZERO, zero)			\
+			  ))
+
+#define _ASM_EXTABLE_KACCESS_ME_SAFE(insn, fixup)			\
+	_ASM_EXTABLE_KACCESS_ERR_ZERO_ME_SAFE(insn, fixup, wzr, wzr)
+
 /*
  * Create an exception table entry for uaccess `insn`, which will branch to `fixup`
  * when an unhandled fault is taken.
@@ -69,6 +82,14 @@ 
 	.endif
 	.endm
 
+/*
+ * Create an exception table entry for kaccess me(memory error) safe `insn`, which
+ * will branch to `fixup` when an unhandled fault is taken.
+ */
+	.macro          _asm_extable_kaccess_me_safe, insn, fixup
+	_ASM_EXTABLE_KACCESS_ME_SAFE(\insn, \fixup)
+	.endm
+
 #else /* __ASSEMBLY__ */
 
 #include <linux/stringify.h>
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 5b6efe8abeeb..7bbebfa5b710 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -57,6 +57,10 @@  alternative_else_nop_endif
 	.endm
 #endif
 
+#define KERNEL_ME_SAFE(l, x...)			\
+9999:	x;					\
+	_asm_extable_kaccess_me_safe	9999b, l
+
 #define USER(l, x...)				\
 9999:	x;					\
 	_asm_extable_uaccess	9999b, l
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
index 72b0e71cc3de..bc49443bc502 100644
--- a/arch/arm64/include/asm/extable.h
+++ b/arch/arm64/include/asm/extable.h
@@ -46,4 +46,5 @@  bool ex_handler_bpf(const struct exception_table_entry *ex,
 #endif /* !CONFIG_BPF_JIT */
 
 bool fixup_exception(struct pt_regs *regs);
+bool fixup_exception_me(struct pt_regs *regs);
 #endif
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 802231772608..2ac716c0d6d8 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -20,7 +20,7 @@ 
  *	x0 - bytes not copied
  */
 	.macro ldrb1 reg, ptr, val
-	ldrb  \reg, [\ptr], \val
+	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
 	.endm
 
 	.macro strb1 reg, ptr, val
@@ -28,7 +28,7 @@ 
 	.endm
 
 	.macro ldrh1 reg, ptr, val
-	ldrh  \reg, [\ptr], \val
+	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
 	.endm
 
 	.macro strh1 reg, ptr, val
@@ -36,7 +36,7 @@ 
 	.endm
 
 	.macro ldr1 reg, ptr, val
-	ldr \reg, [\ptr], \val
+	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
 	.endm
 
 	.macro str1 reg, ptr, val
@@ -44,7 +44,7 @@ 
 	.endm
 
 	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
+	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
 	.endm
 
 	.macro stp1 reg1, reg2, ptr, val
@@ -64,7 +64,7 @@  SYM_FUNC_START(__arch_copy_to_user)
 9997:	cmp	dst, dstin
 	b.ne	9998f
 	// Before being absolutely sure we couldn't copy anything, try harder
-	ldrb	tmp1w, [srcin]
+KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
 USER(9998f, sttrb tmp1w, [dst])
 	add	dst, dst, #1
 9998:	sub	x0, end, dst			// bytes not copied
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 228d681a8715..8c690ae61944 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -72,7 +72,26 @@  bool fixup_exception(struct pt_regs *regs)
 		return ex_handler_uaccess_err_zero(ex, regs);
 	case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
 		return ex_handler_load_unaligned_zeropad(ex, regs);
+	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
+		return false;
 	}
 
 	BUG();
 }
+
+bool fixup_exception_me(struct pt_regs *regs)
+{
+	const struct exception_table_entry *ex;
+
+	ex = search_exception_tables(instruction_pointer(regs));
+	if (!ex)
+		return false;
+
+	switch (ex->type) {
+	case EX_TYPE_UACCESS_ERR_ZERO:
+	case EX_TYPE_KACCESS_ERR_ZERO_ME_SAFE:
+		return ex_handler_uaccess_err_zero(ex, regs);
+	}
+
+	return false;
+}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 451ba7cbd5ad..2dc65f99d389 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -708,21 +708,32 @@  static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
 	return 1; /* "fault" */
 }
 
+/*
+ * APEI claimed this as a firmware-first notification.
+ * Some processing deferred to task_work before ret_to_user().
+ */
+static bool do_apei_claim_sea(struct pt_regs *regs)
+{
+	if (user_mode(regs)) {
+		if (!apei_claim_sea(regs))
+			return true;
+	} else if (IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC)) {
+		if (fixup_exception_me(regs) && !apei_claim_sea(regs))
+			return true;
+	}
+
+	return false;
+}
+
 static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	unsigned long siaddr;
 
-	inf = esr_to_fault_info(esr);
-
-	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
-		/*
-		 * APEI claimed this as a firmware-first notification.
-		 * Some processing deferred to task_work before ret_to_user().
-		 */
+	if (do_apei_claim_sea(regs))
 		return 0;
-	}
 
+	inf = esr_to_fault_info(esr);
 	if (esr & ESR_ELx_FnV) {
 		siaddr = 0;
 	} else {