diff mbox series

arm64: kprobe: Enable OPTPROBE for arm64

Message ID 20210719122417.10355-1-liuqi115@huawei.com (mailing list archive)
State New, archived
Headers show
Series arm64: kprobe: Enable OPTPROBE for arm64 | expand

Commit Message

liuqi (BA) July 19, 2021, 12:24 p.m. UTC
This patch introduce optprobe for ARM64. In optprobe, probed
instruction is replaced by a branch instruction to detour
buffer. Detour buffer contains trampoline code and a call to
optimized_callback(). optimized_callback() calls opt_pre_handler()
to execute kprobe handler.

Limitations:
- We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
guarantee the offset between probe point and kprobe pre_handler
is not larger than 128MiB.

Performance of optprobe on Hip08 platform is test using kprobe
example module[1] to analyze the latency of a kernel function,
and here is the result:

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/kprobes/kretprobe_example.c

kprobe before optimized:
[280709.846380] do_empty returned 0 and took 1530 ns to execute
[280709.852057] do_empty returned 0 and took 550 ns to execute
[280709.857631] do_empty returned 0 and took 440 ns to execute
[280709.863215] do_empty returned 0 and took 380 ns to execute
[280709.868787] do_empty returned 0 and took 360 ns to execute
[280709.874362] do_empty returned 0 and took 340 ns to execute
[280709.879936] do_empty returned 0 and took 320 ns to execute
[280709.885505] do_empty returned 0 and took 300 ns to execute
[280709.891075] do_empty returned 0 and took 280 ns to execute
[280709.896646] do_empty returned 0 and took 290 ns to execute
[280709.902220] do_empty returned 0 and took 290 ns to execute
[280709.907807] do_empty returned 0 and took 290 ns to execute

optprobe:
[ 2965.964572] do_empty returned 0 and took 90 ns to execute
[ 2965.969952] do_empty returned 0 and took 80 ns to execute
[ 2965.975332] do_empty returned 0 and took 70 ns to execute
[ 2965.980714] do_empty returned 0 and took 60 ns to execute
[ 2965.986128] do_empty returned 0 and took 80 ns to execute
[ 2965.991507] do_empty returned 0 and took 70 ns to execute
[ 2965.996884] do_empty returned 0 and took 70 ns to execute
[ 2966.002262] do_empty returned 0 and took 80 ns to execute
[ 2966.007642] do_empty returned 0 and took 70 ns to execute
[ 2966.013020] do_empty returned 0 and took 70 ns to execute
[ 2966.018400] do_empty returned 0 and took 70 ns to execute
[ 2966.023779] do_empty returned 0 and took 70 ns to execute
[ 2966.029158] do_empty returned 0 and took 70 ns to execute

Signed-off-by: Qi Liu <liuqi115@huawei.com>
---
 arch/arm64/Kconfig                            |   1 +
 arch/arm64/include/asm/kprobes.h              |  23 ++
 arch/arm64/kernel/probes/Makefile             |   2 +
 arch/arm64/kernel/probes/kprobes.c            |  19 +-
 arch/arm64/kernel/probes/opt-arm64.c          | 217 ++++++++++++++++++
 .../arm64/kernel/probes/optprobe_trampoline.S |  80 +++++++
 6 files changed, 339 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kernel/probes/opt-arm64.c
 create mode 100644 arch/arm64/kernel/probes/optprobe_trampoline.S

Comments

Masami Hiramatsu (Google) July 21, 2021, 8:41 a.m. UTC | #1
Hi Qi,

Thanks for your effort!

On Mon, 19 Jul 2021 20:24:17 +0800
Qi Liu <liuqi115@huawei.com> wrote:

> This patch introduce optprobe for ARM64. In optprobe, probed
> instruction is replaced by a branch instruction to detour
> buffer. Detour buffer contains trampoline code and a call to
> optimized_callback(). optimized_callback() calls opt_pre_handler()
> to execute kprobe handler.

OK so this will replace only one instruction.

> 
> Limitations:
> - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> guarantee the offset between probe point and kprobe pre_handler
> is not larger than 128MiB.

Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
allocate an intermediate trampoline area similar to arm optprobe
does.

> 
> Performance of optprobe on Hip08 platform is test using kprobe
> example module[1] to analyze the latency of a kernel function,
> and here is the result:
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/kprobes/kretprobe_example.c
> 
> kprobe before optimized:
> [280709.846380] do_empty returned 0 and took 1530 ns to execute
> [280709.852057] do_empty returned 0 and took 550 ns to execute
> [280709.857631] do_empty returned 0 and took 440 ns to execute
> [280709.863215] do_empty returned 0 and took 380 ns to execute
> [280709.868787] do_empty returned 0 and took 360 ns to execute
> [280709.874362] do_empty returned 0 and took 340 ns to execute
> [280709.879936] do_empty returned 0 and took 320 ns to execute
> [280709.885505] do_empty returned 0 and took 300 ns to execute
> [280709.891075] do_empty returned 0 and took 280 ns to execute
> [280709.896646] do_empty returned 0 and took 290 ns to execute
> [280709.902220] do_empty returned 0 and took 290 ns to execute
> [280709.907807] do_empty returned 0 and took 290 ns to execute
> 
> optprobe:
> [ 2965.964572] do_empty returned 0 and took 90 ns to execute
> [ 2965.969952] do_empty returned 0 and took 80 ns to execute
> [ 2965.975332] do_empty returned 0 and took 70 ns to execute
> [ 2965.980714] do_empty returned 0 and took 60 ns to execute
> [ 2965.986128] do_empty returned 0 and took 80 ns to execute
> [ 2965.991507] do_empty returned 0 and took 70 ns to execute
> [ 2965.996884] do_empty returned 0 and took 70 ns to execute
> [ 2966.002262] do_empty returned 0 and took 80 ns to execute
> [ 2966.007642] do_empty returned 0 and took 70 ns to execute
> [ 2966.013020] do_empty returned 0 and took 70 ns to execute
> [ 2966.018400] do_empty returned 0 and took 70 ns to execute
> [ 2966.023779] do_empty returned 0 and took 70 ns to execute
> [ 2966.029158] do_empty returned 0 and took 70 ns to execute

Great result!
I have other comments on the code below.

[...]
> diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
> index 6dbcc89f6662..83755ad62abe 100644
> --- a/arch/arm64/kernel/probes/kprobes.c
> +++ b/arch/arm64/kernel/probes/kprobes.c
> @@ -11,6 +11,7 @@
>  #include <linux/kasan.h>
>  #include <linux/kernel.h>
>  #include <linux/kprobes.h>
> +#include <linux/moduleloader.h>
>  #include <linux/sched/debug.h>
>  #include <linux/set_memory.h>
>  #include <linux/slab.h>
> @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
>  
>  void *alloc_insn_page(void)
>  {
> -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
> -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
> -			NUMA_NO_NODE, __builtin_return_address(0));
> +	void *page;
> +
> +	page = module_alloc(PAGE_SIZE);
> +	if (!page)
> +		return NULL;
> +
> +	set_vm_flush_reset_perms(page);
> +	/*
> +	 * First make the page read-only, and only then make it executable to
> +	 * prevent it from being W+X in between.
> +	 */
> +	set_memory_ro((unsigned long)page, 1);
> +	set_memory_x((unsigned long)page, 1);
> +
> +	return page;

Isn't this a separated change? Or any reason why you have to
change this function?

>  }
>  
>  /* arm kprobe: install breakpoint in text */
> diff --git a/arch/arm64/kernel/probes/opt-arm64.c b/arch/arm64/kernel/probes/opt-arm64.c
> new file mode 100644
> index 000000000000..ff72f6275e71
> --- /dev/null
> +++ b/arch/arm64/kernel/probes/opt-arm64.c
> @@ -0,0 +1,217 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Code for Kernel probes Jump optimization.
> + *
> + * Copyright (C) 2021 Hisilicon Limited
> + */
> +
> +#include <linux/jump_label.h>
> +#include <linux/kprobes.h>
> +
> +#include <asm/cacheflush.h>
> +#include <asm/insn.h>
> +#include <asm/kprobes.h>
> +#include <asm/patching.h>
> +
> +#define TMPL_VAL_IDX \
> +	(optprobe_template_val - optprobe_template_entry)
> +#define TMPL_CALL_BACK \
> +	(optprobe_template_call - optprobe_template_entry)
> +#define TMPL_END_IDX \
> +	(optprobe_template_end - optprobe_template_entry)
> +#define TMPL_RESTORE_ORIGN_INSN \
> +	(optprobe_template_restore_orig_insn - optprobe_template_entry)
> +#define TMPL_RESTORE_END \
> +	(optprobe_template_restore_end - optprobe_template_entry)
> +
> +int arch_check_optimized_kprobe(struct optimized_kprobe *op)
> +{
> +	return 0;
> +}
> +
> +int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
> +{
> +	return optinsn->insn != NULL;
> +}
> +
> +int arch_within_optimized_kprobe(struct optimized_kprobe *op,
> +				unsigned long addr)
> +{
> +	return ((unsigned long)op->kp.addr <= addr &&
> +		(unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr);
> +}
> +
> +static void
> +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
> +{
> +	/* This is possible if op is under delayed unoptimizing */
> +	if (kprobe_disabled(&op->kp))
> +		return;
> +
> +	preempt_disable();
> +
> +	if (kprobe_running()) {
> +		kprobes_inc_nmissed_count(&op->kp);
> +	} else {
> +		__this_cpu_write(current_kprobe, &op->kp);
> +		regs->pc = (unsigned long)op->kp.addr;
> +		get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
> +		opt_pre_handler(&op->kp, regs);
> +		__this_cpu_write(current_kprobe, NULL);
> +	}
> +
> +	preempt_enable_no_resched();
> +}
> +NOKPROBE_SYMBOL(optimized_callback)
> +
> +static bool is_offset_in_branch_range(long offset)
> +{
> +	return (offset >= -0x08000000 && offset <= 0x07fffffc && !(offset & 0x3));
> +}
> +
> +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig)
> +{
> +	kprobe_opcode_t *code;
> +	long rel_chk;
> +	u32 insn, size;
> +	int ret, i;
> +	void *addr;
> +
> +	code = get_optinsn_slot();
> +	if (!code)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Verify if the address gap is in 128MiB range, because this uses
> +	 * a relative jump.
> +	 *
> +	 * kprobe opt use a 'b' instruction to branch to optinsn.insn.
> +	 * According to ARM manual, branch instruction is:
> +	 *
> +	 *   31  30                  25              0
> +	 *  +----+---+---+---+---+---+---------------+
> +	 *  |cond| 0 | 0 | 1 | 0 | 1 |     imm26     |
> +	 *  +----+---+---+---+---+---+---------------+
> +	 *
> +	 * imm26 is a signed 26 bits integer. The real branch offset is computed
> +	 * by: imm64 = SignExtend(imm26:'00', 64);
> +	 *
> +	 * So the maximum forward branch should be:
> +	 *   (0x01ffffff << 2) = 1720x07fffffc =  0x07fffffc
> +	 * The maximum backward branch should be:
> +	 *   (0xfe000000 << 2) = 0xFFFFFFFFF8000000 = -0x08000000
> +	 *
> +	 * We can simply check (rel & 0xf8000003):
> +	 *  if rel is positive, (rel & 0xf8000003) should be 0
> +	 *  if rel is negitive, (rel & 0xf8000003) should be 0xf8000000
> +	 *  the last '3' is used for alignment checking.
> +	 */
> +	rel_chk = (unsigned long)code -
> +			(unsigned long)orig->addr + 8;
> +	if (!is_offset_in_branch_range(rel_chk)) {
> +		pr_err("%s is out of branch range.\n", orig->symbol_name);

Because the optprobe is an optional optimization (it can fail back to
normal kprobe), you don't need to show this message as an error.
pr_debug() or pr_info() will be enough.

> +		free_optinsn_slot(code, 0);
> +		return -ERANGE;
> +	}
> +
> +	/* Setup template */
> +	size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);

Why would you use 'int' instead of 'kprobe_opcode_t' here?

> +	for (i = 0; i < size; i++) {
> +		addr = code + i;
> +		insn = *(optprobe_template_entry + i);
> +		ret = aarch64_insn_patch_text(&addr, &insn, 1);
> +		if (ret < 0) {
> +			free_optinsn_slot(code, 0);
> +			return -ERANGE;
> +		}
> +	}

This is too much calling stop_machine() in the loop.
Please try to allocate an array of addresses and call
aarch64_insn_patch_text() once.
Or, as same as x86, allocate a temporary trampoline buffer and modify code
as you like, and patch it once (with following aarch64_insn_patch_text()
calls.)

> +
> +	/* Set probe information */
> +	addr = code + TMPL_VAL_IDX;
> +	insn =  (unsigned long long)op & 0xffffffff;
> +	aarch64_insn_patch_text(&addr, &insn, 1);
> +
> +	addr = addr + 4;
> +	insn = ((unsigned long long)op & GENMASK_ULL(63, 32)) >> 32;
> +	aarch64_insn_patch_text(&addr, &insn, 1);
> +
> +	addr = code + TMPL_CALL_BACK;
> +	insn =  aarch64_insn_gen_branch_imm((unsigned long)addr,
> +				(unsigned long)optimized_callback,
> +				AARCH64_INSN_BRANCH_LINK);

If you use the branch here (and later), you may also need to
do the branch_range check here too.
(trampoline -> optimized_callback())

> +	aarch64_insn_patch_text(&addr, &insn, 1);
> +
> +	/* The original probed instruction */
> +	addr = code + TMPL_RESTORE_ORIGN_INSN;
> +	insn =  orig->opcode;
> +	aarch64_insn_patch_text(&addr, &insn, 1);
> +
> +	/* Jump back to next instruction */
> +	addr = code + TMPL_RESTORE_END;
> +	insn = aarch64_insn_gen_branch_imm(
> +				(unsigned long)(&code[TMPL_RESTORE_END]),
> +				(unsigned long)(op->kp.addr) + 4,
> +				AARCH64_INSN_BRANCH_NOLINK);
> +	aarch64_insn_patch_text(&addr, &insn, 1);

Ditto.

> +
> +	flush_icache_range((unsigned long)code,
> +			   (unsigned long)(&code[TMPL_END_IDX]));
> +	/* Set op->optinsn.insn means prepared. */
> +	op->optinsn.insn = code;
> +	return 0;
> +}
> +
> +void arch_optimize_kprobes(struct list_head *oplist)
> +{
> +	struct optimized_kprobe *op, *tmp;
> +
> +	list_for_each_entry_safe(op, tmp, oplist, list) {
> +		u32 insn;
> +
> +		WARN_ON(kprobe_disabled(&op->kp));
> +
> +		/*
> +		 * Backup instructions which will be replaced
> +		 * by jump address
> +		 */
> +		memcpy(op->optinsn.copied_insn, op->kp.addr,
> +			RELATIVEJUMP_SIZE);
> +		insn = aarch64_insn_gen_branch_imm((unsigned long)op->kp.addr,
> +				(unsigned long)op->optinsn.insn,
> +				AARCH64_INSN_BRANCH_NOLINK);
> +
> +		WARN_ON(insn == 0);
> +
> +		aarch64_insn_patch_text((void *)&(op->kp.addr), &insn, 1);

Hmm, there is room for improvement.
Since aarch64_insn_patch_text() is a batch patching API, this should
optimize probes in batch instead of calling it on each probe.

Thank you,

> +
> +		list_del_init(&op->list);
> +	}
> +}
> +
> +void arch_unoptimize_kprobe(struct optimized_kprobe *op)
> +{
> +	arch_arm_kprobe(&op->kp);
> +}
> +
> +/*
> + * Recover original instructions and breakpoints from relative jumps.
> + * Caller must call with locking kprobe_mutex.
> + */
> +void arch_unoptimize_kprobes(struct list_head *oplist,
> +			    struct list_head *done_list)
> +{
> +	struct optimized_kprobe *op, *tmp;
> +
> +	list_for_each_entry_safe(op, tmp, oplist, list) {
> +		arch_unoptimize_kprobe(op);
> +		list_move(&op->list, done_list);
> +	}
> +}
> +
> +void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
> +{
> +	if (op->optinsn.insn) {
> +		free_optinsn_slot(op->optinsn.insn, 1);
> +		op->optinsn.insn = NULL;
> +	}
> +}
> diff --git a/arch/arm64/kernel/probes/optprobe_trampoline.S b/arch/arm64/kernel/probes/optprobe_trampoline.S
> new file mode 100644
> index 000000000000..13729cb279b8
> --- /dev/null
> +++ b/arch/arm64/kernel/probes/optprobe_trampoline.S
> @@ -0,0 +1,80 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * trampoline entry and return code for optprobes.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/assembler.h>
> +
> +	.global optprobe_template_entry
> +optprobe_template_entry:
> +	sub sp, sp, #PT_REGS_SIZE
> +	stp x0, x1, [sp, #S_X0]
> +	stp x2, x3, [sp, #S_X2]
> +	stp x4, x5, [sp, #S_X4]
> +	stp x6, x7, [sp, #S_X6]
> +	stp x8, x9, [sp, #S_X8]
> +	stp x10, x11, [sp, #S_X10]
> +	stp x12, x13, [sp, #S_X12]
> +	stp x14, x15, [sp, #S_X14]
> +	stp x16, x17, [sp, #S_X16]
> +	stp x18, x19, [sp, #S_X18]
> +	stp x20, x21, [sp, #S_X20]
> +	stp x22, x23, [sp, #S_X22]
> +	stp x24, x25, [sp, #S_X24]
> +	stp x26, x27, [sp, #S_X26]
> +	stp x28, x29, [sp, #S_X28]
> +	add x0, sp, #PT_REGS_SIZE
> +	stp lr, x0, [sp, #S_LR]
> +	/*
> +	 * Construct a useful saved PSTATE
> +	 */
> +	mrs x0, nzcv
> +	mrs x1, daif
> +	orr x0, x0, x1
> +	mrs x1, CurrentEL
> +	orr x0, x0, x1
> +	mrs x1, SPSel
> +	orr x0, x0, x1
> +	stp xzr, x0, [sp, #S_PC]
> +	/* Get parameters to optimized_callback() */
> +	ldr	x0, 1f
> +	mov	x1, sp
> +	/* Branch to optimized_callback() */
> +	.global optprobe_template_call
> +optprobe_template_call:
> +	nop
> +        /* Restore registers */
> +	ldr x0, [sp, #S_PSTATE]
> +	and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
> +	msr nzcv, x0
> +	ldp x0, x1, [sp, #S_X0]
> +	ldp x2, x3, [sp, #S_X2]
> +	ldp x4, x5, [sp, #S_X4]
> +	ldp x6, x7, [sp, #S_X6]
> +	ldp x8, x9, [sp, #S_X8]
> +	ldp x10, x11, [sp, #S_X10]
> +	ldp x12, x13, [sp, #S_X12]
> +	ldp x14, x15, [sp, #S_X14]
> +	ldp x16, x17, [sp, #S_X16]
> +	ldp x18, x19, [sp, #S_X18]
> +	ldp x20, x21, [sp, #S_X20]
> +	ldp x22, x23, [sp, #S_X22]
> +	ldp x24, x25, [sp, #S_X24]
> +	ldp x26, x27, [sp, #S_X26]
> +	ldp x28, x29, [sp, #S_X28]
> +	ldr lr, [sp, #S_LR]
> +        add sp, sp, #PT_REGS_SIZE
> +	.global optprobe_template_restore_orig_insn
> +optprobe_template_restore_orig_insn:
> +	nop
> +	.global optprobe_template_restore_end
> +optprobe_template_restore_end:
> +	nop
> +	.global optprobe_template_end
> +optprobe_template_end:
> +	.global optprobe_template_val
> +optprobe_template_val:
> +	1:	.long 0
> +		.long 0



> -- 
> 2.17.1
>
Song Bao Hua (Barry Song) July 22, 2021, 10:24 a.m. UTC | #2
> -----Original Message-----
> From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> Sent: Wednesday, July 21, 2021 8:42 PM
> To: liuqi (BA) <liuqi115@huawei.com>
> Cc: catalin.marinas@arm.com; will@kernel.org; naveen.n.rao@linux.ibm.com;
> anil.s.keshavamurthy@intel.com; davem@davemloft.net;
> linux-arm-kernel@lists.infradead.org; Song Bao Hua (Barry Song)
> <song.bao.hua@hisilicon.com>; Zengtao (B) <prime.zeng@hisilicon.com>;
> robin.murphy@arm.com; Linuxarm <linuxarm@huawei.com>;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> 
> Hi Qi,
> 
> Thanks for your effort!
> 
> On Mon, 19 Jul 2021 20:24:17 +0800
> Qi Liu <liuqi115@huawei.com> wrote:
> 
> > This patch introduce optprobe for ARM64. In optprobe, probed
> > instruction is replaced by a branch instruction to detour
> > buffer. Detour buffer contains trampoline code and a call to
> > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > to execute kprobe handler.
> 
> OK so this will replace only one instruction.
> 
> >
> > Limitations:
> > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > guarantee the offset between probe point and kprobe pre_handler
> > is not larger than 128MiB.
> 
> Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> allocate an intermediate trampoline area similar to arm optprobe
> does.

Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
RANDOMIZE_BASE according to arch/arm64/Kconfig:
config RANDOMIZE_BASE
	bool "Randomize the address of the kernel image"
	select ARM64_MODULE_PLTS if MODULES
	select RELOCATABLE

Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
allowing RANDOMIZE_BASE via avoiding long jump according to:
arch/arm64/Kconfig:

config RANDOMIZE_MODULE_REGION_FULL
	bool "Randomize the module region over a 4 GB range"
	depends on RANDOMIZE_BASE
	default y
	help
	  Randomizes the location of the module region inside a 4 GB window
	  covering the core kernel. This way, it is less likely for modules
	  to leak information about the location of core kernel data structures
	  but it does imply that function calls between modules and the core
	  kernel will need to be resolved via veneers in the module PLT.

	  When this option is not set, the module region will be randomized over
	  a limited range that contains the [_stext, _etext] interval of the
	  core kernel, so branch relocations are always in range.

and

arch/arm64/kernel/kaslr.c:
	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
		/*
		 * Randomize the module region over a 2 GB window covering the
		 * kernel. This reduces the risk of modules leaking information
		 * about the address of the kernel itself, but results in
		 * branches between modules and the core kernel that are
		 * resolved via PLTs. (Branches between modules will be
		 * resolved normally.)
		 */
		module_range = SZ_2G - (u64)(_end - _stext);
		module_alloc_base = max((u64)_end + offset - SZ_2G,
					(u64)MODULES_VADDR);
	} else {
		/*
		 * Randomize the module region by setting module_alloc_base to
		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
		 * _stext) . This guarantees that the resulting region still
		 * covers [_stext, _etext], and that all relative branches can
		 * be resolved without veneers.
		 */
		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
		module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
	}

So depending on ! ARM64_MODULE_PLTS seems to narrow the scenarios
while depending on ! RANDOMIZE_MODULE_REGION_FULL  permit more
machines to use optprobe.

I am not quite sure I am 100% right but tests seem to back this.
hopefully Catalin and Will can correct me.

> 
> >
> > Performance of optprobe on Hip08 platform is test using kprobe
> > example module[1] to analyze the latency of a kernel function,
> > and here is the result:
> >
> > [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/sa
> mples/kprobes/kretprobe_example.c
> >
> > kprobe before optimized:
> > [280709.846380] do_empty returned 0 and took 1530 ns to execute
> > [280709.852057] do_empty returned 0 and took 550 ns to execute
> > [280709.857631] do_empty returned 0 and took 440 ns to execute
> > [280709.863215] do_empty returned 0 and took 380 ns to execute
> > [280709.868787] do_empty returned 0 and took 360 ns to execute
> > [280709.874362] do_empty returned 0 and took 340 ns to execute
> > [280709.879936] do_empty returned 0 and took 320 ns to execute
> > [280709.885505] do_empty returned 0 and took 300 ns to execute
> > [280709.891075] do_empty returned 0 and took 280 ns to execute
> > [280709.896646] do_empty returned 0 and took 290 ns to execute
> > [280709.902220] do_empty returned 0 and took 290 ns to execute
> > [280709.907807] do_empty returned 0 and took 290 ns to execute
> >
> > optprobe:
> > [ 2965.964572] do_empty returned 0 and took 90 ns to execute
> > [ 2965.969952] do_empty returned 0 and took 80 ns to execute
> > [ 2965.975332] do_empty returned 0 and took 70 ns to execute
> > [ 2965.980714] do_empty returned 0 and took 60 ns to execute
> > [ 2965.986128] do_empty returned 0 and took 80 ns to execute
> > [ 2965.991507] do_empty returned 0 and took 70 ns to execute
> > [ 2965.996884] do_empty returned 0 and took 70 ns to execute
> > [ 2966.002262] do_empty returned 0 and took 80 ns to execute
> > [ 2966.007642] do_empty returned 0 and took 70 ns to execute
> > [ 2966.013020] do_empty returned 0 and took 70 ns to execute
> > [ 2966.018400] do_empty returned 0 and took 70 ns to execute
> > [ 2966.023779] do_empty returned 0 and took 70 ns to execute
> > [ 2966.029158] do_empty returned 0 and took 70 ns to execute
> 
> Great result!
> I have other comments on the code below.
> 
> [...]
> > diff --git a/arch/arm64/kernel/probes/kprobes.c
> b/arch/arm64/kernel/probes/kprobes.c
> > index 6dbcc89f6662..83755ad62abe 100644
> > --- a/arch/arm64/kernel/probes/kprobes.c
> > +++ b/arch/arm64/kernel/probes/kprobes.c
> > @@ -11,6 +11,7 @@
> >  #include <linux/kasan.h>
> >  #include <linux/kernel.h>
> >  #include <linux/kprobes.h>
> > +#include <linux/moduleloader.h>
> >  #include <linux/sched/debug.h>
> >  #include <linux/set_memory.h>
> >  #include <linux/slab.h>
> > @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
> >
> >  void *alloc_insn_page(void)
> >  {
> > -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
> > -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
> > -			NUMA_NO_NODE, __builtin_return_address(0));
> > +	void *page;
> > +
> > +	page = module_alloc(PAGE_SIZE);
> > +	if (!page)
> > +		return NULL;
> > +
> > +	set_vm_flush_reset_perms(page);
> > +	/*
> > +	 * First make the page read-only, and only then make it executable to
> > +	 * prevent it from being W+X in between.
> > +	 */
> > +	set_memory_ro((unsigned long)page, 1);
> > +	set_memory_x((unsigned long)page, 1);
> > +
> > +	return page;
> 
> Isn't this a separated change? Or any reason why you have to
> change this function?

As far as I can tell, this is still related with the 128MB
short jump limitation.
VMALLOC_START, VMALLOC_END is an fixed virtual address area
which isn't necessarily modules will be put.
So this patch is moving to module_alloc() which will get
memory between module_alloc_base and module_alloc_end.

Together with depending on !RANDOMIZE_MODULE_REGION_FULL,
this makes all kernel, module and trampoline in short
jmp area.

As long as we can figure out a way to support long jmp
for optprobe, the change in alloc_insn_page() can be
dropped.

Masami, any reference code from any platform to support long
jump for optprobe? For long jmp, we need to put jmp address
to a memory and then somehow load the target address
to PC. Right now, we are able to replace an instruction
only. That is the problem.

Thanks
Barry
Masami Hiramatsu (Google) July 22, 2021, 3:03 p.m. UTC | #3
Hi Song,

On Thu, 22 Jul 2021 10:24:54 +0000
"Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> 
> 
> > -----Original Message-----
> > From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> > Sent: Wednesday, July 21, 2021 8:42 PM
> > To: liuqi (BA) <liuqi115@huawei.com>
> > Cc: catalin.marinas@arm.com; will@kernel.org; naveen.n.rao@linux.ibm.com;
> > anil.s.keshavamurthy@intel.com; davem@davemloft.net;
> > linux-arm-kernel@lists.infradead.org; Song Bao Hua (Barry Song)
> > <song.bao.hua@hisilicon.com>; Zengtao (B) <prime.zeng@hisilicon.com>;
> > robin.murphy@arm.com; Linuxarm <linuxarm@huawei.com>;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> > 
> > Hi Qi,
> > 
> > Thanks for your effort!
> > 
> > On Mon, 19 Jul 2021 20:24:17 +0800
> > Qi Liu <liuqi115@huawei.com> wrote:
> > 
> > > This patch introduce optprobe for ARM64. In optprobe, probed
> > > instruction is replaced by a branch instruction to detour
> > > buffer. Detour buffer contains trampoline code and a call to
> > > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > > to execute kprobe handler.
> > 
> > OK so this will replace only one instruction.
> > 
> > >
> > > Limitations:
> > > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > > guarantee the offset between probe point and kprobe pre_handler
> > > is not larger than 128MiB.
> > 
> > Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> > allocate an intermediate trampoline area similar to arm optprobe
> > does.
> 
> Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
> RANDOMIZE_BASE according to arch/arm64/Kconfig:
> config RANDOMIZE_BASE
> 	bool "Randomize the address of the kernel image"
> 	select ARM64_MODULE_PLTS if MODULES
> 	select RELOCATABLE

Yes, but why it is required for "RANDOMIZE_BASE"?
Does that imply the module call might need to use PLT in
some cases?

> 
> Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
> allowing RANDOMIZE_BASE via avoiding long jump according to:
> arch/arm64/Kconfig:
> 
> config RANDOMIZE_MODULE_REGION_FULL
> 	bool "Randomize the module region over a 4 GB range"
> 	depends on RANDOMIZE_BASE
> 	default y
> 	help
> 	  Randomizes the location of the module region inside a 4 GB window
> 	  covering the core kernel. This way, it is less likely for modules
> 	  to leak information about the location of core kernel data structures
> 	  but it does imply that function calls between modules and the core
> 	  kernel will need to be resolved via veneers in the module PLT.
> 
> 	  When this option is not set, the module region will be randomized over
> 	  a limited range that contains the [_stext, _etext] interval of the
> 	  core kernel, so branch relocations are always in range.

Hmm, this dependency looks strange. If it always in range, don't we need
PLT for modules?

Cataline, would you know why?
Maybe it's a KASLR's Kconfig issue?

> 
> and
> 
> arch/arm64/kernel/kaslr.c:
> 	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
> 		/*
> 		 * Randomize the module region over a 2 GB window covering the
> 		 * kernel. This reduces the risk of modules leaking information
> 		 * about the address of the kernel itself, but results in
> 		 * branches between modules and the core kernel that are
> 		 * resolved via PLTs. (Branches between modules will be
> 		 * resolved normally.)
> 		 */
> 		module_range = SZ_2G - (u64)(_end - _stext);
> 		module_alloc_base = max((u64)_end + offset - SZ_2G,
> 					(u64)MODULES_VADDR);
> 	} else {
> 		/*
> 		 * Randomize the module region by setting module_alloc_base to
> 		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
> 		 * _stext) . This guarantees that the resulting region still
> 		 * covers [_stext, _etext], and that all relative branches can
> 		 * be resolved without veneers.
> 		 */
> 		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
> 		module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
> 	}
> 
> So depending on ! ARM64_MODULE_PLTS seems to narrow the scenarios
> while depending on ! RANDOMIZE_MODULE_REGION_FULL  permit more
> machines to use optprobe.

OK, I see that the code ensures the range will be in the MODULE_VSIZE (=128MB).

> 
> I am not quite sure I am 100% right but tests seem to back this.
> hopefully Catalin and Will can correct me.
> 
> > 
> > >
> > > Performance of optprobe on Hip08 platform is test using kprobe
> > > example module[1] to analyze the latency of a kernel function,
> > > and here is the result:
> > >
> > > [1]
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/sa
> > mples/kprobes/kretprobe_example.c
> > >
> > > kprobe before optimized:
> > > [280709.846380] do_empty returned 0 and took 1530 ns to execute
> > > [280709.852057] do_empty returned 0 and took 550 ns to execute
> > > [280709.857631] do_empty returned 0 and took 440 ns to execute
> > > [280709.863215] do_empty returned 0 and took 380 ns to execute
> > > [280709.868787] do_empty returned 0 and took 360 ns to execute
> > > [280709.874362] do_empty returned 0 and took 340 ns to execute
> > > [280709.879936] do_empty returned 0 and took 320 ns to execute
> > > [280709.885505] do_empty returned 0 and took 300 ns to execute
> > > [280709.891075] do_empty returned 0 and took 280 ns to execute
> > > [280709.896646] do_empty returned 0 and took 290 ns to execute
> > > [280709.902220] do_empty returned 0 and took 290 ns to execute
> > > [280709.907807] do_empty returned 0 and took 290 ns to execute
> > >
> > > optprobe:
> > > [ 2965.964572] do_empty returned 0 and took 90 ns to execute
> > > [ 2965.969952] do_empty returned 0 and took 80 ns to execute
> > > [ 2965.975332] do_empty returned 0 and took 70 ns to execute
> > > [ 2965.980714] do_empty returned 0 and took 60 ns to execute
> > > [ 2965.986128] do_empty returned 0 and took 80 ns to execute
> > > [ 2965.991507] do_empty returned 0 and took 70 ns to execute
> > > [ 2965.996884] do_empty returned 0 and took 70 ns to execute
> > > [ 2966.002262] do_empty returned 0 and took 80 ns to execute
> > > [ 2966.007642] do_empty returned 0 and took 70 ns to execute
> > > [ 2966.013020] do_empty returned 0 and took 70 ns to execute
> > > [ 2966.018400] do_empty returned 0 and took 70 ns to execute
> > > [ 2966.023779] do_empty returned 0 and took 70 ns to execute
> > > [ 2966.029158] do_empty returned 0 and took 70 ns to execute
> > 
> > Great result!
> > I have other comments on the code below.
> > 
> > [...]
> > > diff --git a/arch/arm64/kernel/probes/kprobes.c
> > b/arch/arm64/kernel/probes/kprobes.c
> > > index 6dbcc89f6662..83755ad62abe 100644
> > > --- a/arch/arm64/kernel/probes/kprobes.c
> > > +++ b/arch/arm64/kernel/probes/kprobes.c
> > > @@ -11,6 +11,7 @@
> > >  #include <linux/kasan.h>
> > >  #include <linux/kernel.h>
> > >  #include <linux/kprobes.h>
> > > +#include <linux/moduleloader.h>
> > >  #include <linux/sched/debug.h>
> > >  #include <linux/set_memory.h>
> > >  #include <linux/slab.h>
> > > @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
> > >
> > >  void *alloc_insn_page(void)
> > >  {
> > > -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
> > > -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
> > > -			NUMA_NO_NODE, __builtin_return_address(0));
> > > +	void *page;
> > > +
> > > +	page = module_alloc(PAGE_SIZE);
> > > +	if (!page)
> > > +		return NULL;
> > > +
> > > +	set_vm_flush_reset_perms(page);
> > > +	/*
> > > +	 * First make the page read-only, and only then make it executable to
> > > +	 * prevent it from being W+X in between.
> > > +	 */
> > > +	set_memory_ro((unsigned long)page, 1);
> > > +	set_memory_x((unsigned long)page, 1);
> > > +
> > > +	return page;
> > 
> > Isn't this a separated change? Or any reason why you have to
> > change this function?
> 
> As far as I can tell, this is still related with the 128MB
> short jump limitation.
> VMALLOC_START, VMALLOC_END is an fixed virtual address area
> which isn't necessarily modules will be put.
> So this patch is moving to module_alloc() which will get
> memory between module_alloc_base and module_alloc_end.

Ah, I missed that point. Yes, VMALLOC_START and VMALLOC_END
are not correct range.

> 
> Together with depending on !RANDOMIZE_MODULE_REGION_FULL,
> this makes all kernel, module and trampoline in short
> jmp area.
> 
> As long as we can figure out a way to support long jmp
> for optprobe, the change in alloc_insn_page() can be
> dropped.

No, I think above change is rather readable, so it is OK.

> 
> Masami, any reference code from any platform to support long
> jump for optprobe? For long jmp, we need to put jmp address
> to a memory and then somehow load the target address
> to PC. Right now, we are able to replace an instruction
> only. That is the problem.

Hmm, I had read a paper about 2-stage jump idea 15years ago. That
paper allocated an intermediate trampoline (like PLT) which did a long
jump to the real trampoline on SPARC.
(something like, "push x0; ldr x0, [pc+8]; br x0; <immediate-addr>" for
a slot of the intermediate trampoline.)

For the other (simpler) solution example is optprobe in powerpc
(arch/powerpc/kernel/optprobes_head.S). That reserves a buffer page
in the text section, and use it.

But I think your current implementation is good enough for the
first step. If someone needs CONFIG_RANDOMIZE_MODULE_REGION_FULL
and optprobe, we can revisit this point.

Thank you,
Song Bao Hua (Barry Song) July 23, 2021, 2:43 a.m. UTC | #4
> -----Original Message-----
> From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> Sent: Friday, July 23, 2021 3:03 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
> will@kernel.org; naveen.n.rao@linux.ibm.com; anil.s.keshavamurthy@intel.com;
> davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
> <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
> <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> 
> Hi Song,
> 
> On Thu, 22 Jul 2021 10:24:54 +0000
> "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:
> 
> >
> >
> > > -----Original Message-----
> > > From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> > > Sent: Wednesday, July 21, 2021 8:42 PM
> > > To: liuqi (BA) <liuqi115@huawei.com>
> > > Cc: catalin.marinas@arm.com; will@kernel.org; naveen.n.rao@linux.ibm.com;
> > > anil.s.keshavamurthy@intel.com; davem@davemloft.net;
> > > linux-arm-kernel@lists.infradead.org; Song Bao Hua (Barry Song)
> > > <song.bao.hua@hisilicon.com>; Zengtao (B) <prime.zeng@hisilicon.com>;
> > > robin.murphy@arm.com; Linuxarm <linuxarm@huawei.com>;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> > >
> > > Hi Qi,
> > >
> > > Thanks for your effort!
> > >
> > > On Mon, 19 Jul 2021 20:24:17 +0800
> > > Qi Liu <liuqi115@huawei.com> wrote:
> > >
> > > > This patch introduce optprobe for ARM64. In optprobe, probed
> > > > instruction is replaced by a branch instruction to detour
> > > > buffer. Detour buffer contains trampoline code and a call to
> > > > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > > > to execute kprobe handler.
> > >
> > > OK so this will replace only one instruction.
> > >
> > > >
> > > > Limitations:
> > > > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > > > guarantee the offset between probe point and kprobe pre_handler
> > > > is not larger than 128MiB.
> > >
> > > Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> > > allocate an intermediate trampoline area similar to arm optprobe
> > > does.
> >
> > Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
> > RANDOMIZE_BASE according to arch/arm64/Kconfig:
> > config RANDOMIZE_BASE
> > 	bool "Randomize the address of the kernel image"
> > 	select ARM64_MODULE_PLTS if MODULES
> > 	select RELOCATABLE
> 
> Yes, but why it is required for "RANDOMIZE_BASE"?
> Does that imply the module call might need to use PLT in
> some cases?
> 
> >
> > Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
> > allowing RANDOMIZE_BASE via avoiding long jump according to:
> > arch/arm64/Kconfig:
> >
> > config RANDOMIZE_MODULE_REGION_FULL
> > 	bool "Randomize the module region over a 4 GB range"
> > 	depends on RANDOMIZE_BASE
> > 	default y
> > 	help
> > 	  Randomizes the location of the module region inside a 4 GB window
> > 	  covering the core kernel. This way, it is less likely for modules
> > 	  to leak information about the location of core kernel data structures
> > 	  but it does imply that function calls between modules and the core
> > 	  kernel will need to be resolved via veneers in the module PLT.
> >
> > 	  When this option is not set, the module region will be randomized over
> > 	  a limited range that contains the [_stext, _etext] interval of the
> > 	  core kernel, so branch relocations are always in range.
> 
> Hmm, this dependency looks strange. If it always in range, don't we need
> PLT for modules?
> 
> Cataline, would you know why?
> Maybe it's a KASLR's Kconfig issue?

I actually didn't see any problem after making this change:

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e07e7de9ac49..6440671b72e0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1781,7 +1781,6 @@ config RELOCATABLE

 config RANDOMIZE_BASE
        bool "Randomize the address of the kernel image"
-       select ARM64_MODULE_PLTS if MODULES
        select RELOCATABLE
        help
          Randomizes the virtual address at which the kernel image is
@@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
 config RANDOMIZE_MODULE_REGION_FULL
        bool "Randomize the module region over a 4 GB range"
        depends on RANDOMIZE_BASE
+       select ARM64_MODULE_PLTS if MODULES
        default y
        help
          Randomizes the location of the module region inside a 4 GB window

and having this config:
# zcat /proc/config.gz | grep RANDOMIZE_BASE
CONFIG_RANDOMIZE_BASE=y

# zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
# CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set

# zcat /proc/config.gz | grep ARM64_MODULE_PLTS
# CONFIG_ARM64_MODULE_PLTS is not set

Modules work all good:
# lsmod
Module                  Size  Used by
btrfs                1355776  0
blake2b_generic        20480  0
libcrc32c              16384  1 btrfs
xor                    20480  1 btrfs
xor_neon               16384  1 xor
zstd_compress         163840  1 btrfs
raid6_pq              110592  1 btrfs
ctr                    16384  0
md5                    16384  0
ip_tunnel              32768  0
ipv6                  442368  28


I am not quite sure if there is a corner case. If no,
I would think the kconfig might be some improper.


> 
> >
> > and
> >
> > arch/arm64/kernel/kaslr.c:
> > 	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
> > 		/*
> > 		 * Randomize the module region over a 2 GB window covering the
> > 		 * kernel. This reduces the risk of modules leaking information
> > 		 * about the address of the kernel itself, but results in
> > 		 * branches between modules and the core kernel that are
> > 		 * resolved via PLTs. (Branches between modules will be
> > 		 * resolved normally.)
> > 		 */
> > 		module_range = SZ_2G - (u64)(_end - _stext);
> > 		module_alloc_base = max((u64)_end + offset - SZ_2G,
> > 					(u64)MODULES_VADDR);
> > 	} else {
> > 		/*
> > 		 * Randomize the module region by setting module_alloc_base to
> > 		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
> > 		 * _stext) . This guarantees that the resulting region still
> > 		 * covers [_stext, _etext], and that all relative branches can
> > 		 * be resolved without veneers.
> > 		 */
> > 		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
> > 		module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
> > 	}
> >
> > So depending on ! ARM64_MODULE_PLTS seems to narrow the scenarios
> > while depending on ! RANDOMIZE_MODULE_REGION_FULL  permit more
> > machines to use optprobe.
> 
> OK, I see that the code ensures the range will be in the MODULE_VSIZE (=128MB).
> 
> >
> > I am not quite sure I am 100% right but tests seem to back this.
> > hopefully Catalin and Will can correct me.
> >
> > >
> > > >
> > > > Performance of optprobe on Hip08 platform is test using kprobe
> > > > example module[1] to analyze the latency of a kernel function,
> > > > and here is the result:
> > > >
> > > > [1]
> > >
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/sa
> > > mples/kprobes/kretprobe_example.c
> > > >
> > > > kprobe before optimized:
> > > > [280709.846380] do_empty returned 0 and took 1530 ns to execute
> > > > [280709.852057] do_empty returned 0 and took 550 ns to execute
> > > > [280709.857631] do_empty returned 0 and took 440 ns to execute
> > > > [280709.863215] do_empty returned 0 and took 380 ns to execute
> > > > [280709.868787] do_empty returned 0 and took 360 ns to execute
> > > > [280709.874362] do_empty returned 0 and took 340 ns to execute
> > > > [280709.879936] do_empty returned 0 and took 320 ns to execute
> > > > [280709.885505] do_empty returned 0 and took 300 ns to execute
> > > > [280709.891075] do_empty returned 0 and took 280 ns to execute
> > > > [280709.896646] do_empty returned 0 and took 290 ns to execute
> > > > [280709.902220] do_empty returned 0 and took 290 ns to execute
> > > > [280709.907807] do_empty returned 0 and took 290 ns to execute
> > > >
> > > > optprobe:
> > > > [ 2965.964572] do_empty returned 0 and took 90 ns to execute
> > > > [ 2965.969952] do_empty returned 0 and took 80 ns to execute
> > > > [ 2965.975332] do_empty returned 0 and took 70 ns to execute
> > > > [ 2965.980714] do_empty returned 0 and took 60 ns to execute
> > > > [ 2965.986128] do_empty returned 0 and took 80 ns to execute
> > > > [ 2965.991507] do_empty returned 0 and took 70 ns to execute
> > > > [ 2965.996884] do_empty returned 0 and took 70 ns to execute
> > > > [ 2966.002262] do_empty returned 0 and took 80 ns to execute
> > > > [ 2966.007642] do_empty returned 0 and took 70 ns to execute
> > > > [ 2966.013020] do_empty returned 0 and took 70 ns to execute
> > > > [ 2966.018400] do_empty returned 0 and took 70 ns to execute
> > > > [ 2966.023779] do_empty returned 0 and took 70 ns to execute
> > > > [ 2966.029158] do_empty returned 0 and took 70 ns to execute
> > >
> > > Great result!
> > > I have other comments on the code below.
> > >
> > > [...]
> > > > diff --git a/arch/arm64/kernel/probes/kprobes.c
> > > b/arch/arm64/kernel/probes/kprobes.c
> > > > index 6dbcc89f6662..83755ad62abe 100644
> > > > --- a/arch/arm64/kernel/probes/kprobes.c
> > > > +++ b/arch/arm64/kernel/probes/kprobes.c
> > > > @@ -11,6 +11,7 @@
> > > >  #include <linux/kasan.h>
> > > >  #include <linux/kernel.h>
> > > >  #include <linux/kprobes.h>
> > > > +#include <linux/moduleloader.h>
> > > >  #include <linux/sched/debug.h>
> > > >  #include <linux/set_memory.h>
> > > >  #include <linux/slab.h>
> > > > @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
> > > >
> > > >  void *alloc_insn_page(void)
> > > >  {
> > > > -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
> VMALLOC_END,
> > > > -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
> > > > -			NUMA_NO_NODE, __builtin_return_address(0));
> > > > +	void *page;
> > > > +
> > > > +	page = module_alloc(PAGE_SIZE);
> > > > +	if (!page)
> > > > +		return NULL;
> > > > +
> > > > +	set_vm_flush_reset_perms(page);
> > > > +	/*
> > > > +	 * First make the page read-only, and only then make it executable
> to
> > > > +	 * prevent it from being W+X in between.
> > > > +	 */
> > > > +	set_memory_ro((unsigned long)page, 1);
> > > > +	set_memory_x((unsigned long)page, 1);
> > > > +
> > > > +	return page;
> > >
> > > Isn't this a separated change? Or any reason why you have to
> > > change this function?
> >
> > As far as I can tell, this is still related with the 128MB
> > short jump limitation.
> > VMALLOC_START, VMALLOC_END is an fixed virtual address area
> > which isn't necessarily modules will be put.
> > So this patch is moving to module_alloc() which will get
> > memory between module_alloc_base and module_alloc_end.
> 
> Ah, I missed that point. Yes, VMALLOC_START and VMALLOC_END
> are not correct range.
> 
> >
> > Together with depending on !RANDOMIZE_MODULE_REGION_FULL,
> > this makes all kernel, module and trampoline in short
> > jmp area.
> >
> > As long as we can figure out a way to support long jmp
> > for optprobe, the change in alloc_insn_page() can be
> > dropped.
> 
> No, I think above change is rather readable, so it is OK.
> 
> >
> > Masami, any reference code from any platform to support long
> > jump for optprobe? For long jmp, we need to put jmp address
> > to a memory and then somehow load the target address
> > to PC. Right now, we are able to replace an instruction
> > only. That is the problem.
> 
> Hmm, I had read a paper about 2-stage jump idea 15years ago. That
> paper allocated an intermediate trampoline (like PLT) which did a long
> jump to the real trampoline on SPARC.
> (something like, "push x0; ldr x0, [pc+8]; br x0; <immediate-addr>" for
> a slot of the intermediate trampoline.)
> 
> For the other (simpler) solution example is optprobe in powerpc
> (arch/powerpc/kernel/optprobes_head.S). That reserves a buffer page
> in the text section, and use it.
> 
> But I think your current implementation is good enough for the
> first step. If someone needs CONFIG_RANDOMIZE_MODULE_REGION_FULL
> and optprobe, we can revisit this point.
> 
> Thank you,
> 
> 

Thanks
Barry
liuqi (BA) July 30, 2021, 3:32 a.m. UTC | #5
Hi Masami.
Thanks for reviewing this patch
On 2021/7/21 16:41, Masami Hiramatsu wrote:
> Hi Qi,
> 
> Thanks for your effort!
> 
> On Mon, 19 Jul 2021 20:24:17 +0800
> Qi Liu <liuqi115@huawei.com> wrote:
> 
>> This patch introduce optprobe for ARM64. In optprobe, probed
>> instruction is replaced by a branch instruction to detour
>> buffer. Detour buffer contains trampoline code and a call to
>> optimized_callback(). optimized_callback() calls opt_pre_handler()
>> to execute kprobe handler.
> 
> OK so this will replace only one instruction.
> 
>>
>> Limitations:
>> - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
>> guarantee the offset between probe point and kprobe pre_handler
>> is not larger than 128MiB.
> 
> Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> allocate an intermediate trampoline area similar to arm optprobe
> does.
> 
>>
>> Performance of optprobe on Hip08 platform is test using kprobe
>> example module[1] to analyze the latency of a kernel function,
>> and here is the result:
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/kprobes/kretprobe_example.c
>>
>> kprobe before optimized:
>> [280709.846380] do_empty returned 0 and took 1530 ns to execute
>> [280709.852057] do_empty returned 0 and took 550 ns to execute
>> [280709.857631] do_empty returned 0 and took 440 ns to execute
>> [280709.863215] do_empty returned 0 and took 380 ns to execute
>> [280709.868787] do_empty returned 0 and took 360 ns to execute
>> [280709.874362] do_empty returned 0 and took 340 ns to execute
>> [280709.879936] do_empty returned 0 and took 320 ns to execute
>> [280709.885505] do_empty returned 0 and took 300 ns to execute
>> [280709.891075] do_empty returned 0 and took 280 ns to execute
>> [280709.896646] do_empty returned 0 and took 290 ns to execute
>> [280709.902220] do_empty returned 0 and took 290 ns to execute
>> [280709.907807] do_empty returned 0 and took 290 ns to execute
>>
>> optprobe:
>> [ 2965.964572] do_empty returned 0 and took 90 ns to execute
>> [ 2965.969952] do_empty returned 0 and took 80 ns to execute
>> [ 2965.975332] do_empty returned 0 and took 70 ns to execute
>> [ 2965.980714] do_empty returned 0 and took 60 ns to execute
>> [ 2965.986128] do_empty returned 0 and took 80 ns to execute
>> [ 2965.991507] do_empty returned 0 and took 70 ns to execute
>> [ 2965.996884] do_empty returned 0 and took 70 ns to execute
>> [ 2966.002262] do_empty returned 0 and took 80 ns to execute
>> [ 2966.007642] do_empty returned 0 and took 70 ns to execute
>> [ 2966.013020] do_empty returned 0 and took 70 ns to execute
>> [ 2966.018400] do_empty returned 0 and took 70 ns to execute
>> [ 2966.023779] do_empty returned 0 and took 70 ns to execute
>> [ 2966.029158] do_empty returned 0 and took 70 ns to execute
> 
> Great result!
> I have other comments on the code below.
> 
> [...]
>> diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
>> index 6dbcc89f6662..83755ad62abe 100644
>> --- a/arch/arm64/kernel/probes/kprobes.c
>> +++ b/arch/arm64/kernel/probes/kprobes.c
>> @@ -11,6 +11,7 @@
>>   #include <linux/kasan.h>
>>   #include <linux/kernel.h>
>>   #include <linux/kprobes.h>
>> +#include <linux/moduleloader.h>
>>   #include <linux/sched/debug.h>
>>   #include <linux/set_memory.h>
>>   #include <linux/slab.h>
>> @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
>>   
>>   void *alloc_insn_page(void)
>>   {
>> -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
>> -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
>> -			NUMA_NO_NODE, __builtin_return_address(0));
>> +	void *page;
>> +
>> +	page = module_alloc(PAGE_SIZE);
>> +	if (!page)
>> +		return NULL;
>> +
>> +	set_vm_flush_reset_perms(page);
>> +	/*
>> +	 * First make the page read-only, and only then make it executable to
>> +	 * prevent it from being W+X in between.
>> +	 */
>> +	set_memory_ro((unsigned long)page, 1);
>> +	set_memory_x((unsigned long)page, 1);
>> +
>> +	return page;
> 
> Isn't this a separated change? Or any reason why you have to
> change this function?
> 

As Barry's reply, this is for 128MB short jump limitation.

>>   }
>>   

[...]

>> +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig)
>> +{
>> +	kprobe_opcode_t *code;
>> +	long rel_chk;
>> +	u32 insn, size;
>> +	int ret, i;
>> +	void *addr;
>> +
>> +	code = get_optinsn_slot();
>> +	if (!code)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * Verify if the address gap is in 128MiB range, because this uses
>> +	 * a relative jump.
>> +	 *
>> +	 * kprobe opt use a 'b' instruction to branch to optinsn.insn.
>> +	 * According to ARM manual, branch instruction is:
>> +	 *
>> +	 *   31  30                  25              0
>> +	 *  +----+---+---+---+---+---+---------------+
>> +	 *  |cond| 0 | 0 | 1 | 0 | 1 |     imm26     |
>> +	 *  +----+---+---+---+---+---+---------------+
>> +	 *
>> +	 * imm26 is a signed 26 bits integer. The real branch offset is computed
>> +	 * by: imm64 = SignExtend(imm26:'00', 64);
>> +	 *
>> +	 * So the maximum forward branch should be:
>> +	 *   (0x01ffffff << 2) = 1720x07fffffc =  0x07fffffc
>> +	 * The maximum backward branch should be:
>> +	 *   (0xfe000000 << 2) = 0xFFFFFFFFF8000000 = -0x08000000
>> +	 *
>> +	 * We can simply check (rel & 0xf8000003):
>> +	 *  if rel is positive, (rel & 0xf8000003) should be 0
>> +	 *  if rel is negitive, (rel & 0xf8000003) should be 0xf8000000
>> +	 *  the last '3' is used for alignment checking.
>> +	 */
>> +	rel_chk = (unsigned long)code -
>> +			(unsigned long)orig->addr + 8;
>> +	if (!is_offset_in_branch_range(rel_chk)) {
>> +		pr_err("%s is out of branch range.\n", orig->symbol_name);
> 
> Because the optprobe is an optional optimization (it can fail back to
> normal kprobe), you don't need to show this message as an error.
> pr_debug() or pr_info() will be enough.
> 
ok, I'll change it to pr_debug().thanks.

>> +		free_optinsn_slot(code, 0);
>> +		return -ERANGE;
>> +	}
>> +
>> +	/* Setup template */
>> +	size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);
> 
> Why would you use 'int' instead of 'kprobe_opcode_t' here?
> 
>> +	for (i = 0; i < size; i++) {
>> +		addr = code + i;
>> +		insn = *(optprobe_template_entry + i);
>> +		ret = aarch64_insn_patch_text(&addr, &insn, 1);
>> +		if (ret < 0) {
>> +			free_optinsn_slot(code, 0);
>> +			return -ERANGE;
>> +		}
>> +	}
> 
> This is too much calling stop_machine() in the loop.
> Please try to allocate an array of addresses and call
> aarch64_insn_patch_text() once.
> Or, as same as x86, allocate a temporary trampoline buffer and modify code
> as you like, and patch it once (with following aarch64_insn_patch_text()
> calls.)
ok, I'allocate an array of addresses and call aarch64_insn_patch_text() 
once. thanks.

> 
>> +
>> +	/* Set probe information */
>> +	addr = code + TMPL_VAL_IDX;
>> +	insn =  (unsigned long long)op & 0xffffffff;
>> +	aarch64_insn_patch_text(&addr, &insn, 1);
>> +
>> +	addr = addr + 4;
>> +	insn = ((unsigned long long)op & GENMASK_ULL(63, 32)) >> 32;
>> +	aarch64_insn_patch_text(&addr, &insn, 1);
>> +
>> +	addr = code + TMPL_CALL_BACK;
>> +	insn =  aarch64_insn_gen_branch_imm((unsigned long)addr,
>> +				(unsigned long)optimized_callback,
>> +				AARCH64_INSN_BRANCH_LINK);
> 
> If you use the branch here (and later), you may also need to
> do the branch_range check here too.
> (trampoline -> optimized_callback())
> 
got it, will add check here, thanks.

>> +	aarch64_insn_patch_text(&addr, &insn, 1);
>> +
>> +	/* The original probed instruction */
>> +	addr = code + TMPL_RESTORE_ORIGN_INSN;
>> +	insn =  orig->opcode;
>> +	aarch64_insn_patch_text(&addr, &insn, 1);
>> +
>> +	/* Jump back to next instruction */
>> +	addr = code + TMPL_RESTORE_END;
>> +	insn = aarch64_insn_gen_branch_imm(
>> +				(unsigned long)(&code[TMPL_RESTORE_END]),
>> +				(unsigned long)(op->kp.addr) + 4,
>> +				AARCH64_INSN_BRANCH_NOLINK);
>> +	aarch64_insn_patch_text(&addr, &insn, 1);
> 
> Ditto.
> 
got it, thanks.
>> +
>> +	flush_icache_range((unsigned long)code,
>> +			   (unsigned long)(&code[TMPL_END_IDX]));
>> +	/* Set op->optinsn.insn means prepared. */
>> +	op->optinsn.insn = code;
>> +	return 0;
>> +}
>> +
>> +void arch_optimize_kprobes(struct list_head *oplist)
>> +{
>> +	struct optimized_kprobe *op, *tmp;
>> +
>> +	list_for_each_entry_safe(op, tmp, oplist, list) {
>> +		u32 insn;
>> +
>> +		WARN_ON(kprobe_disabled(&op->kp));
>> +
>> +		/*
>> +		 * Backup instructions which will be replaced
>> +		 * by jump address
>> +		 */
>> +		memcpy(op->optinsn.copied_insn, op->kp.addr,
>> +			RELATIVEJUMP_SIZE);
>> +		insn = aarch64_insn_gen_branch_imm((unsigned long)op->kp.addr,
>> +				(unsigned long)op->optinsn.insn,
>> +				AARCH64_INSN_BRANCH_NOLINK);
>> +
>> +		WARN_ON(insn == 0);
>> +
>> +		aarch64_insn_patch_text((void *)&(op->kp.addr), &insn, 1);
> 
> Hmm, there is room for improvement.
> Since aarch64_insn_patch_text() is a batch patching API, this should
> optimize probes in batch instead of calling it on each probe.
> 

I'm not sure how to replace aarch64_insn_patch_text(), do you mean using 
aarch64_insn_patch_text_nosync() instead of it?

But in arch_arm_kprobe() on each probe, aarch64_insn_patch_text() is 
used to add BRK64_OPCODE_KPROBES instruction...

Thanks,
Qi

> Thank you,
> 
>> +
>> +		list_del_init(&op->list);
>> +	}
>> +}
>> +
>> +void arch_unoptimize_kprobe(struct optimized_kprobe *op)
>> +{
>> +	arch_arm_kprobe(&op->kp);
>> +}
>> +
>> +/*
>> + * Recover original instructions and breakpoints from relative jumps.
>> + * Caller must call with locking kprobe_mutex.
>> + */
>> +void arch_unoptimize_kprobes(struct list_head *oplist,
>> +			    struct list_head *done_list)
>> +{
>> +	struct optimized_kprobe *op, *tmp;
>> +
>> +	list_for_each_entry_safe(op, tmp, oplist, list) {
>> +		arch_unoptimize_kprobe(op);
>> +		list_move(&op->list, done_list);
>> +	}
>> +}
>> +
>> +void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
>> +{
>> +	if (op->optinsn.insn) {
>> +		free_optinsn_slot(op->optinsn.insn, 1);
>> +		op->optinsn.insn = NULL;
>> +	}
>> +}
>> diff --git a/arch/arm64/kernel/probes/optprobe_trampoline.S b/arch/arm64/kernel/probes/optprobe_trampoline.S
>> new file mode 100644
>> index 000000000000..13729cb279b8
>> --- /dev/null
>> +++ b/arch/arm64/kernel/probes/optprobe_trampoline.S
>> @@ -0,0 +1,80 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * trampoline entry and return code for optprobes.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/asm-offsets.h>
>> +#include <asm/assembler.h>
>> +
>> +	.global optprobe_template_entry
>> +optprobe_template_entry:
>> +	sub sp, sp, #PT_REGS_SIZE
>> +	stp x0, x1, [sp, #S_X0]
>> +	stp x2, x3, [sp, #S_X2]
>> +	stp x4, x5, [sp, #S_X4]
>> +	stp x6, x7, [sp, #S_X6]
>> +	stp x8, x9, [sp, #S_X8]
>> +	stp x10, x11, [sp, #S_X10]
>> +	stp x12, x13, [sp, #S_X12]
>> +	stp x14, x15, [sp, #S_X14]
>> +	stp x16, x17, [sp, #S_X16]
>> +	stp x18, x19, [sp, #S_X18]
>> +	stp x20, x21, [sp, #S_X20]
>> +	stp x22, x23, [sp, #S_X22]
>> +	stp x24, x25, [sp, #S_X24]
>> +	stp x26, x27, [sp, #S_X26]
>> +	stp x28, x29, [sp, #S_X28]
>> +	add x0, sp, #PT_REGS_SIZE
>> +	stp lr, x0, [sp, #S_LR]
>> +	/*
>> +	 * Construct a useful saved PSTATE
>> +	 */
>> +	mrs x0, nzcv
>> +	mrs x1, daif
>> +	orr x0, x0, x1
>> +	mrs x1, CurrentEL
>> +	orr x0, x0, x1
>> +	mrs x1, SPSel
>> +	orr x0, x0, x1
>> +	stp xzr, x0, [sp, #S_PC]
>> +	/* Get parameters to optimized_callback() */
>> +	ldr	x0, 1f
>> +	mov	x1, sp
>> +	/* Branch to optimized_callback() */
>> +	.global optprobe_template_call
>> +optprobe_template_call:
>> +	nop
>> +        /* Restore registers */
>> +	ldr x0, [sp, #S_PSTATE]
>> +	and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
>> +	msr nzcv, x0
>> +	ldp x0, x1, [sp, #S_X0]
>> +	ldp x2, x3, [sp, #S_X2]
>> +	ldp x4, x5, [sp, #S_X4]
>> +	ldp x6, x7, [sp, #S_X6]
>> +	ldp x8, x9, [sp, #S_X8]
>> +	ldp x10, x11, [sp, #S_X10]
>> +	ldp x12, x13, [sp, #S_X12]
>> +	ldp x14, x15, [sp, #S_X14]
>> +	ldp x16, x17, [sp, #S_X16]
>> +	ldp x18, x19, [sp, #S_X18]
>> +	ldp x20, x21, [sp, #S_X20]
>> +	ldp x22, x23, [sp, #S_X22]
>> +	ldp x24, x25, [sp, #S_X24]
>> +	ldp x26, x27, [sp, #S_X26]
>> +	ldp x28, x29, [sp, #S_X28]
>> +	ldr lr, [sp, #S_LR]
>> +        add sp, sp, #PT_REGS_SIZE
>> +	.global optprobe_template_restore_orig_insn
>> +optprobe_template_restore_orig_insn:
>> +	nop
>> +	.global optprobe_template_restore_end
>> +optprobe_template_restore_end:
>> +	nop
>> +	.global optprobe_template_end
>> +optprobe_template_end:
>> +	.global optprobe_template_val
>> +optprobe_template_val:
>> +	1:	.long 0
>> +		.long 0
> 
> 
> 
>> -- 
>> 2.17.1
>>
> 
>
Song Bao Hua (Barry Song) July 30, 2021, 10:04 a.m. UTC | #6
> -----Original Message-----
> From: Song Bao Hua (Barry Song)
> Sent: Friday, July 23, 2021 2:43 PM
> To: 'Masami Hiramatsu' <mhiramat@kernel.org>
> Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
> will@kernel.org; naveen.n.rao@linux.ibm.com; anil.s.keshavamurthy@intel.com;
> davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
> <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
> <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> 
> 
> 
> > -----Original Message-----
> > From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> > Sent: Friday, July 23, 2021 3:03 AM
> > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> > Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
> > will@kernel.org; naveen.n.rao@linux.ibm.com;
> anil.s.keshavamurthy@intel.com;
> > davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
> > <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
> > <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> >
> > Hi Song,
> >
> > On Thu, 22 Jul 2021 10:24:54 +0000
> > "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:
> >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> > > > Sent: Wednesday, July 21, 2021 8:42 PM
> > > > To: liuqi (BA) <liuqi115@huawei.com>
> > > > Cc: catalin.marinas@arm.com; will@kernel.org;
> naveen.n.rao@linux.ibm.com;
> > > > anil.s.keshavamurthy@intel.com; davem@davemloft.net;
> > > > linux-arm-kernel@lists.infradead.org; Song Bao Hua (Barry Song)
> > > > <song.bao.hua@hisilicon.com>; Zengtao (B) <prime.zeng@hisilicon.com>;
> > > > robin.murphy@arm.com; Linuxarm <linuxarm@huawei.com>;
> > > > linux-kernel@vger.kernel.org
> > > > Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> > > >
> > > > Hi Qi,
> > > >
> > > > Thanks for your effort!
> > > >
> > > > On Mon, 19 Jul 2021 20:24:17 +0800
> > > > Qi Liu <liuqi115@huawei.com> wrote:
> > > >
> > > > > This patch introduce optprobe for ARM64. In optprobe, probed
> > > > > instruction is replaced by a branch instruction to detour
> > > > > buffer. Detour buffer contains trampoline code and a call to
> > > > > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > > > > to execute kprobe handler.
> > > >
> > > > OK so this will replace only one instruction.
> > > >
> > > > >
> > > > > Limitations:
> > > > > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > > > > guarantee the offset between probe point and kprobe pre_handler
> > > > > is not larger than 128MiB.
> > > >
> > > > Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> > > > allocate an intermediate trampoline area similar to arm optprobe
> > > > does.
> > >
> > > Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
> > > RANDOMIZE_BASE according to arch/arm64/Kconfig:
> > > config RANDOMIZE_BASE
> > > 	bool "Randomize the address of the kernel image"
> > > 	select ARM64_MODULE_PLTS if MODULES
> > > 	select RELOCATABLE
> >
> > Yes, but why it is required for "RANDOMIZE_BASE"?
> > Does that imply the module call might need to use PLT in
> > some cases?
> >
> > >
> > > Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
> > > allowing RANDOMIZE_BASE via avoiding long jump according to:
> > > arch/arm64/Kconfig:
> > >
> > > config RANDOMIZE_MODULE_REGION_FULL
> > > 	bool "Randomize the module region over a 4 GB range"
> > > 	depends on RANDOMIZE_BASE
> > > 	default y
> > > 	help
> > > 	  Randomizes the location of the module region inside a 4 GB window
> > > 	  covering the core kernel. This way, it is less likely for modules
> > > 	  to leak information about the location of core kernel data structures
> > > 	  but it does imply that function calls between modules and the core
> > > 	  kernel will need to be resolved via veneers in the module PLT.
> > >
> > > 	  When this option is not set, the module region will be randomized over
> > > 	  a limited range that contains the [_stext, _etext] interval of the
> > > 	  core kernel, so branch relocations are always in range.
> >
> > Hmm, this dependency looks strange. If it always in range, don't we need
> > PLT for modules?
> >
> > Cataline, would you know why?
> > Maybe it's a KASLR's Kconfig issue?
> 
> I actually didn't see any problem after making this change:
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index e07e7de9ac49..6440671b72e0 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1781,7 +1781,6 @@ config RELOCATABLE
> 
>  config RANDOMIZE_BASE
>         bool "Randomize the address of the kernel image"
> -       select ARM64_MODULE_PLTS if MODULES
>         select RELOCATABLE
>         help
>           Randomizes the virtual address at which the kernel image is
> @@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
>  config RANDOMIZE_MODULE_REGION_FULL
>         bool "Randomize the module region over a 4 GB range"
>         depends on RANDOMIZE_BASE
> +       select ARM64_MODULE_PLTS if MODULES
>         default y
>         help
>           Randomizes the location of the module region inside a 4 GB window
> 
> and having this config:
> # zcat /proc/config.gz | grep RANDOMIZE_BASE
> CONFIG_RANDOMIZE_BASE=y
> 
> # zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
> # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
> 
> # zcat /proc/config.gz | grep ARM64_MODULE_PLTS
> # CONFIG_ARM64_MODULE_PLTS is not set
> 
> Modules work all good:
> # lsmod
> Module                  Size  Used by
> btrfs                1355776  0
> blake2b_generic        20480  0
> libcrc32c              16384  1 btrfs
> xor                    20480  1 btrfs
> xor_neon               16384  1 xor
> zstd_compress         163840  1 btrfs
> raid6_pq              110592  1 btrfs
> ctr                    16384  0
> md5                    16384  0
> ip_tunnel              32768  0
> ipv6                  442368  28
> 
> 
> I am not quite sure if there is a corner case. If no,
> I would think the kconfig might be some improper.

The corner case is that even CONFIG_RANDOMIZE_MODULE_REGION_FULL
is not enabled, but if CONFIG_ARM64_MODULE_PLTS is enabled, when
we can't get memory from the 128MB area in case the area is exhausted,
we will fall back in module_alloc() to a 2GB area as long as either
of the below two conditions is met:

1. KASAN is not enabled
2. KASAN is enabled and CONFIG_KASAN_VMALLOC is also enabled.

void *module_alloc(unsigned long size)
{
	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
	gfp_t gfp_mask = GFP_KERNEL;
	void *p;

	/* Silence the initial allocation */
	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
		gfp_mask |= __GFP_NOWARN;

	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
		/* don't exceed the static module region - see below */
		module_alloc_end = MODULES_END;

	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
				NUMA_NO_NODE, __builtin_return_address(0));

	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
	    (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
	     (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	      !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
		/*
		 * KASAN without KASAN_VMALLOC can only deal with module
		 * allocations being served from the reserved module region,
		 * since the remainder of the vmalloc region is already
		 * backed by zero shadow pages, and punching holes into it
		 * is non-trivial. Since the module region is not randomized
		 * when KASAN is enabled without KASAN_VMALLOC, it is even
		 * less likely that the module region gets exhausted, so we
		 * can simply omit this fallback in that case.
		 */
		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
				module_alloc_base + SZ_2G, GFP_KERNEL,
				PAGE_KERNEL, 0, NUMA_NO_NODE,
				__builtin_return_address(0));

	if (p && (kasan_module_alloc(p, size) < 0)) {
		vfree(p);
		return NULL;
	}

	return p;
}

This should be happening quite rarely. But maybe arm64's document
needs some minor fixup, otherwise, it is quite confusing.

> >
> > >
> > > and
> > >
> > > arch/arm64/kernel/kaslr.c:
> > > 	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
> > > 		/*
> > > 		 * Randomize the module region over a 2 GB window covering the
> > > 		 * kernel. This reduces the risk of modules leaking information
> > > 		 * about the address of the kernel itself, but results in
> > > 		 * branches between modules and the core kernel that are
> > > 		 * resolved via PLTs. (Branches between modules will be
> > > 		 * resolved normally.)
> > > 		 */
> > > 		module_range = SZ_2G - (u64)(_end - _stext);
> > > 		module_alloc_base = max((u64)_end + offset - SZ_2G,
> > > 					(u64)MODULES_VADDR);
> > > 	} else {
> > > 		/*
> > > 		 * Randomize the module region by setting module_alloc_base to
> > > 		 * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
> > > 		 * _stext) . This guarantees that the resulting region still
> > > 		 * covers [_stext, _etext], and that all relative branches can
> > > 		 * be resolved without veneers.
> > > 		 */
> > > 		module_range = MODULES_VSIZE - (u64)(_etext - _stext);
> > > 		module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
> > > 	}
> > >
> > > So depending on ! ARM64_MODULE_PLTS seems to narrow the scenarios
> > > while depending on ! RANDOMIZE_MODULE_REGION_FULL  permit more
> > > machines to use optprobe.
> >
> > OK, I see that the code ensures the range will be in the MODULE_VSIZE (=128MB).
> >
> > >
> > > I am not quite sure I am 100% right but tests seem to back this.
> > > hopefully Catalin and Will can correct me.
> > >
> > > >
> > > > >
> > > > > Performance of optprobe on Hip08 platform is test using kprobe
> > > > > example module[1] to analyze the latency of a kernel function,
> > > > > and here is the result:
> > > > >
> > > > > [1]
> > > >
> >
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/sa
> > > > mples/kprobes/kretprobe_example.c
> > > > >
> > > > > kprobe before optimized:
> > > > > [280709.846380] do_empty returned 0 and took 1530 ns to execute
> > > > > [280709.852057] do_empty returned 0 and took 550 ns to execute
> > > > > [280709.857631] do_empty returned 0 and took 440 ns to execute
> > > > > [280709.863215] do_empty returned 0 and took 380 ns to execute
> > > > > [280709.868787] do_empty returned 0 and took 360 ns to execute
> > > > > [280709.874362] do_empty returned 0 and took 340 ns to execute
> > > > > [280709.879936] do_empty returned 0 and took 320 ns to execute
> > > > > [280709.885505] do_empty returned 0 and took 300 ns to execute
> > > > > [280709.891075] do_empty returned 0 and took 280 ns to execute
> > > > > [280709.896646] do_empty returned 0 and took 290 ns to execute
> > > > > [280709.902220] do_empty returned 0 and took 290 ns to execute
> > > > > [280709.907807] do_empty returned 0 and took 290 ns to execute
> > > > >
> > > > > optprobe:
> > > > > [ 2965.964572] do_empty returned 0 and took 90 ns to execute
> > > > > [ 2965.969952] do_empty returned 0 and took 80 ns to execute
> > > > > [ 2965.975332] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2965.980714] do_empty returned 0 and took 60 ns to execute
> > > > > [ 2965.986128] do_empty returned 0 and took 80 ns to execute
> > > > > [ 2965.991507] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2965.996884] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2966.002262] do_empty returned 0 and took 80 ns to execute
> > > > > [ 2966.007642] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2966.013020] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2966.018400] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2966.023779] do_empty returned 0 and took 70 ns to execute
> > > > > [ 2966.029158] do_empty returned 0 and took 70 ns to execute
> > > >
> > > > Great result!
> > > > I have other comments on the code below.
> > > >
> > > > [...]
> > > > > diff --git a/arch/arm64/kernel/probes/kprobes.c
> > > > b/arch/arm64/kernel/probes/kprobes.c
> > > > > index 6dbcc89f6662..83755ad62abe 100644
> > > > > --- a/arch/arm64/kernel/probes/kprobes.c
> > > > > +++ b/arch/arm64/kernel/probes/kprobes.c
> > > > > @@ -11,6 +11,7 @@
> > > > >  #include <linux/kasan.h>
> > > > >  #include <linux/kernel.h>
> > > > >  #include <linux/kprobes.h>
> > > > > +#include <linux/moduleloader.h>
> > > > >  #include <linux/sched/debug.h>
> > > > >  #include <linux/set_memory.h>
> > > > >  #include <linux/slab.h>
> > > > > @@ -113,9 +114,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe
> *p)
> > > > >
> > > > >  void *alloc_insn_page(void)
> > > > >  {
> > > > > -	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
> > VMALLOC_END,
> > > > > -			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
> > > > > -			NUMA_NO_NODE, __builtin_return_address(0));
> > > > > +	void *page;
> > > > > +
> > > > > +	page = module_alloc(PAGE_SIZE);
> > > > > +	if (!page)
> > > > > +		return NULL;
> > > > > +
> > > > > +	set_vm_flush_reset_perms(page);
> > > > > +	/*
> > > > > +	 * First make the page read-only, and only then make it executable
> > to
> > > > > +	 * prevent it from being W+X in between.
> > > > > +	 */
> > > > > +	set_memory_ro((unsigned long)page, 1);
> > > > > +	set_memory_x((unsigned long)page, 1);
> > > > > +
> > > > > +	return page;
> > > >
> > > > Isn't this a separated change? Or any reason why you have to
> > > > change this function?
> > >
> > > As far as I can tell, this is still related with the 128MB
> > > short jump limitation.
> > > VMALLOC_START, VMALLOC_END is an fixed virtual address area
> > > which isn't necessarily modules will be put.
> > > So this patch is moving to module_alloc() which will get
> > > memory between module_alloc_base and module_alloc_end.
> >
> > Ah, I missed that point. Yes, VMALLOC_START and VMALLOC_END
> > are not correct range.
> >
> > >
> > > Together with depending on !RANDOMIZE_MODULE_REGION_FULL,
> > > this makes all kernel, module and trampoline in short
> > > jmp area.
> > >
> > > As long as we can figure out a way to support long jmp
> > > for optprobe, the change in alloc_insn_page() can be
> > > dropped.
> >
> > No, I think above change is rather readable, so it is OK.
> >
> > >
> > > Masami, any reference code from any platform to support long
> > > jump for optprobe? For long jmp, we need to put jmp address
> > > to a memory and then somehow load the target address
> > > to PC. Right now, we are able to replace an instruction
> > > only. That is the problem.
> >
> > Hmm, I had read a paper about 2-stage jump idea 15years ago. That
> > paper allocated an intermediate trampoline (like PLT) which did a long
> > jump to the real trampoline on SPARC.
> > (something like, "push x0; ldr x0, [pc+8]; br x0; <immediate-addr>" for
> > a slot of the intermediate trampoline.)
> >
> > For the other (simpler) solution example is optprobe in powerpc
> > (arch/powerpc/kernel/optprobes_head.S). That reserves a buffer page
> > in the text section, and use it.
> >
> > But I think your current implementation is good enough for the
> > first step. If someone needs CONFIG_RANDOMIZE_MODULE_REGION_FULL
> > and optprobe, we can revisit this point.
> >
> > Thank you,


Thanks
Barry
Masami Hiramatsu (Google) July 31, 2021, 1:15 a.m. UTC | #7
On Fri, 30 Jul 2021 10:04:06 +0000
"Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> > > > >
> > > > > Hi Qi,
> > > > >
> > > > > Thanks for your effort!
> > > > >
> > > > > On Mon, 19 Jul 2021 20:24:17 +0800
> > > > > Qi Liu <liuqi115@huawei.com> wrote:
> > > > >
> > > > > > This patch introduce optprobe for ARM64. In optprobe, probed
> > > > > > instruction is replaced by a branch instruction to detour
> > > > > > buffer. Detour buffer contains trampoline code and a call to
> > > > > > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > > > > > to execute kprobe handler.
> > > > >
> > > > > OK so this will replace only one instruction.
> > > > >
> > > > > >
> > > > > > Limitations:
> > > > > > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > > > > > guarantee the offset between probe point and kprobe pre_handler
> > > > > > is not larger than 128MiB.
> > > > >
> > > > > Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> > > > > allocate an intermediate trampoline area similar to arm optprobe
> > > > > does.
> > > >
> > > > Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
> > > > RANDOMIZE_BASE according to arch/arm64/Kconfig:
> > > > config RANDOMIZE_BASE
> > > > 	bool "Randomize the address of the kernel image"
> > > > 	select ARM64_MODULE_PLTS if MODULES
> > > > 	select RELOCATABLE
> > >
> > > Yes, but why it is required for "RANDOMIZE_BASE"?
> > > Does that imply the module call might need to use PLT in
> > > some cases?
> > >
> > > >
> > > > Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
> > > > allowing RANDOMIZE_BASE via avoiding long jump according to:
> > > > arch/arm64/Kconfig:
> > > >
> > > > config RANDOMIZE_MODULE_REGION_FULL
> > > > 	bool "Randomize the module region over a 4 GB range"
> > > > 	depends on RANDOMIZE_BASE
> > > > 	default y
> > > > 	help
> > > > 	  Randomizes the location of the module region inside a 4 GB window
> > > > 	  covering the core kernel. This way, it is less likely for modules
> > > > 	  to leak information about the location of core kernel data structures
> > > > 	  but it does imply that function calls between modules and the core
> > > > 	  kernel will need to be resolved via veneers in the module PLT.
> > > >
> > > > 	  When this option is not set, the module region will be randomized over
> > > > 	  a limited range that contains the [_stext, _etext] interval of the
> > > > 	  core kernel, so branch relocations are always in range.
> > >
> > > Hmm, this dependency looks strange. If it always in range, don't we need
> > > PLT for modules?
> > >
> > > Cataline, would you know why?
> > > Maybe it's a KASLR's Kconfig issue?
> > 
> > I actually didn't see any problem after making this change:
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index e07e7de9ac49..6440671b72e0 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -1781,7 +1781,6 @@ config RELOCATABLE
> > 
> >  config RANDOMIZE_BASE
> >         bool "Randomize the address of the kernel image"
> > -       select ARM64_MODULE_PLTS if MODULES
> >         select RELOCATABLE
> >         help
> >           Randomizes the virtual address at which the kernel image is
> > @@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
> >  config RANDOMIZE_MODULE_REGION_FULL
> >         bool "Randomize the module region over a 4 GB range"
> >         depends on RANDOMIZE_BASE
> > +       select ARM64_MODULE_PLTS if MODULES
> >         default y
> >         help
> >           Randomizes the location of the module region inside a 4 GB window
> > 
> > and having this config:
> > # zcat /proc/config.gz | grep RANDOMIZE_BASE
> > CONFIG_RANDOMIZE_BASE=y
> > 
> > # zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
> > # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
> > 
> > # zcat /proc/config.gz | grep ARM64_MODULE_PLTS
> > # CONFIG_ARM64_MODULE_PLTS is not set
> > 
> > Modules work all good:
> > # lsmod
> > Module                  Size  Used by
> > btrfs                1355776  0
> > blake2b_generic        20480  0
> > libcrc32c              16384  1 btrfs
> > xor                    20480  1 btrfs
> > xor_neon               16384  1 xor
> > zstd_compress         163840  1 btrfs
> > raid6_pq              110592  1 btrfs
> > ctr                    16384  0
> > md5                    16384  0
> > ip_tunnel              32768  0
> > ipv6                  442368  28
> > 
> > 
> > I am not quite sure if there is a corner case. If no,
> > I would think the kconfig might be some improper.
> 
> The corner case is that even CONFIG_RANDOMIZE_MODULE_REGION_FULL
> is not enabled, but if CONFIG_ARM64_MODULE_PLTS is enabled, when
> we can't get memory from the 128MB area in case the area is exhausted,
> we will fall back in module_alloc() to a 2GB area as long as either
> of the below two conditions is met:
> 
> 1. KASAN is not enabled
> 2. KASAN is enabled and CONFIG_KASAN_VMALLOC is also enabled.
> 
> void *module_alloc(unsigned long size)
> {
> 	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> 	gfp_t gfp_mask = GFP_KERNEL;
> 	void *p;
> 
> 	/* Silence the initial allocation */
> 	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> 		gfp_mask |= __GFP_NOWARN;
> 
> 	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> 	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> 		/* don't exceed the static module region - see below */
> 		module_alloc_end = MODULES_END;
> 
> 	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> 				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> 				NUMA_NO_NODE, __builtin_return_address(0));
> 
> 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> 	    (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
> 	     (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
> 	      !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
> 		/*
> 		 * KASAN without KASAN_VMALLOC can only deal with module
> 		 * allocations being served from the reserved module region,
> 		 * since the remainder of the vmalloc region is already
> 		 * backed by zero shadow pages, and punching holes into it
> 		 * is non-trivial. Since the module region is not randomized
> 		 * when KASAN is enabled without KASAN_VMALLOC, it is even
> 		 * less likely that the module region gets exhausted, so we
> 		 * can simply omit this fallback in that case.
> 		 */
> 		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> 				module_alloc_base + SZ_2G, GFP_KERNEL,
> 				PAGE_KERNEL, 0, NUMA_NO_NODE,
> 				__builtin_return_address(0));
> 
> 	if (p && (kasan_module_alloc(p, size) < 0)) {
> 		vfree(p);
> 		return NULL;
> 	}
> 
> 	return p;
> }
> 
> This should be happening quite rarely. But maybe arm64's document
> needs some minor fixup, otherwise, it is quite confusing.

OK, so CONFIG_KASAN_VLALLOC=y and CONFIG_ARM64_MODULE_PLTS=y, the module_alloc()
basically returns the memory in 128MB region, but can return the memory in 2GB
region. (This is OK because optprobe can filter it out)
But CONFIG_RANDOMIZE_MODULE_REGION_FULL=y, there is almost no chance to get
the memory in 128MB region.

Hmm, for the optprobe in kernel text, maybe we can define 'optinsn_alloc_start'
by 'module_alloc_base - (SZ_2G - MODULES_VADDR)' and use __vmalloc_node_range()
to avoid this issue. But that is only for the kernel. For the modules, we may
always out of 128MB region.

Thank you,
Song Bao Hua (Barry Song) July 31, 2021, 12:21 p.m. UTC | #8
> -----Original Message-----
> From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
> Sent: Saturday, July 31, 2021 1:16 PM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
> will@kernel.org; naveen.n.rao@linux.ibm.com; anil.s.keshavamurthy@intel.com;
> davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
> <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
> <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
> 
> On Fri, 30 Jul 2021 10:04:06 +0000
> "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:
> 
> > > > > >
> > > > > > Hi Qi,
> > > > > >
> > > > > > Thanks for your effort!
> > > > > >
> > > > > > On Mon, 19 Jul 2021 20:24:17 +0800
> > > > > > Qi Liu <liuqi115@huawei.com> wrote:
> > > > > >
> > > > > > > This patch introduce optprobe for ARM64. In optprobe, probed
> > > > > > > instruction is replaced by a branch instruction to detour
> > > > > > > buffer. Detour buffer contains trampoline code and a call to
> > > > > > > optimized_callback(). optimized_callback() calls opt_pre_handler()
> > > > > > > to execute kprobe handler.
> > > > > >
> > > > > > OK so this will replace only one instruction.
> > > > > >
> > > > > > >
> > > > > > > Limitations:
> > > > > > > - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
> > > > > > > guarantee the offset between probe point and kprobe pre_handler
> > > > > > > is not larger than 128MiB.
> > > > > >
> > > > > > Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
> > > > > > allocate an intermediate trampoline area similar to arm optprobe
> > > > > > does.
> > > > >
> > > > > Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
> > > > > RANDOMIZE_BASE according to arch/arm64/Kconfig:
> > > > > config RANDOMIZE_BASE
> > > > > 	bool "Randomize the address of the kernel image"
> > > > > 	select ARM64_MODULE_PLTS if MODULES
> > > > > 	select RELOCATABLE
> > > >
> > > > Yes, but why it is required for "RANDOMIZE_BASE"?
> > > > Does that imply the module call might need to use PLT in
> > > > some cases?
> > > >
> > > > >
> > > > > Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
> > > > > allowing RANDOMIZE_BASE via avoiding long jump according to:
> > > > > arch/arm64/Kconfig:
> > > > >
> > > > > config RANDOMIZE_MODULE_REGION_FULL
> > > > > 	bool "Randomize the module region over a 4 GB range"
> > > > > 	depends on RANDOMIZE_BASE
> > > > > 	default y
> > > > > 	help
> > > > > 	  Randomizes the location of the module region inside a 4 GB window
> > > > > 	  covering the core kernel. This way, it is less likely for modules
> > > > > 	  to leak information about the location of core kernel data structures
> > > > > 	  but it does imply that function calls between modules and the core
> > > > > 	  kernel will need to be resolved via veneers in the module PLT.
> > > > >
> > > > > 	  When this option is not set, the module region will be randomized
> over
> > > > > 	  a limited range that contains the [_stext, _etext] interval of the
> > > > > 	  core kernel, so branch relocations are always in range.
> > > >
> > > > Hmm, this dependency looks strange. If it always in range, don't we need
> > > > PLT for modules?
> > > >
> > > > Cataline, would you know why?
> > > > Maybe it's a KASLR's Kconfig issue?
> > >
> > > I actually didn't see any problem after making this change:
> > >
> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index e07e7de9ac49..6440671b72e0 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -1781,7 +1781,6 @@ config RELOCATABLE
> > >
> > >  config RANDOMIZE_BASE
> > >         bool "Randomize the address of the kernel image"
> > > -       select ARM64_MODULE_PLTS if MODULES
> > >         select RELOCATABLE
> > >         help
> > >           Randomizes the virtual address at which the kernel image is
> > > @@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
> > >  config RANDOMIZE_MODULE_REGION_FULL
> > >         bool "Randomize the module region over a 4 GB range"
> > >         depends on RANDOMIZE_BASE
> > > +       select ARM64_MODULE_PLTS if MODULES
> > >         default y
> > >         help
> > >           Randomizes the location of the module region inside a 4 GB window
> > >
> > > and having this config:
> > > # zcat /proc/config.gz | grep RANDOMIZE_BASE
> > > CONFIG_RANDOMIZE_BASE=y
> > >
> > > # zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
> > > # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
> > >
> > > # zcat /proc/config.gz | grep ARM64_MODULE_PLTS
> > > # CONFIG_ARM64_MODULE_PLTS is not set
> > >
> > > Modules work all good:
> > > # lsmod
> > > Module                  Size  Used by
> > > btrfs                1355776  0
> > > blake2b_generic        20480  0
> > > libcrc32c              16384  1 btrfs
> > > xor                    20480  1 btrfs
> > > xor_neon               16384  1 xor
> > > zstd_compress         163840  1 btrfs
> > > raid6_pq              110592  1 btrfs
> > > ctr                    16384  0
> > > md5                    16384  0
> > > ip_tunnel              32768  0
> > > ipv6                  442368  28
> > >
> > >
> > > I am not quite sure if there is a corner case. If no,
> > > I would think the kconfig might be some improper.
> >
> > The corner case is that even CONFIG_RANDOMIZE_MODULE_REGION_FULL
> > is not enabled, but if CONFIG_ARM64_MODULE_PLTS is enabled, when
> > we can't get memory from the 128MB area in case the area is exhausted,
> > we will fall back in module_alloc() to a 2GB area as long as either
> > of the below two conditions is met:
> >
> > 1. KASAN is not enabled
> > 2. KASAN is enabled and CONFIG_KASAN_VMALLOC is also enabled.
> >
> > void *module_alloc(unsigned long size)
> > {
> > 	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> > 	gfp_t gfp_mask = GFP_KERNEL;
> > 	void *p;
> >
> > 	/* Silence the initial allocation */
> > 	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> > 		gfp_mask |= __GFP_NOWARN;
> >
> > 	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> > 	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> > 		/* don't exceed the static module region - see below */
> > 		module_alloc_end = MODULES_END;
> >
> > 	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> > 				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> > 				NUMA_NO_NODE, __builtin_return_address(0));
> >
> > 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> > 	    (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
> > 	     (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
> > 	      !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
> > 		/*
> > 		 * KASAN without KASAN_VMALLOC can only deal with module
> > 		 * allocations being served from the reserved module region,
> > 		 * since the remainder of the vmalloc region is already
> > 		 * backed by zero shadow pages, and punching holes into it
> > 		 * is non-trivial. Since the module region is not randomized
> > 		 * when KASAN is enabled without KASAN_VMALLOC, it is even
> > 		 * less likely that the module region gets exhausted, so we
> > 		 * can simply omit this fallback in that case.
> > 		 */
> > 		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> > 				module_alloc_base + SZ_2G, GFP_KERNEL,
> > 				PAGE_KERNEL, 0, NUMA_NO_NODE,
> > 				__builtin_return_address(0));
> >
> > 	if (p && (kasan_module_alloc(p, size) < 0)) {
> > 		vfree(p);
> > 		return NULL;
> > 	}
> >
> > 	return p;
> > }
> >
> > This should be happening quite rarely. But maybe arm64's document
> > needs some minor fixup, otherwise, it is quite confusing.
> 
> OK, so CONFIG_KASAN_VLALLOC=y and CONFIG_ARM64_MODULE_PLTS=y, the
> module_alloc()
> basically returns the memory in 128MB region, but can return the memory in 2GB
> region. (This is OK because optprobe can filter it out)
> But CONFIG_RANDOMIZE_MODULE_REGION_FULL=y, there is almost no chance to get
> the memory in 128MB region.
> 
> Hmm, for the optprobe in kernel text, maybe we can define 'optinsn_alloc_start'
> by 'module_alloc_base - (SZ_2G - MODULES_VADDR)' and use __vmalloc_node_range()
> to avoid this issue. But that is only for the kernel. For the modules, we may
> always out of 128MB region.

If we can have some separate PLT entries in each module for optprobe,
we should be able to short-jump to the PLT entry and then PLT entry
will further long-jump to detour out of the range. That is exactly
the duty of PLT.

Right now, arm64 has support on dynamic_ftrace by adding a
section in module for ftrace PLT.
arch/arm64/include/asm/module.lds.h:
SECTIONS {
#ifdef CONFIG_ARM64_MODULE_PLTS
	.plt 0 (NOLOAD) : { BYTE(0) }
	.init.plt 0 (NOLOAD) : { BYTE(0) }
	.text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
#endif
...
}

arch/arm64/kernel/module.c will initialize some PLT entries
for ftrace:

static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
				  const Elf_Shdr *sechdrs,
				  struct module *mod)
{
#if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE)
	const Elf_Shdr *s;
	struct plt_entry *plts;

	s = find_section(hdr, sechdrs, ".text.ftrace_trampoline");
	if (!s)
		return -ENOEXEC;

	plts = (void *)s->sh_addr;

	__init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);

	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
		__init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR);

	mod->arch.ftrace_trampolines = plts;
#endif
	return 0;
}

Ftrace will then use those PLT entries in arch/arm64/kernel/ftrace.c:
static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
{
#ifdef CONFIG_ARM64_MODULE_PLTS
	struct plt_entry *plt = mod->arch.ftrace_trampolines;

	if (addr == FTRACE_ADDR)
		return &plt[FTRACE_PLT_IDX];
	if (addr == FTRACE_REGS_ADDR &&
	    IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
		return &plt[FTRACE_REGS_PLT_IDX];
#endif
	return NULL;
}

/*
 * Turn on the call to ftrace_caller() in instrumented function
 */
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
	unsigned long pc = rec->ip;
	u32 old, new;
	long offset = (long)pc - (long)addr;

	if (offset < -SZ_128M || offset >= SZ_128M) {
		struct module *mod;
		struct plt_entry *plt;

		if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
			return -EINVAL;

		/*
		 * On kernels that support module PLTs, the offset between the
		 * branch instruction and its target may legally exceed the
		 * range of an ordinary relative 'bl' opcode. In this case, we
		 * need to branch via a trampoline in the module.
		 *
		 * NOTE: __module_text_address() must be called with preemption
		 * disabled, but we can rely on ftrace_lock to ensure that 'mod'
		 * retains its validity throughout the remainder of this code.
		 */
		preempt_disable();
		mod = __module_text_address(pc);
		preempt_enable();

		if (WARN_ON(!mod))
			return -EINVAL;

		plt = get_ftrace_plt(mod, addr);
		if (!plt) {
			pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
			return -EINVAL;
		}

		addr = (unsigned long)plt;
	}

	old = aarch64_insn_gen_nop();
	new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);

	return ftrace_modify_code(pc, old, new, true);
}

This might be the direction to go later. Anyway, "Rome wasn't built
in a day", for this stage, we might focus on optprobe for the case
of non-randomized module region :-). 

BTW, @liuqi, if users set "nokaslr" in bootargs, will your optprobe
always work and not fall back to normal kprobe even we remove the
dependency on RANDOMIZED_MODULE_REGION_FULL?

> 
> Thank you,
> 
> --
> Masami Hiramatsu <mhiramat@kernel.org>

Thanks
Barry
liuqi (BA) Aug. 2, 2021, 3:52 a.m. UTC | #9
On 2021/7/31 20:21, Song Bao Hua (Barry Song) wrote:
> 
> 
>> -----Original Message-----
>> From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
>> Sent: Saturday, July 31, 2021 1:16 PM
>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
>> Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
>> will@kernel.org; naveen.n.rao@linux.ibm.com; anil.s.keshavamurthy@intel.com;
>> davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
>> <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
>> <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
>>
>> On Fri, 30 Jul 2021 10:04:06 +0000
>> "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:
>>
>>>>>>>
>>>>>>> Hi Qi,
>>>>>>>
>>>>>>> Thanks for your effort!
>>>>>>>
>>>>>>> On Mon, 19 Jul 2021 20:24:17 +0800
>>>>>>> Qi Liu <liuqi115@huawei.com> wrote:
>>>>>>>
>>>>>>>> This patch introduce optprobe for ARM64. In optprobe, probed
>>>>>>>> instruction is replaced by a branch instruction to detour
>>>>>>>> buffer. Detour buffer contains trampoline code and a call to
>>>>>>>> optimized_callback(). optimized_callback() calls opt_pre_handler()
>>>>>>>> to execute kprobe handler.
>>>>>>>
>>>>>>> OK so this will replace only one instruction.
>>>>>>>
>>>>>>>>
>>>>>>>> Limitations:
>>>>>>>> - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
>>>>>>>> guarantee the offset between probe point and kprobe pre_handler
>>>>>>>> is not larger than 128MiB.
>>>>>>>
>>>>>>> Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
>>>>>>> allocate an intermediate trampoline area similar to arm optprobe
>>>>>>> does.
>>>>>>
>>>>>> Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
>>>>>> RANDOMIZE_BASE according to arch/arm64/Kconfig:
>>>>>> config RANDOMIZE_BASE
>>>>>> 	bool "Randomize the address of the kernel image"
>>>>>> 	select ARM64_MODULE_PLTS if MODULES
>>>>>> 	select RELOCATABLE
>>>>>
>>>>> Yes, but why it is required for "RANDOMIZE_BASE"?
>>>>> Does that imply the module call might need to use PLT in
>>>>> some cases?
>>>>>
>>>>>>
>>>>>> Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
>>>>>> allowing RANDOMIZE_BASE via avoiding long jump according to:
>>>>>> arch/arm64/Kconfig:
>>>>>>
>>>>>> config RANDOMIZE_MODULE_REGION_FULL
>>>>>> 	bool "Randomize the module region over a 4 GB range"
>>>>>> 	depends on RANDOMIZE_BASE
>>>>>> 	default y
>>>>>> 	help
>>>>>> 	  Randomizes the location of the module region inside a 4 GB window
>>>>>> 	  covering the core kernel. This way, it is less likely for modules
>>>>>> 	  to leak information about the location of core kernel data structures
>>>>>> 	  but it does imply that function calls between modules and the core
>>>>>> 	  kernel will need to be resolved via veneers in the module PLT.
>>>>>>
>>>>>> 	  When this option is not set, the module region will be randomized
>> over
>>>>>> 	  a limited range that contains the [_stext, _etext] interval of the
>>>>>> 	  core kernel, so branch relocations are always in range.
>>>>>
>>>>> Hmm, this dependency looks strange. If it always in range, don't we need
>>>>> PLT for modules?
>>>>>
>>>>> Cataline, would you know why?
>>>>> Maybe it's a KASLR's Kconfig issue?
>>>>
>>>> I actually didn't see any problem after making this change:
>>>>
>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>> index e07e7de9ac49..6440671b72e0 100644
>>>> --- a/arch/arm64/Kconfig
>>>> +++ b/arch/arm64/Kconfig
>>>> @@ -1781,7 +1781,6 @@ config RELOCATABLE
>>>>
>>>>   config RANDOMIZE_BASE
>>>>          bool "Randomize the address of the kernel image"
>>>> -       select ARM64_MODULE_PLTS if MODULES
>>>>          select RELOCATABLE
>>>>          help
>>>>            Randomizes the virtual address at which the kernel image is
>>>> @@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
>>>>   config RANDOMIZE_MODULE_REGION_FULL
>>>>          bool "Randomize the module region over a 4 GB range"
>>>>          depends on RANDOMIZE_BASE
>>>> +       select ARM64_MODULE_PLTS if MODULES
>>>>          default y
>>>>          help
>>>>            Randomizes the location of the module region inside a 4 GB window
>>>>
>>>> and having this config:
>>>> # zcat /proc/config.gz | grep RANDOMIZE_BASE
>>>> CONFIG_RANDOMIZE_BASE=y
>>>>
>>>> # zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
>>>> # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
>>>>
>>>> # zcat /proc/config.gz | grep ARM64_MODULE_PLTS
>>>> # CONFIG_ARM64_MODULE_PLTS is not set
>>>>
>>>> Modules work all good:
>>>> # lsmod
>>>> Module                  Size  Used by
>>>> btrfs                1355776  0
>>>> blake2b_generic        20480  0
>>>> libcrc32c              16384  1 btrfs
>>>> xor                    20480  1 btrfs
>>>> xor_neon               16384  1 xor
>>>> zstd_compress         163840  1 btrfs
>>>> raid6_pq              110592  1 btrfs
>>>> ctr                    16384  0
>>>> md5                    16384  0
>>>> ip_tunnel              32768  0
>>>> ipv6                  442368  28
>>>>
>>>>
>>>> I am not quite sure if there is a corner case. If no,
>>>> I would think the kconfig might be some improper.
>>>
>>> The corner case is that even CONFIG_RANDOMIZE_MODULE_REGION_FULL
>>> is not enabled, but if CONFIG_ARM64_MODULE_PLTS is enabled, when
>>> we can't get memory from the 128MB area in case the area is exhausted,
>>> we will fall back in module_alloc() to a 2GB area as long as either
>>> of the below two conditions is met:
>>>
>>> 1. KASAN is not enabled
>>> 2. KASAN is enabled and CONFIG_KASAN_VMALLOC is also enabled.
>>>
>>> void *module_alloc(unsigned long size)
>>> {
>>> 	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
>>> 	gfp_t gfp_mask = GFP_KERNEL;
>>> 	void *p;
>>>
>>> 	/* Silence the initial allocation */
>>> 	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
>>> 		gfp_mask |= __GFP_NOWARN;
>>>
>>> 	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
>>> 	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
>>> 		/* don't exceed the static module region - see below */
>>> 		module_alloc_end = MODULES_END;
>>>
>>> 	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
>>> 				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
>>> 				NUMA_NO_NODE, __builtin_return_address(0));
>>>
>>> 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
>>> 	    (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
>>> 	     (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
>>> 	      !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
>>> 		/*
>>> 		 * KASAN without KASAN_VMALLOC can only deal with module
>>> 		 * allocations being served from the reserved module region,
>>> 		 * since the remainder of the vmalloc region is already
>>> 		 * backed by zero shadow pages, and punching holes into it
>>> 		 * is non-trivial. Since the module region is not randomized
>>> 		 * when KASAN is enabled without KASAN_VMALLOC, it is even
>>> 		 * less likely that the module region gets exhausted, so we
>>> 		 * can simply omit this fallback in that case.
>>> 		 */
>>> 		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
>>> 				module_alloc_base + SZ_2G, GFP_KERNEL,
>>> 				PAGE_KERNEL, 0, NUMA_NO_NODE,
>>> 				__builtin_return_address(0));
>>>
>>> 	if (p && (kasan_module_alloc(p, size) < 0)) {
>>> 		vfree(p);
>>> 		return NULL;
>>> 	}
>>>
>>> 	return p;
>>> }
>>>
>>> This should be happening quite rarely. But maybe arm64's document
>>> needs some minor fixup, otherwise, it is quite confusing.
>>
>> OK, so CONFIG_KASAN_VLALLOC=y and CONFIG_ARM64_MODULE_PLTS=y, the
>> module_alloc()
>> basically returns the memory in 128MB region, but can return the memory in 2GB
>> region. (This is OK because optprobe can filter it out)
>> But CONFIG_RANDOMIZE_MODULE_REGION_FULL=y, there is almost no chance to get
>> the memory in 128MB region.
>>
>> Hmm, for the optprobe in kernel text, maybe we can define 'optinsn_alloc_start'
>> by 'module_alloc_base - (SZ_2G - MODULES_VADDR)' and use __vmalloc_node_range()
>> to avoid this issue. But that is only for the kernel. For the modules, we may
>> always out of 128MB region.
> 
> If we can have some separate PLT entries in each module for optprobe,
> we should be able to short-jump to the PLT entry and then PLT entry
> will further long-jump to detour out of the range. That is exactly
> the duty of PLT.
> 
> Right now, arm64 has support on dynamic_ftrace by adding a
> section in module for ftrace PLT.
> arch/arm64/include/asm/module.lds.h:
> SECTIONS {
> #ifdef CONFIG_ARM64_MODULE_PLTS
> 	.plt 0 (NOLOAD) : { BYTE(0) }
> 	.init.plt 0 (NOLOAD) : { BYTE(0) }
> 	.text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
> #endif
> ...
> }
> 
> arch/arm64/kernel/module.c will initialize some PLT entries
> for ftrace:
> 
> static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
> 				  const Elf_Shdr *sechdrs,
> 				  struct module *mod)
> {
> #if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE)
> 	const Elf_Shdr *s;
> 	struct plt_entry *plts;
> 
> 	s = find_section(hdr, sechdrs, ".text.ftrace_trampoline");
> 	if (!s)
> 		return -ENOEXEC;
> 
> 	plts = (void *)s->sh_addr;
> 
> 	__init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);
> 
> 	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
> 		__init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR);
> 
> 	mod->arch.ftrace_trampolines = plts;
> #endif
> 	return 0;
> }
> 
> Ftrace will then use those PLT entries in arch/arm64/kernel/ftrace.c:
> static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
> {
> #ifdef CONFIG_ARM64_MODULE_PLTS
> 	struct plt_entry *plt = mod->arch.ftrace_trampolines;
> 
> 	if (addr == FTRACE_ADDR)
> 		return &plt[FTRACE_PLT_IDX];
> 	if (addr == FTRACE_REGS_ADDR &&
> 	    IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
> 		return &plt[FTRACE_REGS_PLT_IDX];
> #endif
> 	return NULL;
> }
> 
> /*
>   * Turn on the call to ftrace_caller() in instrumented function
>   */
> int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
> {
> 	unsigned long pc = rec->ip;
> 	u32 old, new;
> 	long offset = (long)pc - (long)addr;
> 
> 	if (offset < -SZ_128M || offset >= SZ_128M) {
> 		struct module *mod;
> 		struct plt_entry *plt;
> 
> 		if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> 			return -EINVAL;
> 
> 		/*
> 		 * On kernels that support module PLTs, the offset between the
> 		 * branch instruction and its target may legally exceed the
> 		 * range of an ordinary relative 'bl' opcode. In this case, we
> 		 * need to branch via a trampoline in the module.
> 		 *
> 		 * NOTE: __module_text_address() must be called with preemption
> 		 * disabled, but we can rely on ftrace_lock to ensure that 'mod'
> 		 * retains its validity throughout the remainder of this code.
> 		 */
> 		preempt_disable();
> 		mod = __module_text_address(pc);
> 		preempt_enable();
> 
> 		if (WARN_ON(!mod))
> 			return -EINVAL;
> 
> 		plt = get_ftrace_plt(mod, addr);
> 		if (!plt) {
> 			pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
> 			return -EINVAL;
> 		}
> 
> 		addr = (unsigned long)plt;
> 	}
> 
> 	old = aarch64_insn_gen_nop();
> 	new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
> 
> 	return ftrace_modify_code(pc, old, new, true);
> }
> 
> This might be the direction to go later. Anyway, "Rome wasn't built
> in a day", for this stage, we might focus on optprobe for the case
> of non-randomized module region :-).
> 
> BTW, @liuqi, if users set "nokaslr" in bootargs, will your optprobe
> always work and not fall back to normal kprobe even we remove the
> dependency on RANDOMIZED_MODULE_REGION_FULL?
> 
Hi Barry,

I do some tests on Hip08 platform, using nokaslr in booting cmdline and 
remove dependency on RANDOMIZED_MODULE_REGION_FULL, optprobe seems work.
Here is the log:

estuary:/$ uname -a
Linux (none) 5.13.0-rc4+ #37 SMP PREEMPT Mon Aug 2 08:13:37 CST 2021 
aarch64 GNU/Linux
estuary:/$ zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION
CONFIG_RANDOMIZE_MODULE_REGION_FULL=y
estuary:/$ zcat /proc/config.gz | grep OPTPROBE
CONFIG_OPTPROBES=y
CONFIG_HAVE_OPTPROBES=y
estuary:/$ cat /proc/cmdline
console=ttyAMA0,115200 earlycon=pl011,0x9000000 kpti=off nokaslr
estuary:/$ cat /sys/bus/platform/devices/hello_driver/kprobe_test
[   61.304143] do_empty returned 0 and took 200 ns to execute
[   61.304662] do_empty returned 0 and took 110 ns to execute
[   61.305196] do_empty returned 0 and took 100 ns to execute
[   61.305745] do_empty returned 0 and took 90 ns to execute
[   61.306262] do_empty returned 0 and took 90 ns to execute
[   61.306781] do_empty returned 0 and took 90 ns to execute
[   61.307286] do_empty returned 0 and took 90 ns to execute
[   61.307798] do_empty returned 0 and took 90 ns to execute
[   61.308314] do_empty returned 0 and took 90 ns to execute
[   61.308828] do_empty returned 0 and took 90 ns to execute
[   61.309323] do_empty returned 0 and took 80 ns to execute
[   61.309832] do_empty returned 0 and took 80 ns to execute
[   61.310357] do_empty returned 0 and took 80 ns to execute
[   61.310871] do_empty returned 0 and took 80 ns to execute
[   61.311361] do_empty returned 0 and took 80 ns to execute
[   61.311851] do_empty returned 0 and took 90 ns to execute
[   61.312358] do_empty returned 0 and took 90 ns to execute
[   61.312879] do_empty returned 0 and took 80 ns to execute

Thanks,
Qi

>>
>> Thank you,
>>
>> --
>> Masami Hiramatsu <mhiramat@kernel.org>
> 
> Thanks
> Barry
> .
>
liuqi (BA) Aug. 2, 2021, 3:59 a.m. UTC | #10
On 2021/8/2 11:52, liuqi (BA) wrote:
> 
> 
> On 2021/7/31 20:21, Song Bao Hua (Barry Song) wrote:
>>
>>
>>> -----Original Message-----
>>> From: Masami Hiramatsu [mailto:mhiramat@kernel.org]
>>> Sent: Saturday, July 31, 2021 1:16 PM
>>> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
>>> Cc: liuqi (BA) <liuqi115@huawei.com>; catalin.marinas@arm.com;
>>> will@kernel.org; naveen.n.rao@linux.ibm.com; 
>>> anil.s.keshavamurthy@intel.com;
>>> davem@davemloft.net; linux-arm-kernel@lists.infradead.org; Zengtao (B)
>>> <prime.zeng@hisilicon.com>; robin.murphy@arm.com; Linuxarm
>>> <linuxarm@huawei.com>; linux-kernel@vger.kernel.org
>>> Subject: Re: [PATCH] arm64: kprobe: Enable OPTPROBE for arm64
>>>
>>> On Fri, 30 Jul 2021 10:04:06 +0000
>>> "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:
>>>
>>>>>>>>
>>>>>>>> Hi Qi,
>>>>>>>>
>>>>>>>> Thanks for your effort!
>>>>>>>>
>>>>>>>> On Mon, 19 Jul 2021 20:24:17 +0800
>>>>>>>> Qi Liu <liuqi115@huawei.com> wrote:
>>>>>>>>
>>>>>>>>> This patch introduce optprobe for ARM64. In optprobe, probed
>>>>>>>>> instruction is replaced by a branch instruction to detour
>>>>>>>>> buffer. Detour buffer contains trampoline code and a call to
>>>>>>>>> optimized_callback(). optimized_callback() calls opt_pre_handler()
>>>>>>>>> to execute kprobe handler.
>>>>>>>>
>>>>>>>> OK so this will replace only one instruction.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Limitations:
>>>>>>>>> - We only support !CONFIG_RANDOMIZE_MODULE_REGION_FULL case to
>>>>>>>>> guarantee the offset between probe point and kprobe pre_handler
>>>>>>>>> is not larger than 128MiB.
>>>>>>>>
>>>>>>>> Hmm, shouldn't we depends on !CONFIG_ARM64_MODULE_PLTS? Or,
>>>>>>>> allocate an intermediate trampoline area similar to arm optprobe
>>>>>>>> does.
>>>>>>>
>>>>>>> Depending on !CONFIG_ARM64_MODULE_PLTS will totally disable
>>>>>>> RANDOMIZE_BASE according to arch/arm64/Kconfig:
>>>>>>> config RANDOMIZE_BASE
>>>>>>>     bool "Randomize the address of the kernel image"
>>>>>>>     select ARM64_MODULE_PLTS if MODULES
>>>>>>>     select RELOCATABLE
>>>>>>
>>>>>> Yes, but why it is required for "RANDOMIZE_BASE"?
>>>>>> Does that imply the module call might need to use PLT in
>>>>>> some cases?
>>>>>>
>>>>>>>
>>>>>>> Depending on !RANDOMIZE_MODULE_REGION_FULL seems to be still
>>>>>>> allowing RANDOMIZE_BASE via avoiding long jump according to:
>>>>>>> arch/arm64/Kconfig:
>>>>>>>
>>>>>>> config RANDOMIZE_MODULE_REGION_FULL
>>>>>>>     bool "Randomize the module region over a 4 GB range"
>>>>>>>     depends on RANDOMIZE_BASE
>>>>>>>     default y
>>>>>>>     help
>>>>>>>       Randomizes the location of the module region inside a 4 GB 
>>>>>>> window
>>>>>>>       covering the core kernel. This way, it is less likely for 
>>>>>>> modules
>>>>>>>       to leak information about the location of core kernel data 
>>>>>>> structures
>>>>>>>       but it does imply that function calls between modules and 
>>>>>>> the core
>>>>>>>       kernel will need to be resolved via veneers in the module PLT.
>>>>>>>
>>>>>>>       When this option is not set, the module region will be 
>>>>>>> randomized
>>> over
>>>>>>>       a limited range that contains the [_stext, _etext] interval 
>>>>>>> of the
>>>>>>>       core kernel, so branch relocations are always in range.
>>>>>>
>>>>>> Hmm, this dependency looks strange. If it always in range, don't 
>>>>>> we need
>>>>>> PLT for modules?
>>>>>>
>>>>>> Cataline, would you know why?
>>>>>> Maybe it's a KASLR's Kconfig issue?
>>>>>
>>>>> I actually didn't see any problem after making this change:
>>>>>
>>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>>> index e07e7de9ac49..6440671b72e0 100644
>>>>> --- a/arch/arm64/Kconfig
>>>>> +++ b/arch/arm64/Kconfig
>>>>> @@ -1781,7 +1781,6 @@ config RELOCATABLE
>>>>>
>>>>>   config RANDOMIZE_BASE
>>>>>          bool "Randomize the address of the kernel image"
>>>>> -       select ARM64_MODULE_PLTS if MODULES
>>>>>          select RELOCATABLE
>>>>>          help
>>>>>            Randomizes the virtual address at which the kernel image is
>>>>> @@ -1801,6 +1800,7 @@ config RANDOMIZE_BASE
>>>>>   config RANDOMIZE_MODULE_REGION_FULL
>>>>>          bool "Randomize the module region over a 4 GB range"
>>>>>          depends on RANDOMIZE_BASE
>>>>> +       select ARM64_MODULE_PLTS if MODULES
>>>>>          default y
>>>>>          help
>>>>>            Randomizes the location of the module region inside a 4 
>>>>> GB window
>>>>>
>>>>> and having this config:
>>>>> # zcat /proc/config.gz | grep RANDOMIZE_BASE
>>>>> CONFIG_RANDOMIZE_BASE=y
>>>>>
>>>>> # zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION_FULL
>>>>> # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
>>>>>
>>>>> # zcat /proc/config.gz | grep ARM64_MODULE_PLTS
>>>>> # CONFIG_ARM64_MODULE_PLTS is not set
>>>>>
>>>>> Modules work all good:
>>>>> # lsmod
>>>>> Module                  Size  Used by
>>>>> btrfs                1355776  0
>>>>> blake2b_generic        20480  0
>>>>> libcrc32c              16384  1 btrfs
>>>>> xor                    20480  1 btrfs
>>>>> xor_neon               16384  1 xor
>>>>> zstd_compress         163840  1 btrfs
>>>>> raid6_pq              110592  1 btrfs
>>>>> ctr                    16384  0
>>>>> md5                    16384  0
>>>>> ip_tunnel              32768  0
>>>>> ipv6                  442368  28
>>>>>
>>>>>
>>>>> I am not quite sure if there is a corner case. If no,
>>>>> I would think the kconfig might be some improper.
>>>>
>>>> The corner case is that even CONFIG_RANDOMIZE_MODULE_REGION_FULL
>>>> is not enabled, but if CONFIG_ARM64_MODULE_PLTS is enabled, when
>>>> we can't get memory from the 128MB area in case the area is exhausted,
>>>> we will fall back in module_alloc() to a 2GB area as long as either
>>>> of the below two conditions is met:
>>>>
>>>> 1. KASAN is not enabled
>>>> 2. KASAN is enabled and CONFIG_KASAN_VMALLOC is also enabled.
>>>>
>>>> void *module_alloc(unsigned long size)
>>>> {
>>>>     u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
>>>>     gfp_t gfp_mask = GFP_KERNEL;
>>>>     void *p;
>>>>
>>>>     /* Silence the initial allocation */
>>>>     if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
>>>>         gfp_mask |= __GFP_NOWARN;
>>>>
>>>>     if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
>>>>         IS_ENABLED(CONFIG_KASAN_SW_TAGS))
>>>>         /* don't exceed the static module region - see below */
>>>>         module_alloc_end = MODULES_END;
>>>>
>>>>     p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
>>>>                 module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
>>>>                 NUMA_NO_NODE, __builtin_return_address(0));
>>>>
>>>>     if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
>>>>         (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
>>>>          (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
>>>>           !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
>>>>         /*
>>>>          * KASAN without KASAN_VMALLOC can only deal with module
>>>>          * allocations being served from the reserved module region,
>>>>          * since the remainder of the vmalloc region is already
>>>>          * backed by zero shadow pages, and punching holes into it
>>>>          * is non-trivial. Since the module region is not randomized
>>>>          * when KASAN is enabled without KASAN_VMALLOC, it is even
>>>>          * less likely that the module region gets exhausted, so we
>>>>          * can simply omit this fallback in that case.
>>>>          */
>>>>         p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
>>>>                 module_alloc_base + SZ_2G, GFP_KERNEL,
>>>>                 PAGE_KERNEL, 0, NUMA_NO_NODE,
>>>>                 __builtin_return_address(0));
>>>>
>>>>     if (p && (kasan_module_alloc(p, size) < 0)) {
>>>>         vfree(p);
>>>>         return NULL;
>>>>     }
>>>>
>>>>     return p;
>>>> }
>>>>
>>>> This should be happening quite rarely. But maybe arm64's document
>>>> needs some minor fixup, otherwise, it is quite confusing.
>>>
>>> OK, so CONFIG_KASAN_VLALLOC=y and CONFIG_ARM64_MODULE_PLTS=y, the
>>> module_alloc()
>>> basically returns the memory in 128MB region, but can return the 
>>> memory in 2GB
>>> region. (This is OK because optprobe can filter it out)
>>> But CONFIG_RANDOMIZE_MODULE_REGION_FULL=y, there is almost no chance 
>>> to get
>>> the memory in 128MB region.
>>>
>>> Hmm, for the optprobe in kernel text, maybe we can define 
>>> 'optinsn_alloc_start'
>>> by 'module_alloc_base - (SZ_2G - MODULES_VADDR)' and use 
>>> __vmalloc_node_range()
>>> to avoid this issue. But that is only for the kernel. For the 
>>> modules, we may
>>> always out of 128MB region.
>>
>> If we can have some separate PLT entries in each module for optprobe,
>> we should be able to short-jump to the PLT entry and then PLT entry
>> will further long-jump to detour out of the range. That is exactly
>> the duty of PLT.
>>
>> Right now, arm64 has support on dynamic_ftrace by adding a
>> section in module for ftrace PLT.
>> arch/arm64/include/asm/module.lds.h:
>> SECTIONS {
>> #ifdef CONFIG_ARM64_MODULE_PLTS
>>     .plt 0 (NOLOAD) : { BYTE(0) }
>>     .init.plt 0 (NOLOAD) : { BYTE(0) }
>>     .text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
>> #endif
>> ...
>> }
>>
>> arch/arm64/kernel/module.c will initialize some PLT entries
>> for ftrace:
>>
>> static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
>>                   const Elf_Shdr *sechdrs,
>>                   struct module *mod)
>> {
>> #if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE)
>>     const Elf_Shdr *s;
>>     struct plt_entry *plts;
>>
>>     s = find_section(hdr, sechdrs, ".text.ftrace_trampoline");
>>     if (!s)
>>         return -ENOEXEC;
>>
>>     plts = (void *)s->sh_addr;
>>
>>     __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);
>>
>>     if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
>>         __init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR);
>>
>>     mod->arch.ftrace_trampolines = plts;
>> #endif
>>     return 0;
>> }
>>
>> Ftrace will then use those PLT entries in arch/arm64/kernel/ftrace.c:
>> static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned 
>> long addr)
>> {
>> #ifdef CONFIG_ARM64_MODULE_PLTS
>>     struct plt_entry *plt = mod->arch.ftrace_trampolines;
>>
>>     if (addr == FTRACE_ADDR)
>>         return &plt[FTRACE_PLT_IDX];
>>     if (addr == FTRACE_REGS_ADDR &&
>>         IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS))
>>         return &plt[FTRACE_REGS_PLT_IDX];
>> #endif
>>     return NULL;
>> }
>>
>> /*
>>   * Turn on the call to ftrace_caller() in instrumented function
>>   */
>> int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>> {
>>     unsigned long pc = rec->ip;
>>     u32 old, new;
>>     long offset = (long)pc - (long)addr;
>>
>>     if (offset < -SZ_128M || offset >= SZ_128M) {
>>         struct module *mod;
>>         struct plt_entry *plt;
>>
>>         if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
>>             return -EINVAL;
>>
>>         /*
>>          * On kernels that support module PLTs, the offset between the
>>          * branch instruction and its target may legally exceed the
>>          * range of an ordinary relative 'bl' opcode. In this case, we
>>          * need to branch via a trampoline in the module.
>>          *
>>          * NOTE: __module_text_address() must be called with preemption
>>          * disabled, but we can rely on ftrace_lock to ensure that 'mod'
>>          * retains its validity throughout the remainder of this code.
>>          */
>>         preempt_disable();
>>         mod = __module_text_address(pc);
>>         preempt_enable();
>>
>>         if (WARN_ON(!mod))
>>             return -EINVAL;
>>
>>         plt = get_ftrace_plt(mod, addr);
>>         if (!plt) {
>>             pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
>>             return -EINVAL;
>>         }
>>
>>         addr = (unsigned long)plt;
>>     }
>>
>>     old = aarch64_insn_gen_nop();
>>     new = aarch64_insn_gen_branch_imm(pc, addr, 
>> AARCH64_INSN_BRANCH_LINK);
>>
>>     return ftrace_modify_code(pc, old, new, true);
>> }
>>
>> This might be the direction to go later. Anyway, "Rome wasn't built
>> in a day", for this stage, we might focus on optprobe for the case
>> of non-randomized module region :-).
>>
>> BTW, @liuqi, if users set "nokaslr" in bootargs, will your optprobe
>> always work and not fall back to normal kprobe even we remove the
>> dependency on RANDOMIZED_MODULE_REGION_FULL?
>>
> Hi Barry,
> 
> I do some tests on Hip08 platform, using nokaslr in booting cmdline and 
> remove dependency on RANDOMIZED_MODULE_REGION_FULL, optprobe seems work.
> Here is the log:
> 
> estuary:/$ uname -a
> Linux (none) 5.13.0-rc4+ #37 SMP PREEMPT Mon Aug 2 08:13:37 CST 2021 
> aarch64 GNU/Linux
> estuary:/$ zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION
> CONFIG_RANDOMIZE_MODULE_REGION_FULL=y
> estuary:/$ zcat /proc/config.gz | grep OPTPROBE
> CONFIG_OPTPROBES=y
> CONFIG_HAVE_OPTPROBES=y
> estuary:/$ cat /proc/cmdline
> console=ttyAMA0,115200 earlycon=pl011,0x9000000 kpti=off nokaslr
> estuary:/$ cat /sys/bus/platform/devices/hello_driver/kprobe_test
> [   61.304143] do_empty returned 0 and took 200 ns to execute
> [   61.304662] do_empty returned 0 and took 110 ns to execute
> [   61.305196] do_empty returned 0 and took 100 ns to execute
> [   61.305745] do_empty returned 0 and took 90 ns to execute
> [   61.306262] do_empty returned 0 and took 90 ns to execute
> [   61.306781] do_empty returned 0 and took 90 ns to execute
> [   61.307286] do_empty returned 0 and took 90 ns to execute
> [   61.307798] do_empty returned 0 and took 90 ns to execute
> [   61.308314] do_empty returned 0 and took 90 ns to execute
> [   61.308828] do_empty returned 0 and took 90 ns to execute
> [   61.309323] do_empty returned 0 and took 80 ns to execute
> [   61.309832] do_empty returned 0 and took 80 ns to execute
> [   61.310357] do_empty returned 0 and took 80 ns to execute
> [   61.310871] do_empty returned 0 and took 80 ns to execute
> [   61.311361] do_empty returned 0 and took 80 ns to execute
> [   61.311851] do_empty returned 0 and took 90 ns to execute
> [   61.312358] do_empty returned 0 and took 90 ns to execute
> [   61.312879] do_empty returned 0 and took 80 ns to execute
> 
> Thanks,
> Qi
> 

This situation is: function to probe is builtin and pre_handler is build 
as module. I'll try to test the other three situation latter.

Thanks,
Qi
>>>
>>> Thank you,
>>>
>>> -- 
>>> Masami Hiramatsu <mhiramat@kernel.org>
>>
>> Thanks
>> Barry
>> .
>>
> 
> .
liuqi (BA) Aug. 2, 2021, 12:02 p.m. UTC | #11
On 2021/8/2 11:52, liuqi (BA) wrote:
[...]

>> This might be the direction to go later. Anyway, "Rome wasn't built
>> in a day", for this stage, we might focus on optprobe for the case
>> of non-randomized module region :-).
>>
>> BTW, @liuqi, if users set "nokaslr" in bootargs, will your optprobe
>> always work and not fall back to normal kprobe even we remove the
>> dependency on RANDOMIZED_MODULE_REGION_FULL?
>>
> Hi Barry,
> 
> I do some tests on Hip08 platform, using nokaslr in booting cmdline and 
> remove dependency on RANDOMIZED_MODULE_REGION_FULL, optprobe seems work.
> Here is the log:
> 
> estuary:/$ uname -a
> Linux (none) 5.13.0-rc4+ #37 SMP PREEMPT Mon Aug 2 08:13:37 CST 2021 
> aarch64 GNU/Linux
> estuary:/$ zcat /proc/config.gz | grep RANDOMIZE_MODULE_REGION
> CONFIG_RANDOMIZE_MODULE_REGION_FULL=y
> estuary:/$ zcat /proc/config.gz | grep OPTPROBE
> CONFIG_OPTPROBES=y
> CONFIG_HAVE_OPTPROBES=y
> estuary:/$ cat /proc/cmdline
> console=ttyAMA0,115200 earlycon=pl011,0x9000000 kpti=off nokaslr
> estuary:/$ cat /sys/bus/platform/devices/hello_driver/kprobe_test
> [   61.304143] do_empty returned 0 and took 200 ns to execute
> [   61.304662] do_empty returned 0 and took 110 ns to execute
> [   61.305196] do_empty returned 0 and took 100 ns to execute
> [   61.305745] do_empty returned 0 and took 90 ns to execute
> [   61.306262] do_empty returned 0 and took 90 ns to execute
> [   61.306781] do_empty returned 0 and took 90 ns to execute
> [   61.307286] do_empty returned 0 and took 90 ns to execute
> [   61.307798] do_empty returned 0 and took 90 ns to execute
> [   61.308314] do_empty returned 0 and took 90 ns to execute
> [   61.308828] do_empty returned 0 and took 90 ns to execute
> [   61.309323] do_empty returned 0 and took 80 ns to execute
> [   61.309832] do_empty returned 0 and took 80 ns to execute
> [   61.310357] do_empty returned 0 and took 80 ns to execute
> [   61.310871] do_empty returned 0 and took 80 ns to execute
> [   61.311361] do_empty returned 0 and took 80 ns to execute
> [   61.311851] do_empty returned 0 and took 90 ns to execute
> [   61.312358] do_empty returned 0 and took 90 ns to execute
> [   61.312879] do_empty returned 0 and took 80 ns to execute
> 
> Thanks,
> Qi
> 
Hi Barry,


I've done test on Hip08 platform using nokaslr in booting cmdline and 
remove dependency on RANDOMIZED_MODULE_REGION_FULL, on following 4 cases:
1. probed code in module, pre_handler in kernel.
2. probed code in kernel, pre_handler in kernel.
3. probed code in module, pre_handler in module.
4. probed code in kernel, pre_handler in module.

and optprobe works in these 4 cases.

Thanks,
Qi
>>>
>>> Thank you,
>>>
>>> -- 
>>> Masami Hiramatsu <mhiramat@kernel.org>
>>
>> Thanks
>> Barry
>> .
>>
> 
> .
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b48fa6da8d9f..1690cec625ed 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -200,6 +200,7 @@  config ARM64
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
+	select HAVE_OPTPROBES if !RANDOMIZE_MODULE_REGION_FULL
 	select HAVE_KRETPROBES
 	select HAVE_GENERIC_VDSO
 	select HOLES_IN_ZONE
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index 5d38ff4a4806..9e1c492a0c3d 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -39,6 +39,29 @@  void arch_remove_kprobe(struct kprobe *);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
 int kprobe_exceptions_notify(struct notifier_block *self,
 			     unsigned long val, void *data);
+
+#define RELATIVEJUMP_SIZE (4)
+#define MAX_COPIED_INSN	DIV_ROUND_UP(RELATIVEJUMP_SIZE, sizeof(kprobe_opcode_t))
+struct arch_optimized_insn {
+	kprobe_opcode_t copied_insn[MAX_COPIED_INSN];
+	/* detour code buffer */
+	kprobe_opcode_t *insn;
+};
+
+/* optinsn template addresses */
+extern __visible kprobe_opcode_t optprobe_template_entry[];
+extern __visible kprobe_opcode_t optprobe_template_val[];
+extern __visible kprobe_opcode_t optprobe_template_call[];
+extern __visible kprobe_opcode_t optprobe_template_end[];
+extern __visible kprobe_opcode_t optprobe_template_restore_begin[];
+extern __visible kprobe_opcode_t optprobe_template_restore_orig_insn[];
+extern __visible kprobe_opcode_t optprobe_template_restore_end[];
+
+#define MAX_OPTIMIZED_LENGTH	4
+#define MAX_OPTINSN_SIZE				\
+	((unsigned long)optprobe_template_end -	\
+	 (unsigned long)optprobe_template_entry)
+
 void kretprobe_trampoline(void);
 void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
 
diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile
index 8e4be92e25b1..52cf5d4ffe8a 100644
--- a/arch/arm64/kernel/probes/Makefile
+++ b/arch/arm64/kernel/probes/Makefile
@@ -4,3 +4,5 @@  obj-$(CONFIG_KPROBES)		+= kprobes.o decode-insn.o	\
 				   simulate-insn.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o decode-insn.o	\
 				   simulate-insn.o
+obj-$(CONFIG_OPTPROBES)		+= opt-arm64.o			\
+				   optprobe_trampoline.o
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 6dbcc89f6662..83755ad62abe 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -11,6 +11,7 @@ 
 #include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
+#include <linux/moduleloader.h>
 #include <linux/sched/debug.h>
 #include <linux/set_memory.h>
 #include <linux/slab.h>
@@ -113,9 +114,21 @@  int __kprobes arch_prepare_kprobe(struct kprobe *p)
 
 void *alloc_insn_page(void)
 {
-	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
+	void *page;
+
+	page = module_alloc(PAGE_SIZE);
+	if (!page)
+		return NULL;
+
+	set_vm_flush_reset_perms(page);
+	/*
+	 * First make the page read-only, and only then make it executable to
+	 * prevent it from being W+X in between.
+	 */
+	set_memory_ro((unsigned long)page, 1);
+	set_memory_x((unsigned long)page, 1);
+
+	return page;
 }
 
 /* arm kprobe: install breakpoint in text */
diff --git a/arch/arm64/kernel/probes/opt-arm64.c b/arch/arm64/kernel/probes/opt-arm64.c
new file mode 100644
index 000000000000..ff72f6275e71
--- /dev/null
+++ b/arch/arm64/kernel/probes/opt-arm64.c
@@ -0,0 +1,217 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Code for Kernel probes Jump optimization.
+ *
+ * Copyright (C) 2021 Hisilicon Limited
+ */
+
+#include <linux/jump_label.h>
+#include <linux/kprobes.h>
+
+#include <asm/cacheflush.h>
+#include <asm/insn.h>
+#include <asm/kprobes.h>
+#include <asm/patching.h>
+
+#define TMPL_VAL_IDX \
+	(optprobe_template_val - optprobe_template_entry)
+#define TMPL_CALL_BACK \
+	(optprobe_template_call - optprobe_template_entry)
+#define TMPL_END_IDX \
+	(optprobe_template_end - optprobe_template_entry)
+#define TMPL_RESTORE_ORIGN_INSN \
+	(optprobe_template_restore_orig_insn - optprobe_template_entry)
+#define TMPL_RESTORE_END \
+	(optprobe_template_restore_end - optprobe_template_entry)
+
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+	return 0;
+}
+
+int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+	return optinsn->insn != NULL;
+}
+
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+				unsigned long addr)
+{
+	return ((unsigned long)op->kp.addr <= addr &&
+		(unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr);
+}
+
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+	/* This is possible if op is under delayed unoptimizing */
+	if (kprobe_disabled(&op->kp))
+		return;
+
+	preempt_disable();
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		__this_cpu_write(current_kprobe, &op->kp);
+		regs->pc = (unsigned long)op->kp.addr;
+		get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+
+	preempt_enable_no_resched();
+}
+NOKPROBE_SYMBOL(optimized_callback)
+
+static bool is_offset_in_branch_range(long offset)
+{
+	return (offset >= -0x08000000 && offset <= 0x07fffffc && !(offset & 0x3));
+}
+
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig)
+{
+	kprobe_opcode_t *code;
+	long rel_chk;
+	u32 insn, size;
+	int ret, i;
+	void *addr;
+
+	code = get_optinsn_slot();
+	if (!code)
+		return -ENOMEM;
+
+	/*
+	 * Verify if the address gap is in 128MiB range, because this uses
+	 * a relative jump.
+	 *
+	 * kprobe opt use a 'b' instruction to branch to optinsn.insn.
+	 * According to ARM manual, branch instruction is:
+	 *
+	 *   31  30                  25              0
+	 *  +----+---+---+---+---+---+---------------+
+	 *  |cond| 0 | 0 | 1 | 0 | 1 |     imm26     |
+	 *  +----+---+---+---+---+---+---------------+
+	 *
+	 * imm26 is a signed 26 bits integer. The real branch offset is computed
+	 * by: imm64 = SignExtend(imm26:'00', 64);
+	 *
+	 * So the maximum forward branch should be:
+	 *   (0x01ffffff << 2) = 1720x07fffffc =  0x07fffffc
+	 * The maximum backward branch should be:
+	 *   (0xfe000000 << 2) = 0xFFFFFFFFF8000000 = -0x08000000
+	 *
+	 * We can simply check (rel & 0xf8000003):
+	 *  if rel is positive, (rel & 0xf8000003) should be 0
+	 *  if rel is negitive, (rel & 0xf8000003) should be 0xf8000000
+	 *  the last '3' is used for alignment checking.
+	 */
+	rel_chk = (unsigned long)code -
+			(unsigned long)orig->addr + 8;
+	if (!is_offset_in_branch_range(rel_chk)) {
+		pr_err("%s is out of branch range.\n", orig->symbol_name);
+		free_optinsn_slot(code, 0);
+		return -ERANGE;
+	}
+
+	/* Setup template */
+	size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);
+	for (i = 0; i < size; i++) {
+		addr = code + i;
+		insn = *(optprobe_template_entry + i);
+		ret = aarch64_insn_patch_text(&addr, &insn, 1);
+		if (ret < 0) {
+			free_optinsn_slot(code, 0);
+			return -ERANGE;
+		}
+	}
+
+	/* Set probe information */
+	addr = code + TMPL_VAL_IDX;
+	insn =  (unsigned long long)op & 0xffffffff;
+	aarch64_insn_patch_text(&addr, &insn, 1);
+
+	addr = addr + 4;
+	insn = ((unsigned long long)op & GENMASK_ULL(63, 32)) >> 32;
+	aarch64_insn_patch_text(&addr, &insn, 1);
+
+	addr = code + TMPL_CALL_BACK;
+	insn =  aarch64_insn_gen_branch_imm((unsigned long)addr,
+				(unsigned long)optimized_callback,
+				AARCH64_INSN_BRANCH_LINK);
+	aarch64_insn_patch_text(&addr, &insn, 1);
+
+	/* The original probed instruction */
+	addr = code + TMPL_RESTORE_ORIGN_INSN;
+	insn =  orig->opcode;
+	aarch64_insn_patch_text(&addr, &insn, 1);
+
+	/* Jump back to next instruction */
+	addr = code + TMPL_RESTORE_END;
+	insn = aarch64_insn_gen_branch_imm(
+				(unsigned long)(&code[TMPL_RESTORE_END]),
+				(unsigned long)(op->kp.addr) + 4,
+				AARCH64_INSN_BRANCH_NOLINK);
+	aarch64_insn_patch_text(&addr, &insn, 1);
+
+	flush_icache_range((unsigned long)code,
+			   (unsigned long)(&code[TMPL_END_IDX]));
+	/* Set op->optinsn.insn means prepared. */
+	op->optinsn.insn = code;
+	return 0;
+}
+
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+	struct optimized_kprobe *op, *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		u32 insn;
+
+		WARN_ON(kprobe_disabled(&op->kp));
+
+		/*
+		 * Backup instructions which will be replaced
+		 * by jump address
+		 */
+		memcpy(op->optinsn.copied_insn, op->kp.addr,
+			RELATIVEJUMP_SIZE);
+		insn = aarch64_insn_gen_branch_imm((unsigned long)op->kp.addr,
+				(unsigned long)op->optinsn.insn,
+				AARCH64_INSN_BRANCH_NOLINK);
+
+		WARN_ON(insn == 0);
+
+		aarch64_insn_patch_text((void *)&(op->kp.addr), &insn, 1);
+
+		list_del_init(&op->list);
+	}
+}
+
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+	arch_arm_kprobe(&op->kp);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+void arch_unoptimize_kprobes(struct list_head *oplist,
+			    struct list_head *done_list)
+{
+	struct optimized_kprobe *op, *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		arch_unoptimize_kprobe(op);
+		list_move(&op->list, done_list);
+	}
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+	if (op->optinsn.insn) {
+		free_optinsn_slot(op->optinsn.insn, 1);
+		op->optinsn.insn = NULL;
+	}
+}
diff --git a/arch/arm64/kernel/probes/optprobe_trampoline.S b/arch/arm64/kernel/probes/optprobe_trampoline.S
new file mode 100644
index 000000000000..13729cb279b8
--- /dev/null
+++ b/arch/arm64/kernel/probes/optprobe_trampoline.S
@@ -0,0 +1,80 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * trampoline entry and return code for optprobes.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+
+	.global optprobe_template_entry
+optprobe_template_entry:
+	sub sp, sp, #PT_REGS_SIZE
+	stp x0, x1, [sp, #S_X0]
+	stp x2, x3, [sp, #S_X2]
+	stp x4, x5, [sp, #S_X4]
+	stp x6, x7, [sp, #S_X6]
+	stp x8, x9, [sp, #S_X8]
+	stp x10, x11, [sp, #S_X10]
+	stp x12, x13, [sp, #S_X12]
+	stp x14, x15, [sp, #S_X14]
+	stp x16, x17, [sp, #S_X16]
+	stp x18, x19, [sp, #S_X18]
+	stp x20, x21, [sp, #S_X20]
+	stp x22, x23, [sp, #S_X22]
+	stp x24, x25, [sp, #S_X24]
+	stp x26, x27, [sp, #S_X26]
+	stp x28, x29, [sp, #S_X28]
+	add x0, sp, #PT_REGS_SIZE
+	stp lr, x0, [sp, #S_LR]
+	/*
+	 * Construct a useful saved PSTATE
+	 */
+	mrs x0, nzcv
+	mrs x1, daif
+	orr x0, x0, x1
+	mrs x1, CurrentEL
+	orr x0, x0, x1
+	mrs x1, SPSel
+	orr x0, x0, x1
+	stp xzr, x0, [sp, #S_PC]
+	/* Get parameters to optimized_callback() */
+	ldr	x0, 1f
+	mov	x1, sp
+	/* Branch to optimized_callback() */
+	.global optprobe_template_call
+optprobe_template_call:
+	nop
+        /* Restore registers */
+	ldr x0, [sp, #S_PSTATE]
+	and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
+	msr nzcv, x0
+	ldp x0, x1, [sp, #S_X0]
+	ldp x2, x3, [sp, #S_X2]
+	ldp x4, x5, [sp, #S_X4]
+	ldp x6, x7, [sp, #S_X6]
+	ldp x8, x9, [sp, #S_X8]
+	ldp x10, x11, [sp, #S_X10]
+	ldp x12, x13, [sp, #S_X12]
+	ldp x14, x15, [sp, #S_X14]
+	ldp x16, x17, [sp, #S_X16]
+	ldp x18, x19, [sp, #S_X18]
+	ldp x20, x21, [sp, #S_X20]
+	ldp x22, x23, [sp, #S_X22]
+	ldp x24, x25, [sp, #S_X24]
+	ldp x26, x27, [sp, #S_X26]
+	ldp x28, x29, [sp, #S_X28]
+	ldr lr, [sp, #S_LR]
+        add sp, sp, #PT_REGS_SIZE
+	.global optprobe_template_restore_orig_insn
+optprobe_template_restore_orig_insn:
+	nop
+	.global optprobe_template_restore_end
+optprobe_template_restore_end:
+	nop
+	.global optprobe_template_end
+optprobe_template_end:
+	.global optprobe_template_val
+optprobe_template_val:
+	1:	.long 0
+		.long 0