diff mbox series

[v5,7/9] riscv/kprobe: Prepare detour buffer for optimized kprobe

Message ID 20221224114315.850130-8-chenguokai17@mails.ucas.ac.cn (mailing list archive)
State Superseded
Delegated to: Palmer Dabbelt
Headers show
Series Add OPTPROBES feature on RISCV | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes or riscv/for-next

Commit Message

Xim Dec. 24, 2022, 11:43 a.m. UTC
From: Liao Chang <liaochang1@huawei.com>

This patch introduce code to prepare instruction slot for optimized
kprobe, the instruction slot for regular kprobe just records two
instructions, first one is the original instruction replaced by EBREAK,
the second one is EBREAK for single-step. While instruction slot for
optimized kprobe is larger, beside execute instruction out-of-line, it
also contains a standalone stackframe for calling kprobe handler.

All optimized instruction slots consis of 5 major parts, which copied
from the assembly code template in opt_trampoline.S.

	SAVE REGS
	CALL optimized_callback
	RESTORE REGS
	EXECUTE INSNS OUT-OF-LINE
	RETURN BACK

Although most instructions in each slot are same, these slots still have
a bit difference in their payload, it is result from three parts:

  - 'CALL optimized_callback', the relative offset for 'call'
    instruction is different for each kprobe.
  - 'EXECUTE INSN OUT-OF-LINE', no doubt.
  - 'RETURN BACK', the chosen free register is reused here as the
     destination register of jumping back.

So it also need to customize the slot payload for each optimized kprobe.

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Co-developed-by: Chen Guokai <chenguokai17@mails.ucas.ac.cn>
Signed-off-by: Chen Guokai <chenguokai17@mails.ucas.ac.cn>
---
 arch/riscv/include/asm/kprobes.h          |  16 +++
 arch/riscv/kernel/probes/opt.c            |  76 +++++++++++++
 arch/riscv/kernel/probes/opt_trampoline.S | 125 ++++++++++++++++++++++
 3 files changed, 217 insertions(+)

Comments

Björn Töpel Jan. 2, 2023, 6:04 p.m. UTC | #1
Chen Guokai <chenguokai17@mails.ucas.ac.cn> writes:

> From: Liao Chang <liaochang1@huawei.com>

> diff --git a/arch/riscv/kernel/probes/opt.c b/arch/riscv/kernel/probes/opt.c
> index 258a283c906d..bc232fce5b39 100644
> --- a/arch/riscv/kernel/probes/opt.c
> +++ b/arch/riscv/kernel/probes/opt.c
> @@ -11,9 +11,37 @@
>  #include <linux/kprobes.h>
>  #include <asm/kprobes.h>
>  #include <asm/patch.h>
> +#include <asm/asm-offsets.h>
>  
>  #include "simulate-insn.h"
>  #include "decode-insn.h"
> +#include "../../net/bpf_jit.h"
> +
> +static void

Super-nit, but I really prefer *not* breaking function name and return
value, for grepability.

> diff --git a/arch/riscv/kernel/probes/opt_trampoline.S b/arch/riscv/kernel/probes/opt_trampoline.S
> index 16160c4367ff..75e34e373cf2 100644
> --- a/arch/riscv/kernel/probes/opt_trampoline.S
> +++ b/arch/riscv/kernel/probes/opt_trampoline.S
> @@ -1,12 +1,137 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
>  /*
>   * Copyright (C) 2022 Guokai Chen
> + * Copyright (C) 2022 Liao, Chang <liaochang1@huawei.com>
>   */
>  
>  #include <linux/linkage.h>
>  
> +#include <asm/asm.h>
>  #incldue <asm/csr.h>
>  #include <asm/asm-offsets.h>
>  
>  SYM_ENTRY(optprobe_template_entry, SYM_L_GLOBAL, SYM_A_NONE)
> +	addi  sp, sp, -(PT_SIZE_ON_STACK)
> +	REG_S x1,  PT_RA(sp)
> +	REG_S x2,  PT_SP(sp)
> +	REG_S x3,  PT_GP(sp)
> +	REG_S x4,  PT_TP(sp)
> +	REG_S x5,  PT_T0(sp)
> +	REG_S x6,  PT_T1(sp)
> +	REG_S x7,  PT_T2(sp)
> +	REG_S x8,  PT_S0(sp)
> +	REG_S x9,  PT_S1(sp)
> +	REG_S x10, PT_A0(sp)
> +	REG_S x11, PT_A1(sp)
> +	REG_S x12, PT_A2(sp)
> +	REG_S x13, PT_A3(sp)
> +	REG_S x14, PT_A4(sp)
> +	REG_S x15, PT_A5(sp)
> +	REG_S x16, PT_A6(sp)
> +	REG_S x17, PT_A7(sp)
> +	REG_S x18, PT_S2(sp)
> +	REG_S x19, PT_S3(sp)
> +	REG_S x20, PT_S4(sp)
> +	REG_S x21, PT_S5(sp)
> +	REG_S x22, PT_S6(sp)
> +	REG_S x23, PT_S7(sp)
> +	REG_S x24, PT_S8(sp)
> +	REG_S x25, PT_S9(sp)
> +	REG_S x26, PT_S10(sp)
> +	REG_S x27, PT_S11(sp)
> +	REG_S x28, PT_T3(sp)
> +	REG_S x29, PT_T4(sp)
> +	REG_S x30, PT_T5(sp)
> +	REG_S x31, PT_T6(sp)
> +	/* Update fp is friendly for stacktrace */
> +	addi  s0, sp, (PT_SIZE_ON_STACK)
> +	j 1f
> +
> +SYM_ENTRY(optprobe_template_save, SYM_L_GLOBAL, SYM_A_NONE)
> +	/*
> +	 * Step1:
> +	 * Filled with the pointer to optimized_kprobe data
> +	 */
> +	.dword 0
> +1:
> +	/* Load optimize_kprobe pointer from .dword below */
> +	auipc a0, 0
> +	REG_L a0, -8(a0)
> +	add   a1, sp, x0
> +
> +SYM_ENTRY(optprobe_template_call, SYM_L_GLOBAL, SYM_A_NONE)
> +	/*
> +	 * Step2:
> +	 * <IMME> of AUIPC/JALR are modified to the offset to optimized_callback
> +	 * jump target is loaded from above .dword.
> +	 */
> +	auipc ra, 0
> +	jalr  ra, 0(ra)
> +
> +	REG_L x1,  PT_RA(sp)
> +	REG_L x3,  PT_GP(sp)
> +	REG_L x4,  PT_TP(sp)
> +	REG_L x5,  PT_T0(sp)
> +	REG_L x6,  PT_T1(sp)
> +	REG_L x7,  PT_T2(sp)
> +	REG_L x8,  PT_S0(sp)
> +	REG_L x9,  PT_S1(sp)
> +	REG_L x10, PT_A0(sp)
> +	REG_L x11, PT_A1(sp)
> +	REG_L x12, PT_A2(sp)
> +	REG_L x13, PT_A3(sp)
> +	REG_L x14, PT_A4(sp)
> +	REG_L x15, PT_A5(sp)
> +	REG_L x16, PT_A6(sp)
> +	REG_L x17, PT_A7(sp)
> +	REG_L x18, PT_S2(sp)
> +	REG_L x19, PT_S3(sp)
> +	REG_L x20, PT_S4(sp)
> +	REG_L x21, PT_S5(sp)
> +	REG_L x22, PT_S6(sp)
> +	REG_L x23, PT_S7(sp)
> +	REG_L x24, PT_S8(sp)
> +	REG_L x25, PT_S9(sp)
> +	REG_L x26, PT_S10(sp)
> +	REG_L x27, PT_S11(sp)
> +	REG_L x28, PT_T3(sp)
> +	REG_L x29, PT_T4(sp)
> +	REG_L x30, PT_T5(sp)
> +	REG_L x31, PT_T6(sp)
> +	REG_L x2,  PT_SP(sp)
> +	addi  sp, sp, (PT_SIZE_ON_STACK)
> +
> +SYM_ENTRY(optprobe_template_insn, SYM_L_GLOBAL, SYM_A_NONE)
> +	/*
> +	 * Step3:
> +	 * NOPS will be replaced by the probed instruction, at worst case 3 RVC
> +	 * and 1 RVI instructions is about to execute out of line.
> +	 */
> +	nop

A nop here will be either a compressed nop or a non-compressed,
depending on the build (C-enabled or not), right? Maybe be explicit to
the assembler what you want?


Björn
Liao, Chang Jan. 4, 2023, 8:35 a.m. UTC | #2
在 2023/1/3 2:04, Björn Töpel 写道:
> Chen Guokai <chenguokai17@mails.ucas.ac.cn> writes:
> 
>> From: Liao Chang <liaochang1@huawei.com>
> 
>> diff --git a/arch/riscv/kernel/probes/opt.c b/arch/riscv/kernel/probes/opt.c
>> index 258a283c906d..bc232fce5b39 100644
>> --- a/arch/riscv/kernel/probes/opt.c
>> +++ b/arch/riscv/kernel/probes/opt.c
>> @@ -11,9 +11,37 @@
>>  #include <linux/kprobes.h>
>>  #include <asm/kprobes.h>
>>  #include <asm/patch.h>
>> +#include <asm/asm-offsets.h>
>>  
>>  #include "simulate-insn.h"
>>  #include "decode-insn.h"
>> +#include "../../net/bpf_jit.h"
>> +
>> +static void
> 
> Super-nit, but I really prefer *not* breaking function name and return
> value, for grepability.

OK, i will keep function name and return at the same line.

> 
>> diff --git a/arch/riscv/kernel/probes/opt_trampoline.S b/arch/riscv/kernel/probes/opt_trampoline.S
>> index 16160c4367ff..75e34e373cf2 100644
>> --- a/arch/riscv/kernel/probes/opt_trampoline.S
>> +++ b/arch/riscv/kernel/probes/opt_trampoline.S
>> @@ -1,12 +1,137 @@
>>  /* SPDX-License-Identifier: GPL-2.0-only */
>>  /*
>>   * Copyright (C) 2022 Guokai Chen
>> + * Copyright (C) 2022 Liao, Chang <liaochang1@huawei.com>
>>   */
>>  
>>  #include <linux/linkage.h>
>>  
>> +#include <asm/asm.h>
>>  #incldue <asm/csr.h>
>>  #include <asm/asm-offsets.h>
>>  
>>  SYM_ENTRY(optprobe_template_entry, SYM_L_GLOBAL, SYM_A_NONE)
>> +	addi  sp, sp, -(PT_SIZE_ON_STACK)
>> +	REG_S x1,  PT_RA(sp)
>> +	REG_S x2,  PT_SP(sp)
>> +	REG_S x3,  PT_GP(sp)
>> +	REG_S x4,  PT_TP(sp)
>> +	REG_S x5,  PT_T0(sp)
>> +	REG_S x6,  PT_T1(sp)
>> +	REG_S x7,  PT_T2(sp)
>> +	REG_S x8,  PT_S0(sp)
>> +	REG_S x9,  PT_S1(sp)
>> +	REG_S x10, PT_A0(sp)
>> +	REG_S x11, PT_A1(sp)
>> +	REG_S x12, PT_A2(sp)
>> +	REG_S x13, PT_A3(sp)
>> +	REG_S x14, PT_A4(sp)
>> +	REG_S x15, PT_A5(sp)
>> +	REG_S x16, PT_A6(sp)
>> +	REG_S x17, PT_A7(sp)
>> +	REG_S x18, PT_S2(sp)
>> +	REG_S x19, PT_S3(sp)
>> +	REG_S x20, PT_S4(sp)
>> +	REG_S x21, PT_S5(sp)
>> +	REG_S x22, PT_S6(sp)
>> +	REG_S x23, PT_S7(sp)
>> +	REG_S x24, PT_S8(sp)
>> +	REG_S x25, PT_S9(sp)
>> +	REG_S x26, PT_S10(sp)
>> +	REG_S x27, PT_S11(sp)
>> +	REG_S x28, PT_T3(sp)
>> +	REG_S x29, PT_T4(sp)
>> +	REG_S x30, PT_T5(sp)
>> +	REG_S x31, PT_T6(sp)
>> +	/* Update fp is friendly for stacktrace */
>> +	addi  s0, sp, (PT_SIZE_ON_STACK)
>> +	j 1f
>> +
>> +SYM_ENTRY(optprobe_template_save, SYM_L_GLOBAL, SYM_A_NONE)
>> +	/*
>> +	 * Step1:
>> +	 * Filled with the pointer to optimized_kprobe data
>> +	 */
>> +	.dword 0
>> +1:
>> +	/* Load optimize_kprobe pointer from .dword below */
>> +	auipc a0, 0
>> +	REG_L a0, -8(a0)
>> +	add   a1, sp, x0
>> +
>> +SYM_ENTRY(optprobe_template_call, SYM_L_GLOBAL, SYM_A_NONE)
>> +	/*
>> +	 * Step2:
>> +	 * <IMME> of AUIPC/JALR are modified to the offset to optimized_callback
>> +	 * jump target is loaded from above .dword.
>> +	 */
>> +	auipc ra, 0
>> +	jalr  ra, 0(ra)
>> +
>> +	REG_L x1,  PT_RA(sp)
>> +	REG_L x3,  PT_GP(sp)
>> +	REG_L x4,  PT_TP(sp)
>> +	REG_L x5,  PT_T0(sp)
>> +	REG_L x6,  PT_T1(sp)
>> +	REG_L x7,  PT_T2(sp)
>> +	REG_L x8,  PT_S0(sp)
>> +	REG_L x9,  PT_S1(sp)
>> +	REG_L x10, PT_A0(sp)
>> +	REG_L x11, PT_A1(sp)
>> +	REG_L x12, PT_A2(sp)
>> +	REG_L x13, PT_A3(sp)
>> +	REG_L x14, PT_A4(sp)
>> +	REG_L x15, PT_A5(sp)
>> +	REG_L x16, PT_A6(sp)
>> +	REG_L x17, PT_A7(sp)
>> +	REG_L x18, PT_S2(sp)
>> +	REG_L x19, PT_S3(sp)
>> +	REG_L x20, PT_S4(sp)
>> +	REG_L x21, PT_S5(sp)
>> +	REG_L x22, PT_S6(sp)
>> +	REG_L x23, PT_S7(sp)
>> +	REG_L x24, PT_S8(sp)
>> +	REG_L x25, PT_S9(sp)
>> +	REG_L x26, PT_S10(sp)
>> +	REG_L x27, PT_S11(sp)
>> +	REG_L x28, PT_T3(sp)
>> +	REG_L x29, PT_T4(sp)
>> +	REG_L x30, PT_T5(sp)
>> +	REG_L x31, PT_T6(sp)
>> +	REG_L x2,  PT_SP(sp)
>> +	addi  sp, sp, (PT_SIZE_ON_STACK)
>> +
>> +SYM_ENTRY(optprobe_template_insn, SYM_L_GLOBAL, SYM_A_NONE)
>> +	/*
>> +	 * Step3:
>> +	 * NOPS will be replaced by the probed instruction, at worst case 3 RVC
>> +	 * and 1 RVI instructions is about to execute out of line.
>> +	 */
>> +	nop
> 
> A nop here will be either a compressed nop or a non-compressed,
> depending on the build (C-enabled or not), right? Maybe be explicit to
> the assembler what you want?
> 

You are right, if CONFIG_RISCV_ISA_C is disabled, two NOP is enough for 2 RVI execute out of line,
if CONFIG_RISCV_ISA_C is enabled, it needs eight C.NOP here for the worst case (3 RVC + 1 RVI).

I will use {C}.NOP explicitly for different configure in next revision, thanks.

> 
> Björn
Björn Töpel Jan. 4, 2023, 9:12 a.m. UTC | #3
"liaochang (A)" <liaochang1@huawei.com> writes:

>>> +SYM_ENTRY(optprobe_template_insn, SYM_L_GLOBAL, SYM_A_NONE)
>>> +	/*
>>> +	 * Step3:
>>> +	 * NOPS will be replaced by the probed instruction, at worst case 3 RVC
>>> +	 * and 1 RVI instructions is about to execute out of line.
>>> +	 */
>>> +	nop
>> 
>> A nop here will be either a compressed nop or a non-compressed,
>> depending on the build (C-enabled or not), right? Maybe be explicit to
>> the assembler what you want?
>> 
>
> You are right, if CONFIG_RISCV_ISA_C is disabled, two NOP is enough for 2 RVI execute out of line,
> if CONFIG_RISCV_ISA_C is enabled, it needs eight C.NOP here for the worst case (3 RVC + 1 RVI).
>
> I will use {C}.NOP explicitly for different configure in next revision, thanks.

What I meant was that "nop" can expand to compressed instructions, and
you should be explicit. So you know how it's expanded by the
compiler/assembler.

An example:

$ cat bar.S
	.text
bar:
	nop
	nop
$ riscv64-linux-gnu-gcc -O2 -o bar.o -c bar.S && riscv64-linux-gnu-objdump -M no-aliases -d bar.o

bar.o:     file format elf64-littleriscv


Disassembly of section .text:

0000000000000000 <bar>:
   0:	0001                	c.addi	zero,0
   2:	0001                	c.addi	zero,0


vs

$ cat foo.S
	.text
foo:
	.option norvc
	nop
	nop

$ riscv64-linux-gnu-gcc -O2 -o foo.o -c foo.S && riscv64-linux-gnu-objdump -M no-aliases -d foo.o

foo.o:     file format elf64-littleriscv


Disassembly of section .text:

0000000000000000 <foo>:
   0:	00000013          	addi	zero,zero,0
   4:	00000013          	addi	zero,zero,0


Björn
Liao, Chang Jan. 5, 2023, 12:46 a.m. UTC | #4
在 2023/1/4 17:12, Björn Töpel 写道:
> "liaochang (A)" <liaochang1@huawei.com> writes:
> 
>>>> +SYM_ENTRY(optprobe_template_insn, SYM_L_GLOBAL, SYM_A_NONE)
>>>> +	/*
>>>> +	 * Step3:
>>>> +	 * NOPS will be replaced by the probed instruction, at worst case 3 RVC
>>>> +	 * and 1 RVI instructions is about to execute out of line.
>>>> +	 */
>>>> +	nop
>>>
>>> A nop here will be either a compressed nop or a non-compressed,
>>> depending on the build (C-enabled or not), right? Maybe be explicit to
>>> the assembler what you want?
>>>
>>
>> You are right, if CONFIG_RISCV_ISA_C is disabled, two NOP is enough for 2 RVI execute out of line,
>> if CONFIG_RISCV_ISA_C is enabled, it needs eight C.NOP here for the worst case (3 RVC + 1 RVI).
>>
>> I will use {C}.NOP explicitly for different configure in next revision, thanks.
> 
> What I meant was that "nop" can expand to compressed instructions, and
> you should be explicit. So you know how it's expanded by the
> compiler/assembler.
> 
> An example:
> 
> $ cat bar.S
> 	.text
> bar:
> 	nop
> 	nop
> $ riscv64-linux-gnu-gcc -O2 -o bar.o -c bar.S && riscv64-linux-gnu-objdump -M no-aliases -d bar.o
> 
> bar.o:     file format elf64-littleriscv
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <bar>:
>    0:	0001                	c.addi	zero,0
>    2:	0001                	c.addi	zero,0
> 
> 
> vs
> 
> $ cat foo.S
> 	.text
> foo:
> 	.option norvc
> 	nop
> 	nop
> 
> $ riscv64-linux-gnu-gcc -O2 -o foo.o -c foo.S && riscv64-linux-gnu-objdump -M no-aliases -d foo.o
> 
> foo.o:     file format elf64-littleriscv
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <foo>:
>    0:	00000013          	addi	zero,zero,0
>    4:	00000013          	addi	zero,zero,0

Above examples are very clear, i will use these expaned instructions in next revision, thanks.

> 
> 
> Björn
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h
index e85130c9112f..e40c837d0a1d 100644
--- a/arch/riscv/include/asm/kprobes.h
+++ b/arch/riscv/include/asm/kprobes.h
@@ -46,10 +46,26 @@  bool kprobe_single_step_handler(struct pt_regs *regs);
 /* optinsn template addresses */
 extern __visible kprobe_opcode_t optprobe_template_entry[];
 extern __visible kprobe_opcode_t optprobe_template_end[];
+extern __visible kprobe_opcode_t optprobe_template_save[];
+extern __visible kprobe_opcode_t optprobe_template_call[];
+extern __visible kprobe_opcode_t optprobe_template_insn[];
+extern __visible kprobe_opcode_t optprobe_template_return[];
 
 #define MAX_OPTINSN_SIZE				\
 	((unsigned long)optprobe_template_end -		\
 	 (unsigned long)optprobe_template_entry)
+#define DETOUR_SAVE_OFFSET				\
+	((unsigned long)optprobe_template_save -	\
+	 (unsigned long)optprobe_template_entry)
+#define DETOUR_CALL_OFFSET				\
+	((unsigned long)optprobe_template_call -	\
+	 (unsigned long)optprobe_template_entry)
+#define DETOUR_INSN_OFFSET				\
+	((unsigned long)optprobe_template_insn -	\
+	 (unsigned long)optprobe_template_entry)
+#define DETOUR_RETURN_OFFSET				\
+	((unsigned long)optprobe_template_return -	\
+	 (unsigned long)optprobe_template_entry)
 
 /*
  * For RVI and RVC hybird encoding kernel, althought long jump just needs
diff --git a/arch/riscv/kernel/probes/opt.c b/arch/riscv/kernel/probes/opt.c
index 258a283c906d..bc232fce5b39 100644
--- a/arch/riscv/kernel/probes/opt.c
+++ b/arch/riscv/kernel/probes/opt.c
@@ -11,9 +11,37 @@ 
 #include <linux/kprobes.h>
 #include <asm/kprobes.h>
 #include <asm/patch.h>
+#include <asm/asm-offsets.h>
 
 #include "simulate-insn.h"
 #include "decode-insn.h"
+#include "../../net/bpf_jit.h"
+
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+	unsigned long flags;
+	struct kprobe_ctlblk *kcb;
+
+	/* Save skipped registers */
+	regs->epc = (unsigned long)op->kp.addr;
+	regs->orig_a0 = ~0UL;
+
+	local_irq_save(flags);
+	kcb = get_kprobe_ctlblk();
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		__this_cpu_write(current_kprobe, &op->kp);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+	local_irq_restore(flags);
+}
+
+NOKPROBE_SYMBOL(optimized_callback)
 
 static inline int in_auipc_jalr_range(long val)
 {
@@ -30,6 +58,11 @@  static inline int in_auipc_jalr_range(long val)
 #endif
 }
 
+#define DETOUR_ADDR(code, offs) \
+	((void *)((unsigned long)(code) + (offs)))
+#define DETOUR_INSN(code, offs) \
+	(*(kprobe_opcode_t *)((unsigned long)(code) + (offs)))
+
 /*
  * Copy optprobe assembly code template into detour buffer and modify some
  * instructions for each kprobe.
@@ -38,6 +71,49 @@  static void prepare_detour_buffer(kprobe_opcode_t *code, kprobe_opcode_t *slot,
 				  int rd, struct optimized_kprobe *op,
 				  kprobe_opcode_t opcode)
 {
+	long offs;
+	unsigned long data;
+
+	memcpy(code, optprobe_template_entry, MAX_OPTINSN_SIZE);
+
+	/* Step1: record optimized_kprobe pointer into detour buffer */
+	memcpy(DETOUR_ADDR(code, DETOUR_SAVE_OFFSET), &op, sizeof(op));
+
+	/*
+	 * Step2
+	 * auipc ra, 0     --> aupic ra, HI20.{optimized_callback - pc}
+	 * jalr  ra, 0(ra) --> jalr  ra, LO12.{optimized_callback - pc}(ra)
+	 */
+	offs = (unsigned long)&optimized_callback -
+	       (unsigned long)DETOUR_ADDR(slot, DETOUR_CALL_OFFSET);
+	DETOUR_INSN(code, DETOUR_CALL_OFFSET) =
+				rv_auipc(1, (offs + (1 << 11)) >> 12);
+	DETOUR_INSN(code, DETOUR_CALL_OFFSET + 0x4) =
+				rv_jalr(1, 1, offs & 0xFFF);
+
+	/* Step3: copy replaced instructions into detour buffer */
+	memcpy(DETOUR_ADDR(code, DETOUR_INSN_OFFSET), op->kp.addr,
+	       op->optinsn.length);
+	memcpy(DETOUR_ADDR(code, DETOUR_INSN_OFFSET), &opcode,
+	       GET_INSN_LENGTH(opcode));
+
+	/* Step4: record return address of long jump into detour buffer */
+	data = (unsigned long)op->kp.addr + op->optinsn.length;
+	memcpy(DETOUR_ADDR(code, DETOUR_RETURN_OFFSET), &data, sizeof(data));
+
+	/*
+	 * Step5
+	 * auipc ra, 0      --> auipc rd, 0
+	 * ld/w  ra, -4(ra) --> ld/w  rd, -8(rd)
+	 * jalr  x0,  0(ra) --> jalr  x0,  0(rd)
+	 */
+	DETOUR_INSN(code, DETOUR_RETURN_OFFSET + 0x8) = rv_auipc(rd, 0);
+#if __riscv_xlen == 32
+	DETOUR_INSN(code, DETOUR_RETURN_OFFSET + 0xC) = rv_lw(rd, -8, rd);
+#else
+	DETOUR_INSN(code, DETOUR_RETURN_OFFSET + 0xC) = rv_ld(rd, -8, rd);
+#endif
+	DETOUR_INSN(code, DETOUR_RETURN_OFFSET + 0x10) = rv_jalr(0, rd, 0);
 }
 
 /* Registers the first usage of which is the destination of instruction */
diff --git a/arch/riscv/kernel/probes/opt_trampoline.S b/arch/riscv/kernel/probes/opt_trampoline.S
index 16160c4367ff..75e34e373cf2 100644
--- a/arch/riscv/kernel/probes/opt_trampoline.S
+++ b/arch/riscv/kernel/probes/opt_trampoline.S
@@ -1,12 +1,137 @@ 
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2022 Guokai Chen
+ * Copyright (C) 2022 Liao, Chang <liaochang1@huawei.com>
  */
 
 #include <linux/linkage.h>
 
+#include <asm/asm.h>
 #incldue <asm/csr.h>
 #include <asm/asm-offsets.h>
 
 SYM_ENTRY(optprobe_template_entry, SYM_L_GLOBAL, SYM_A_NONE)
+	addi  sp, sp, -(PT_SIZE_ON_STACK)
+	REG_S x1,  PT_RA(sp)
+	REG_S x2,  PT_SP(sp)
+	REG_S x3,  PT_GP(sp)
+	REG_S x4,  PT_TP(sp)
+	REG_S x5,  PT_T0(sp)
+	REG_S x6,  PT_T1(sp)
+	REG_S x7,  PT_T2(sp)
+	REG_S x8,  PT_S0(sp)
+	REG_S x9,  PT_S1(sp)
+	REG_S x10, PT_A0(sp)
+	REG_S x11, PT_A1(sp)
+	REG_S x12, PT_A2(sp)
+	REG_S x13, PT_A3(sp)
+	REG_S x14, PT_A4(sp)
+	REG_S x15, PT_A5(sp)
+	REG_S x16, PT_A6(sp)
+	REG_S x17, PT_A7(sp)
+	REG_S x18, PT_S2(sp)
+	REG_S x19, PT_S3(sp)
+	REG_S x20, PT_S4(sp)
+	REG_S x21, PT_S5(sp)
+	REG_S x22, PT_S6(sp)
+	REG_S x23, PT_S7(sp)
+	REG_S x24, PT_S8(sp)
+	REG_S x25, PT_S9(sp)
+	REG_S x26, PT_S10(sp)
+	REG_S x27, PT_S11(sp)
+	REG_S x28, PT_T3(sp)
+	REG_S x29, PT_T4(sp)
+	REG_S x30, PT_T5(sp)
+	REG_S x31, PT_T6(sp)
+	/* Update fp is friendly for stacktrace */
+	addi  s0, sp, (PT_SIZE_ON_STACK)
+	j 1f
+
+SYM_ENTRY(optprobe_template_save, SYM_L_GLOBAL, SYM_A_NONE)
+	/*
+	 * Step1:
+	 * Filled with the pointer to optimized_kprobe data
+	 */
+	.dword 0
+1:
+	/* Load optimize_kprobe pointer from .dword below */
+	auipc a0, 0
+	REG_L a0, -8(a0)
+	add   a1, sp, x0
+
+SYM_ENTRY(optprobe_template_call, SYM_L_GLOBAL, SYM_A_NONE)
+	/*
+	 * Step2:
+	 * <IMME> of AUIPC/JALR are modified to the offset to optimized_callback
+	 * jump target is loaded from above .dword.
+	 */
+	auipc ra, 0
+	jalr  ra, 0(ra)
+
+	REG_L x1,  PT_RA(sp)
+	REG_L x3,  PT_GP(sp)
+	REG_L x4,  PT_TP(sp)
+	REG_L x5,  PT_T0(sp)
+	REG_L x6,  PT_T1(sp)
+	REG_L x7,  PT_T2(sp)
+	REG_L x8,  PT_S0(sp)
+	REG_L x9,  PT_S1(sp)
+	REG_L x10, PT_A0(sp)
+	REG_L x11, PT_A1(sp)
+	REG_L x12, PT_A2(sp)
+	REG_L x13, PT_A3(sp)
+	REG_L x14, PT_A4(sp)
+	REG_L x15, PT_A5(sp)
+	REG_L x16, PT_A6(sp)
+	REG_L x17, PT_A7(sp)
+	REG_L x18, PT_S2(sp)
+	REG_L x19, PT_S3(sp)
+	REG_L x20, PT_S4(sp)
+	REG_L x21, PT_S5(sp)
+	REG_L x22, PT_S6(sp)
+	REG_L x23, PT_S7(sp)
+	REG_L x24, PT_S8(sp)
+	REG_L x25, PT_S9(sp)
+	REG_L x26, PT_S10(sp)
+	REG_L x27, PT_S11(sp)
+	REG_L x28, PT_T3(sp)
+	REG_L x29, PT_T4(sp)
+	REG_L x30, PT_T5(sp)
+	REG_L x31, PT_T6(sp)
+	REG_L x2,  PT_SP(sp)
+	addi  sp, sp, (PT_SIZE_ON_STACK)
+
+SYM_ENTRY(optprobe_template_insn, SYM_L_GLOBAL, SYM_A_NONE)
+	/*
+	 * Step3:
+	 * NOPS will be replaced by the probed instruction, at worst case 3 RVC
+	 * and 1 RVI instructions is about to execute out of line.
+	 */
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	j 2f
+
+SYM_ENTRY(optprobe_template_return, SYM_L_GLOBAL, SYM_A_NONE)
+	/*
+	 * Step4:
+	 * Filled with the return address of long jump(AUIPC/JALR)
+	 */
+	.dword 0
+2:
+	/*
+	 * Step5:
+	 * The <RA> of AUIPC/LD/JALR will be replaced for each kprobe,
+	 * used to read return address saved in .dword above.
+	 */
+	auipc ra, 0
+	REG_L ra, -8(ra)
+	jalr  x0, 0(ra)
 SYM_ENTRY(optprobe_template_end, SYM_L_GLOBAL, SYM_A_NONE)