diff mbox

[v20,08/11] ARM: kprobes: enable OPTPROBES for ARM 32

Message ID 1420785456-21900-1-git-send-email-wangnan0@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang Nan Jan. 9, 2015, 6:37 a.m. UTC
This patch introduce kprobeopt for ARM 32.

Limitations:
 - Currently only kernel compiled with ARM ISA is supported.

 - Offset between probe point and optinsn slot must not larger than
   32MiB. Masami Hiramatsu suggests replacing 2 words, it will make
   things complex. Futher patch can make such optimization.

Kprobe opt on ARM is relatively simpler than kprobe opt on x86 because
ARM instruction is always 4 bytes aligned and 4 bytes long. This patch
replace probed instruction by a 'b', branch to trampoline code and then
calls optimized_callback(). optimized_callback() calls opt_pre_handler()
to execute kprobe handler. It also emulate/simulate replaced instruction.

When unregistering kprobe, the deferred manner of unoptimizer may leave
branch instruction before optimizer is called. Different from x86_64,
which only copy the probed insn after optprobe_template_end and
reexecute them, this patch call singlestep to emulate/simulate the insn
directly. Futher patch can optimize this behavior.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Jon Medhurst (Tixy) <tixy@linaro.org>
Reviewed-by: Jon Medhurst (Tixy) <tixy@linaro.org>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
---
 arch/arm/Kconfig                        |   1 +
 arch/arm/{kernel => include/asm}/insn.h |   0
 arch/arm/include/asm/kprobes.h          |  29 +++
 arch/arm/kernel/Makefile                |   2 +-
 arch/arm/kernel/ftrace.c                |   3 +-
 arch/arm/kernel/jump_label.c            |   3 +-
 arch/arm/probes/kprobes/Makefile        |   1 +
 arch/arm/probes/kprobes/core.c          |  26 ++-
 arch/arm/probes/kprobes/core.h          |   2 +
 arch/arm/probes/kprobes/opt-arm.c       | 322 ++++++++++++++++++++++++++++++++
 10 files changed, 377 insertions(+), 12 deletions(-)
 rename arch/arm/{kernel => include/asm}/insn.h (100%)
 create mode 100644 arch/arm/probes/kprobes/opt-arm.c

Comments

Jon Medhurst (Tixy) Jan. 9, 2015, 10:25 a.m. UTC | #1
On Fri, 2015-01-09 at 14:37 +0800, Wang Nan wrote:
> This patch introduce kprobeopt for ARM 32.
> 
> Limitations:
>  - Currently only kernel compiled with ARM ISA is supported.
> 
>  - Offset between probe point and optinsn slot must not larger than
>    32MiB. Masami Hiramatsu suggests replacing 2 words, it will make
>    things complex. Futher patch can make such optimization.
> 
> Kprobe opt on ARM is relatively simpler than kprobe opt on x86 because
> ARM instruction is always 4 bytes aligned and 4 bytes long. This patch
> replace probed instruction by a 'b', branch to trampoline code and then
> calls optimized_callback(). optimized_callback() calls opt_pre_handler()
> to execute kprobe handler. It also emulate/simulate replaced instruction.
> 
> When unregistering kprobe, the deferred manner of unoptimizer may leave
> branch instruction before optimizer is called. Different from x86_64,
> which only copy the probed insn after optprobe_template_end and
> reexecute them, this patch call singlestep to emulate/simulate the insn
> directly. Futher patch can optimize this behavior.
> 
> Signed-off-by: Wang Nan <wangnan0@huawei.com>
> Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
> Cc: Jon Medhurst (Tixy) <tixy@linaro.org>
> Reviewed-by: Jon Medhurst (Tixy) <tixy@linaro.org>
> Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
> Cc: Will Deacon <will.deacon@arm.com>
> ---

[...]

> +asm (
> +			".global optprobe_template_entry\n"
> +			"optprobe_template_entry:\n"
> +			".global optprobe_template_sub_sp\n"
> +			"optprobe_template_sub_sp:"
> +			"	sub	sp, sp, #0xff\n"
> +			"	stmia	sp, {r0 - r14} \n"
> +			".global optprobe_template_add_sp\n"
> +			"optprobe_template_add_sp:"
> +			"	add	r3, sp, #0xff\n"
> +			"	str	r3, [sp, #52]\n"
> +			"	mrs	r4, cpsr\n"
> +			"	str	r4, [sp, #64]\n"
> +			"	mov	r1, sp\n"
> +			"	ldr	r0, 1f\n"
> +			"	ldr	r2, 2f\n"
> +			/*
> +			 * AEABI requires an 8-bytes alignment stack. If
> +			 * SP % 8 != 0 (SP % 4 == 0 should be ensured),
> +			 * alloc more bytes here.
> +			 */
> +			"	and	r4, sp, #4\n"
> +			"	sub	sp, sp, r4\n"
> +#if __LINUX_ARM_ARCH__ >= 5
> +			"	blx	r2\n"
> +#else
> +			"	mov     lr, pc\n"
> +			"	bx	r2\n"

I think the BX instruction is not supported for ARMv4 chips that don't
have Thumb support (e.g. SA110), at least an old ARM ARM I have says BX
is supported on "Version 5 and above, and T variants of version 4".
Though building assabet_defconfig with kprobes enabled doesn't produce
an error for the BX instruction (!?)

To be safe I would be tempted to use "mov pc, r2" instead. Again, if you
agree, I'll change this in the patch in the branch I'm putting together.

[...]
Wang Nan Jan. 9, 2015, 10:55 a.m. UTC | #2
On 2015/1/9 18:25, Jon Medhurst (Tixy) wrote:
> On Fri, 2015-01-09 at 14:37 +0800, Wang Nan wrote:
>> This patch introduce kprobeopt for ARM 32.
>>
>> Limitations:
>>  - Currently only kernel compiled with ARM ISA is supported.
>>
>>  - Offset between probe point and optinsn slot must not larger than
>>    32MiB. Masami Hiramatsu suggests replacing 2 words, it will make
>>    things complex. Futher patch can make such optimization.
>>
>> Kprobe opt on ARM is relatively simpler than kprobe opt on x86 because
>> ARM instruction is always 4 bytes aligned and 4 bytes long. This patch
>> replace probed instruction by a 'b', branch to trampoline code and then
>> calls optimized_callback(). optimized_callback() calls opt_pre_handler()
>> to execute kprobe handler. It also emulate/simulate replaced instruction.
>>
>> When unregistering kprobe, the deferred manner of unoptimizer may leave
>> branch instruction before optimizer is called. Different from x86_64,
>> which only copy the probed insn after optprobe_template_end and
>> reexecute them, this patch call singlestep to emulate/simulate the insn
>> directly. Futher patch can optimize this behavior.
>>
>> Signed-off-by: Wang Nan <wangnan0@huawei.com>
>> Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
>> Cc: Jon Medhurst (Tixy) <tixy@linaro.org>
>> Reviewed-by: Jon Medhurst (Tixy) <tixy@linaro.org>
>> Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
>> Cc: Will Deacon <will.deacon@arm.com>
>> ---
> 
> [...]
> 
>> +asm (
>> +			".global optprobe_template_entry\n"
>> +			"optprobe_template_entry:\n"
>> +			".global optprobe_template_sub_sp\n"
>> +			"optprobe_template_sub_sp:"
>> +			"	sub	sp, sp, #0xff\n"
>> +			"	stmia	sp, {r0 - r14} \n"
>> +			".global optprobe_template_add_sp\n"
>> +			"optprobe_template_add_sp:"
>> +			"	add	r3, sp, #0xff\n"
>> +			"	str	r3, [sp, #52]\n"
>> +			"	mrs	r4, cpsr\n"
>> +			"	str	r4, [sp, #64]\n"
>> +			"	mov	r1, sp\n"
>> +			"	ldr	r0, 1f\n"
>> +			"	ldr	r2, 2f\n"
>> +			/*
>> +			 * AEABI requires an 8-bytes alignment stack. If
>> +			 * SP % 8 != 0 (SP % 4 == 0 should be ensured),
>> +			 * alloc more bytes here.
>> +			 */
>> +			"	and	r4, sp, #4\n"
>> +			"	sub	sp, sp, r4\n"
>> +#if __LINUX_ARM_ARCH__ >= 5
>> +			"	blx	r2\n"
>> +#else
>> +			"	mov     lr, pc\n"
>> +			"	bx	r2\n"
> 
> I think the BX instruction is not supported for ARMv4 chips that don't
> have Thumb support (e.g. SA110), at least an old ARM ARM I have says BX
> is supported on "Version 5 and above, and T variants of version 4".
> Though building assabet_defconfig with kprobes enabled doesn't produce
> an error for the BX instruction (!?)
> 
> To be safe I would be tempted to use "mov pc, r2" instead. Again, if you
> agree, I'll change this in the patch in the branch I'm putting together.
> 
> [...]
> 
Sure. I tested a function pointer calling and found that gcc generates
'mov pc, r2', and there is no need for ISA switching.
Russell King - ARM Linux Jan. 9, 2015, 4:35 p.m. UTC | #3
On Fri, Jan 09, 2015 at 10:25:54AM +0000, Jon Medhurst (Tixy) wrote:
> On Fri, 2015-01-09 at 14:37 +0800, Wang Nan wrote:
> > +			"	and	r4, sp, #4\n"
> > +			"	sub	sp, sp, r4\n"
> > +#if __LINUX_ARM_ARCH__ >= 5
> > +			"	blx	r2\n"
> > +#else
> > +			"	mov     lr, pc\n"
> > +			"	bx	r2\n"
> 
> I think the BX instruction is not supported for ARMv4 chips that don't
> have Thumb support (e.g. SA110), at least an old ARM ARM I have says BX
> is supported on "Version 5 and above, and T variants of version 4".

Correct.

> Though building assabet_defconfig with kprobes enabled doesn't produce
> an error for the BX instruction (!?)

Which config are you using?  Does it have CONFIG_CPU_32v4 enabled?
That should result in "-D__LINUX_ARM_ARCH__=4 -march=armv4" being
passed to the compiler (please check with make V=1).
Jon Medhurst (Tixy) Jan. 9, 2015, 5:28 p.m. UTC | #4
On Fri, 2015-01-09 at 16:35 +0000, Russell King - ARM Linux wrote:
> On Fri, Jan 09, 2015 at 10:25:54AM +0000, Jon Medhurst (Tixy) wrote:
> > On Fri, 2015-01-09 at 14:37 +0800, Wang Nan wrote:
> > > +			"	and	r4, sp, #4\n"
> > > +			"	sub	sp, sp, r4\n"
> > > +#if __LINUX_ARM_ARCH__ >= 5
> > > +			"	blx	r2\n"
> > > +#else
> > > +			"	mov     lr, pc\n"
> > > +			"	bx	r2\n"
> > 
> > I think the BX instruction is not supported for ARMv4 chips that don't
> > have Thumb support (e.g. SA110), at least an old ARM ARM I have says BX
> > is supported on "Version 5 and above, and T variants of version 4".
> 
> Correct.
> 
> > Though building assabet_defconfig with kprobes enabled doesn't produce
> > an error for the BX instruction (!?)
> 
> Which config are you using?  Does it have CONFIG_CPU_32v4 enabled?

Yes

> That should result in "-D__LINUX_ARM_ARCH__=4 -march=armv4" being
> passed to the compiler (please check with make V=1).

I does have that, the arguments for compiling this source file
include...

-mno-thumb-interwork -marm -D__LINUX_ARM_ARCH__=4 -march=armv4 -mtune=strongarm1100

Using objdump I can see that the BX instruction does indeed end up in
the code, it hasn't been auto-magically turned into a MOV PC,R2.

Adding in a ".code 16" to the assembler produces "Error: selected
processor does not support THUMB opcodes", so at least it's got that
right. 

I have "gcc version 4.9.1 (Ubuntu/Linaro 4.9.1-16ubuntu6)"

Interestingly...

$ echo 'asm ("bx r2\n");' | arm-linux-gnueabihf-gcc -x c -S -march=armv4 -
<stdin>:1:0: warning: target CPU does not support THUMB instructions
$

but adding -marm gets rid of that error.

$ echo 'asm ("bx r2\n");' | arm-linux-gnueabihf-gcc -x c -S -marm -march=armv4 -
$
Russell King - ARM Linux Jan. 9, 2015, 5:57 p.m. UTC | #5
On Fri, Jan 09, 2015 at 05:28:22PM +0000, Jon Medhurst (Tixy) wrote:
> Using objdump I can see that the BX instruction does indeed end up in
> the code, it hasn't been auto-magically turned into a MOV PC,R2.
> 
> Adding in a ".code 16" to the assembler produces "Error: selected
> processor does not support THUMB opcodes", so at least it's got that
> right. 
> 
> I have "gcc version 4.9.1 (Ubuntu/Linaro 4.9.1-16ubuntu6)"

Remember that it's binutils which issues the errors about the assembly.

> Interestingly...
> 
> $ echo 'asm ("bx r2\n");' | arm-linux-gnueabihf-gcc -x c -S -march=armv4 -
> <stdin>:1:0: warning: target CPU does not support THUMB instructions
> $

Mine doesn't do that.

> but adding -marm gets rid of that error.
> 
> $ echo 'asm ("bx r2\n");' | arm-linux-gnueabihf-gcc -x c -S -marm -march=armv4 -
> $

Yes - but check the -.s file for the output... this won't run the assembler
so the assembler won't check that the instruction is legal.

For me:

$ echo 'asm ("bx r2\n");' | arm-linux-gcc -x c -c -marm -march=armv4 -v - -o o.o

calls the assembler thusly:

/usr/local/lib/gcc/arm-linux-gnueabi/4.7.4/../../../../arm-linux-gnueabi/bin/as \
-v -march=armv4 -meabi=5 --fix-v4bx -o o.o /tmp/ccB0cZgO.s

Sure enough, the object file contains:

00000000 <.text>:
   0:   e12fff12        bx      r2
                        0: R_ARM_V4BX   *ABS*

so it looks like it's been told...  Then if you do:

$ arm-linux-ld --fix-v4bx -o o1.o o.o
$ arm-linux-objdump -dr o1.o

you get:

    8074:       e1a0f002        mov     pc, r2

Hmm, I wonder if this means we should have the kernel linker deal with
V4BX relocations on ARMv4, converting them to their mov pc, X variant.

Also, do we need --fix-v4bx for the link of vmlinux?
Jon Medhurst (Tixy) Jan. 9, 2015, 7:18 p.m. UTC | #6
On Fri, 2015-01-09 at 17:57 +0000, Russell King - ARM Linux wrote:
[...]
> For me:
> 
> $ echo 'asm ("bx r2\n");' | arm-linux-gcc -x c -c -marm -march=armv4 -v - -o o.o
> 
> calls the assembler thusly:
> 
> /usr/local/lib/gcc/arm-linux-gnueabi/4.7.4/../../../../arm-linux-gnueabi/bin/as \
> -v -march=armv4 -meabi=5 --fix-v4bx -o o.o /tmp/ccB0cZgO.s
> 
> Sure enough, the object file contains:
> 
> 00000000 <.text>:
>    0:   e12fff12        bx      r2
>                         0: R_ARM_V4BX   *ABS*
> 
> so it looks like it's been told...  Then if you do:
> 
> $ arm-linux-ld --fix-v4bx -o o1.o o.o
> $ arm-linux-objdump -dr o1.o
> 
> you get:
> 
>     8074:       e1a0f002        mov     pc, r2

I get results consistent with what you get above. A bit of googling
seems to indicate the generate-BX-and-fix-it-in-the-linker came in many
years ago with AEABI support.

> Hmm, I wonder if this means we should have the kernel linker deal with
> V4BX relocations on ARMv4, converting them to their mov pc, X variant.
> 
> Also, do we need --fix-v4bx for the link of vmlinux?

I guess the answer is yes if we want to catch uses of BX in inline
assembly. A quick and not very thorough grep of arch/arm for 'bx'
doesn't seem to turn up any existing dodgy uses, except in the kprobes
test code I wrote :-(
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 97d07ed..3d5dc2d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -60,6 +60,7 @@  config ARM
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
+	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
diff --git a/arch/arm/kernel/insn.h b/arch/arm/include/asm/insn.h
similarity index 100%
rename from arch/arm/kernel/insn.h
rename to arch/arm/include/asm/insn.h
diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h
index 56f9ac6..50ff3bc 100644
--- a/arch/arm/include/asm/kprobes.h
+++ b/arch/arm/include/asm/kprobes.h
@@ -50,5 +50,34 @@  int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
 int kprobe_exceptions_notify(struct notifier_block *self,
 			     unsigned long val, void *data);
 
+/* optinsn template addresses */
+extern __visible kprobe_opcode_t optprobe_template_entry;
+extern __visible kprobe_opcode_t optprobe_template_val;
+extern __visible kprobe_opcode_t optprobe_template_call;
+extern __visible kprobe_opcode_t optprobe_template_end;
+extern __visible kprobe_opcode_t optprobe_template_sub_sp;
+extern __visible kprobe_opcode_t optprobe_template_add_sp;
+
+#define MAX_OPTIMIZED_LENGTH	4
+#define MAX_OPTINSN_SIZE				\
+	((unsigned long)&optprobe_template_end -	\
+	 (unsigned long)&optprobe_template_entry)
+#define RELATIVEJUMP_SIZE	4
+
+struct arch_optimized_insn {
+	/*
+	 * copy of the original instructions.
+	 * Different from x86, ARM kprobe_opcode_t is u32.
+	 */
+#define MAX_COPIED_INSN	DIV_ROUND_UP(RELATIVEJUMP_SIZE, sizeof(kprobe_opcode_t))
+	kprobe_opcode_t copied_insn[MAX_COPIED_INSN];
+	/* detour code buffer */
+	kprobe_opcode_t *insn;
+	/*
+	 * We always copy one instruction on ARM,
+	 * so size will always be 4, and unlike x86, there is no
+	 * need for a size field.
+	 */
+};
 
 #endif /* _ARM_KPROBES_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 9c51a43..902397d 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -52,7 +52,7 @@  obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o insn.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o insn.o patch.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 # Main staffs in KPROBES are in arch/arm/probes/ .
-obj-$(CONFIG_KPROBES)		+= patch.o
+obj-$(CONFIG_KPROBES)		+= patch.o insn.o
 obj-$(CONFIG_OABI_COMPAT)	+= sys_oabi-compat.o
 obj-$(CONFIG_ARM_THUMBEE)	+= thumbee.o
 obj-$(CONFIG_KGDB)		+= kgdb.o patch.o
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index b8c75e4..709ee1d 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -20,8 +20,7 @@ 
 #include <asm/cacheflush.h>
 #include <asm/opcodes.h>
 #include <asm/ftrace.h>
-
-#include "insn.h"
+#include <asm/insn.h>
 
 #ifdef CONFIG_THUMB2_KERNEL
 #define	NOP		0xf85deb04	/* pop.w {lr} */
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index d8da075..e39cbf4 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -1,8 +1,7 @@ 
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
 #include <asm/patch.h>
-
-#include "insn.h"
+#include <asm/insn.h>
 
 #ifdef HAVE_JUMP_LABEL
 
diff --git a/arch/arm/probes/kprobes/Makefile b/arch/arm/probes/kprobes/Makefile
index bc8d504..76a36bf 100644
--- a/arch/arm/probes/kprobes/Makefile
+++ b/arch/arm/probes/kprobes/Makefile
@@ -7,5 +7,6 @@  obj-$(CONFIG_KPROBES)		+= actions-thumb.o checkers-thumb.o
 test-kprobes-objs		+= test-thumb.o
 else
 obj-$(CONFIG_KPROBES)		+= actions-arm.o checkers-arm.o
+obj-$(CONFIG_OPTPROBES)		+= opt-arm.o
 test-kprobes-objs		+= test-arm.o
 endif
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index 3a58db4..a4ec240 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -163,19 +163,31 @@  void __kprobes arch_arm_kprobe(struct kprobe *p)
  * memory. It is also needed to atomically set the two half-words of a 32-bit
  * Thumb breakpoint.
  */
-int __kprobes __arch_disarm_kprobe(void *p)
-{
-	struct kprobe *kp = p;
-	void *addr = (void *)((uintptr_t)kp->addr & ~1);
-
-	__patch_text(addr, kp->opcode);
+struct patch {
+	void *addr;
+	unsigned int insn;
+};
 
+static int __kprobes_remove_breakpoint(void *data)
+{
+	struct patch *p = data;
+	__patch_text(p->addr, p->insn);
 	return 0;
 }
 
+void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn)
+{
+	struct patch p = {
+		.addr = addr,
+		.insn = insn,
+	};
+	stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask);
+}
+
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
-	stop_machine(__arch_disarm_kprobe, p, cpu_online_mask);
+	kprobes_remove_breakpoint((void *)((uintptr_t)p->addr & ~1),
+			p->opcode);
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/arm/probes/kprobes/core.h b/arch/arm/probes/kprobes/core.h
index f88c79f..b3036c5 100644
--- a/arch/arm/probes/kprobes/core.h
+++ b/arch/arm/probes/kprobes/core.h
@@ -30,6 +30,8 @@ 
 #define KPROBE_THUMB16_BREAKPOINT_INSTRUCTION	0xde18
 #define KPROBE_THUMB32_BREAKPOINT_INSTRUCTION	0xf7f0a018
 
+extern void kprobes_remove_breakpoint(void *addr, unsigned int insn);
+
 enum probes_insn __kprobes
 kprobe_decode_ldmstm(kprobe_opcode_t insn, struct arch_probes_insn *asi,
 		const struct decode_header *h);
diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c
new file mode 100644
index 0000000..cb47eff
--- /dev/null
+++ b/arch/arm/probes/kprobes/opt-arm.c
@@ -0,0 +1,322 @@ 
+/*
+ *  Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ * Copyright (C) Huawei Inc., 2014
+ */
+
+#include <linux/kprobes.h>
+#include <linux/jump_label.h>
+#include <asm/kprobes.h>
+#include <asm/cacheflush.h>
+/* for arm_gen_branch */
+#include <asm/insn.h>
+/* for patch_text */
+#include <asm/patch.h>
+
+#include "core.h"
+
+/*
+ * NOTE: the first sub and add instruction will be modified according
+ * to the stack cost of the instruction.
+ */
+asm (
+			".global optprobe_template_entry\n"
+			"optprobe_template_entry:\n"
+			".global optprobe_template_sub_sp\n"
+			"optprobe_template_sub_sp:"
+			"	sub	sp, sp, #0xff\n"
+			"	stmia	sp, {r0 - r14} \n"
+			".global optprobe_template_add_sp\n"
+			"optprobe_template_add_sp:"
+			"	add	r3, sp, #0xff\n"
+			"	str	r3, [sp, #52]\n"
+			"	mrs	r4, cpsr\n"
+			"	str	r4, [sp, #64]\n"
+			"	mov	r1, sp\n"
+			"	ldr	r0, 1f\n"
+			"	ldr	r2, 2f\n"
+			/*
+			 * AEABI requires an 8-bytes alignment stack. If
+			 * SP % 8 != 0 (SP % 4 == 0 should be ensured),
+			 * alloc more bytes here.
+			 */
+			"	and	r4, sp, #4\n"
+			"	sub	sp, sp, r4\n"
+#if __LINUX_ARM_ARCH__ >= 5
+			"	blx	r2\n"
+#else
+			"	mov     lr, pc\n"
+			"	bx	r2\n"
+#endif
+			"	add	sp, sp, r4\n"
+			"	ldr	r1, [sp, #64]\n"
+			"	tst	r1, #"__stringify(PSR_T_BIT)"\n"
+			"	ldrne	r2, [sp, #60]\n"
+			"	orrne	r2, #1\n"
+			"	strne	r2, [sp, #60] @ set bit0 of PC for thumb\n"
+			"	msr	cpsr_cxsf, r1\n"
+			"	ldmia	sp, {r0 - r15}\n"
+			".global optprobe_template_val\n"
+			"optprobe_template_val:\n"
+			"1:	.long 0\n"
+			".global optprobe_template_call\n"
+			"optprobe_template_call:\n"
+			"2:	.long 0\n"
+			".global optprobe_template_end\n"
+			"optprobe_template_end:\n");
+
+#define TMPL_VAL_IDX \
+	((unsigned long *)&optprobe_template_val - (unsigned long *)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+	((unsigned long *)&optprobe_template_call - (unsigned long *)&optprobe_template_entry)
+#define TMPL_END_IDX \
+	((unsigned long *)&optprobe_template_end - (unsigned long *)&optprobe_template_entry)
+#define TMPL_ADD_SP \
+	((unsigned long *)&optprobe_template_add_sp - (unsigned long *)&optprobe_template_entry)
+#define TMPL_SUB_SP \
+	((unsigned long *)&optprobe_template_sub_sp - (unsigned long *)&optprobe_template_entry)
+
+/*
+ * ARM can always optimize an instruction when using ARM ISA, except
+ * instructions like 'str r0, [sp, r1]' which store to stack and unable
+ * to determine stack space consumption statically.
+ */
+int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+	return optinsn->insn != NULL;
+}
+
+/*
+ * In ARM ISA, kprobe opt always replace one instruction (4 bytes
+ * aligned and 4 bytes long). It is impossible to encounter another
+ * kprobe in the address range. So always return 0.
+ */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+	return 0;
+}
+
+/* Caller must ensure addr & 3 == 0 */
+static int can_optimize(struct kprobe *kp)
+{
+	if (kp->ainsn.stack_space < 0)
+		return 0;
+	/*
+	 * 255 is the biggest imm can be used in 'sub r0, r0, #<imm>'.
+	 * Number larger than 255 needs special encoding.
+	 */
+	if (kp->ainsn.stack_space > 255 - sizeof(struct pt_regs))
+		return 0;
+	return 1;
+}
+
+/* Free optimized instruction slot */
+static void
+__arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+	if (op->optinsn.insn) {
+		free_optinsn_slot(op->optinsn.insn, dirty);
+		op->optinsn.insn = NULL;
+	}
+}
+
+extern void kprobe_handler(struct pt_regs *regs);
+
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+	unsigned long flags;
+	struct kprobe *p = &op->kp;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	/* Save skipped registers */
+	regs->ARM_pc = (unsigned long)op->kp.addr;
+	regs->ARM_ORIG_r0 = ~0UL;
+
+	local_irq_save(flags);
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		__this_cpu_write(current_kprobe, &op->kp);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+
+	/* In each case, we must singlestep the replaced instruction. */
+	op->kp.ainsn.insn_singlestep(p->opcode, &p->ainsn, regs);
+
+	local_irq_restore(flags);
+}
+
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig)
+{
+	kprobe_opcode_t *code;
+	unsigned long rel_chk;
+	unsigned long val;
+	unsigned long stack_protect = sizeof(struct pt_regs);
+
+	if (!can_optimize(orig))
+		return -EILSEQ;
+
+	code = get_optinsn_slot();
+	if (!code)
+		return -ENOMEM;
+
+	/*
+	 * Verify if the address gap is in 32MiB range, because this uses
+	 * a relative jump.
+	 *
+	 * kprobe opt use a 'b' instruction to branch to optinsn.insn.
+	 * According to ARM manual, branch instruction is:
+	 *
+	 *   31  28 27           24 23             0
+	 *  +------+---+---+---+---+----------------+
+	 *  | cond | 1 | 0 | 1 | 0 |      imm24     |
+	 *  +------+---+---+---+---+----------------+
+	 *
+	 * imm24 is a signed 24 bits integer. The real branch offset is computed
+	 * by: imm32 = SignExtend(imm24:'00', 32);
+	 *
+	 * So the maximum forward branch should be:
+	 *   (0x007fffff << 2) = 0x01fffffc =  0x1fffffc
+	 * The maximum backword branch should be:
+	 *   (0xff800000 << 2) = 0xfe000000 = -0x2000000
+	 *
+	 * We can simply check (rel & 0xfe000003):
+	 *  if rel is positive, (rel & 0xfe000000) shoule be 0
+	 *  if rel is negitive, (rel & 0xfe000000) should be 0xfe000000
+	 *  the last '3' is used for alignment checking.
+	 */
+	rel_chk = (unsigned long)((long)code -
+			(long)orig->addr + 8) & 0xfe000003;
+
+	if ((rel_chk != 0) && (rel_chk != 0xfe000000)) {
+		/*
+		 * Different from x86, we free code buf directly instead of
+		 * calling __arch_remove_optimized_kprobe() because
+		 * we have not fill any field in op.
+		 */
+		free_optinsn_slot(code, 0);
+		return -ERANGE;
+	}
+
+	/* Copy arch-dep-instance from template. */
+	memcpy(code, &optprobe_template_entry,
+			TMPL_END_IDX * sizeof(kprobe_opcode_t));
+
+	/* Adjust buffer according to instruction. */
+	BUG_ON(orig->ainsn.stack_space < 0);
+
+	stack_protect += orig->ainsn.stack_space;
+
+	/* Should have been filtered by can_optimize(). */
+	BUG_ON(stack_protect > 255);
+
+	/* Create a 'sub sp, sp, #<stack_protect>' */
+	code[TMPL_SUB_SP] = __opcode_to_mem_arm(0xe24dd000 | stack_protect);
+	/* Create a 'add r3, sp, #<stack_protect>' */
+	code[TMPL_ADD_SP] = __opcode_to_mem_arm(0xe28d3000 | stack_protect);
+
+	/* Set probe information */
+	val = (unsigned long)op;
+	code[TMPL_VAL_IDX] = val;
+
+	/* Set probe function call */
+	val = (unsigned long)optimized_callback;
+	code[TMPL_CALL_IDX] = val;
+
+	flush_icache_range((unsigned long)code,
+			   (unsigned long)(&code[TMPL_END_IDX]));
+
+	/* Set op->optinsn.insn means prepared. */
+	op->optinsn.insn = code;
+	return 0;
+}
+
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+	struct optimized_kprobe *op, *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		unsigned long insn;
+		WARN_ON(kprobe_disabled(&op->kp));
+
+		/*
+		 * Backup instructions which will be replaced
+		 * by jump address
+		 */
+		memcpy(op->optinsn.copied_insn, op->kp.addr,
+				RELATIVEJUMP_SIZE);
+
+		insn = arm_gen_branch((unsigned long)op->kp.addr,
+				(unsigned long)op->optinsn.insn);
+		BUG_ON(insn == 0);
+
+		/*
+		 * Make it a conditional branch if replaced insn
+		 * is consitional
+		 */
+		insn = (__mem_to_opcode_arm(
+			  op->optinsn.copied_insn[0]) & 0xf0000000) |
+			(insn & 0x0fffffff);
+
+		/*
+		 * Similar to __arch_disarm_kprobe, operations which
+		 * removing breakpoints must be wrapped by stop_machine
+		 * to avoid racing.
+		 */
+		kprobes_remove_breakpoint(op->kp.addr, insn);
+
+		list_del_init(&op->list);
+	}
+}
+
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+	arch_arm_kprobe(&op->kp);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+void arch_unoptimize_kprobes(struct list_head *oplist,
+			    struct list_head *done_list)
+{
+	struct optimized_kprobe *op, *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		arch_unoptimize_kprobe(op);
+		list_move(&op->list, done_list);
+	}
+}
+
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+				unsigned long addr)
+{
+	return ((unsigned long)op->kp.addr <= addr &&
+		(unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr);
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+	__arch_remove_optimized_kprobe(op, 1);
+}