@@ -909,6 +909,11 @@ static const struct uprobe_xol_ops push_xol_ops = {
.emulate = push_emulate_op,
};
+static int is_nop5_insn(uprobe_opcode_t *insn)
+{
+ return !memcmp(insn, x86_nops[5], 5);
+}
+
/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
@@ -928,6 +933,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
break;
case 0x0f:
+ if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
+ goto setup;
if (insn->opcode.nbytes != 2)
return -ENOSYS;
/*
Adding support to emulate nop5 as the original uprobe instruction. This speeds up uprobes on top of nop5 instructions: (results from benchs/run_bench_uprobes.sh) current: uprobe-nop : 3.252 ± 0.019M/s uprobe-push : 3.097 ± 0.002M/s uprobe-ret : 1.116 ± 0.001M/s --> uprobe-nop5 : 1.115 ± 0.001M/s uretprobe-nop : 1.731 ± 0.016M/s uretprobe-push : 1.673 ± 0.023M/s uretprobe-ret : 0.843 ± 0.009M/s --> uretprobe-nop5 : 1.124 ± 0.001M/s after the change: uprobe-nop : 3.281 ± 0.003M/s uprobe-push : 3.085 ± 0.003M/s uprobe-ret : 1.130 ± 0.000M/s --> uprobe-nop5 : 3.276 ± 0.007M/s uretprobe-nop : 1.716 ± 0.016M/s uretprobe-push : 1.651 ± 0.017M/s uretprobe-ret : 0.846 ± 0.006M/s --> uretprobe-nop5 : 3.279 ± 0.002M/s Strangely I can see uretprobe-nop5 is now much faster compared to uretprobe-nop, while perf profiles for both are almost identical. I'm still checking on that. Signed-off-by: Jiri Olsa <jolsa@kernel.org> --- arch/x86/kernel/uprobes.c | 7 +++++++ 1 file changed, 7 insertions(+)