diff mbox series

tracing: Trace instrumentation begin and end

Message ID 20230321215121.71b339c5@gandalf.local.home (mailing list archive)
State Deferred
Headers show
Series tracing: Trace instrumentation begin and end | expand

Commit Message

Steven Rostedt March 22, 2023, 1:51 a.m. UTC
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

I do a lot of analysis on how much the kernel is interfering with user
space and why. One measurement was to trace the length of page faults.
That was done with:

 trace-cmd start -p function_graph --max-graph_depth 1

Which would produce output like:

 3)   4.429 us    | __do_page_fault();
 3)   8.208 us    | __do_page_fault();
 3)   6.877 us    | __do_page_fault();
 3)   + 10.033 us | __do_page_fault();
 3)   8.013 us    | __do_page_fault();

But now that entry into the kernel is all protected with noinstr, function
graph tracing no longer catches entry into the kernel, and
__do_page_fault() is no longer traced.

Even trying just:

 trace-cmd start -p function_graph --max-graph_depth 1 -e page_fault_user

produces:

 2)   0.764 us    |  exit_to_user_mode_prepare();
 2)               |  /* page_fault_user: address=0x7fadaba40fd8 ip=0x7fadaba40fd8 error_code=0x14 */
 2)   0.581 us    |  down_read_trylock();

The "page_fault_user" event is not encapsulated around any function, which
means it probably triggered and went back to user space without any trace
to know how long that page fault took (the down_read_trylock() is likely to
be part of the page fault function, but that's besides the point).

To help bring back the old functionality, two trace points are added. One
just after instrumentation begins, and one just before it ends. This way,
we can see all the time that the kernel can do something meaningful, and we
will trace it.

 trace-cmd start -e instrumentation -e page_fault_user

produces:

       trace-cmd-911     [001] d....  2616.730120: instrumentation_begin: exc_page_fault+0x2e/0x250 <- asm_exc_page_fault+0x26/0x30
       trace-cmd-911     [001] d....  2616.730121: page_fault_user: address=0x7fcadc16ab70 ip=0x7fcadc16ab70 error_code=0x14
       trace-cmd-911     [001] d....  2616.730137: instrumentation_end: exc_page_fault+0x8d/0x250 <- asm_exc_page_fault+0x26/0x30

The time between the instrumentation_begin and the instrumentation_end
that has the page_fault_user can be considered the time the page fault
took.

To even make a histogram out of this, and using sqlhist to create
synthetic events. A synthetic event can be used to join the
instrumentation_begin with page_fault_user, and then another synthetic
event can join that synthetic event to instrumentation_end, passing the
timestamp between them.

 sqlhist -e -n fault_start 'SELECT start.TIMESTAMP AS start_ts
	 FROM instrumentation_begin AS start JOIN page_fault_user AS end
         ON start.common_pid = end.common_pid'

 sqlhist -e -n fault_total 'SELECT (end.TIMESTAMP - start.start_ts) AS delta
         FROM fault_start AS start JOIN instrumentation_end AS end
         ON start.common_pid = end.common_pid'

 ># cd /sys/kernel/tracing
 ># echo 'hist:keys=ip.sym-offset,pip.sys-offset,delta.buckets=20' > events/synthetic/fault_total/trigger
 ># cat events/synthetic/fault_total/hist
 # event histogram
 #
 # trigger info: hist:keys=common_pid.execname,delta.buckets=2000:vals=hitcount:sort=common_pid.execname,delta.buckets=2000:size=2048 [active]
 #

 { common_pid: systemd-journal [       250], delta: ~ 2000-3999 } hitcount:          7
 { common_pid: systemd-journal [       250], delta: ~ 4000-5999 } hitcount:        153
 { common_pid: systemd-journal [       250], delta: ~ 6000-7999 } hitcount:          7
 { common_pid: systemd-journal [       250], delta: ~ 8000-9999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 14000-15999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 16000-17999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 20000-21999 } hitcount:          9
 { common_pid: systemd-journal [       250], delta: ~ 22000-23999 } hitcount:          5
 { common_pid: systemd-journal [       250], delta: ~ 24000-25999 } hitcount:          3
 { common_pid: systemd-journal [       250], delta: ~ 42000-43999 } hitcount:          2
 { common_pid: systemd-journal [       250], delta: ~ 44000-45999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 52000-53999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 70000-71999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 74000-75999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 162000-163999 } hitcount:          1
 { common_pid: systemd-journal [       250], delta: ~ 172000-173999 } hitcount:          1
 { common_pid: NetworkManager  [       393], delta: ~ 10000-11999 } hitcount:          1
 { common_pid: NetworkManager  [       393], delta: ~ 30000-31999 } hitcount:          1
 { common_pid: exim4           [       719], delta: ~ 8000-9999 } hitcount:          1
 { common_pid: exim4           [       719], delta: ~ 12000-13999 } hitcount:          3

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/instrumentation.h        | 34 ++++++++++++++++++++--
 include/trace/events/instrumentation.h | 40 ++++++++++++++++++++++++++
 kernel/trace/trace.c                   | 20 +++++++++++++
 3 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/instrumentation.h

Comments

Joel Fernandes March 22, 2023, 2:10 a.m. UTC | #1
On Tue, Mar 21, 2023 at 9:51 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
>
> I do a lot of analysis on how much the kernel is interfering with user
> space and why. One measurement was to trace the length of page faults.
> That was done with:
>
>  trace-cmd start -p function_graph --max-graph_depth 1
>
> Which would produce output like:
>
>  3)   4.429 us    | __do_page_fault();
>  3)   8.208 us    | __do_page_fault();
>  3)   6.877 us    | __do_page_fault();
>  3)   + 10.033 us | __do_page_fault();
>  3)   8.013 us    | __do_page_fault();
>
> But now that entry into the kernel is all protected with noinstr, function
> graph tracing no longer catches entry into the kernel, and
> __do_page_fault() is no longer traced.
>
> Even trying just:
>
>  trace-cmd start -p function_graph --max-graph_depth 1 -e page_fault_user
>
> produces:
>
>  2)   0.764 us    |  exit_to_user_mode_prepare();
>  2)               |  /* page_fault_user: address=0x7fadaba40fd8 ip=0x7fadaba40fd8 error_code=0x14 */
>  2)   0.581 us    |  down_read_trylock();
>
> The "page_fault_user" event is not encapsulated around any function, which
> means it probably triggered and went back to user space without any trace
> to know how long that page fault took (the down_read_trylock() is likely to
> be part of the page fault function, but that's besides the point).
>
> To help bring back the old functionality, two trace points are added. One
> just after instrumentation begins, and one just before it ends. This way,
> we can see all the time that the kernel can do something meaningful, and we
> will trace it.
>
>  trace-cmd start -e instrumentation -e page_fault_user

One drawback is it requires the user to know about instrumentation.

I don't have an issue with the new instrumentation tracepoint but I
was wondering, since you have this specific usecase a lot, would it
not also be better to add a dedicated tracepoint to measure pagefault
time? And only collect the start timestamp if the tracepoint is
enabled.

Or even a pair of pagefault begin/end tracepoints and then one can go
create synthetic events on top to measure the time delta. :-P

thanks,

 - Joel


>
> produces:
>
>        trace-cmd-911     [001] d....  2616.730120: instrumentation_begin: exc_page_fault+0x2e/0x250 <- asm_exc_page_fault+0x26/0x30
>        trace-cmd-911     [001] d....  2616.730121: page_fault_user: address=0x7fcadc16ab70 ip=0x7fcadc16ab70 error_code=0x14
>        trace-cmd-911     [001] d....  2616.730137: instrumentation_end: exc_page_fault+0x8d/0x250 <- asm_exc_page_fault+0x26/0x30
>
> The time between the instrumentation_begin and the instrumentation_end
> that has the page_fault_user can be considered the time the page fault
> took.
>
> To even make a histogram out of this, and using sqlhist to create
> synthetic events. A synthetic event can be used to join the
> instrumentation_begin with page_fault_user, and then another synthetic
> event can join that synthetic event to instrumentation_end, passing the
> timestamp between them.
>
>  sqlhist -e -n fault_start 'SELECT start.TIMESTAMP AS start_ts
>          FROM instrumentation_begin AS start JOIN page_fault_user AS end
>          ON start.common_pid = end.common_pid'
>
>  sqlhist -e -n fault_total 'SELECT (end.TIMESTAMP - start.start_ts) AS delta
>          FROM fault_start AS start JOIN instrumentation_end AS end
>          ON start.common_pid = end.common_pid'
>
>  ># cd /sys/kernel/tracing
>  ># echo 'hist:keys=ip.sym-offset,pip.sys-offset,delta.buckets=20' > events/synthetic/fault_total/trigger
>  ># cat events/synthetic/fault_total/hist
>  # event histogram
>  #
>  # trigger info: hist:keys=common_pid.execname,delta.buckets=2000:vals=hitcount:sort=common_pid.execname,delta.buckets=2000:size=2048 [active]
>  #
>
>  { common_pid: systemd-journal [       250], delta: ~ 2000-3999 } hitcount:          7
>  { common_pid: systemd-journal [       250], delta: ~ 4000-5999 } hitcount:        153
>  { common_pid: systemd-journal [       250], delta: ~ 6000-7999 } hitcount:          7
>  { common_pid: systemd-journal [       250], delta: ~ 8000-9999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 14000-15999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 16000-17999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 20000-21999 } hitcount:          9
>  { common_pid: systemd-journal [       250], delta: ~ 22000-23999 } hitcount:          5
>  { common_pid: systemd-journal [       250], delta: ~ 24000-25999 } hitcount:          3
>  { common_pid: systemd-journal [       250], delta: ~ 42000-43999 } hitcount:          2
>  { common_pid: systemd-journal [       250], delta: ~ 44000-45999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 52000-53999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 70000-71999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 74000-75999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 162000-163999 } hitcount:          1
>  { common_pid: systemd-journal [       250], delta: ~ 172000-173999 } hitcount:          1
>  { common_pid: NetworkManager  [       393], delta: ~ 10000-11999 } hitcount:          1
>  { common_pid: NetworkManager  [       393], delta: ~ 30000-31999 } hitcount:          1
>  { common_pid: exim4           [       719], delta: ~ 8000-9999 } hitcount:          1
>  { common_pid: exim4           [       719], delta: ~ 12000-13999 } hitcount:          3
>
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> ---
>  include/linux/instrumentation.h        | 34 ++++++++++++++++++++--
>  include/trace/events/instrumentation.h | 40 ++++++++++++++++++++++++++
>  kernel/trace/trace.c                   | 20 +++++++++++++
>  3 files changed, 92 insertions(+), 2 deletions(-)
>  create mode 100644 include/trace/events/instrumentation.h
>
> diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h
> index bc7babe91b2e..f013acfc9192 100644
> --- a/include/linux/instrumentation.h
> +++ b/include/linux/instrumentation.h
> @@ -2,6 +2,34 @@
>  #ifndef __LINUX_INSTRUMENTATION_H
>  #define __LINUX_INSTRUMENTATION_H
>
> +#ifdef CONFIG_TRACING
> +#include <linux/kernel.h>
> +#include <linux/tracepoint-defs.h>
> +
> +void call_trace_instrumentation_begin(unsigned long ip, unsigned long pip);
> +void call_trace_instrumentation_end(unsigned long ip, unsigned long pip);
> +
> +DECLARE_TRACEPOINT(instrumentation_begin);
> +DECLARE_TRACEPOINT(instrumentation_end);
> +
> +static inline void do_trace_instrumentation_begin(unsigned long ip,
> +                                                 unsigned long pip)
> +{
> +       if (tracepoint_enabled(instrumentation_begin))
> +               call_trace_instrumentation_begin(ip, pip);
> +}
> +
> +static inline void do_trace_instrumentation_end(unsigned long ip,
> +                                               unsigned long pip)
> +{
> +       if (tracepoint_enabled(instrumentation_end))
> +               call_trace_instrumentation_end(ip, pip);
> +}
> +#else /* !CONFIG_TRACING */
> +static inline void do_trace_instrumentation_begin(unsigned long ip, unsigned long pip) { }
> +static inline void do_trace_instrumentation_end(unsigned long ip, unsigned long pip) { }
> +#endif /* CONFIG_TRACING */
> +
>  #ifdef CONFIG_NOINSTR_VALIDATION
>
>  #include <linux/stringify.h>
> @@ -12,6 +40,7 @@
>                      ".pushsection .discard.instr_begin\n\t"            \
>                      ".long " __stringify(c) "b - .\n\t"                \
>                      ".popsection\n\t" : : "i" (c));                    \
> +       do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);            \
>  })
>  #define instrumentation_begin() __instrumentation_begin(__COUNTER__)
>
> @@ -47,6 +76,7 @@
>   * part of the condition block and does not escape.
>   */
>  #define __instrumentation_end(c) ({                                    \
> +       do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);              \
>         asm volatile(__stringify(c) ": nop\n\t"                         \
>                      ".pushsection .discard.instr_end\n\t"              \
>                      ".long " __stringify(c) "b - .\n\t"                \
> @@ -54,8 +84,8 @@
>  })
>  #define instrumentation_end() __instrumentation_end(__COUNTER__)
>  #else /* !CONFIG_NOINSTR_VALIDATION */
> -# define instrumentation_begin()       do { } while(0)
> -# define instrumentation_end()         do { } while(0)
> +# define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
> +# define instrumentation_end()  do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);
>  #endif /* CONFIG_NOINSTR_VALIDATION */
>
>  #endif /* __LINUX_INSTRUMENTATION_H */
> diff --git a/include/trace/events/instrumentation.h b/include/trace/events/instrumentation.h
> new file mode 100644
> index 000000000000..6b92a205d9ae
> --- /dev/null
> +++ b/include/trace/events/instrumentation.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM instrumentation
> +
> +#if !defined(_TRACE_INSTRUMENTATION_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_INSTRUMENTATION_H
> +
> +#include <linux/tracepoint.h>
> +
> +DECLARE_EVENT_CLASS(instrumentation_template,
> +
> +       TP_PROTO(unsigned long ip, unsigned long pip),
> +
> +       TP_ARGS(ip, pip),
> +
> +       TP_STRUCT__entry(
> +               __field( unsigned long,         ip )
> +               __field( unsigned long,         pip )
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->ip = ip;
> +               __entry->pip = pip;
> +       ),
> +
> +       TP_printk("%pS <- %pS", (void *)__entry->ip, (void *)__entry->pip)
> +);
> +
> +DEFINE_EVENT(instrumentation_template, instrumentation_begin,
> +            TP_PROTO(unsigned long ip, unsigned long pip),
> +            TP_ARGS(ip, pip));
> +
> +DEFINE_EVENT(instrumentation_template, instrumentation_end,
> +            TP_PROTO(unsigned long ip, unsigned long pip),
> +            TP_ARGS(ip, pip));
> +
> +#endif /* _TRACE_INSTRUMENTATION_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 937e9676dfd4..071885824408 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -54,6 +54,26 @@
>  #include "trace.h"
>  #include "trace_output.h"
>
> +/* Define instrumentation tracing here, as there is no real home for it */
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/instrumentation.h>
> +#undef CREATE_TRACE_POINTS
> +
> +void call_trace_instrumentation_begin(unsigned long ip, unsigned long pip)
> +{
> +       trace_instrumentation_begin(ip, pip);
> +}
> +EXPORT_SYMBOL_GPL(call_trace_instrumentation_begin);
> +
> +void call_trace_instrumentation_end(unsigned long ip, unsigned long pip)
> +{
> +       trace_instrumentation_end(ip, pip);
> +}
> +EXPORT_SYMBOL_GPL(call_trace_instrumentation_end);
> +
> +EXPORT_TRACEPOINT_SYMBOL_GPL(instrumentation_begin);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(instrumentation_end);
> +
>  /*
>   * On boot up, the ring buffer is set to the minimum size, so that
>   * we do not waste memory on systems that are not using tracing.
> --
> 2.39.1
>
kernel test robot March 22, 2023, 3:26 a.m. UTC | #2
Hi Steven,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.3-rc3 next-20230321]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
patch link:    https://lore.kernel.org/r/20230321215121.71b339c5%40gandalf.local.home
patch subject: [PATCH] tracing: Trace instrumentation begin and end
config: powerpc-allyesconfig (https://download.01.org/0day-ci/archive/20230322/202303221148.o4nkqckQ-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
        git checkout 1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc prepare

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303221148.o4nkqckQ-lkp@intel.com/

All errors (new ones prefixed by >>):

   scripts/genksyms/parse.y: warning: 9 shift/reduce conflicts [-Wconflicts-sr]
   scripts/genksyms/parse.y: warning: 5 reduce/reduce conflicts [-Wconflicts-rr]
   scripts/genksyms/parse.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
   In file included from arch/powerpc/include/asm/page.h:247,
                    from arch/powerpc/include/asm/thread_info.h:13,
                    from include/linux/thread_info.h:60,
                    from arch/powerpc/include/asm/ptrace.h:342,
                    from arch/powerpc/include/asm/hw_irq.h:12,
                    from arch/powerpc/include/asm/irqflags.h:12,
                    from include/linux/irqflags.h:16,
                    from include/asm-generic/cmpxchg-local.h:6,
                    from arch/powerpc/include/asm/cmpxchg.h:755,
                    from arch/powerpc/include/asm/atomic.h:11,
                    from include/linux/atomic.h:7,
                    from include/linux/tracepoint-defs.h:11,
                    from include/linux/instrumentation.h:7,
                    from include/asm-generic/bug.h:6,
                    from arch/powerpc/include/asm/bug.h:159,
                    from include/linux/bug.h:5,
                    from include/linux/fortify-string.h:5,
                    from include/linux/string.h:254,
                    from include/linux/uuid.h:11,
                    from include/linux/mod_devicetable.h:13,
                    from scripts/mod/devicetable-offsets.c:3:
   arch/powerpc/include/asm/page_32.h: In function 'clear_page':
>> arch/powerpc/include/asm/page_32.h:48:9: error: implicit declaration of function 'WARN_ON' [-Werror=implicit-function-declaration]
      48 |         WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
         |         ^~~~~~~
   include/linux/thread_info.h: In function 'check_copy_size':
>> include/linux/thread_info.h:249:13: error: implicit declaration of function 'WARN_ON_ONCE' [-Werror=implicit-function-declaration]
     249 |         if (WARN_ON_ONCE(bytes > INT_MAX))
         |             ^~~~~~~~~~~~
   cc1: some warnings being treated as errors
   make[2]: *** [scripts/Makefile.build:114: scripts/mod/devicetable-offsets.s] Error 1
   make[2]: Target 'scripts/mod/' not remade because of errors.
   make[1]: *** [Makefile:1285: prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [Makefile:226: __sub-make] Error 2
   make: Target 'prepare' not remade because of errors.


vim +/WARN_ON +48 arch/powerpc/include/asm/page_32.h

7ab0b7cb8951d4 arch/powerpc/include/asm/page_32.h Christophe Leroy 2019-08-16  38  
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  39  /*
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  40   * Clear page using the dcbz instruction, which doesn't cause any
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  41   * memory traffic (except to write out any cache lines which get
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  42   * displaced).  This only works on cacheable memory.
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  43   */
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  44  static inline void clear_page(void *addr)
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  45  {
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  46  	unsigned int i;
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  47  
7ab0b7cb8951d4 arch/powerpc/include/asm/page_32.h Christophe Leroy 2019-08-16 @48  	WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
7ab0b7cb8951d4 arch/powerpc/include/asm/page_32.h Christophe Leroy 2019-08-16  49  
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  50  	for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES)
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  51  		dcbz(addr);
5736f96d12dd42 arch/powerpc/include/asm/page_32.h Christophe Leroy 2016-02-09  52  }
5cd16ee934eafc include/asm-powerpc/page_32.h      Michael Ellerman 2005-11-11  53  extern void copy_page(void *to, void *from);
5cd16ee934eafc include/asm-powerpc/page_32.h      Michael Ellerman 2005-11-11  54
kernel test robot March 22, 2023, 3:47 a.m. UTC | #3
Hi Steven,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.3-rc3 next-20230321]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
patch link:    https://lore.kernel.org/r/20230321215121.71b339c5%40gandalf.local.home
patch subject: [PATCH] tracing: Trace instrumentation begin and end
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20230322/202303221157.uDnuxtAh-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
        git checkout 1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k prepare

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303221157.uDnuxtAh-lkp@intel.com/

All errors (new ones prefixed by >>):

   scripts/genksyms/parse.y: warning: 9 shift/reduce conflicts [-Wconflicts-sr]
   scripts/genksyms/parse.y: warning: 5 reduce/reduce conflicts [-Wconflicts-rr]
   scripts/genksyms/parse.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
   In file included from include/asm-generic/bug.h:6,
                    from arch/m68k/include/asm/bug.h:32,
                    from include/linux/bug.h:5,
                    from include/linux/thread_info.h:13,
                    from include/asm-generic/preempt.h:5,
                    from ./arch/m68k/include/generated/asm/preempt.h:1,
                    from include/linux/preempt.h:78,
                    from arch/m68k/include/asm/irqflags.h:6,
                    from include/linux/irqflags.h:16,
                    from arch/m68k/include/asm/atomic.h:6,
                    from include/linux/atomic.h:7,
                    from include/linux/rcupdate.h:25,
                    from include/linux/rculist.h:11,
                    from include/linux/pid.h:5,
                    from include/linux/sched.h:14,
                    from arch/m68k/kernel/asm-offsets.c:15:
   include/linux/thread_info.h: In function 'check_copy_size':
>> include/linux/instrumentation.h:87:65: error: '_THIS_IP_' undeclared (first use in this function)
      87 | # define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
         |                                                                 ^~~~~~~~~
   include/asm-generic/bug.h:96:17: note: in expansion of macro 'instrumentation_begin'
      96 |                 instrumentation_begin();                                \
         |                 ^~~~~~~~~~~~~~~~~~~~~
   include/asm-generic/bug.h:94:33: note: in expansion of macro '__WARN_printf'
      94 | #define __WARN()                __WARN_printf(TAINT_WARN, NULL)
         |                                 ^~~~~~~~~~~~~
   include/asm-generic/bug.h:124:17: note: in expansion of macro '__WARN'
     124 |                 __WARN();                                               \
         |                 ^~~~~~
   include/linux/once_lite.h:31:25: note: in expansion of macro 'WARN_ON'
      31 |                         func(__VA_ARGS__);                              \
         |                         ^~~~
   include/asm-generic/bug.h:147:9: note: in expansion of macro 'DO_ONCE_LITE_IF'
     147 |         DO_ONCE_LITE_IF(condition, WARN_ON, 1)
         |         ^~~~~~~~~~~~~~~
   include/linux/thread_info.h:249:13: note: in expansion of macro 'WARN_ON_ONCE'
     249 |         if (WARN_ON_ONCE(bytes > INT_MAX))
         |             ^~~~~~~~~~~~
   include/linux/instrumentation.h:87:65: note: each undeclared identifier is reported only once for each function it appears in
      87 | # define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
         |                                                                 ^~~~~~~~~
   include/asm-generic/bug.h:96:17: note: in expansion of macro 'instrumentation_begin'
      96 |                 instrumentation_begin();                                \
         |                 ^~~~~~~~~~~~~~~~~~~~~
   include/asm-generic/bug.h:94:33: note: in expansion of macro '__WARN_printf'
      94 | #define __WARN()                __WARN_printf(TAINT_WARN, NULL)
         |                                 ^~~~~~~~~~~~~
   include/asm-generic/bug.h:124:17: note: in expansion of macro '__WARN'
     124 |                 __WARN();                                               \
         |                 ^~~~~~
   include/linux/once_lite.h:31:25: note: in expansion of macro 'WARN_ON'
      31 |                         func(__VA_ARGS__);                              \
         |                         ^~~~
   include/asm-generic/bug.h:147:9: note: in expansion of macro 'DO_ONCE_LITE_IF'
     147 |         DO_ONCE_LITE_IF(condition, WARN_ON, 1)
         |         ^~~~~~~~~~~~~~~
   include/linux/thread_info.h:249:13: note: in expansion of macro 'WARN_ON_ONCE'
     249 |         if (WARN_ON_ONCE(bytes > INT_MAX))
         |             ^~~~~~~~~~~~
>> include/linux/instrumentation.h:87:76: error: '_RET_IP_' undeclared (first use in this function)
      87 | # define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
         |                                                                            ^~~~~~~~
   include/asm-generic/bug.h:96:17: note: in expansion of macro 'instrumentation_begin'
      96 |                 instrumentation_begin();                                \
         |                 ^~~~~~~~~~~~~~~~~~~~~
   include/asm-generic/bug.h:94:33: note: in expansion of macro '__WARN_printf'
      94 | #define __WARN()                __WARN_printf(TAINT_WARN, NULL)
         |                                 ^~~~~~~~~~~~~
   include/asm-generic/bug.h:124:17: note: in expansion of macro '__WARN'
     124 |                 __WARN();                                               \
         |                 ^~~~~~
   include/linux/once_lite.h:31:25: note: in expansion of macro 'WARN_ON'
      31 |                         func(__VA_ARGS__);                              \
         |                         ^~~~
   include/asm-generic/bug.h:147:9: note: in expansion of macro 'DO_ONCE_LITE_IF'
     147 |         DO_ONCE_LITE_IF(condition, WARN_ON, 1)
         |         ^~~~~~~~~~~~~~~
   include/linux/thread_info.h:249:13: note: in expansion of macro 'WARN_ON_ONCE'
     249 |         if (WARN_ON_ONCE(bytes > INT_MAX))
         |             ^~~~~~~~~~~~
   make[2]: *** [scripts/Makefile.build:114: arch/m68k/kernel/asm-offsets.s] Error 1
   make[2]: Target 'prepare' not remade because of errors.
   make[1]: *** [Makefile:1286: prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [Makefile:226: __sub-make] Error 2
   make: Target 'prepare' not remade because of errors.


vim +/_THIS_IP_ +87 include/linux/instrumentation.h

    46	
    47	/*
    48	 * Because instrumentation_{begin,end}() can nest, objtool validation considers
    49	 * _begin() a +1 and _end() a -1 and computes a sum over the instructions.
    50	 * When the value is greater than 0, we consider instrumentation allowed.
    51	 *
    52	 * There is a problem with code like:
    53	 *
    54	 * noinstr void foo()
    55	 * {
    56	 *	instrumentation_begin();
    57	 *	...
    58	 *	if (cond) {
    59	 *		instrumentation_begin();
    60	 *		...
    61	 *		instrumentation_end();
    62	 *	}
    63	 *	bar();
    64	 *	instrumentation_end();
    65	 * }
    66	 *
    67	 * If instrumentation_end() would be an empty label, like all the other
    68	 * annotations, the inner _end(), which is at the end of a conditional block,
    69	 * would land on the instruction after the block.
    70	 *
    71	 * If we then consider the sum of the !cond path, we'll see that the call to
    72	 * bar() is with a 0-value, even though, we meant it to happen with a positive
    73	 * value.
    74	 *
    75	 * To avoid this, have _end() be a NOP instruction, this ensures it will be
    76	 * part of the condition block and does not escape.
    77	 */
    78	#define __instrumentation_end(c) ({					\
    79		do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);		\
    80		asm volatile(__stringify(c) ": nop\n\t"				\
    81			     ".pushsection .discard.instr_end\n\t"		\
    82			     ".long " __stringify(c) "b - .\n\t"		\
    83			     ".popsection\n\t" : : "i" (c));			\
    84	})
    85	#define instrumentation_end() __instrumentation_end(__COUNTER__)
    86	#else /* !CONFIG_NOINSTR_VALIDATION */
  > 87	# define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
    88	# define instrumentation_end()	 do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);
    89	#endif /* CONFIG_NOINSTR_VALIDATION */
    90
Thomas Gleixner March 22, 2023, 11:19 a.m. UTC | #4
Steven!

On Tue, Mar 21 2023 at 21:51, Steven Rostedt wrote:
> From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
> produces:
>
>  2)   0.764 us    |  exit_to_user_mode_prepare();
>  2)               |  /* page_fault_user: address=0x7fadaba40fd8 ip=0x7fadaba40fd8 error_code=0x14 */
>  2)   0.581 us    |  down_read_trylock();
>
> The "page_fault_user" event is not encapsulated around any function, which
> means it probably triggered and went back to user space without any trace
> to know how long that page fault took (the down_read_trylock() is likely to
> be part of the page fault function, but that's besides the point).
>
> To help bring back the old functionality, two trace points are added. One
> just after instrumentation begins, and one just before it ends. This way,
> we can see all the time that the kernel can do something meaningful, and we
> will trace it.

Seriously? That's completely insane. Have you actually looked how many
instrumentation_begin()/end() pairs are in the affected code pathes?

Obviously not. It's a total of _five_ for every syscall and at least
_four_ for every interrupt/exception from user mode.

The number #1 design rule for instrumentation is to be as non-intrusive as
possible and not to be as lazy as possible.

instrumentation_begin()/end() is solely meant for objtool validation and
nothing else.

There are clearly less horrible ways to retrieve the #PF duration, no?

Thanks,

        tglx
Steven Rostedt March 22, 2023, 12:48 p.m. UTC | #5
On Wed, 22 Mar 2023 12:19:14 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> Steven!
> 
> On Tue, Mar 21 2023 at 21:51, Steven Rostedt wrote:
> > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
> > produces:
> >
> >  2)   0.764 us    |  exit_to_user_mode_prepare();
> >  2)               |  /* page_fault_user: address=0x7fadaba40fd8 ip=0x7fadaba40fd8 error_code=0x14 */
> >  2)   0.581 us    |  down_read_trylock();
> >
> > The "page_fault_user" event is not encapsulated around any function, which
> > means it probably triggered and went back to user space without any trace
> > to know how long that page fault took (the down_read_trylock() is likely to
> > be part of the page fault function, but that's besides the point).
> >
> > To help bring back the old functionality, two trace points are added. One
> > just after instrumentation begins, and one just before it ends. This way,
> > we can see all the time that the kernel can do something meaningful, and we
> > will trace it.  
> 
> Seriously? That's completely insane. Have you actually looked how many
> instrumentation_begin()/end() pairs are in the affected code pathes?
> 
> Obviously not. It's a total of _five_ for every syscall and at least
> _four_ for every interrupt/exception from user mode.
> 
> The number #1 design rule for instrumentation is to be as non-intrusive as
> possible and not to be as lazy as possible.

And it still is. It still uses nops when not enabled. I could even add a
config to only have this compiled in when the config is set, so that
production can disable it if it wants to.

Just in case it's not obvious:

	if (tracepoint_enabled(instrumentation_begin))
		call_trace_instrumentation_begin(ip, pip);

is equivalent to:

	if (static_key_false(&__tracepoint_instrumentation_begin.key))
		call_trace_instrumentation_begin(ip, pip);

We have trace points in preempt_enable/disable() I think that's *far* more
intrusive.

> 
> instrumentation_begin()/end() is solely meant for objtool validation and
> nothing else.
> 
> There are clearly less horrible ways to retrieve the #PF duration, no?

It's not just for #PF, that was just one example. I use to use function
graph tracing max_depth_count=1 to verify NO_HZ_FULL to make sure there's
no entry into the kernel. That doesn't work anymore. Even compat syscalls
are not traced.

I lost a kernel feature with the noinstr push and this is the closest that
comes to bringing it back. And the more we add noinstr, the more the kernel
becomes a black box again.

-- Steve
Peter Zijlstra March 22, 2023, 12:52 p.m. UTC | #6
On Wed, Mar 22, 2023 at 08:48:34AM -0400, Steven Rostedt wrote:
> On Wed, 22 Mar 2023 12:19:14 +0100
> Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > Steven!
> > 
> > On Tue, Mar 21 2023 at 21:51, Steven Rostedt wrote:
> > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
> > > produces:
> > >
> > >  2)   0.764 us    |  exit_to_user_mode_prepare();
> > >  2)               |  /* page_fault_user: address=0x7fadaba40fd8 ip=0x7fadaba40fd8 error_code=0x14 */
> > >  2)   0.581 us    |  down_read_trylock();
> > >
> > > The "page_fault_user" event is not encapsulated around any function, which
> > > means it probably triggered and went back to user space without any trace
> > > to know how long that page fault took (the down_read_trylock() is likely to
> > > be part of the page fault function, but that's besides the point).
> > >
> > > To help bring back the old functionality, two trace points are added. One
> > > just after instrumentation begins, and one just before it ends. This way,
> > > we can see all the time that the kernel can do something meaningful, and we
> > > will trace it.  
> > 
> > Seriously? That's completely insane. Have you actually looked how many
> > instrumentation_begin()/end() pairs are in the affected code pathes?
> > 
> > Obviously not. It's a total of _five_ for every syscall and at least
> > _four_ for every interrupt/exception from user mode.
> > 
> > The number #1 design rule for instrumentation is to be as non-intrusive as
> > possible and not to be as lazy as possible.
> 
> And it still is. It still uses nops when not enabled. I could even add a
> config to only have this compiled in when the config is set, so that
> production can disable it if it wants to.
> 
> Just in case it's not obvious:
> 
> 	if (tracepoint_enabled(instrumentation_begin))
> 		call_trace_instrumentation_begin(ip, pip);
> 
> is equivalent to:
> 
> 	if (static_key_false(&__tracepoint_instrumentation_begin.key))
> 		call_trace_instrumentation_begin(ip, pip);
> 

It is still completely insane.

NAK.
Thomas Gleixner March 22, 2023, 3:39 p.m. UTC | #7
Steven!

On Wed, Mar 22 2023 at 08:48, Steven Rostedt wrote:
> On Wed, 22 Mar 2023 12:19:14 +0100
> Thomas Gleixner <tglx@linutronix.de> wrote:
>> Seriously? That's completely insane. Have you actually looked how many
>> instrumentation_begin()/end() pairs are in the affected code pathes?
>> 
>> Obviously not. It's a total of _five_ for every syscall and at least
>> _four_ for every interrupt/exception from user mode.
>> 
>> The number #1 design rule for instrumentation is to be as non-intrusive as
>> possible and not to be as lazy as possible.
>
> And it still is. It still uses nops when not enabled. I could even add a
> config to only have this compiled in when the config is set, so that
> production can disable it if it wants to.
>
> Just in case it's not obvious:
>
> 	if (tracepoint_enabled(instrumentation_begin))
> 		call_trace_instrumentation_begin(ip, pip);
>
> is equivalent to:
>
> 	if (static_key_false(&__tracepoint_instrumentation_begin.key))
> 		call_trace_instrumentation_begin(ip, pip);

And that makes the insanity of enabling 10 tracepoints in the syscall
path and at mininum 8 tracepoints in the exception/interrupt path any
smaller?

> We have trace points in preempt_enable/disable() I think that's *far* more
> intrusive.

What? If you want to do preempt_enable()/disable() tracing then there is
no other choice than tracing every single invocation.

But for figuring out how long a syscall, interrupt or exception takes
there are exactly TWO tracepoints required, not 10 or 8. And it's bloody
obvious where to place them, right?

>> instrumentation_begin()/end() is solely meant for objtool validation and
>> nothing else.
>> 
>> There are clearly less horrible ways to retrieve the #PF duration, no?
>
> It's not just for #PF, that was just one example. I use to use function
> graph tracing max_depth_count=1 to verify NO_HZ_FULL to make sure there's
> no entry into the kernel. That doesn't work anymore. Even compat syscalls
> are not traced.

That still works. noinstr did neither break syscall tracing nor any of
the interrupt/exception tracepoints which can be used to validate the
NOHZ full mechanics. Your fancy favourite script might not work anymore,
but claiming that it can't be traced is just nonsense.

> I lost a kernel feature with the noinstr push and this is the closest that
> comes to bringing it back.

This is the closest _you_ came up with without thinking about it for a
split second.

> And the more we add noinstr, the more the kernel becomes a black box
> again.

It does not. noinstr is a technical requirement to keep instrumentation
out of code pathes which are not able to handle instrumentation. You
know that very well, so please stop this theatrical handwaving and come
back if you have sensible technical arguments.

Thanks,

        tglx
Steven Rostedt March 22, 2023, 6:16 p.m. UTC | #8
On Wed, 22 Mar 2023 16:39:41 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> But for figuring out how long a syscall, interrupt or exception takes
> there are exactly TWO tracepoints required, not 10 or 8. And it's bloody
> obvious where to place them, right?

Not always, and it's pretty much always architecture dependent. I was
looking for an architecture agnostic approach, as I imagine all archs will
be eventually using this.

> 
> >> instrumentation_begin()/end() is solely meant for objtool validation and
> >> nothing else.
> >> 
> >> There are clearly less horrible ways to retrieve the #PF duration, no?  
> >
> > It's not just for #PF, that was just one example. I use to use function
> > graph tracing max_depth_count=1 to verify NO_HZ_FULL to make sure there's
> > no entry into the kernel. That doesn't work anymore. Even compat syscalls
> > are not traced.  
> 
> That still works. noinstr did neither break syscall tracing nor any of
> the interrupt/exception tracepoints which can be used to validate the
> NOHZ full mechanics. Your fancy favourite script might not work anymore,
> but claiming that it can't be traced is just nonsense.

I don't remember fully, but there was something that was missing. It was
back in 2021, so I do not remember fully. That's when I first wrote this
patch. I only just redisovered it and wanted to share ;-) The only thing I
did differently since then was to add the page fault logic, because that
was something I am currently interested it.

Things could have changed since then. But if adding trace events for the
missing syscalls and around exceptions for timing purposes is OK, then I'm
happy to go that approach.

> 
> > I lost a kernel feature with the noinstr push and this is the closest that
> > comes to bringing it back.  
> 
> This is the closest _you_ came up with without thinking about it for a
> split second.
> 
> > And the more we add noinstr, the more the kernel becomes a black box
> > again.  
> 
> It does not. noinstr is a technical requirement to keep instrumentation
> out of code pathes which are not able to handle instrumentation. You
> know that very well, so please stop this theatrical handwaving and come
> back if you have sensible technical arguments.

I never said nor implied that it's not important. I'm just concerned that
we currently have no way to see when it's happening.

But I'll drop this patch and look to add specific trace events in specific
points to be able to get the timings that are useful.

Thanks,

-- Steve
Thomas Gleixner March 22, 2023, 10:03 p.m. UTC | #9
Steven!

On Wed, Mar 22 2023 at 14:16, Steven Rostedt wrote:
> On Wed, 22 Mar 2023 16:39:41 +0100
> Thomas Gleixner <tglx@linutronix.de> wrote:
>
>> But for figuring out how long a syscall, interrupt or exception takes
>> there are exactly TWO tracepoints required, not 10 or 8. And it's bloody
>> obvious where to place them, right?
>
> Not always, and it's pretty much always architecture dependent. I was
> looking for an architecture agnostic approach, as I imagine all archs will
> be eventually using this.

All architectures, at least those which matter, will eventually use the
generic entry code, which is completely architecture agnostic.

And again, while instrumentation_begin/end() might be in use on more
architectures today, it's still the fundamentally wrong place to stick a
tracepoint into. See below.

> Things could have changed since then. But if adding trace events for the
> missing syscalls and around exceptions for timing purposes is OK, then I'm
> happy to go that approach.

If there are tracepoints missing for syscalls then the obvious correct
thing is to add them so that the syscall coverage is complete instead of
abusing the lack of tracepoints as an argument for horrible hackery.

If there is a value to have tracepoints around exceptions etc. for
timing purposes then they definitely make more sense than the abuse of
instrumentation_begin/end() which just generates noise and adds overhead
into completely unrelated code pathes.

>> > And the more we add noinstr, the more the kernel becomes a black box
>> > again.  
>> 
>> It does not. noinstr is a technical requirement to keep instrumentation
>> out of code pathes which are not able to handle instrumentation. You
>> know that very well, so please stop this theatrical handwaving and come
>> back if you have sensible technical arguments.
>
> I never said nor implied that it's not important. I'm just concerned that
> we currently have no way to see when it's happening.

There is no value to see the instrumentation_begin()/end() annotations
in tracing, really.

They are annotations to enable objtool to validate that the noinstr
constraints are not violated. We need quite a few of them as the tooling
operates at function scope. So a function which is invoked from noinstr
low level code looks like this:

noinstr foo(...)
{
        enter();

        instrumentation_begin();        #2
        do_other_stuff();
        instrumentation_end();          #2

        exit();
}

enter()/exit() need to be noinstr functions as well, but as enter()
might be the function which actually establishes the context,
e.g. __enter_from_user_mode(), it can contain calls into instrumentable
code too:

noinstr enter(..)
{
        enter_context();

        instrumentation_begin();        #1
        do_instrumentable_enter();
        instrumentation_end();          #1
}

See? So you end up with two pairs of instrumentation_begin()/end() in
this simple example. Add exit() to the picture:

noinstr exit(..)
{
        instrumentation_begin();        #3
        do_instrumentable_exit();
        instrumentation_end();          #3

        exit_context();
}

So you have already three.

In reality it's five pairs per syscall and at least four pairs per
exception/interrupt.

As you can see above end() #1 and begin() #2 are identical from a
tracing POV. So are end() #2 and begin() #3.

IOW there is zero value to have tracepoints in them. Especially as
do_instrumentable_enter()/exit() and do_other_stuff() are subject to
tracing already unless the compiler decides to inline them.

The only relevant information is begin() #1 and end() #3, right?

And as you cannot piggy-pack that on instrumentation_begin()/end() for
obvious reasons, you need explicit tracepoints, which are in the very
end more informative and come with a factor of 4-5 less overhead in both
the enabled and disabled case.

Contrary to popular believe, disabled tracepoints are not completely
free of overhead either.

Thanks,

        tglx
kernel test robot March 23, 2023, 1:56 a.m. UTC | #10
Hi Steven,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v6.3-rc3 next-20230322]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
patch link:    https://lore.kernel.org/r/20230321215121.71b339c5%40gandalf.local.home
patch subject: [PATCH] tracing: Trace instrumentation begin and end
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20230323/202303230955.doWocaU3-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Steven-Rostedt/tracing-Trace-instrumentation-begin-and-end/20230322-095354
        git checkout 1f40755bb9b4817135459d6cf76fcbd17ffb53dd
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303230955.doWocaU3-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> vmlinux.o: warning: objtool: .text+0x2bb7f2: relocation to !ENDBR: .text+0x2bb7e9
   vmlinux.o: warning: objtool: .text+0x2bb803: relocation to !ENDBR: .text+0x2bb7e2


objdump-func vmlinux.o .text:
diff mbox series

Patch

diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h
index bc7babe91b2e..f013acfc9192 100644
--- a/include/linux/instrumentation.h
+++ b/include/linux/instrumentation.h
@@ -2,6 +2,34 @@ 
 #ifndef __LINUX_INSTRUMENTATION_H
 #define __LINUX_INSTRUMENTATION_H
 
+#ifdef CONFIG_TRACING
+#include <linux/kernel.h>
+#include <linux/tracepoint-defs.h>
+
+void call_trace_instrumentation_begin(unsigned long ip, unsigned long pip);
+void call_trace_instrumentation_end(unsigned long ip, unsigned long pip);
+
+DECLARE_TRACEPOINT(instrumentation_begin);
+DECLARE_TRACEPOINT(instrumentation_end);
+
+static inline void do_trace_instrumentation_begin(unsigned long ip,
+						  unsigned long pip)
+{
+	if (tracepoint_enabled(instrumentation_begin))
+		call_trace_instrumentation_begin(ip, pip);
+}
+
+static inline void do_trace_instrumentation_end(unsigned long ip,
+						unsigned long pip)
+{
+	if (tracepoint_enabled(instrumentation_end))
+		call_trace_instrumentation_end(ip, pip);
+}
+#else /* !CONFIG_TRACING */
+static inline void do_trace_instrumentation_begin(unsigned long ip, unsigned long pip) { }
+static inline void do_trace_instrumentation_end(unsigned long ip, unsigned long pip) { }
+#endif /* CONFIG_TRACING */
+
 #ifdef CONFIG_NOINSTR_VALIDATION
 
 #include <linux/stringify.h>
@@ -12,6 +40,7 @@ 
 		     ".pushsection .discard.instr_begin\n\t"		\
 		     ".long " __stringify(c) "b - .\n\t"		\
 		     ".popsection\n\t" : : "i" (c));			\
+	do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);		\
 })
 #define instrumentation_begin() __instrumentation_begin(__COUNTER__)
 
@@ -47,6 +76,7 @@ 
  * part of the condition block and does not escape.
  */
 #define __instrumentation_end(c) ({					\
+	do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);		\
 	asm volatile(__stringify(c) ": nop\n\t"				\
 		     ".pushsection .discard.instr_end\n\t"		\
 		     ".long " __stringify(c) "b - .\n\t"		\
@@ -54,8 +84,8 @@ 
 })
 #define instrumentation_end() __instrumentation_end(__COUNTER__)
 #else /* !CONFIG_NOINSTR_VALIDATION */
-# define instrumentation_begin()	do { } while(0)
-# define instrumentation_end()		do { } while(0)
+# define instrumentation_begin() do_trace_instrumentation_begin(_THIS_IP_, _RET_IP_);
+# define instrumentation_end()	 do_trace_instrumentation_end(_THIS_IP_, _RET_IP_);
 #endif /* CONFIG_NOINSTR_VALIDATION */
 
 #endif /* __LINUX_INSTRUMENTATION_H */
diff --git a/include/trace/events/instrumentation.h b/include/trace/events/instrumentation.h
new file mode 100644
index 000000000000..6b92a205d9ae
--- /dev/null
+++ b/include/trace/events/instrumentation.h
@@ -0,0 +1,40 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM instrumentation
+
+#if !defined(_TRACE_INSTRUMENTATION_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_INSTRUMENTATION_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(instrumentation_template,
+
+	TP_PROTO(unsigned long ip, unsigned long pip),
+
+	TP_ARGS(ip, pip),
+
+	TP_STRUCT__entry(
+		__field( unsigned long,		ip )
+		__field( unsigned long,		pip )
+	),
+
+	TP_fast_assign(
+		__entry->ip = ip;
+		__entry->pip = pip;
+	),
+
+	TP_printk("%pS <- %pS", (void *)__entry->ip, (void *)__entry->pip)
+);
+
+DEFINE_EVENT(instrumentation_template, instrumentation_begin,
+	     TP_PROTO(unsigned long ip, unsigned long pip),
+	     TP_ARGS(ip, pip));
+
+DEFINE_EVENT(instrumentation_template, instrumentation_end,
+	     TP_PROTO(unsigned long ip, unsigned long pip),
+	     TP_ARGS(ip, pip));
+
+#endif /* _TRACE_INSTRUMENTATION_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 937e9676dfd4..071885824408 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -54,6 +54,26 @@ 
 #include "trace.h"
 #include "trace_output.h"
 
+/* Define instrumentation tracing here, as there is no real home for it */
+#define CREATE_TRACE_POINTS
+#include <trace/events/instrumentation.h>
+#undef CREATE_TRACE_POINTS
+
+void call_trace_instrumentation_begin(unsigned long ip, unsigned long pip)
+{
+	trace_instrumentation_begin(ip, pip);
+}
+EXPORT_SYMBOL_GPL(call_trace_instrumentation_begin);
+
+void call_trace_instrumentation_end(unsigned long ip, unsigned long pip)
+{
+	trace_instrumentation_end(ip, pip);
+}
+EXPORT_SYMBOL_GPL(call_trace_instrumentation_end);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(instrumentation_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(instrumentation_end);
+
 /*
  * On boot up, the ring buffer is set to the minimum size, so that
  * we do not waste memory on systems that are not using tracing.