diff mbox series

[RFC,net-next] trace: tcp: Add tracepoint for tcp_cwnd_reduction()

Message ID 20250120-cwnd_tracepoint-v1-1-36b0e0d643fa@debian.org (mailing list archive)
State Handled Elsewhere
Headers show
Series [RFC,net-next] trace: tcp: Add tracepoint for tcp_cwnd_reduction() | expand

Commit Message

Breno Leitao Jan. 20, 2025, 12:02 p.m. UTC
Add a tracepoint to monitor TCP congestion window adjustments through the
tcp_cwnd_reduction() function. This tracepoint helps track:
  - TCP window size fluctuations
  - Active socket behavior
  - Congestion window reduction events

Meta has been using BPF programs to monitor this function for years. By
adding a proper tracepoint, we provide a stable API for all users who
need to monitor TCP congestion window behavior.

The tracepoint captures:
  - Socket source and destination IPs
  - Number of newly acknowledged packets
  - Number of newly lost packets
  - Packets in flight

Here is an example of a tracepoint when viewed in the trace buffer:

tcp_cwnd_reduction: src=[2401:db00:3021:10e1:face:0:32a:0]:45904 dest=[2401:db00:3021:1fb:face:0:23:0]:5201 newly_lost=0 newly_acked_sacked=27 in_flight=34

CC: Yonghong Song <yonghong.song@linux.dev>
CC: Song Liu <song@kernel.org>
CC: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
 include/trace/events/tcp.h | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |  2 ++
 2 files changed, 36 insertions(+)


---
base-commit: 96e12defe5a8fa3f3a10e3ef1d20fee503245a10
change-id: 20250120-cwnd_tracepoint-2e11c996a9cb

Best regards,

Comments

Jason Xing Jan. 20, 2025, 12:08 p.m. UTC | #1
On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
>
> Add a tracepoint to monitor TCP congestion window adjustments through the
> tcp_cwnd_reduction() function. This tracepoint helps track:
>   - TCP window size fluctuations
>   - Active socket behavior
>   - Congestion window reduction events
>
> Meta has been using BPF programs to monitor this function for years. By
> adding a proper tracepoint, we provide a stable API for all users who
> need to monitor TCP congestion window behavior.
>
> The tracepoint captures:
>   - Socket source and destination IPs
>   - Number of newly acknowledged packets
>   - Number of newly lost packets
>   - Packets in flight
>
> Here is an example of a tracepoint when viewed in the trace buffer:
>
> tcp_cwnd_reduction: src=[2401:db00:3021:10e1:face:0:32a:0]:45904 dest=[2401:db00:3021:1fb:face:0:23:0]:5201 newly_lost=0 newly_acked_sacked=27 in_flight=34
>
> CC: Yonghong Song <yonghong.song@linux.dev>
> CC: Song Liu <song@kernel.org>
> CC: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  include/trace/events/tcp.h | 34 ++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_input.c       |  2 ++
>  2 files changed, 36 insertions(+)
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index a27c4b619dffd7dcc72fffa71bf0fd5e34fe6681..b3a636658b39721cca843c0000eaa573cf4d09d5 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -259,6 +259,40 @@ TRACE_EVENT(tcp_retransmit_synack,
>                   __entry->saddr_v6, __entry->daddr_v6)
>  );
>
> +TRACE_EVENT(tcp_cwnd_reduction,
> +
> +       TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
> +                const int newly_lost, const int flag),
> +
> +       TP_ARGS(sk, newly_acked_sacked, newly_lost, flag),
> +
> +       TP_STRUCT__entry(
> +               __array(__u8, saddr, sizeof(struct sockaddr_in6))
> +               __array(__u8, daddr, sizeof(struct sockaddr_in6))
> +               __field(int, in_flight)
> +
> +               __field(int, newly_acked_sacked)
> +               __field(int, newly_lost)
> +       ),
> +
> +       TP_fast_assign(
> +               const struct inet_sock *inet = inet_sk(sk);
> +               const struct tcp_sock *tp = tcp_sk(sk);
> +
> +               memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
> +               memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
> +
> +               TP_STORE_ADDR_PORTS(__entry, inet, sk);
> +               __entry->in_flight = tcp_packets_in_flight(tp);
> +               __entry->newly_lost = newly_lost;
> +               __entry->newly_acked_sacked = newly_acked_sacked;
> +       ),
> +
> +       TP_printk("src=%pISpc dest=%pISpc newly_lost=%d newly_acked_sacked=%d in_flight=%d",
> +                 __entry->saddr, __entry->daddr, __entry->newly_lost,
> +                 __entry->newly_acked_sacked, __entry->in_flight)
> +);
> +
>  #include <trace/events/net_probe_common.h>
>
>  TRACE_EVENT(tcp_probe,
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
>         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
>                 return;
>
> +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> +

Are there any other reasons why introducing a new tracepoint here?
AFAIK, it can be easily replaced by a bpf related program or script to
monitor in the above position.

Thanks,
Jason
Breno Leitao Jan. 20, 2025, 1:02 p.m. UTC | #2
Hello Jason,

On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> >                 return;
> >
> > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > +
> 
> Are there any other reasons why introducing a new tracepoint here?
> AFAIK, it can be easily replaced by a bpf related program or script to
> monitor in the above position.

In which position exactly?

Thank you,
--breno
Jason Xing Jan. 20, 2025, 1:06 p.m. UTC | #3
On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
>
> Hello Jason,
>
> On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > >                 return;
> > >
> > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > +
> >
> > Are there any other reasons why introducing a new tracepoint here?
> > AFAIK, it can be easily replaced by a bpf related program or script to
> > monitor in the above position.
>
> In which position exactly?

I meant, in the position where you insert a one-line tracepoint, which
should be easily replaced with a bpf program (kprobe
tcp_cwnd_reduction with two checks like in the earlier if-statement).
It doesn't mean that I object to this new tracepoint, just curious if
you have other motivations.

Thanks,
Jason
Breno Leitao Jan. 20, 2025, 1:20 p.m. UTC | #4
On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > --- a/net/ipv4/tcp_input.c
> > > > +++ b/net/ipv4/tcp_input.c
> > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > >                 return;
> > > >
> > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > +
> > >
> > > Are there any other reasons why introducing a new tracepoint here?
> > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > monitor in the above position.
> >
> > In which position exactly?
> 
> I meant, in the position where you insert a one-line tracepoint, which
> should be easily replaced with a bpf program (kprobe
> tcp_cwnd_reduction with two checks like in the earlier if-statement).
> It doesn't mean that I object to this new tracepoint, just curious if
> you have other motivations.

This is exactly the current implementation we have at Meta, as it relies on
hooking into this specific function. This approach is unstable, as
compiler optimizations like inlining can break the functionality.

This patch enhances the API's stability by introducing a guaranteed hook
point, allowing the compiler to make changes without disrupting the
BPF program's functionality.
Steven Rostedt Jan. 20, 2025, 3:03 p.m. UTC | #5
On Mon, 20 Jan 2025 05:20:05 -0800
Breno Leitao <leitao@debian.org> wrote:

> This patch enhances the API's stability by introducing a guaranteed hook
> point, allowing the compiler to make changes without disrupting the
> BPF program's functionality.

Instead of using a TRACE_EVENT() macro, you can use DECLARE_TRACE()
which will create the tracepoint in the kernel, but will not create a
trace event that is exported to the tracefs file system. Then BPF could
hook to it and it will still not be exposed as an user space API.

You can see its use in include/trace/events/sched.h

-- Steve
Jason Xing Jan. 21, 2025, 1:15 a.m. UTC | #6
On Mon, Jan 20, 2025 at 9:20 PM Breno Leitao <leitao@debian.org> wrote:
>
>
> On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> > On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > > --- a/net/ipv4/tcp_input.c
> > > > > +++ b/net/ipv4/tcp_input.c
> > > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > > >                 return;
> > > > >
> > > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > > +
> > > >
> > > > Are there any other reasons why introducing a new tracepoint here?
> > > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > > monitor in the above position.
> > >
> > > In which position exactly?
> >
> > I meant, in the position where you insert a one-line tracepoint, which
> > should be easily replaced with a bpf program (kprobe
> > tcp_cwnd_reduction with two checks like in the earlier if-statement).
> > It doesn't mean that I object to this new tracepoint, just curious if
> > you have other motivations.
>
> This is exactly the current implementation we have at Meta, as it relies on
> hooking into this specific function. This approach is unstable, as
> compiler optimizations like inlining can break the functionality.
>
> This patch enhances the API's stability by introducing a guaranteed hook
> point, allowing the compiler to make changes without disrupting the
> BPF program's functionality.

Surely it does :) The reason why I asked is that perhaps one year ago
I'm trying to add many tracepoints so that the user space monitor can
take advantage of these various stable hook points. I believe that
there are many other places which might be inlined because of gcc,
receiving a few similar reports in recent months, but there is no
guarantee to not touch some functions which obviously break some
monitor applications.

I'm wondering that except for this new transepoint, if we can design a
common usage for the monitor program, so that people who are
interested will work on this together? Profiling is becoming more and
more important nowadays, from the point of my view.

Thanks,
Jason
Jason Xing Jan. 21, 2025, 1:22 a.m. UTC | #7
On Tue, Jan 21, 2025 at 9:15 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Mon, Jan 20, 2025 at 9:20 PM Breno Leitao <leitao@debian.org> wrote:
> >
> >
> > On Mon, Jan 20, 2025 at 09:06:43PM +0800, Jason Xing wrote:
> > > On Mon, Jan 20, 2025 at 9:02 PM Breno Leitao <leitao@debian.org> wrote:
> > > > On Mon, Jan 20, 2025 at 08:08:52PM +0800, Jason Xing wrote:
> > > > > On Mon, Jan 20, 2025 at 8:03 PM Breno Leitao <leitao@debian.org> wrote:
> > > > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > > > index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
> > > > > > --- a/net/ipv4/tcp_input.c
> > > > > > +++ b/net/ipv4/tcp_input.c
> > > > > > @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
> > > > > >         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
> > > > > >                 return;
> > > > > >
> > > > > > +       trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
> > > > > > +
> > > > >
> > > > > Are there any other reasons why introducing a new tracepoint here?
> > > > > AFAIK, it can be easily replaced by a bpf related program or script to
> > > > > monitor in the above position.
> > > >
> > > > In which position exactly?
> > >
> > > I meant, in the position where you insert a one-line tracepoint, which
> > > should be easily replaced with a bpf program (kprobe
> > > tcp_cwnd_reduction with two checks like in the earlier if-statement).
> > > It doesn't mean that I object to this new tracepoint, just curious if
> > > you have other motivations.
> >
> > This is exactly the current implementation we have at Meta, as it relies on
> > hooking into this specific function. This approach is unstable, as
> > compiler optimizations like inlining can break the functionality.
> >
> > This patch enhances the API's stability by introducing a guaranteed hook
> > point, allowing the compiler to make changes without disrupting the
> > BPF program's functionality.
>
> Surely it does :) The reason why I asked is that perhaps one year ago
> I'm trying to add many tracepoints so that the user space monitor can
> take advantage of these various stable hook points. I believe that
> there are many other places which might be inlined because of gcc,
> receiving a few similar reports in recent months, but there is no
> guarantee to not touch some functions which obviously break some
> monitor applications.

Before BPF gets widely used, changing function names will not hurt
applications. Things are a little bit different now, changing names or
inlined function will lead the monitor application adjusting codes
through versions, because they are not kabi functions.

Thanks,
Jason
Breno Leitao Jan. 22, 2025, 9:39 a.m. UTC | #8
Hello Steven,

On Mon, Jan 20, 2025 at 10:03:40AM -0500, Steven Rostedt wrote:
> On Mon, 20 Jan 2025 05:20:05 -0800
> Breno Leitao <leitao@debian.org> wrote:
> 
> > This patch enhances the API's stability by introducing a guaranteed hook
> > point, allowing the compiler to make changes without disrupting the
> > BPF program's functionality.
> 
> Instead of using a TRACE_EVENT() macro, you can use DECLARE_TRACE()
> which will create the tracepoint in the kernel, but will not create a
> trace event that is exported to the tracefs file system. Then BPF could
> hook to it and it will still not be exposed as an user space API.

Right, DECLARE_TRACE would solve my current problem, but, a056a5bed7fa
("sched/debug: Export the newly added tracepoints") says "BPF doesn't
have infrastructure to access these bare tracepoints either.".

Does BPF know how to attach to this bare tracepointers now?

On the other side, it seems real tracepoints is getting more pervasive?
So, this current approach might be OK also?

	https://lore.kernel.org/bpf/20250118033723.GV1977892@ZenIV/T/#m4c2fb2d904e839b34800daf8578dff0b9abd69a0

> You can see its use in include/trace/events/sched.h

I suppose I need to export the tracepointer with
EXPORT_TRACEPOINT_SYMBOL_GPL(), right?

I am trying to hack something as the following, but, I struggled to hook
BPF into it.

Thank you!
--breno

Author: Breno Leitao <leitao@debian.org>
Date:   Fri Jan 17 09:26:22 2025 -0800

    trace: tcp: Add tracepoint for tcp_cwnd_reduction()
    
    Add a lightweight tracepoint to monitor TCP congestion window
    adjustments via tcp_cwnd_reduction(). This tracepoint enables tracking
    of:
      - TCP window size fluctuations
      - Active socket behavior
      - Congestion window reduction events
    
    Meta has been using BPF programs to monitor this function for years.
    Adding a proper tracepoint provides a stable API for all users who need
    to monitor TCP congestion window behavior.
    
    Use DECLARE_TRACE instead of TRACE_EVENT to avoid creating trace event
    infrastructure and exporting to tracefs, keeping the implementation
    minimal. (Thanks Steven Rostedt)

    Signed-off-by: Breno Leitao <leitao@debian.org>

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index a27c4b619dffd..07add3e20931a 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -259,6 +259,11 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+DECLARE_TRACE(tcp_cwnd_reduction_tp,
+	TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
+		 const int newly_lost, const int flag),
+	TP_ARGS(sk, newly_acked_sacked, newly_lost, flag));
+
 #include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(tcp_probe,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4811727b8a022..74cf8dbbedaa0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
 	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
 		return;
 
+	trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
+
 	tp->prr_delivered += newly_acked_sacked;
 	if (delta < 0) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
@@ -2726,6 +2728,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
 	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
 	tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
 }
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_cwnd_reduction_tp);
 
 static inline void tcp_end_cwnd_reduction(struct sock *sk)
 {
Steven Rostedt Jan. 22, 2025, 2:56 p.m. UTC | #9
On Wed, 22 Jan 2025 01:39:42 -0800
Breno Leitao <leitao@debian.org> wrote:

> Right, DECLARE_TRACE would solve my current problem, but, a056a5bed7fa
> ("sched/debug: Export the newly added tracepoints") says "BPF doesn't
> have infrastructure to access these bare tracepoints either.".
> 
> Does BPF know how to attach to this bare tracepointers now?
> 
> On the other side, it seems real tracepoints is getting more pervasive?
> So, this current approach might be OK also?
> 
> 	https://lore.kernel.org/bpf/20250118033723.GV1977892@ZenIV/T/#m4c2fb2d904e839b34800daf8578dff0b9abd69a0

Thanks for the pointer. I didn't know this discussion was going on. I just
asked to attend if this gets accepted. I'm only a 6 hour drive from
Montreal anyway.

> 
> > You can see its use in include/trace/events/sched.h  
> 
> I suppose I need to export the tracepointer with
> EXPORT_TRACEPOINT_SYMBOL_GPL(), right?

For modules to use them directly, yes. But there's other ways too.

> 
> I am trying to hack something as the following, but, I struggled to hook
> BPF into it.

Maybe you can use the iterator to search for the tracepoint.

#include <linux/tracepoint.h>

static void fct(struct tracepoint *tp, void *priv)
{
	if (!tp->name || strcmp(tp->name, "<tracepoint_name>") != 0)
		return 0;

	// attach to tracepoint tp
}

[..]
	for_each_kernel_tracepoint(fct, NULL);

This is how LTTng hooks to tracepoints.

-- Steve
Yonghong Song Jan. 22, 2025, 6:40 p.m. UTC | #10
On 1/22/25 1:39 AM, Breno Leitao wrote:
> Hello Steven,
>
> On Mon, Jan 20, 2025 at 10:03:40AM -0500, Steven Rostedt wrote:
>> On Mon, 20 Jan 2025 05:20:05 -0800
>> Breno Leitao <leitao@debian.org> wrote:
>>
>>> This patch enhances the API's stability by introducing a guaranteed hook
>>> point, allowing the compiler to make changes without disrupting the
>>> BPF program's functionality.
>> Instead of using a TRACE_EVENT() macro, you can use DECLARE_TRACE()
>> which will create the tracepoint in the kernel, but will not create a
>> trace event that is exported to the tracefs file system. Then BPF could
>> hook to it and it will still not be exposed as an user space API.
> Right, DECLARE_TRACE would solve my current problem, but, a056a5bed7fa
> ("sched/debug: Export the newly added tracepoints") says "BPF doesn't
> have infrastructure to access these bare tracepoints either.".
>
> Does BPF know how to attach to this bare tracepointers now?
>
> On the other side, it seems real tracepoints is getting more pervasive?
> So, this current approach might be OK also?
>
> 	https://lore.kernel.org/bpf/20250118033723.GV1977892@ZenIV/T/#m4c2fb2d904e839b34800daf8578dff0b9abd69a0
>
>> You can see its use in include/trace/events/sched.h
> I suppose I need to export the tracepointer with
> EXPORT_TRACEPOINT_SYMBOL_GPL(), right?
>
> I am trying to hack something as the following, but, I struggled to hook
> BPF into it.
>
> Thank you!
> --breno
>
> Author: Breno Leitao <leitao@debian.org>
> Date:   Fri Jan 17 09:26:22 2025 -0800
>
>      trace: tcp: Add tracepoint for tcp_cwnd_reduction()
>      
>      Add a lightweight tracepoint to monitor TCP congestion window
>      adjustments via tcp_cwnd_reduction(). This tracepoint enables tracking
>      of:
>        - TCP window size fluctuations
>        - Active socket behavior
>        - Congestion window reduction events
>      
>      Meta has been using BPF programs to monitor this function for years.
>      Adding a proper tracepoint provides a stable API for all users who need
>      to monitor TCP congestion window behavior.
>      
>      Use DECLARE_TRACE instead of TRACE_EVENT to avoid creating trace event
>      infrastructure and exporting to tracefs, keeping the implementation
>      minimal. (Thanks Steven Rostedt)
>
>      Signed-off-by: Breno Leitao <leitao@debian.org>
>
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index a27c4b619dffd..07add3e20931a 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -259,6 +259,11 @@ TRACE_EVENT(tcp_retransmit_synack,
>   		  __entry->saddr_v6, __entry->daddr_v6)
>   );
>   
> +DECLARE_TRACE(tcp_cwnd_reduction_tp,
> +	TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
> +		 const int newly_lost, const int flag),

I don't think we need 'const' for int types. For 'const strcut sock *',
it makes sense since we do not want sk-><fields> get changed.

> +	TP_ARGS(sk, newly_acked_sacked, newly_lost, flag));
> +
>   #include <trace/events/net_probe_common.h>
>   
>   TRACE_EVENT(tcp_probe,
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 4811727b8a022..74cf8dbbedaa0 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2710,6 +2710,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
>   	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
>   		return;
>   
> +	trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
> +
>   	tp->prr_delivered += newly_acked_sacked;
>   	if (delta < 0) {
>   		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
> @@ -2726,6 +2728,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
>   	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
>   	tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
>   }
> +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_cwnd_reduction_tp);
>   
>   static inline void tcp_end_cwnd_reduction(struct sock *sk)
>   {
Yonghong Song Jan. 22, 2025, 7:02 p.m. UTC | #11
On 1/22/25 6:56 AM, Steven Rostedt wrote:
> On Wed, 22 Jan 2025 01:39:42 -0800
> Breno Leitao <leitao@debian.org> wrote:
>
>> Right, DECLARE_TRACE would solve my current problem, but, a056a5bed7fa
>> ("sched/debug: Export the newly added tracepoints") says "BPF doesn't
>> have infrastructure to access these bare tracepoints either.".
>>
>> Does BPF know how to attach to this bare tracepointers now?
>>
>> On the other side, it seems real tracepoints is getting more pervasive?
>> So, this current approach might be OK also?
>>
>> 	https://lore.kernel.org/bpf/20250118033723.GV1977892@ZenIV/T/#m4c2fb2d904e839b34800daf8578dff0b9abd69a0
> Thanks for the pointer. I didn't know this discussion was going on. I just
> asked to attend if this gets accepted. I'm only a 6 hour drive from
> Montreal anyway.
>
>>> You can see its use in include/trace/events/sched.h
>> I suppose I need to export the tracepointer with
>> EXPORT_TRACEPOINT_SYMBOL_GPL(), right?
> For modules to use them directly, yes. But there's other ways too.
>
>> I am trying to hack something as the following, but, I struggled to hook
>> BPF into it.
> Maybe you can use the iterator to search for the tracepoint.
>
> #include <linux/tracepoint.h>
>
> static void fct(struct tracepoint *tp, void *priv)
> {
> 	if (!tp->name || strcmp(tp->name, "<tracepoint_name>") != 0)
> 		return 0;
>
> 	// attach to tracepoint tp
> }
>
> [..]
> 	for_each_kernel_tracepoint(fct, NULL);
>
> This is how LTTng hooks to tracepoints.

The LTTng approach in the above needs a kernel module to enable and disable
the tracepoint and this is not a bpf-way to handle tracepoints.

So for bpf, we need a new UAPI to pass <tracepoint_name> from user
space to the kernel to attach to tracepoint tp since <tracepont_name> is not
available in trace_fs.

What is the criteria for a tracepoint to be a normal tp or a bare tp?


>
> -- Steve
>
Yonghong Song Jan. 24, 2025, 4:40 a.m. UTC | #12
On 1/22/25 6:56 AM, Steven Rostedt wrote:
> On Wed, 22 Jan 2025 01:39:42 -0800
> Breno Leitao <leitao@debian.org> wrote:
>
>> Right, DECLARE_TRACE would solve my current problem, but, a056a5bed7fa
>> ("sched/debug: Export the newly added tracepoints") says "BPF doesn't
>> have infrastructure to access these bare tracepoints either.".
>>
>> Does BPF know how to attach to this bare tracepointers now?
>>
>> On the other side, it seems real tracepoints is getting more pervasive?
>> So, this current approach might be OK also?
>>
>> 	https://lore.kernel.org/bpf/20250118033723.GV1977892@ZenIV/T/#m4c2fb2d904e839b34800daf8578dff0b9abd69a0
> Thanks for the pointer. I didn't know this discussion was going on. I just
> asked to attend if this gets accepted. I'm only a 6 hour drive from
> Montreal anyway.
>
>>> You can see its use in include/trace/events/sched.h
>> I suppose I need to export the tracepointer with
>> EXPORT_TRACEPOINT_SYMBOL_GPL(), right?
> For modules to use them directly, yes. But there's other ways too.
>
>> I am trying to hack something as the following, but, I struggled to hook
>> BPF into it.
> Maybe you can use the iterator to search for the tracepoint.
>
> #include <linux/tracepoint.h>
>
> static void fct(struct tracepoint *tp, void *priv)
> {
> 	if (!tp->name || strcmp(tp->name, "<tracepoint_name>") != 0)
> 		return 0;
>
> 	// attach to tracepoint tp
> }
>
> [..]
> 	for_each_kernel_tracepoint(fct, NULL);
>
> This is how LTTng hooks to tracepoints.

Hi, Steve,

I did some prototype to support bpf dynamic_events tracepoint. I fixed a couple of issues
but still not working. See the hacked patch:

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9ea4c404bd4e..729ea1c21c94 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -824,6 +824,15 @@ DECLARE_TRACE(sched_compute_energy_tp,
                  unsigned long max_util, unsigned long busy_time),
         TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
                                                                                                                                           
+/* At /sys/kernel/debug/tracing directory, do
+ *   echo 't sched_switch_dynamic' >> dynamic_events
+ * before actually use this tracepoint. The tracepoint will be at
+ *   /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/
+ */
+DECLARE_TRACE(sched_switch_dynamic,
+       TP_PROTO(bool preempt),
+       TP_ARGS(preempt));
+
  #endif /* _TRACE_SCHED_H */
                                                                                                                                           
  /* This part must be outside protection */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 065f9188b44a..37391eb5089f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10749,6 +10749,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
  int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                             u64 bpf_cookie)
  {
+       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
         bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
   
         if (!perf_event_is_tracing(event))
@@ -10756,7 +10757,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
   
         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
         is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
-       is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+       is_tracepoint = (event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT) ||
+                       ((event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags);
+
         is_syscall_tp = is_syscall_trace_event(event->tp_event);
         if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
                 /* bpf programs can only be attached to u/kprobe or tracepoint */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e5a6bf587f9..53b3d9e20d00 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6750,6 +6750,7 @@ static void __sched notrace __schedule(int sched_mode)
                 psi_account_irqtime(rq, prev, next);
                 psi_sched_switch(prev, next, block);
   
+               trace_sched_switch_dynamic(preempt);
                 trace_sched_switch(preempt, prev, next, prev_state);
   
                 /* Also unlocks the rq: */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 71c1c02ca7a3..8f9fd2f347ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2448,7 +2448,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
                             u64 *probe_offset, u64 *probe_addr,
                             unsigned long *missed)
  {
-       bool is_tracepoint, is_syscall_tp;
+       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
+       bool is_tracepoint, is_dyn_tracepoint, is_syscall_tp;
         struct bpf_prog *prog;
         int flags, err = 0;
   
@@ -2463,9 +2464,10 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
         *prog_id = prog->aux->id;
         flags = event->tp_event->flags;
         is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+       is_dyn_tracepoint = (event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags;
         is_syscall_tp = is_syscall_trace_event(event->tp_event);
   
-       if (is_tracepoint || is_syscall_tp) {
+       if (is_tracepoint || is_dyn_tracepoint || is_syscall_tp) {
                 *buf = is_tracepoint ? event->tp_event->tp->name
                                      : event->tp_event->name;
                 /* We allow NULL pointer for tracepoint */
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index c62d1629cffe..bacc4a1f5f20 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -436,8 +436,10 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
   
  static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
  {
-       if (trace_fprobe_is_registered(tf))
+       if (trace_fprobe_is_registered(tf)) {
+               pr_warn("fprobe is enabled\n");
                 enable_fprobe(&tf->fp);
+       }
   
         return 0;
  }
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 1702aa592c2c..423770aa581e 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -196,6 +196,8 @@ static void test_send_signal_common(struct perf_event_attr *attr,
         /* notify child safe to exit */
         ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
   
+       ASSERT_EQ(skel->bss->dyn_tp_visited, 1, "dyn_tp_visited");
+
  disable_pmu:
         close(pmu_fd);
  destroy_skel:
@@ -260,6 +262,8 @@ void test_send_signal(void)
  {
         if (test__start_subtest("send_signal_tracepoint"))
                 test_send_signal_tracepoint(false, false);
+/* Disable all other subtests except above send_signal_tracepoint. */
+if (0) {
         if (test__start_subtest("send_signal_perf"))
                 test_send_signal_perf(false, false);
         if (test__start_subtest("send_signal_nmi"))
@@ -285,3 +289,4 @@ void test_send_signal(void)
         if (test__start_subtest("send_signal_nmi_thread_remote"))
                 test_send_signal_nmi(true, true);
  }
+}
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
index 176a355e3062..9b580d437046 100644
--- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -60,6 +60,20 @@ int send_signal_tp_sched(void *ctx)
         return bpf_send_signal_test(ctx);
  }
   
+int dyn_tp_visited = 0;
+#if 1
+/* This will fail */
+SEC("tracepoint/tracepoints/sched_switch_dynamic")
+#else
+/* This will succeed */
+SEC("tracepoint/sched/sched_switch")
+#endif
+int send_signal_dyn_tp_sched(void *ctx)
+{
+       dyn_tp_visited = 1;
+       return 0;
+}
+
  SEC("perf_event")
  int send_signal_perf(void *ctx)
  {

To test the above, build the latest bpf-next and then apply the above change.
Boot the updated kernel and boot into a qemu VM and run bpf selftest
   ./test_progs -t send_signal

With tracepoint tracepoint/sched/sched_switch, the test result is correct.
With tracepoint tracepoint/tracepoints/sched_switch_dynamic, the test failed.
The test failure means the sched_switch_dynamic tracepoint is not
triggered with the bpf program.

As expected, do
   echo 1 > /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/enable
and the sched_switch_dynamic tracepoint works as expected (trace_pipe can dump
the expected result).

For both 'echo 1 > .../sched_switch_dynamic/enable' approach and
bpf tracepoint tracepoint/tracepoints/sched_switch_dynamic, the message
   fprobe is enabled
is printed out in dmesg. The following is enable_fprobe() code.

/**
  * enable_fprobe() - Enable fprobe
  * @fp: The fprobe to be enabled.
  *
  * This will soft-enable @fp.
  */
static inline void enable_fprobe(struct fprobe *fp)
{
         if (fp)
                 fp->flags &= ~FPROBE_FL_DISABLED;
}

Note that in the above the fprobe/dynamic_events is soft-enable.
Maybe the bpf tracepoint/tracepoints/sched_switch_dynamic only has
soft-enable and 'echo 1 > ...' approach has both soft-enable and
actual hard-enable (at pmu level)? If this is the case, what is
missing for hard-enable for bpf dynamic_events case? Do you have
any suggestions?


The above prototype tries to reuse the existing infra/API.
If you have better way to support dynamic_events, please let me know.

Thanks,

Yonghong

>
> -- Steve
>
Steven Rostedt Jan. 24, 2025, 3:50 p.m. UTC | #13
On Thu, 23 Jan 2025 20:40:20 -0800
Yonghong Song <yonghong.song@linux.dev> wrote:

> Hi, Steve,
> 
> I did some prototype to support bpf dynamic_events tracepoint. I fixed a couple of issues

Before I go further, I want to make sure that we have our terminology's
inline. A dynamic event is one that is created at run time. For example, a
kprobe event is a dynamic event. Other dynamic events are synthetic events,
event probes, and uprobes.

> but still not working. See the hacked patch:
> 
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 9ea4c404bd4e..729ea1c21c94 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -824,6 +824,15 @@ DECLARE_TRACE(sched_compute_energy_tp,
>                   unsigned long max_util, unsigned long busy_time),
>          TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
>                                                                                                                                            
> +/* At /sys/kernel/debug/tracing directory, do
> + *   echo 't sched_switch_dynamic' >> dynamic_events
> + * before actually use this tracepoint. The tracepoint will be at
> + *   /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/
> + */
> +DECLARE_TRACE(sched_switch_dynamic,
> +       TP_PROTO(bool preempt),
> +       TP_ARGS(preempt));

The above is just creating a tracepoint and no "event" is attached to it.

tracepoints (defined in include/linux/tracepoints.h) are what creates the
hooks within the kernel. The trace_##call() functions, like trace_sched_switch().
A trace event is something that can be attached to tracepoints, and show up
in the tracefs file system. The TRACE_EVENT() macro will create both a
tracepoint and a trace event that attaches to the tracepoint it created.

> +
>   #endif /* _TRACE_SCHED_H */
>                                                                                                                                            
>   /* This part must be outside protection */
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 065f9188b44a..37391eb5089f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -10749,6 +10749,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
>   int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
>                              u64 bpf_cookie)
>   {
> +       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
>          bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
>    
>          if (!perf_event_is_tracing(event))
> @@ -10756,7 +10757,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
>    
>          is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
>          is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
> -       is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
> +       is_tracepoint = (event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT) ||
> +                       ((event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags);
> +
>          is_syscall_tp = is_syscall_trace_event(event->tp_event);
>          if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
>                  /* bpf programs can only be attached to u/kprobe or tracepoint */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3e5a6bf587f9..53b3d9e20d00 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6750,6 +6750,7 @@ static void __sched notrace __schedule(int sched_mode)
>                  psi_account_irqtime(rq, prev, next);
>                  psi_sched_switch(prev, next, block);
>    
> +               trace_sched_switch_dynamic(preempt);
>                  trace_sched_switch(preempt, prev, next, prev_state);
>    
>                  /* Also unlocks the rq: */
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 71c1c02ca7a3..8f9fd2f347ef 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2448,7 +2448,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>                              u64 *probe_offset, u64 *probe_addr,
>                              unsigned long *missed)
>   {
> -       bool is_tracepoint, is_syscall_tp;
> +       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
> +       bool is_tracepoint, is_dyn_tracepoint, is_syscall_tp;
>          struct bpf_prog *prog;
>          int flags, err = 0;
>    
> @@ -2463,9 +2464,10 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>          *prog_id = prog->aux->id;
>          flags = event->tp_event->flags;
>          is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> +       is_dyn_tracepoint = (event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags;
>          is_syscall_tp = is_syscall_trace_event(event->tp_event);
>    
> -       if (is_tracepoint || is_syscall_tp) {
> +       if (is_tracepoint || is_dyn_tracepoint || is_syscall_tp) {
>                  *buf = is_tracepoint ? event->tp_event->tp->name
>                                       : event->tp_event->name;
>                  /* We allow NULL pointer for tracepoint */
> diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
> index c62d1629cffe..bacc4a1f5f20 100644
> --- a/kernel/trace/trace_fprobe.c
> +++ b/kernel/trace/trace_fprobe.c
> @@ -436,8 +436,10 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
>    
>   static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
>   {
> -       if (trace_fprobe_is_registered(tf))
> +       if (trace_fprobe_is_registered(tf)) {
> +               pr_warn("fprobe is enabled\n");
>                  enable_fprobe(&tf->fp);
> +       }
>    
>          return 0;
>   }
> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> index 1702aa592c2c..423770aa581e 100644
> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> @@ -196,6 +196,8 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>          /* notify child safe to exit */
>          ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>    
> +       ASSERT_EQ(skel->bss->dyn_tp_visited, 1, "dyn_tp_visited");
> +
>   disable_pmu:
>          close(pmu_fd);
>   destroy_skel:
> @@ -260,6 +262,8 @@ void test_send_signal(void)
>   {
>          if (test__start_subtest("send_signal_tracepoint"))
>                  test_send_signal_tracepoint(false, false);
> +/* Disable all other subtests except above send_signal_tracepoint. */
> +if (0) {
>          if (test__start_subtest("send_signal_perf"))
>                  test_send_signal_perf(false, false);
>          if (test__start_subtest("send_signal_nmi"))
> @@ -285,3 +289,4 @@ void test_send_signal(void)
>          if (test__start_subtest("send_signal_nmi_thread_remote"))
>                  test_send_signal_nmi(true, true);
>   }

The above looks like you are trying to create your own dynamic event on top
of the tracepoint you made.

> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> index 176a355e3062..9b580d437046 100644
> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> @@ -60,6 +60,20 @@ int send_signal_tp_sched(void *ctx)
>          return bpf_send_signal_test(ctx);
>   }
>    
> +int dyn_tp_visited = 0;
> +#if 1
> +/* This will fail */
> +SEC("tracepoint/tracepoints/sched_switch_dynamic")
> +#else
> +/* This will succeed */
> +SEC("tracepoint/sched/sched_switch")
> +#endif

I don't know bpf code at all, so I don't know what this is trying to do. Is
it looking at the tracefs file system?

What I was suggesting, was not to use trace events at all (nothing should
be added to the tracefs files system). You would need some code inside the
kernel to search for tracepoints that are not exported to tracefs and then
BPF could provide its own hooks to them. Isn't this what BPF raw tracepoints do?

> +int send_signal_dyn_tp_sched(void *ctx)
> +{
> +       dyn_tp_visited = 1;
> +       return 0;
> +}
> +
>   SEC("perf_event")
>   int send_signal_perf(void *ctx)
>   {
> 
> To test the above, build the latest bpf-next and then apply the above change.
> Boot the updated kernel and boot into a qemu VM and run bpf selftest
>    ./test_progs -t send_signal
> 
> With tracepoint tracepoint/sched/sched_switch, the test result is correct.
> With tracepoint tracepoint/tracepoints/sched_switch_dynamic, the test failed.
> The test failure means the sched_switch_dynamic tracepoint is not
> triggered with the bpf program.
> 
> As expected, do
>    echo 1 > /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/enable
> and the sched_switch_dynamic tracepoint works as expected (trace_pipe can dump
> the expected result).
> 
> For both 'echo 1 > .../sched_switch_dynamic/enable' approach and
> bpf tracepoint tracepoint/tracepoints/sched_switch_dynamic, the message
>    fprobe is enabled
> is printed out in dmesg. The following is enable_fprobe() code.
> 
> /**
>   * enable_fprobe() - Enable fprobe
>   * @fp: The fprobe to be enabled.
>   *
>   * This will soft-enable @fp.
>   */
> static inline void enable_fprobe(struct fprobe *fp)
> {
>          if (fp)
>                  fp->flags &= ~FPROBE_FL_DISABLED;
> }
> 
> Note that in the above the fprobe/dynamic_events is soft-enable.
> Maybe the bpf tracepoint/tracepoints/sched_switch_dynamic only has
> soft-enable and 'echo 1 > ...' approach has both soft-enable and
> actual hard-enable (at pmu level)? If this is the case, what is
> missing for hard-enable for bpf dynamic_events case? Do you have
> any suggestions?
> 
> 
> The above prototype tries to reuse the existing infra/API.
> If you have better way to support dynamic_events, please let me know.

Either create a full trace event (one that is exposed in tracefs), or add a
direct hook to the tracepoint you want. What the above looks like is some
kind of mixture of the two. No "dynamic events" should be involed.

-- Steve
Yonghong Song Jan. 24, 2025, 5:35 p.m. UTC | #14
On 1/24/25 7:50 AM, Steven Rostedt wrote:
> On Thu, 23 Jan 2025 20:40:20 -0800
> Yonghong Song <yonghong.song@linux.dev> wrote:
>
>> Hi, Steve,
>>
>> I did some prototype to support bpf dynamic_events tracepoint. I fixed a couple of issues
> Before I go further, I want to make sure that we have our terminology's
> inline. A dynamic event is one that is created at run time. For example, a
> kprobe event is a dynamic event. Other dynamic events are synthetic events,
> event probes, and uprobes.
>
>> but still not working. See the hacked patch:
>>
>> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
>> index 9ea4c404bd4e..729ea1c21c94 100644
>> --- a/include/trace/events/sched.h
>> +++ b/include/trace/events/sched.h
>> @@ -824,6 +824,15 @@ DECLARE_TRACE(sched_compute_energy_tp,
>>                    unsigned long max_util, unsigned long busy_time),
>>           TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
>>                                                                                                                                             
>> +/* At /sys/kernel/debug/tracing directory, do
>> + *   echo 't sched_switch_dynamic' >> dynamic_events
>> + * before actually use this tracepoint. The tracepoint will be at
>> + *   /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/
>> + */
>> +DECLARE_TRACE(sched_switch_dynamic,
>> +       TP_PROTO(bool preempt),
>> +       TP_ARGS(preempt));
> The above is just creating a tracepoint and no "event" is attached to it.
>
> tracepoints (defined in include/linux/tracepoints.h) are what creates the
> hooks within the kernel. The trace_##call() functions, like trace_sched_switch().
> A trace event is something that can be attached to tracepoints, and show up
> in the tracefs file system. The TRACE_EVENT() macro will create both a
> tracepoint and a trace event that attaches to the tracepoint it created.
>
>> +
>>    #endif /* _TRACE_SCHED_H */
>>                                                                                                                                             
>>    /* This part must be outside protection */
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 065f9188b44a..37391eb5089f 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -10749,6 +10749,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
>>    int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
>>                               u64 bpf_cookie)
>>    {
>> +       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
>>           bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
>>     
>>           if (!perf_event_is_tracing(event))
>> @@ -10756,7 +10757,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
>>     
>>           is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
>>           is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
>> -       is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
>> +       is_tracepoint = (event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT) ||
>> +                       ((event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags);
>> +
>>           is_syscall_tp = is_syscall_trace_event(event->tp_event);
>>           if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
>>                   /* bpf programs can only be attached to u/kprobe or tracepoint */
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 3e5a6bf587f9..53b3d9e20d00 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -6750,6 +6750,7 @@ static void __sched notrace __schedule(int sched_mode)
>>                   psi_account_irqtime(rq, prev, next);
>>                   psi_sched_switch(prev, next, block);
>>     
>> +               trace_sched_switch_dynamic(preempt);
>>                   trace_sched_switch(preempt, prev, next, prev_state);
>>     
>>                   /* Also unlocks the rq: */
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index 71c1c02ca7a3..8f9fd2f347ef 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -2448,7 +2448,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>>                               u64 *probe_offset, u64 *probe_addr,
>>                               unsigned long *missed)
>>    {
>> -       bool is_tracepoint, is_syscall_tp;
>> +       u32 dyn_tp_flags = TRACE_EVENT_FL_DYNAMIC | TRACE_EVENT_FL_FPROBE;
>> +       bool is_tracepoint, is_dyn_tracepoint, is_syscall_tp;
>>           struct bpf_prog *prog;
>>           int flags, err = 0;
>>     
>> @@ -2463,9 +2464,10 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>>           *prog_id = prog->aux->id;
>>           flags = event->tp_event->flags;
>>           is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
>> +       is_dyn_tracepoint = (event->tp_event->flags & dyn_tp_flags) == dyn_tp_flags;
>>           is_syscall_tp = is_syscall_trace_event(event->tp_event);
>>     
>> -       if (is_tracepoint || is_syscall_tp) {
>> +       if (is_tracepoint || is_dyn_tracepoint || is_syscall_tp) {
>>                   *buf = is_tracepoint ? event->tp_event->tp->name
>>                                        : event->tp_event->name;
>>                   /* We allow NULL pointer for tracepoint */
>> diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
>> index c62d1629cffe..bacc4a1f5f20 100644
>> --- a/kernel/trace/trace_fprobe.c
>> +++ b/kernel/trace/trace_fprobe.c
>> @@ -436,8 +436,10 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
>>     
>>    static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
>>    {
>> -       if (trace_fprobe_is_registered(tf))
>> +       if (trace_fprobe_is_registered(tf)) {
>> +               pr_warn("fprobe is enabled\n");
>>                   enable_fprobe(&tf->fp);
>> +       }
>>     
>>           return 0;
>>    }
>> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> index 1702aa592c2c..423770aa581e 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> @@ -196,6 +196,8 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>           /* notify child safe to exit */
>>           ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>>     
>> +       ASSERT_EQ(skel->bss->dyn_tp_visited, 1, "dyn_tp_visited");
>> +
>>    disable_pmu:
>>           close(pmu_fd);
>>    destroy_skel:
>> @@ -260,6 +262,8 @@ void test_send_signal(void)
>>    {
>>           if (test__start_subtest("send_signal_tracepoint"))
>>                   test_send_signal_tracepoint(false, false);
>> +/* Disable all other subtests except above send_signal_tracepoint. */
>> +if (0) {
>>           if (test__start_subtest("send_signal_perf"))
>>                   test_send_signal_perf(false, false);
>>           if (test__start_subtest("send_signal_nmi"))
>> @@ -285,3 +289,4 @@ void test_send_signal(void)
>>           if (test__start_subtest("send_signal_nmi_thread_remote"))
>>                   test_send_signal_nmi(true, true);
>>    }
> The above looks like you are trying to create your own dynamic event on top
> of the tracepoint you made.
>
>> +}
>> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> index 176a355e3062..9b580d437046 100644
>> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> @@ -60,6 +60,20 @@ int send_signal_tp_sched(void *ctx)
>>           return bpf_send_signal_test(ctx);
>>    }
>>     
>> +int dyn_tp_visited = 0;
>> +#if 1
>> +/* This will fail */
>> +SEC("tracepoint/tracepoints/sched_switch_dynamic")
>> +#else
>> +/* This will succeed */
>> +SEC("tracepoint/sched/sched_switch")
>> +#endif
> I don't know bpf code at all, so I don't know what this is trying to do. Is
> it looking at the tracefs file system?

Thanks for the above clarification for terminologies.
For the above code, yet, I try to look at tracefs file system since
at runtime once we add 'sched_switch_dynamic' to dynamic_events,
the above tracepoint tracepoints/sched_switch_dynamic will show up
in /sys/kernel/debug/tracing/events/ directory.

>
> What I was suggesting, was not to use trace events at all (nothing should
> be added to the tracefs files system). You would need some code inside the
> kernel to search for tracepoints that are not exported to tracefs and then
> BPF could provide its own hooks to them. Isn't this what BPF raw tracepoints do?

Thanks for suggestion! I tried and raw tracepoint indeed work. The following
is a hack:

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9ea4c404bd4e..db28c5c37a10 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -824,6 +824,10 @@ DECLARE_TRACE(sched_compute_energy_tp,
                  unsigned long max_util, unsigned long busy_time),
         TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
                                                                                                                                           
+DECLARE_TRACE(sched_switch_hack,
+       TP_PROTO(bool preempt),
+       TP_ARGS(preempt));
+
  #endif /* _TRACE_SCHED_H */
                                                                                                                                           
  /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88a9a515b2ba..df3e52d89c94 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6754,6 +6754,7 @@ static void __sched notrace __schedule(int sched_mode)
                 psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
                                              prev->se.sched_delayed);
                                                                                                                                           
+               trace_sched_switch_hack(preempt);
                 trace_sched_switch(preempt, prev, next, prev_state);
   
                 /* Also unlocks the rq: */
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 1702aa592c2c..937d5e05fe08 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -195,6 +195,8 @@ static void test_send_signal_common(struct perf_event_attr *attr,
   
         /* notify child safe to exit */
         ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
+       if (!attr)
+               ASSERT_EQ(skel->bss->dyn_tp_visited, 1, "dyn_tp_visited");
   
  disable_pmu:
         close(pmu_fd);
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
index 176a355e3062..abed1a55ae3b 100644
--- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -60,6 +60,15 @@ int send_signal_tp_sched(void *ctx)
         return bpf_send_signal_test(ctx);
  }
   
+int dyn_tp_visited = 0;
+
+SEC("raw_tp/sched_switch_hack")
+int send_signal_dyn_tp_sched(void *ctx)
+{
+       dyn_tp_visited = 1;
+       return 0;
+}
+
  SEC("perf_event")
  int send_signal_perf(void *ctx)
  {

The raw_tp/sched_switch_hack works great!

>
>> +int send_signal_dyn_tp_sched(void *ctx)
>> +{
>> +       dyn_tp_visited = 1;
>> +       return 0;
>> +}
>> +
>>    SEC("perf_event")
>>    int send_signal_perf(void *ctx)
>>    {
>>
>> To test the above, build the latest bpf-next and then apply the above change.
>> Boot the updated kernel and boot into a qemu VM and run bpf selftest
>>     ./test_progs -t send_signal
>>
>> With tracepoint tracepoint/sched/sched_switch, the test result is correct.
>> With tracepoint tracepoint/tracepoints/sched_switch_dynamic, the test failed.
>> The test failure means the sched_switch_dynamic tracepoint is not
>> triggered with the bpf program.
>>
>> As expected, do
>>     echo 1 > /sys/kernel/debug/tracing/events/tracepoints/sched_switch_dynamic/enable
>> and the sched_switch_dynamic tracepoint works as expected (trace_pipe can dump
>> the expected result).
>>
>> For both 'echo 1 > .../sched_switch_dynamic/enable' approach and
>> bpf tracepoint tracepoint/tracepoints/sched_switch_dynamic, the message
>>     fprobe is enabled
>> is printed out in dmesg. The following is enable_fprobe() code.
>>
>> /**
>>    * enable_fprobe() - Enable fprobe
>>    * @fp: The fprobe to be enabled.
>>    *
>>    * This will soft-enable @fp.
>>    */
>> static inline void enable_fprobe(struct fprobe *fp)
>> {
>>           if (fp)
>>                   fp->flags &= ~FPROBE_FL_DISABLED;
>> }
>>
>> Note that in the above the fprobe/dynamic_events is soft-enable.
>> Maybe the bpf tracepoint/tracepoints/sched_switch_dynamic only has
>> soft-enable and 'echo 1 > ...' approach has both soft-enable and
>> actual hard-enable (at pmu level)? If this is the case, what is
>> missing for hard-enable for bpf dynamic_events case? Do you have
>> any suggestions?
>>
>>
>> The above prototype tries to reuse the existing infra/API.
>> If you have better way to support dynamic_events, please let me know.
> Either create a full trace event (one that is exposed in tracefs), or add a
> direct hook to the tracepoint you want. What the above looks like is some
> kind of mixture of the two. No "dynamic events" should be involed.

Indeed, as you suggested in the above, raw_tracepoint works fine for bpf
for those DECLARE_TRACE tracepoints. We can just use it.

>
> -- Steve
>
diff mbox series

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index a27c4b619dffd7dcc72fffa71bf0fd5e34fe6681..b3a636658b39721cca843c0000eaa573cf4d09d5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -259,6 +259,40 @@  TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+TRACE_EVENT(tcp_cwnd_reduction,
+
+	TP_PROTO(const struct sock *sk, const int newly_acked_sacked,
+		 const int newly_lost, const int flag),
+
+	TP_ARGS(sk, newly_acked_sacked, newly_lost, flag),
+
+	TP_STRUCT__entry(
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(int, in_flight)
+
+		__field(int, newly_acked_sacked)
+		__field(int, newly_lost)
+	),
+
+	TP_fast_assign(
+		const struct inet_sock *inet = inet_sk(sk);
+		const struct tcp_sock *tp = tcp_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+		__entry->in_flight = tcp_packets_in_flight(tp);
+		__entry->newly_lost = newly_lost;
+		__entry->newly_acked_sacked = newly_acked_sacked;
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc newly_lost=%d newly_acked_sacked=%d in_flight=%d",
+		  __entry->saddr, __entry->daddr, __entry->newly_lost,
+		  __entry->newly_acked_sacked, __entry->in_flight)
+);
+
 #include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(tcp_probe,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4811727b8a02258ec6fa1fd129beecf7cbb0f90e..fc88c511e81bc12ec57e8dc3e9185a920d1bd079 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2710,6 +2710,8 @@  void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
 	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
 		return;
 
+	trace_tcp_cwnd_reduction(sk, newly_acked_sacked, newly_lost, flag);
+
 	tp->prr_delivered += newly_acked_sacked;
 	if (delta < 0) {
 		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +