diff mbox series

[net-next,02/13] ipv4: introduce tracepoint trace_ip_queue_xmit()

Message ID 20210805185750.4522-3-xiyou.wangcong@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: add more tracepoints to TCP/IP stack | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: mingo@redhat.com dsahern@kernel.org davem@davemloft.net rostedt@goodmis.org kuba@kernel.org yoshfuji@linux-ipv6.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4 this patch: 3
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Lines should not end with a '(' CHECK: No space is necessary after a cast WARNING: Using vsprintf specifier '%px' potentially exposes the kernel memory layout, if you don't really need the address please consider using '%p'. WARNING: line length of 81 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 3
netdev/header_inline success Link

Commit Message

Cong Wang Aug. 5, 2021, 6:57 p.m. UTC
From: Qitao Xu <qitao.xu@bytedance.com>

Tracepoint trace_ip_queue_xmit() is introduced to trace skb
at the entrance of IP layer on TX side.

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
---
 include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c      | 10 +++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

Comments

Eric Dumazet Aug. 6, 2021, 10:08 a.m. UTC | #1
On 8/5/21 8:57 PM, Cong Wang wrote:
> From: Qitao Xu <qitao.xu@bytedance.com>
> 
> Tracepoint trace_ip_queue_xmit() is introduced to trace skb
> at the entrance of IP layer on TX side.
> 
> Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> ---
>  include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/ip_output.c      | 10 +++++++++-
>  2 files changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h
> index 008f821ebc50..553ae7276732 100644
> --- a/include/trace/events/ip.h
> +++ b/include/trace/events/ip.h
> @@ -41,6 +41,48 @@
>  	TP_STORE_V4MAPPED(__entry, saddr, daddr)
>  #endif
>  
> +TRACE_EVENT(ip_queue_xmit,
> +
> +	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> +
> +	TP_ARGS(sk, skb),
> +
> +	TP_STRUCT__entry(
> +		__field(const void *, skbaddr)
> +		__field(const void *, skaddr)
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +	),
> +
> +	TP_fast_assign(
> +		struct inet_sock *inet = inet_sk(sk);
> +		__be32 *p32;
> +
> +		__entry->skbaddr = skb;
> +		__entry->skaddr = sk;
> +
> +		__entry->sport = ntohs(inet->inet_sport);
> +		__entry->dport = ntohs(inet->inet_dport);
> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = inet->inet_saddr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 =  inet->inet_daddr;
> +
> +		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> +			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +	),
> +
> +	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px",
> +		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr)
> +);
> +
>  #endif /* _TRACE_IP_H */
>  
>  /* This part must be outside protection */
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 6b04a88466b2..dcf94059112e 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -82,6 +82,7 @@
>  #include <linux/netfilter_bridge.h>
>  #include <linux/netlink.h>
>  #include <linux/tcp.h>
> +#include <trace/events/ip.h>
>  
>  static int
>  ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
> @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit);
>  
>  int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
>  {
> -	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> +	int ret;
> +
> +	ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> +	if (!ret)
> +		trace_ip_queue_xmit(sk, skb);
> +
> +	return ret;
> +
>  }
>  EXPORT_SYMBOL(ip_queue_xmit);
>  
> 

While it is useful to have stuff like this,
ddding so many trace points has a certain cost.

I fear that you have not determined this cost
on workloads where we enter these functions with cold caches.

For instance, before this patch, compiler gives us :

2e10 <ip_queue_xmit>:
    2e10:	e8 00 00 00 00       	callq  2e15 <ip_queue_xmit+0x5> (__fentry__-0x4)
    2e15:	0f b6 8f 1c 03 00 00 	movzbl 0x31c(%rdi),%ecx
    2e1c:	e9 ef fb ff ff       	jmpq   2a10 <__ip_queue_xmit>


After patch, we see the compiler had to save/restore registers, and no longer
jumps to __ip_queue_xmit. Code is bigger, even when tracepoint is not enabled.

    2e10:	e8 00 00 00 00       	callq  2e15 <ip_queue_xmit+0x5>
			2e11: R_X86_64_PLT32	__fentry__-0x4
    2e15:	41 55                	push   %r13
    2e17:	49 89 f5             	mov    %rsi,%r13
    2e1a:	41 54                	push   %r12
    2e1c:	55                   	push   %rbp
    2e1d:	0f b6 8f 1c 03 00 00 	movzbl 0x31c(%rdi),%ecx
    2e24:	48 89 fd             	mov    %rdi,%rbp
    2e27:	e8 00 00 00 00       	callq  2e2c <ip_queue_xmit+0x1c>
			2e28: R_X86_64_PLT32	__ip_queue_xmit-0x4
    2e2c:	41 89 c4             	mov    %eax,%r12d
    2e2f:	85 c0                	test   %eax,%eax
    2e31:	74 09                	je     2e3c <ip_queue_xmit+0x2c>
    2e33:	44 89 e0             	mov    %r12d,%eax
    2e36:	5d                   	pop    %rbp
    2e37:	41 5c                	pop    %r12
    2e39:	41 5d                	pop    %r13
    2e3b:	c3                   	retq   
    2e3c:	66 90                	xchg   %ax,%ax
    2e3e:	44 89 e0             	mov    %r12d,%eax
    2e41:	5d                   	pop    %rbp
    2e42:	41 5c                	pop    %r12
    2e44:	41 5d                	pop    %r13
    2e46:	c3                   	retq  
---- tracing code --- 
    2e47:	65 8b 05 00 00 00 00 	mov    %gs:0x0(%rip),%eax        # 2e4e <ip_queue_xmit+0x3e>
			2e4a: R_X86_64_PC32	cpu_number-0x4
    2e4e:	89 c0                	mov    %eax,%eax
    2e50:	48 0f a3 05 00 00 00 	bt     %rax,0x0(%rip)        # 2e58 <ip_queue_xmit+0x48>
    2e57:	00 
			2e54: R_X86_64_PC32	__cpu_online_mask-0x4
    2e58:	73 d9                	jae    2e33 <ip_queue_xmit+0x23>
    2e5a:	48 8b 05 00 00 00 00 	mov    0x0(%rip),%rax        # 2e61 <ip_queue_xmit+0x51>
			2e5d: R_X86_64_PC32	__tracepoint_ip_queue_xmit+0x3c
    2e61:	48 85 c0             	test   %rax,%rax
    2e64:	74 0f                	je     2e75 <ip_queue_xmit+0x65>
    2e66:	48 8b 78 08          	mov    0x8(%rax),%rdi
    2e6a:	4c 89 ea             	mov    %r13,%rdx
    2e6d:	48 89 ee             	mov    %rbp,%rsi
    2e70:	e8 00 00 00 00       	callq  2e75 <ip_queue_xmit+0x65>
			2e71: R_X86_64_PLT32	__SCT__tp_func_ip_queue_xmit-0x4
    2e75:	44 89 e0             	mov    %r12d,%eax
    2e78:	5d                   	pop    %rbp
    2e79:	41 5c                	pop    %r12
    2e7b:	41 5d                	pop    %r13
    2e7d:	c3                   	retq
Cong Wang Aug. 9, 2021, 8:32 p.m. UTC | #2
On Fri, Aug 6, 2021 at 3:09 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> While it is useful to have stuff like this,
> ddding so many trace points has a certain cost.
>
> I fear that you have not determined this cost
> on workloads where we enter these functions with cold caches.
>
> For instance, before this patch, compiler gives us :
>
> 2e10 <ip_queue_xmit>:
>     2e10:       e8 00 00 00 00          callq  2e15 <ip_queue_xmit+0x5> (__fentry__-0x4)
>     2e15:       0f b6 8f 1c 03 00 00    movzbl 0x31c(%rdi),%ecx
>     2e1c:       e9 ef fb ff ff          jmpq   2a10 <__ip_queue_xmit>
>
>
> After patch, we see the compiler had to save/restore registers, and no longer
> jumps to __ip_queue_xmit. Code is bigger, even when tracepoint is not enabled.

Interesting, I didn't pay attention to the binary code generated
by compilers. Let me check it, as I have moved the trace function
before __ip_queue_xmit() (otherwise the order is reversed).

Thanks!
Martin KaFai Lau Aug. 11, 2021, 9:22 p.m. UTC | #3
On Thu, Aug 05, 2021 at 11:57:39AM -0700, Cong Wang wrote:
> From: Qitao Xu <qitao.xu@bytedance.com>
> 
> Tracepoint trace_ip_queue_xmit() is introduced to trace skb
> at the entrance of IP layer on TX side.
> 
> Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> ---
>  include/trace/events/ip.h | 42 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/ip_output.c      | 10 +++++++++-
>  2 files changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h
> index 008f821ebc50..553ae7276732 100644
> --- a/include/trace/events/ip.h
> +++ b/include/trace/events/ip.h
> @@ -41,6 +41,48 @@
>  	TP_STORE_V4MAPPED(__entry, saddr, daddr)
>  #endif
>  
> +TRACE_EVENT(ip_queue_xmit,
> +
> +	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
> +
> +	TP_ARGS(sk, skb),
> +
> +	TP_STRUCT__entry(
> +		__field(const void *, skbaddr)
> +		__field(const void *, skaddr)
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +	),
> +
> +	TP_fast_assign(
> +		struct inet_sock *inet = inet_sk(sk);
> +		__be32 *p32;
> +
> +		__entry->skbaddr = skb;
> +		__entry->skaddr = sk;
> +
> +		__entry->sport = ntohs(inet->inet_sport);
> +		__entry->dport = ntohs(inet->inet_dport);
> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = inet->inet_saddr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 =  inet->inet_daddr;
> +
> +		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
> +			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
> +	),
> +
> +	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px",
> +		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr)
> +);
> +
>  #endif /* _TRACE_IP_H */
>  
>  /* This part must be outside protection */
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 6b04a88466b2..dcf94059112e 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -82,6 +82,7 @@
>  #include <linux/netfilter_bridge.h>
>  #include <linux/netlink.h>
>  #include <linux/tcp.h>
> +#include <trace/events/ip.h>
>  
>  static int
>  ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
> @@ -536,7 +537,14 @@ EXPORT_SYMBOL(__ip_queue_xmit);
>  
>  int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
>  {
> -	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> +	int ret;
> +
> +	ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> +	if (!ret)
> +		trace_ip_queue_xmit(sk, skb);
Instead of adding tracepoints, the bpf fexit prog can be used here and
the bpf prog will have the sk, skb, and ret available (example in fexit_test.c).
Some tracepoints in this set can also be done with bpf fentry/fexit.
Does bpf fentry/fexit work for your use case?
Cong Wang Aug. 11, 2021, 10:48 p.m. UTC | #4
On Wed, Aug 11, 2021 at 2:23 PM Martin KaFai Lau <kafai@fb.com> wrote:
> Instead of adding tracepoints, the bpf fexit prog can be used here and
> the bpf prog will have the sk, skb, and ret available (example in fexit_test.c).
> Some tracepoints in this set can also be done with bpf fentry/fexit.
> Does bpf fentry/fexit work for your use case?

Well, kprobe works too in this perspective. The problem with kprobe
or fexit is that there is no guarantee the function still exists in kernel
during iteration. Kernel is free to delete or rename it. With tracepoint,
even if ip_queue_xmit() were renamed, the same tracepoint must
remain in the kernel.

Thanks.
Martin KaFai Lau Aug. 11, 2021, 11:08 p.m. UTC | #5
On Wed, Aug 11, 2021 at 03:48:36PM -0700, Cong Wang wrote:
> On Wed, Aug 11, 2021 at 2:23 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > Instead of adding tracepoints, the bpf fexit prog can be used here and
> > the bpf prog will have the sk, skb, and ret available (example in fexit_test.c).
> > Some tracepoints in this set can also be done with bpf fentry/fexit.
> > Does bpf fentry/fexit work for your use case?
> 
> Well, kprobe works too in this perspective. The problem with kprobe
> or fexit is that there is no guarantee the function still exists in kernel
> during iteration. Kernel is free to delete or rename it. With tracepoint,
> even if ip_queue_xmit() were renamed, the same tracepoint must
> remain in the kernel.
Some of the function names are hardly changed.  Considering it is
not always cost free based on another thread, this is not a strong
enough reason to add so many tracepoints while other options
are available.
Cong Wang Aug. 12, 2021, 12:37 a.m. UTC | #6
On Wed, Aug 11, 2021 at 4:08 PM Martin KaFai Lau <kafai@fb.com> wrote:
> Some of the function names are hardly changed.

This is obviously wrong for two reasons:

1. Kernel developers did change them. As a quick example,
tcp_retransmit_skb() has been changed, we do have reasons to only trace
__tcp_retransmit_skb() instead.

2. Even if kernel developers never did, compilers can do inline too. For
example, I see nothing to stop compiler to inline tcp_transmit_skb()
which is static and only calls __tcp_transmit_skb(). You explicitly
mark bpf_fentry_test1() as noinline, don't you?

I understand you are eager to promote ebpf, however, please keep
reasonable on facts.

Thanks.
Martin KaFai Lau Aug. 12, 2021, 5:46 a.m. UTC | #7
On Wed, Aug 11, 2021 at 05:37:35PM -0700, Cong Wang wrote:
> On Wed, Aug 11, 2021 at 4:08 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > Some of the function names are hardly changed.
> 
> This is obviously wrong for two reasons:
> 
> 1. Kernel developers did change them. As a quick example,
> tcp_retransmit_skb() has been changed, we do have reasons to only trace
> __tcp_retransmit_skb() instead.
I did not say it has never changed.  how often?  I don't believe
it changed often enough.

> 2. Even if kernel developers never did, compilers can do inline too. For
> example, I see nothing to stop compiler to inline tcp_transmit_skb()
> which is static and only calls __tcp_transmit_skb(). You explicitly
> mark bpf_fentry_test1() as noinline, don't you?
Yes, correct, inline is a legit reason.

Another one is to track some local variables in the function stack.

However, how many functions that you need here are indeed inlined by
compiler or need to track the local variables?

> I understand you are eager to promote ebpf, however, please keep
> reasonable on facts.
Absolutely not.  There is no need.  There is enough scripts
using bpf that does network tracing doing useful stuff without
adding so many tracepoints in the fast path.

I am only exploring other options and trying to understand
why it does not work in your case before adding all of
them in the fast path.  It is sad that you take it
this way.
diff mbox series

Patch

diff --git a/include/trace/events/ip.h b/include/trace/events/ip.h
index 008f821ebc50..553ae7276732 100644
--- a/include/trace/events/ip.h
+++ b/include/trace/events/ip.h
@@ -41,6 +41,48 @@ 
 	TP_STORE_V4MAPPED(__entry, saddr, daddr)
 #endif
 
+TRACE_EVENT(ip_queue_xmit,
+
+	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		__field(const void *, skbaddr)
+		__field(const void *, skaddr)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+	),
+
+	TP_fast_assign(
+		struct inet_sock *inet = inet_sk(sk);
+		__be32 *p32;
+
+		__entry->skbaddr = skb;
+		__entry->skaddr = sk;
+
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 =  inet->inet_daddr;
+
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+	),
+
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skbaddr=%px",
+		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6, __entry->skbaddr)
+);
+
 #endif /* _TRACE_IP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6b04a88466b2..dcf94059112e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@ 
 #include <linux/netfilter_bridge.h>
 #include <linux/netlink.h>
 #include <linux/tcp.h>
+#include <trace/events/ip.h>
 
 static int
 ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -536,7 +537,14 @@  EXPORT_SYMBOL(__ip_queue_xmit);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 {
-	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+	int ret;
+
+	ret = __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+	if (!ret)
+		trace_ip_queue_xmit(sk, skb);
+
+	return ret;
+
 }
 EXPORT_SYMBOL(ip_queue_xmit);