diff mbox series

[net-next,v2] net_sched: introduce tracepoint trace_qdisc_enqueue()

Message ID 20210711190308.8476-1-xiyou.wangcong@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v2] net_sched: introduce tracepoint trace_qdisc_enqueue() | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 11 maintainers not CCed: mingo@redhat.com daniel@iogearbox.net andriin@fb.com rostedt@goodmis.org ast@kernel.org ap420073@gmail.com kuba@kernel.org alobakin@pm.me atenart@kernel.org davem@davemloft.net weiwan@google.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 11 this patch: 11
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Lines should not end with a '(' WARNING: Using vsprintf specifier '%px' potentially exposes the kernel memory layout, if you don't really need the address please consider using '%p'. WARNING: line length of 87 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 11 this patch: 11
netdev/header_inline success Link

Commit Message

Cong Wang July 11, 2021, 7:03 p.m. UTC
From: Qitao Xu <qitao.xu@bytedance.com>

Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
the entrance of TC layer on TX side. This is kinda symmetric to
trace_qdisc_dequeue(), and together they can be used to calculate
the packet queueing latency. It is more accurate than
trace_net_dev_queue(), because we already successfully enqueue
the packet at that point.

Note, trace ring buffer is only accessible to privileged users,
it is safe to use %px to print a real kernel address here.

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
---
 include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
 net/core/dev.c               |  9 +++++++++
 2 files changed, 35 insertions(+)

Comments

Tonghao Zhang July 12, 2021, 12:49 a.m. UTC | #1
On Mon, Jul 12, 2021 at 3:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> From: Qitao Xu <qitao.xu@bytedance.com>
>
> Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> the entrance of TC layer on TX side. This is kinda symmetric to
> trace_qdisc_dequeue(), and together they can be used to calculate
> the packet queueing latency. It is more accurate than
> trace_net_dev_queue(), because we already successfully enqueue
> the packet at that point.
>
> Note, trace ring buffer is only accessible to privileged users,
> it is safe to use %px to print a real kernel address here.
>
> Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> ---
>  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
>  net/core/dev.c               |  9 +++++++++
>  2 files changed, 35 insertions(+)
>
> diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> index 58209557cb3a..c3006c6b4a87 100644
> --- a/include/trace/events/qdisc.h
> +++ b/include/trace/events/qdisc.h
> @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
>                   __entry->txq_state, __entry->packets, __entry->skbaddr )
>  );
>
> +TRACE_EVENT(qdisc_enqueue,
> +
> +       TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> +
> +       TP_ARGS(qdisc, txq, skb),
> +
> +       TP_STRUCT__entry(
> +               __field(struct Qdisc *, qdisc)
> +               __field(void *, skbaddr)
> +               __field(int, ifindex)
> +               __field(u32, handle)
> +               __field(u32, parent)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->qdisc = qdisc;
> +               __entry->skbaddr = skb;
> +               __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> +               __entry->handle  = qdisc->handle;
> +               __entry->parent  = qdisc->parent;
> +       ),
Hi qitao, cong
Why not support the txq, we get more info from txq.
and we should take care of the return value of q->enqueue, because we
can know what happens in the qdisc queue(not necessary to work with
qdisc:dequeue).
and we can use a tracepoint filter for the return value too.
we should introduce a new function to instead of now codes, that may
make the codes clean.  Please review my patch for more info.
https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/

> +       TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
> +                 __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
> +);
> +
>  TRACE_EVENT(qdisc_reset,
>
>         TP_PROTO(struct Qdisc *q),
> diff --git a/net/core/dev.c b/net/core/dev.c
> index c253c2aafe97..20b9376de301 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -131,6 +131,7 @@
>  #include <trace/events/napi.h>
>  #include <trace/events/net.h>
>  #include <trace/events/skb.h>
> +#include <trace/events/qdisc.h>
>  #include <linux/inetdevice.h>
>  #include <linux/cpu_rmap.h>
>  #include <linux/static_key.h>
> @@ -3864,6 +3865,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>                         if (unlikely(!nolock_qdisc_is_empty(q))) {
>                                 rc = q->enqueue(skb, q, &to_free) &
>                                         NET_XMIT_MASK;
> +                               if (rc == NET_XMIT_SUCCESS)
> +                                       trace_qdisc_enqueue(q, txq, skb);
>                                 __qdisc_run(q);
>                                 qdisc_run_end(q);
>
> @@ -3880,6 +3883,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>                 }
>
>                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> +               if (rc == NET_XMIT_SUCCESS)
> +                       trace_qdisc_enqueue(q, txq, skb);
> +
>                 qdisc_run(q);
>
>  no_lock_out:
> @@ -3924,6 +3930,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>                 rc = NET_XMIT_SUCCESS;
>         } else {
>                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> +               if (rc == NET_XMIT_SUCCESS)
> +                       trace_qdisc_enqueue(q, txq, skb);
> +
>                 if (qdisc_run_begin(q)) {
>                         if (unlikely(contended)) {
>                                 spin_unlock(&q->busylock);
> --
> 2.27.0
>
Cong Wang July 12, 2021, 1:47 a.m. UTC | #2
On Sun, Jul 11, 2021 at 5:50 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 3:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > From: Qitao Xu <qitao.xu@bytedance.com>
> >
> > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > the entrance of TC layer on TX side. This is kinda symmetric to
> > trace_qdisc_dequeue(), and together they can be used to calculate
> > the packet queueing latency. It is more accurate than
> > trace_net_dev_queue(), because we already successfully enqueue
> > the packet at that point.
> >
> > Note, trace ring buffer is only accessible to privileged users,
> > it is safe to use %px to print a real kernel address here.
> >
> > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > Cc: Jiri Pirko <jiri@resnulli.us>
> > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > ---
> >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> >  net/core/dev.c               |  9 +++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > index 58209557cb3a..c3006c6b4a87 100644
> > --- a/include/trace/events/qdisc.h
> > +++ b/include/trace/events/qdisc.h
> > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> >                   __entry->txq_state, __entry->packets, __entry->skbaddr )
> >  );
> >
> > +TRACE_EVENT(qdisc_enqueue,
> > +
> > +       TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > +
> > +       TP_ARGS(qdisc, txq, skb),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(struct Qdisc *, qdisc)
> > +               __field(void *, skbaddr)
> > +               __field(int, ifindex)
> > +               __field(u32, handle)
> > +               __field(u32, parent)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->qdisc = qdisc;
> > +               __entry->skbaddr = skb;
> > +               __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > +               __entry->handle  = qdisc->handle;
> > +               __entry->parent  = qdisc->parent;
> > +       ),
> Hi qitao, cong
> Why not support the txq, we get more info from txq.

Because we only want to calculate queueing latency, not anything
else. If you need it, you are welcome to add anything reasonable
in the future, it won't break ABI (see 3dd344ea84e122f791ab).

> and we should take care of the return value of q->enqueue, because we
> can know what happens in the qdisc queue(not necessary to work with
> qdisc:dequeue).
> and we can use a tracepoint filter for the return value too.

Disagree. Because we really have no interest in dropped packets.
Even if we really do, we could trace kfree_skb(), not really here.

> we should introduce a new function to instead of now codes, that may
> make the codes clean.  Please review my patch for more info.

Just 3 lines of code, it is totally personal taste.

> https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/

I did review it. Like I said, %p does not work. Have you tested your
patches? ;)

Thanks.
Yunsheng Lin July 12, 2021, 3:01 a.m. UTC | #3
On 2021/7/12 3:03, Cong Wang wrote:
> From: Qitao Xu <qitao.xu@bytedance.com>
> 
> Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> the entrance of TC layer on TX side. This is kinda symmetric to
> trace_qdisc_dequeue(), and together they can be used to calculate
> the packet queueing latency. It is more accurate than
> trace_net_dev_queue(), because we already successfully enqueue
> the packet at that point.
> 
> Note, trace ring buffer is only accessible to privileged users,
> it is safe to use %px to print a real kernel address here.
> 
> Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> ---
>  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
>  net/core/dev.c               |  9 +++++++++
>  2 files changed, 35 insertions(+)
> 
> diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> index 58209557cb3a..c3006c6b4a87 100644
> --- a/include/trace/events/qdisc.h
> +++ b/include/trace/events/qdisc.h
> @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
>  		  __entry->txq_state, __entry->packets, __entry->skbaddr )
>  );
>  
> +TRACE_EVENT(qdisc_enqueue,
> +
> +	TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> +
> +	TP_ARGS(qdisc, txq, skb),
> +
> +	TP_STRUCT__entry(
> +		__field(struct Qdisc *, qdisc)
> +		__field(void *,	skbaddr)
> +		__field(int, ifindex)
> +		__field(u32, handle)
> +		__field(u32, parent)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->qdisc = qdisc;
> +		__entry->skbaddr = skb;
> +		__entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> +		__entry->handle	 = qdisc->handle;
> +		__entry->parent	 = qdisc->parent;
> +	),
> +
> +	TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
> +		  __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
> +);
> +
>  TRACE_EVENT(qdisc_reset,
>  
>  	TP_PROTO(struct Qdisc *q),
> diff --git a/net/core/dev.c b/net/core/dev.c
> index c253c2aafe97..20b9376de301 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -131,6 +131,7 @@
>  #include <trace/events/napi.h>
>  #include <trace/events/net.h>
>  #include <trace/events/skb.h>
> +#include <trace/events/qdisc.h>
>  #include <linux/inetdevice.h>
>  #include <linux/cpu_rmap.h>
>  #include <linux/static_key.h>
> @@ -3864,6 +3865,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  			if (unlikely(!nolock_qdisc_is_empty(q))) {
>  				rc = q->enqueue(skb, q, &to_free) &
>  					NET_XMIT_MASK;
> +				if (rc == NET_XMIT_SUCCESS)

If NET_XMIT_CN is returned, the skb seems to be enqueued too?

Also instead of checking the rc before calling the trace_*, maybe
it make more sense to add the rc to the tracepoint, so that the checking
is avoided, and we are able to tell the enqueuing result of a specific skb
from that tracepoint too.

> +					trace_qdisc_enqueue(q, txq, skb);

Does it make sense to wrap the about to something like:

int sch_enqueue(....)
{
	rc = q->enqueue(skb, q, &to_free)..
	....
	trace_qdisc_enqueue(q, txq, skb);
}

So that the below code can reuse that wrapper too.

>  				__qdisc_run(q);
>  				qdisc_run_end(q);
>  
> @@ -3880,6 +3883,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  		}
>  
>  		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> +		if (rc == NET_XMIT_SUCCESS)
> +			trace_qdisc_enqueue(q, txq, skb);
> +
>  		qdisc_run(q);
>  
>  no_lock_out:
> @@ -3924,6 +3930,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  		rc = NET_XMIT_SUCCESS;
>  	} else {
>  		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> +		if (rc == NET_XMIT_SUCCESS)
> +			trace_qdisc_enqueue(q, txq, skb);
> +
>  		if (qdisc_run_begin(q)) {
>  			if (unlikely(contended)) {
>  				spin_unlock(&q->busylock);
>
Tonghao Zhang July 12, 2021, 3:01 a.m. UTC | #4
On Mon, Jul 12, 2021 at 9:47 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 5:50 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 3:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > From: Qitao Xu <qitao.xu@bytedance.com>
> > >
> > > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > > the entrance of TC layer on TX side. This is kinda symmetric to
> > > trace_qdisc_dequeue(), and together they can be used to calculate
> > > the packet queueing latency. It is more accurate than
> > > trace_net_dev_queue(), because we already successfully enqueue
> > > the packet at that point.
> > >
> > > Note, trace ring buffer is only accessible to privileged users,
> > > it is safe to use %px to print a real kernel address here.
> > >
> > > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > > ---
> > >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> > >  net/core/dev.c               |  9 +++++++++
> > >  2 files changed, 35 insertions(+)
> > >
> > > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > > index 58209557cb3a..c3006c6b4a87 100644
> > > --- a/include/trace/events/qdisc.h
> > > +++ b/include/trace/events/qdisc.h
> > > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> > >                   __entry->txq_state, __entry->packets, __entry->skbaddr )
> > >  );
> > >
> > > +TRACE_EVENT(qdisc_enqueue,
> > > +
> > > +       TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > > +
> > > +       TP_ARGS(qdisc, txq, skb),
> > > +
> > > +       TP_STRUCT__entry(
> > > +               __field(struct Qdisc *, qdisc)
> > > +               __field(void *, skbaddr)
> > > +               __field(int, ifindex)
> > > +               __field(u32, handle)
> > > +               __field(u32, parent)
> > > +       ),
> > > +
> > > +       TP_fast_assign(
> > > +               __entry->qdisc = qdisc;
> > > +               __entry->skbaddr = skb;
> > > +               __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > > +               __entry->handle  = qdisc->handle;
> > > +               __entry->parent  = qdisc->parent;
> > > +       ),
> > Hi qitao, cong
> > Why not support the txq, we get more info from txq.
>
> Because we only want to calculate queueing latency, not anything
> else. If you need it, you are welcome to add anything reasonable
> in the future, it won't break ABI (see 3dd344ea84e122f791ab).
Thanks!
> > and we should take care of the return value of q->enqueue, because we
> > can know what happens in the qdisc queue(not necessary to work with
> > qdisc:dequeue).
> > and we can use a tracepoint filter for the return value too.
>
> Disagree. Because we really have no interest in dropped packets.
> Even if we really do, we could trace kfree_skb(), not really here.
The qdisc returns not only the NET_XMIT_DROP, right ?
skbprio_enqueue, sfq_enqueue and red_enqueue may return the NET_XMIT_CN.


> > we should introduce a new function to instead of now codes, that may
> > make the codes clean.  Please review my patch for more info.
>
> Just 3 lines of code, it is totally personal taste.
>
> > https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/
>
> I did review it. Like I said, %p does not work. Have you tested your
> patches? ;)
Yes, I tested them, I didn't find the error.  my patch is based on
commit id 89212e160b81e778f829b89743570665810e3b13
> Thanks.
Tonghao Zhang July 12, 2021, 3:06 a.m. UTC | #5
On Mon, Jul 12, 2021 at 11:01 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2021/7/12 3:03, Cong Wang wrote:
> > From: Qitao Xu <qitao.xu@bytedance.com>
> >
> > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > the entrance of TC layer on TX side. This is kinda symmetric to
> > trace_qdisc_dequeue(), and together they can be used to calculate
> > the packet queueing latency. It is more accurate than
> > trace_net_dev_queue(), because we already successfully enqueue
> > the packet at that point.
> >
> > Note, trace ring buffer is only accessible to privileged users,
> > it is safe to use %px to print a real kernel address here.
> >
> > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > Cc: Jiri Pirko <jiri@resnulli.us>
> > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > ---
> >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> >  net/core/dev.c               |  9 +++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > index 58209557cb3a..c3006c6b4a87 100644
> > --- a/include/trace/events/qdisc.h
> > +++ b/include/trace/events/qdisc.h
> > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> >                 __entry->txq_state, __entry->packets, __entry->skbaddr )
> >  );
> >
> > +TRACE_EVENT(qdisc_enqueue,
> > +
> > +     TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > +
> > +     TP_ARGS(qdisc, txq, skb),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(struct Qdisc *, qdisc)
> > +             __field(void *, skbaddr)
> > +             __field(int, ifindex)
> > +             __field(u32, handle)
> > +             __field(u32, parent)
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->qdisc = qdisc;
> > +             __entry->skbaddr = skb;
> > +             __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > +             __entry->handle  = qdisc->handle;
> > +             __entry->parent  = qdisc->parent;
> > +     ),
> > +
> > +     TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
> > +               __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
> > +);
> > +
> >  TRACE_EVENT(qdisc_reset,
> >
> >       TP_PROTO(struct Qdisc *q),
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index c253c2aafe97..20b9376de301 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -131,6 +131,7 @@
> >  #include <trace/events/napi.h>
> >  #include <trace/events/net.h>
> >  #include <trace/events/skb.h>
> > +#include <trace/events/qdisc.h>
> >  #include <linux/inetdevice.h>
> >  #include <linux/cpu_rmap.h>
> >  #include <linux/static_key.h>
> > @@ -3864,6 +3865,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> >                       if (unlikely(!nolock_qdisc_is_empty(q))) {
> >                               rc = q->enqueue(skb, q, &to_free) &
> >                                       NET_XMIT_MASK;
> > +                             if (rc == NET_XMIT_SUCCESS)
>
> If NET_XMIT_CN is returned, the skb seems to be enqueued too?
>
> Also instead of checking the rc before calling the trace_*, maybe
> it make more sense to add the rc to the tracepoint, so that the checking
> is avoided, and we are able to tell the enqueuing result of a specific skb
> from that tracepoint too.
Yes, I will fix it.
> > +                                     trace_qdisc_enqueue(q, txq, skb);
>
> Does it make sense to wrap the about to something like:
>
> int sch_enqueue(....)
> {
>         rc = q->enqueue(skb, q, &to_free)..
>         ....
>         trace_qdisc_enqueue(q, txq, skb);
> }
Yes, I agree, my patch uses qdisc_enqueue_skb, because __dev_xmit_skb
invoke the qdisc_xxx api.

https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/
> So that the below code can reuse that wrapper too.
>
> >                               __qdisc_run(q);
> >                               qdisc_run_end(q);
> >
> > @@ -3880,6 +3883,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> >               }
> >
> >               rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> > +             if (rc == NET_XMIT_SUCCESS)
> > +                     trace_qdisc_enqueue(q, txq, skb);
> > +
> >               qdisc_run(q);
> >
> >  no_lock_out:
> > @@ -3924,6 +3930,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> >               rc = NET_XMIT_SUCCESS;
> >       } else {
> >               rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> > +             if (rc == NET_XMIT_SUCCESS)
> > +                     trace_qdisc_enqueue(q, txq, skb);
> > +
> >               if (qdisc_run_begin(q)) {
> >                       if (unlikely(contended)) {
> >                               spin_unlock(&q->busylock);
> >
Cong Wang July 12, 2021, 3:23 a.m. UTC | #6
On Sun, Jul 11, 2021 at 8:02 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 9:47 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, Jul 11, 2021 at 5:50 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 3:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > From: Qitao Xu <qitao.xu@bytedance.com>
> > > >
> > > > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > > > the entrance of TC layer on TX side. This is kinda symmetric to
> > > > trace_qdisc_dequeue(), and together they can be used to calculate
> > > > the packet queueing latency. It is more accurate than
> > > > trace_net_dev_queue(), because we already successfully enqueue
> > > > the packet at that point.
> > > >
> > > > Note, trace ring buffer is only accessible to privileged users,
> > > > it is safe to use %px to print a real kernel address here.
> > > >
> > > > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > > > ---
> > > >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> > > >  net/core/dev.c               |  9 +++++++++
> > > >  2 files changed, 35 insertions(+)
> > > >
> > > > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > > > index 58209557cb3a..c3006c6b4a87 100644
> > > > --- a/include/trace/events/qdisc.h
> > > > +++ b/include/trace/events/qdisc.h
> > > > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> > > >                   __entry->txq_state, __entry->packets, __entry->skbaddr )
> > > >  );
> > > >
> > > > +TRACE_EVENT(qdisc_enqueue,
> > > > +
> > > > +       TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > > > +
> > > > +       TP_ARGS(qdisc, txq, skb),
> > > > +
> > > > +       TP_STRUCT__entry(
> > > > +               __field(struct Qdisc *, qdisc)
> > > > +               __field(void *, skbaddr)
> > > > +               __field(int, ifindex)
> > > > +               __field(u32, handle)
> > > > +               __field(u32, parent)
> > > > +       ),
> > > > +
> > > > +       TP_fast_assign(
> > > > +               __entry->qdisc = qdisc;
> > > > +               __entry->skbaddr = skb;
> > > > +               __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > > > +               __entry->handle  = qdisc->handle;
> > > > +               __entry->parent  = qdisc->parent;
> > > > +       ),
> > > Hi qitao, cong
> > > Why not support the txq, we get more info from txq.
> >
> > Because we only want to calculate queueing latency, not anything
> > else. If you need it, you are welcome to add anything reasonable
> > in the future, it won't break ABI (see 3dd344ea84e122f791ab).
> Thanks!
> > > and we should take care of the return value of q->enqueue, because we
> > > can know what happens in the qdisc queue(not necessary to work with
> > > qdisc:dequeue).
> > > and we can use a tracepoint filter for the return value too.
> >
> > Disagree. Because we really have no interest in dropped packets.
> > Even if we really do, we could trace kfree_skb(), not really here.
> The qdisc returns not only the NET_XMIT_DROP, right ?
> skbprio_enqueue, sfq_enqueue and red_enqueue may return the NET_XMIT_CN.
>

Sure, in that case a different packet is dropped, once again you
can trace it with kfree_skb() if you want. What's the problem?

>
> > > we should introduce a new function to instead of now codes, that may
> > > make the codes clean.  Please review my patch for more info.
> >
> > Just 3 lines of code, it is totally personal taste.
> >
> > > https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/
> >
> > I did review it. Like I said, %p does not work. Have you tested your
> > patches? ;)
> Yes, I tested them, I didn't find the error.  my patch is based on
> commit id 89212e160b81e778f829b89743570665810e3b13

I seriously doubt it, because we actually used %p in the beginning
too and got the same address for two different packets, this is why
we have to move to %px. It is 100% reproducible, so it probably
means you didn't test it at all.

Thanks.
Cong Wang July 12, 2021, 3:34 a.m. UTC | #7
On Sun, Jul 11, 2021 at 8:01 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2021/7/12 3:03, Cong Wang wrote:
> > From: Qitao Xu <qitao.xu@bytedance.com>
> >
> > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > the entrance of TC layer on TX side. This is kinda symmetric to
> > trace_qdisc_dequeue(), and together they can be used to calculate
> > the packet queueing latency. It is more accurate than
> > trace_net_dev_queue(), because we already successfully enqueue
> > the packet at that point.
> >
> > Note, trace ring buffer is only accessible to privileged users,
> > it is safe to use %px to print a real kernel address here.
> >
> > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > Cc: Jiri Pirko <jiri@resnulli.us>
> > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > ---
> >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> >  net/core/dev.c               |  9 +++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > index 58209557cb3a..c3006c6b4a87 100644
> > --- a/include/trace/events/qdisc.h
> > +++ b/include/trace/events/qdisc.h
> > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> >                 __entry->txq_state, __entry->packets, __entry->skbaddr )
> >  );
> >
> > +TRACE_EVENT(qdisc_enqueue,
> > +
> > +     TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > +
> > +     TP_ARGS(qdisc, txq, skb),
> > +
> > +     TP_STRUCT__entry(
> > +             __field(struct Qdisc *, qdisc)
> > +             __field(void *, skbaddr)
> > +             __field(int, ifindex)
> > +             __field(u32, handle)
> > +             __field(u32, parent)
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __entry->qdisc = qdisc;
> > +             __entry->skbaddr = skb;
> > +             __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > +             __entry->handle  = qdisc->handle;
> > +             __entry->parent  = qdisc->parent;
> > +     ),
> > +
> > +     TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
> > +               __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
> > +);
> > +
> >  TRACE_EVENT(qdisc_reset,
> >
> >       TP_PROTO(struct Qdisc *q),
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index c253c2aafe97..20b9376de301 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -131,6 +131,7 @@
> >  #include <trace/events/napi.h>
> >  #include <trace/events/net.h>
> >  #include <trace/events/skb.h>
> > +#include <trace/events/qdisc.h>
> >  #include <linux/inetdevice.h>
> >  #include <linux/cpu_rmap.h>
> >  #include <linux/static_key.h>
> > @@ -3864,6 +3865,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> >                       if (unlikely(!nolock_qdisc_is_empty(q))) {
> >                               rc = q->enqueue(skb, q, &to_free) &
> >                                       NET_XMIT_MASK;
> > +                             if (rc == NET_XMIT_SUCCESS)
>
> If NET_XMIT_CN is returned, the skb seems to be enqueued too?

Sure. See the other reply from on why dropped packets are not
interesting here.

>
> Also instead of checking the rc before calling the trace_*, maybe
> it make more sense to add the rc to the tracepoint, so that the checking
> is avoided, and we are able to tell the enqueuing result of a specific skb
> from that tracepoint too.

Totally disagree, because trace_qdisc_dequeue() is only called for
successful cases too (see dequeue_skb()), it does not make sense
to let them be different.

>
> > +                                     trace_qdisc_enqueue(q, txq, skb);
>
> Does it make sense to wrap the about to something like:

Nope. Because ->enqueue() is called by lower layer qdisc's
too, but here we only want to track root, aka, entrance of TC.
I know this may be confusing, please blame trace_qdisc_dequeue()
which only tracks the exit. ;)

Thanks.
Tonghao Zhang July 12, 2021, 3:35 a.m. UTC | #8
On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 8:02 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 9:47 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sun, Jul 11, 2021 at 5:50 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 12, 2021 at 3:03 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > From: Qitao Xu <qitao.xu@bytedance.com>
> > > > >
> > > > > Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
> > > > > the entrance of TC layer on TX side. This is kinda symmetric to
> > > > > trace_qdisc_dequeue(), and together they can be used to calculate
> > > > > the packet queueing latency. It is more accurate than
> > > > > trace_net_dev_queue(), because we already successfully enqueue
> > > > > the packet at that point.
> > > > >
> > > > > Note, trace ring buffer is only accessible to privileged users,
> > > > > it is safe to use %px to print a real kernel address here.
> > > > >
> > > > > Reviewed-by: Cong Wang <cong.wang@bytedance.com>
> > > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > > > Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
> > > > > ---
> > > > >  include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
> > > > >  net/core/dev.c               |  9 +++++++++
> > > > >  2 files changed, 35 insertions(+)
> > > > >
> > > > > diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
> > > > > index 58209557cb3a..c3006c6b4a87 100644
> > > > > --- a/include/trace/events/qdisc.h
> > > > > +++ b/include/trace/events/qdisc.h
> > > > > @@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
> > > > >                   __entry->txq_state, __entry->packets, __entry->skbaddr )
> > > > >  );
> > > > >
> > > > > +TRACE_EVENT(qdisc_enqueue,
> > > > > +
> > > > > +       TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
> > > > > +
> > > > > +       TP_ARGS(qdisc, txq, skb),
> > > > > +
> > > > > +       TP_STRUCT__entry(
> > > > > +               __field(struct Qdisc *, qdisc)
> > > > > +               __field(void *, skbaddr)
> > > > > +               __field(int, ifindex)
> > > > > +               __field(u32, handle)
> > > > > +               __field(u32, parent)
> > > > > +       ),
> > > > > +
> > > > > +       TP_fast_assign(
> > > > > +               __entry->qdisc = qdisc;
> > > > > +               __entry->skbaddr = skb;
> > > > > +               __entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
> > > > > +               __entry->handle  = qdisc->handle;
> > > > > +               __entry->parent  = qdisc->parent;
> > > > > +       ),
> > > > Hi qitao, cong
> > > > Why not support the txq, we get more info from txq.
> > >
> > > Because we only want to calculate queueing latency, not anything
> > > else. If you need it, you are welcome to add anything reasonable
> > > in the future, it won't break ABI (see 3dd344ea84e122f791ab).
> > Thanks!
> > > > and we should take care of the return value of q->enqueue, because we
> > > > can know what happens in the qdisc queue(not necessary to work with
> > > > qdisc:dequeue).
> > > > and we can use a tracepoint filter for the return value too.
> > >
> > > Disagree. Because we really have no interest in dropped packets.
> > > Even if we really do, we could trace kfree_skb(), not really here.
> > The qdisc returns not only the NET_XMIT_DROP, right ?
> > skbprio_enqueue, sfq_enqueue and red_enqueue may return the NET_XMIT_CN.
> >
>
> Sure, in that case a different packet is dropped, once again you
> can trace it with kfree_skb() if you want. What's the problem?
It's ok, but we can make it better. Yunsheng Lin may have explained why?
> >
> > > > we should introduce a new function to instead of now codes, that may
> > > > make the codes clean.  Please review my patch for more info.
> > >
> > > Just 3 lines of code, it is totally personal taste.
> > >
> > > > https://patchwork.kernel.org/project/netdevbpf/patch/20210711050007.1200-1-xiangxia.m.yue@gmail.com/
> > >
> > > I did review it. Like I said, %p does not work. Have you tested your
> > > patches? ;)
> > Yes, I tested them, I didn't find the error.  my patch is based on
> > commit id 89212e160b81e778f829b89743570665810e3b13
>
> I seriously doubt it, because we actually used %p in the beginning
> too and got the same address for two different packets, this is why
> we have to move to %px. It is 100% reproducible, so it probably
> means you didn't test it at all.
I use iperf to send the packets, not found the same address really

          <idle>-0       [043] ..s.    62.493634: qdisc_enqueue:
enqueue ifindex=2 qdisc handle=0x0 parent=0x2C
skbaddr=00000000a40f93fb ret=0
          <idle>-0       [043] ..s.    62.494641: qdisc_enqueue:
enqueue ifindex=2 qdisc handle=0x0 parent=0x20
skbaddr=00000000c3c53e95 ret=0
          <idle>-0       [014] ..s.    64.473877: qdisc_enqueue:
enqueue ifindex=2 qdisc handle=0x0 parent=0x20
skbaddr=00000000ad610424 ret=0
          <idle>-0       [014] ..s.    64.473896: qdisc_enqueue:
enqueue ifindex=2 qdisc handle=0x0 parent=0x20
skbaddr=00000000112a562d ret=0

>
> Thanks.
Cong Wang July 12, 2021, 3:39 a.m. UTC | #9
On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > Sure, in that case a different packet is dropped, once again you
> > can trace it with kfree_skb() if you want. What's the problem?
> It's ok, but we can make it better. Yunsheng Lin may have explained why?

Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
I fail to see it. You are just asking for duplications. If you do not see it by
yourself, it means you don't understand or need it at all. ;)

Thanks.
Tonghao Zhang July 12, 2021, 3:48 a.m. UTC | #10
On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > Sure, in that case a different packet is dropped, once again you
> > > can trace it with kfree_skb() if you want. What's the problem?
> > It's ok, but we can make it better. Yunsheng Lin may have explained why?
>
> Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
I mean we can use one tracepoint to know what happened in the queue,
not necessary to trace enqueue and  kfree_skb()
If so, we must match when the packet is dropped and what packets. if
we use the return value in trace_qdisc_requeue. It
is easy to know what happened(when, where, what packets were dropped ).

> I fail to see it. You are just asking for duplications. If you do not see it by
> yourself, it means you don't understand or need it at all. ;)
I added the tracepoint in centos 8 4.18 kernel version in our servers
for a long time.
> Thanks.
Cong Wang July 12, 2021, 4:02 a.m. UTC | #11
On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > Sure, in that case a different packet is dropped, once again you
> > > > can trace it with kfree_skb() if you want. What's the problem?
> > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> >
> > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> I mean we can use one tracepoint to know what happened in the queue,
> not necessary to trace enqueue and  kfree_skb()

This is wrong, packets can be dropped for other reasons too, tracing
enqueue is clearly not sufficient even if you trace it unconditionally.
For a quick example, codel drops packets at dequeue rather than
enqueue. ;)

> If so, we must match when the packet is dropped and what packets. if
> we use the return value in trace_qdisc_requeue. It
> is easy to know what happened(when, where, what packets were dropped ).

I am afraid you have to watch the dropped packets.

Thanks.
Tonghao Zhang July 12, 2021, 4:12 a.m. UTC | #12
On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > Sure, in that case a different packet is dropped, once again you
> > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > >
> > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > I mean we can use one tracepoint to know what happened in the queue,
> > not necessary to trace enqueue and  kfree_skb()
>
> This is wrong, packets can be dropped for other reasons too, tracing
no matter where the packet is dropped, we should allow user to know
whether dropped in the enqueue.  and the
what the return value.
> enqueue is clearly not sufficient even if you trace it unconditionally.
> For a quick example, codel drops packets at dequeue rather than
> enqueue. ;)
>
> > If so, we must match when the packet is dropped and what packets. if
> > we use the return value in trace_qdisc_requeue. It
> > is easy to know what happened(when, where, what packets were dropped ).
>
> I am afraid you have to watch the dropped packets.
>
> Thanks.
Cong Wang July 12, 2021, 4:19 a.m. UTC | #13
On Sun, Jul 11, 2021 at 9:12 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > Sure, in that case a different packet is dropped, once again you
> > > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > > >
> > > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > > I mean we can use one tracepoint to know what happened in the queue,
> > > not necessary to trace enqueue and  kfree_skb()
> >
> > This is wrong, packets can be dropped for other reasons too, tracing
> no matter where the packet is dropped, we should allow user to know
> whether dropped in the enqueue.  and the
> what the return value.

Again you can know it by kfree_skb(). And you can not avoid
kfree_skb() no matter how you change enqueue. So, I don't see your
argument of saving kfree_skb() makes any sense here.

Thanks.
Tonghao Zhang July 12, 2021, 4:40 a.m. UTC | #14
On Mon, Jul 12, 2021 at 12:20 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 9:12 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > > Sure, in that case a different packet is dropped, once again you
> > > > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > > > >
> > > > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > > > I mean we can use one tracepoint to know what happened in the queue,
> > > > not necessary to trace enqueue and  kfree_skb()
> > >
> > > This is wrong, packets can be dropped for other reasons too, tracing
> > no matter where the packet is dropped, we should allow user to know
> > whether dropped in the enqueue.  and the
> > what the return value.
>
> Again you can know it by kfree_skb(). And you can not avoid
> kfree_skb() no matter how you change enqueue. So, I don't see your
No, If I know what value returned for specified qdisc , I can know
what happened, not necessarily kfree_skb()
> argument of saving kfree_skb() makes any sense here.
>
> Thanks.
Cong Wang July 12, 2021, 4:45 a.m. UTC | #15
On Sun, Jul 11, 2021 at 9:41 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 12:20 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, Jul 11, 2021 at 9:12 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > >
> > > > > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > > > Sure, in that case a different packet is dropped, once again you
> > > > > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > > > > >
> > > > > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > > > > I mean we can use one tracepoint to know what happened in the queue,
> > > > > not necessary to trace enqueue and  kfree_skb()
> > > >
> > > > This is wrong, packets can be dropped for other reasons too, tracing
> > > no matter where the packet is dropped, we should allow user to know
> > > whether dropped in the enqueue.  and the
> > > what the return value.
> >
> > Again you can know it by kfree_skb(). And you can not avoid
> > kfree_skb() no matter how you change enqueue. So, I don't see your
> No, If I know what value returned for specified qdisc , I can know
> what happened, not necessarily kfree_skb()

This is wrong. You have to trace dropped packets because you
need to know when to delete the key (skb address) from the hashtable
you use to calculate the latency. You save the key on enqueue and
remove it on both dequeue and kfree_skb, the only difference is you
only need to calculate the timestamp difference for the former.

Thanks.
Tonghao Zhang July 13, 2021, 1:56 a.m. UTC | #16
On Mon, Jul 12, 2021 at 12:45 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Jul 11, 2021 at 9:41 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> >
> > On Mon, Jul 12, 2021 at 12:20 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sun, Jul 11, 2021 at 9:12 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > >
> > > > > > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > > > > Sure, in that case a different packet is dropped, once again you
> > > > > > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > > > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > > > > > >
> > > > > > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > > > > > I mean we can use one tracepoint to know what happened in the queue,
> > > > > > not necessary to trace enqueue and  kfree_skb()
> > > > >
> > > > > This is wrong, packets can be dropped for other reasons too, tracing
> > > > no matter where the packet is dropped, we should allow user to know
> > > > whether dropped in the enqueue.  and the
> > > > what the return value.
> > >
> > > Again you can know it by kfree_skb(). And you can not avoid
> > > kfree_skb() no matter how you change enqueue. So, I don't see your
> > No, If I know what value returned for specified qdisc , I can know
> > what happened, not necessarily kfree_skb()
>
> This is wrong. You have to trace dropped packets because you
> need to know when to delete the key (skb address) from the hashtable
> you use to calculate the latency. You save the key on enqueue and
> remove it on both dequeue and kfree_skb, the only difference is you
No, we can set the timestamp or cb in  skb in enqueue and check them in dequeue.
we may not use the hashtable.
If we use hashtable, we still can check the return value, save them to
hashtable or not.
> only need to calculate the timestamp difference for the former.
>
> Thanks.
Cong Wang July 15, 2021, 4:28 a.m. UTC | #17
On Mon, Jul 12, 2021 at 6:57 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
>
> On Mon, Jul 12, 2021 at 12:45 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sun, Jul 11, 2021 at 9:41 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > >
> > > On Mon, Jul 12, 2021 at 12:20 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > On Sun, Jul 11, 2021 at 9:12 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 12, 2021 at 12:02 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > >
> > > > > > On Sun, Jul 11, 2021 at 8:49 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 12, 2021 at 11:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Sun, Jul 11, 2021 at 8:36 PM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Jul 12, 2021 at 11:23 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > > > > > Sure, in that case a different packet is dropped, once again you
> > > > > > > > > > can trace it with kfree_skb() if you want. What's the problem?
> > > > > > > > > It's ok, but we can make it better. Yunsheng Lin may have explained why?
> > > > > > > >
> > > > > > > > Why it is better to trace dropped packets both in enqueue and in kfree_skb()?
> > > > > > > I mean we can use one tracepoint to know what happened in the queue,
> > > > > > > not necessary to trace enqueue and  kfree_skb()
> > > > > >
> > > > > > This is wrong, packets can be dropped for other reasons too, tracing
> > > > > no matter where the packet is dropped, we should allow user to know
> > > > > whether dropped in the enqueue.  and the
> > > > > what the return value.
> > > >
> > > > Again you can know it by kfree_skb(). And you can not avoid
> > > > kfree_skb() no matter how you change enqueue. So, I don't see your
> > > No, If I know what value returned for specified qdisc , I can know
> > > what happened, not necessarily kfree_skb()
> >
> > This is wrong. You have to trace dropped packets because you
> > need to know when to delete the key (skb address) from the hashtable
> > you use to calculate the latency. You save the key on enqueue and
> > remove it on both dequeue and kfree_skb, the only difference is you
> No, we can set the timestamp or cb in  skb in enqueue and check them in dequeue.
> we may not use the hashtable.

Are you sure this is safe?? How do you ensure we have enough space
in skb->cb[]? More importantly, why do we even modify skb in tracepoint?

> If we use hashtable, we still can check the return value, save them to
> hashtable or not.

How many times do I have to repeat the return value of enqueue is not
sufficient even if you trace it? Can't you just look at codel_dequeue()
and tell me how a drop at dequeue can be reflected to enqueue?

Thanks.
diff mbox series

Patch

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 58209557cb3a..c3006c6b4a87 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -46,6 +46,32 @@  TRACE_EVENT(qdisc_dequeue,
 		  __entry->txq_state, __entry->packets, __entry->skbaddr )
 );
 
+TRACE_EVENT(qdisc_enqueue,
+
+	TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
+
+	TP_ARGS(qdisc, txq, skb),
+
+	TP_STRUCT__entry(
+		__field(struct Qdisc *, qdisc)
+		__field(void *,	skbaddr)
+		__field(int, ifindex)
+		__field(u32, handle)
+		__field(u32, parent)
+	),
+
+	TP_fast_assign(
+		__entry->qdisc = qdisc;
+		__entry->skbaddr = skb;
+		__entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
+		__entry->handle	 = qdisc->handle;
+		__entry->parent	 = qdisc->parent;
+	),
+
+	TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
+		  __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
+);
+
 TRACE_EVENT(qdisc_reset,
 
 	TP_PROTO(struct Qdisc *q),
diff --git a/net/core/dev.c b/net/core/dev.c
index c253c2aafe97..20b9376de301 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@ 
 #include <trace/events/napi.h>
 #include <trace/events/net.h>
 #include <trace/events/skb.h>
+#include <trace/events/qdisc.h>
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
@@ -3864,6 +3865,8 @@  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			if (unlikely(!nolock_qdisc_is_empty(q))) {
 				rc = q->enqueue(skb, q, &to_free) &
 					NET_XMIT_MASK;
+				if (rc == NET_XMIT_SUCCESS)
+					trace_qdisc_enqueue(q, txq, skb);
 				__qdisc_run(q);
 				qdisc_run_end(q);
 
@@ -3880,6 +3883,9 @@  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		}
 
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+		if (rc == NET_XMIT_SUCCESS)
+			trace_qdisc_enqueue(q, txq, skb);
+
 		qdisc_run(q);
 
 no_lock_out:
@@ -3924,6 +3930,9 @@  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+		if (rc == NET_XMIT_SUCCESS)
+			trace_qdisc_enqueue(q, txq, skb);
+
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);