diff mbox series

[net-next,1/1] net/sched: Disambiguate verdict from return code

Message ID 20230919145951.352548-1-victor@mojatatu.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,1/1] net/sched: Disambiguate verdict from return code | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 6441 this patch: 6441
netdev/cc_maintainers warning 3 maintainers not CCed: toke@toke.dk cake@lists.bufferbloat.net daniel@iogearbox.net
netdev/build_clang success Errors and warnings before: 2683 this patch: 2683
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6833 this patch: 6833
netdev/checkpatch warning + /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */ + /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */ CHECK: Comparison to NULL could be written "!cl" WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Victor Nogueira Sept. 19, 2023, 2:59 p.m. UTC
Currently there is no way to distinguish between an error and a
classification verdict. This patch adds the verdict field as a part of
struct tcf_result. That way, tcf_classify can return a proper
error number when it fails, and we keep the classification result
information encapsulated in struct tcf_result.

Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
With that we can distinguish between a drop from a processing error versus
a drop from classification.

Signed-off-by: Victor Nogueira <victor@mojatatu.com>
---
 include/net/dropreason-core.h |  6 +++++
 include/net/sch_generic.h     |  7 ++++++
 net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
 net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
 net/sched/sch_cake.c          | 32 +++++++++++++-------------
 net/sched/sch_drr.c           | 33 +++++++++++++--------------
 net/sched/sch_ets.c           |  6 +++--
 net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
 net/sched/sch_fq_pie.c        | 28 +++++++++++------------
 net/sched/sch_hfsc.c          |  6 +++--
 net/sched/sch_htb.c           |  6 +++--
 net/sched/sch_multiq.c        |  6 +++--
 net/sched/sch_prio.c          |  7 ++++--
 net/sched/sch_qfq.c           | 34 +++++++++++++---------------
 net/sched/sch_sfb.c           | 29 ++++++++++++------------
 net/sched/sch_sfq.c           | 28 +++++++++++------------
 16 files changed, 195 insertions(+), 142 deletions(-)

Comments

Daniel Borkmann Sept. 19, 2023, 10:15 p.m. UTC | #1
[ +Martin, bpf ]

On 9/19/23 4:59 PM, Victor Nogueira wrote:
> Currently there is no way to distinguish between an error and a
> classification verdict. This patch adds the verdict field as a part of
> struct tcf_result. That way, tcf_classify can return a proper
> error number when it fails, and we keep the classification result
> information encapsulated in struct tcf_result.
> 
> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> With that we can distinguish between a drop from a processing error versus
> a drop from classification.
> 
> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> ---
>   include/net/dropreason-core.h |  6 +++++
>   include/net/sch_generic.h     |  7 ++++++
>   net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
>   net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
>   net/sched/sch_cake.c          | 32 +++++++++++++-------------
>   net/sched/sch_drr.c           | 33 +++++++++++++--------------
>   net/sched/sch_ets.c           |  6 +++--
>   net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
>   net/sched/sch_fq_pie.c        | 28 +++++++++++------------
>   net/sched/sch_hfsc.c          |  6 +++--
>   net/sched/sch_htb.c           |  6 +++--
>   net/sched/sch_multiq.c        |  6 +++--
>   net/sched/sch_prio.c          |  7 ++++--
>   net/sched/sch_qfq.c           | 34 +++++++++++++---------------
>   net/sched/sch_sfb.c           | 29 ++++++++++++------------
>   net/sched/sch_sfq.c           | 28 +++++++++++------------
>   16 files changed, 195 insertions(+), 142 deletions(-)
> 
> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> index a587e83fc169..b1c069c8e7f2 100644
> --- a/include/net/dropreason-core.h
> +++ b/include/net/dropreason-core.h
> @@ -80,6 +80,8 @@
>   	FN(IPV6_NDISC_BAD_OPTIONS)	\
>   	FN(IPV6_NDISC_NS_OTHERHOST)	\
>   	FN(QUEUE_PURGE)			\
> +	FN(TC_EGRESS_ERROR)		\
> +	FN(TC_INGRESS_ERROR)		\
>   	FNe(MAX)
>   
>   /**
> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>   	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>   	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>   	SKB_DROP_REASON_QUEUE_PURGE,
> +	/** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> +	SKB_DROP_REASON_TC_EGRESS_ERROR,
> +	/** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> +	SKB_DROP_REASON_TC_INGRESS_ERROR,
>   	/**
>   	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>   	 * shouldn't be used as a real 'reason' - only for tracing code gen
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index f232512505f8..9a3f71d2545e 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -326,6 +326,7 @@ struct Qdisc_ops {
>   
>   
>   struct tcf_result {
> +	u32 verdict;
>   	union {
>   		struct {
>   			unsigned long	class;
> @@ -336,6 +337,12 @@ struct tcf_result {
>   	};
>   };
>   
> +static inline void tcf_result_set_verdict(struct tcf_result *res,
> +					  const u32 verdict)
> +{
> +	res->verdict = verdict;
> +}
> +
>   struct tcf_chain;
>   
>   struct tcf_proto_ops {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index ccff2b6ef958..1450f4741d9b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
>   #endif /* CONFIG_NET_EGRESS */
>   
>   #ifdef CONFIG_NET_XGRESS
> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> +		  struct tcf_result *res)
>   {
> -	int ret = TC_ACT_UNSPEC;
> +	int ret = 0;
>   #ifdef CONFIG_NET_CLS_ACT
>   	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> -	struct tcf_result res;
>   
> -	if (!miniq)
> +	if (!miniq) {
> +		tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>   		return ret;
> +	}
>   
>   	tc_skb_cb(skb)->mru = 0;
>   	tc_skb_cb(skb)->post_ct = false;
>   
>   	mini_qdisc_bstats_cpu_update(miniq, skb);
> -	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> +	ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> +	if (ret < 0) {
> +		mini_qdisc_qstats_cpu_drop(miniq);
> +		return ret;
> +	}
>   	/* Only tcf related quirks below. */
> -	switch (ret) {
> +	switch (res->verdict) {
>   	case TC_ACT_SHOT:
>   		mini_qdisc_qstats_cpu_drop(miniq);
>   		break;
>   	case TC_ACT_OK:
>   	case TC_ACT_RECLASSIFY:
> -		skb->tc_index = TC_H_MIN(res.classid);
> +		skb->tc_index = TC_H_MIN(res->classid);
>   		break;
>   	}
> +#else
> +	tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>   #endif /* CONFIG_NET_CLS_ACT */
>   	return ret;
>   }
> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>   		   struct net_device *orig_dev, bool *another)
>   {
>   	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> +	struct tcf_result res = {0};
>   	int sch_ret;
>   
>   	if (!entry)
> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>   		if (sch_ret != TC_ACT_UNSPEC)
>   			goto ingress_verdict;
>   	}
> -	sch_ret = tc_run(tcx_entry(entry), skb);
> +	sch_ret = tc_run(tcx_entry(entry), skb, &res);
> +	if (sch_ret < 0) {
> +		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> +		*ret = NET_RX_DROP;
> +		return NULL;
> +	}
>   ingress_verdict:
> -	switch (sch_ret) {
> +	switch (res.verdict) {

This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
in such case.

>   	case TC_ACT_REDIRECT:
>   		/* skb_mac_header check was done by BPF, so we can safely
>   		 * push the L2 header back before redirecting to another
> @@ -4032,6 +4046,7 @@ static __always_inline struct sk_buff *
>   sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>   {
>   	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
> +	struct tcf_result res = {0};
>   	int sch_ret;
>   
>   	if (!entry)
> @@ -4045,9 +4060,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>   		if (sch_ret != TC_ACT_UNSPEC)
>   			goto egress_verdict;
>   	}
> -	sch_ret = tc_run(tcx_entry(entry), skb);
> +	sch_ret = tc_run(tcx_entry(entry), skb, &res);
> +	if (sch_ret < 0) {
> +		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS_ERROR);
> +		*ret = NET_XMIT_DROP;
> +		return NULL;
> +	}
>   egress_verdict:
> -	switch (sch_ret) {
> +	switch (res.verdict) {
>   	case TC_ACT_REDIRECT:
>   		/* No need to push/pop skb's mac_header here on egress! */
>   		skb_do_redirect(skb);
Jamal Hadi Salim Sept. 19, 2023, 11:20 p.m. UTC | #2
On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> [ +Martin, bpf ]
>
> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> > Currently there is no way to distinguish between an error and a
> > classification verdict. This patch adds the verdict field as a part of
> > struct tcf_result. That way, tcf_classify can return a proper
> > error number when it fails, and we keep the classification result
> > information encapsulated in struct tcf_result.
> >
> > Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> > SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> > With that we can distinguish between a drop from a processing error versus
> > a drop from classification.
> >
> > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > ---
> >   include/net/dropreason-core.h |  6 +++++
> >   include/net/sch_generic.h     |  7 ++++++
> >   net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
> >   net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
> >   net/sched/sch_cake.c          | 32 +++++++++++++-------------
> >   net/sched/sch_drr.c           | 33 +++++++++++++--------------
> >   net/sched/sch_ets.c           |  6 +++--
> >   net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
> >   net/sched/sch_fq_pie.c        | 28 +++++++++++------------
> >   net/sched/sch_hfsc.c          |  6 +++--
> >   net/sched/sch_htb.c           |  6 +++--
> >   net/sched/sch_multiq.c        |  6 +++--
> >   net/sched/sch_prio.c          |  7 ++++--
> >   net/sched/sch_qfq.c           | 34 +++++++++++++---------------
> >   net/sched/sch_sfb.c           | 29 ++++++++++++------------
> >   net/sched/sch_sfq.c           | 28 +++++++++++------------
> >   16 files changed, 195 insertions(+), 142 deletions(-)
> >
> > diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> > index a587e83fc169..b1c069c8e7f2 100644
> > --- a/include/net/dropreason-core.h
> > +++ b/include/net/dropreason-core.h
> > @@ -80,6 +80,8 @@
> >       FN(IPV6_NDISC_BAD_OPTIONS)      \
> >       FN(IPV6_NDISC_NS_OTHERHOST)     \
> >       FN(QUEUE_PURGE)                 \
> > +     FN(TC_EGRESS_ERROR)             \
> > +     FN(TC_INGRESS_ERROR)            \
> >       FNe(MAX)
> >
> >   /**
> > @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >       SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >       /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >       SKB_DROP_REASON_QUEUE_PURGE,
> > +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> > +     SKB_DROP_REASON_TC_EGRESS_ERROR,
> > +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> > +     SKB_DROP_REASON_TC_INGRESS_ERROR,
> >       /**
> >        * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >        * shouldn't be used as a real 'reason' - only for tracing code gen
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index f232512505f8..9a3f71d2545e 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -326,6 +326,7 @@ struct Qdisc_ops {
> >
> >
> >   struct tcf_result {
> > +     u32 verdict;
> >       union {
> >               struct {
> >                       unsigned long   class;
> > @@ -336,6 +337,12 @@ struct tcf_result {
> >       };
> >   };
> >
> > +static inline void tcf_result_set_verdict(struct tcf_result *res,
> > +                                       const u32 verdict)
> > +{
> > +     res->verdict = verdict;
> > +}
> > +
> >   struct tcf_chain;
> >
> >   struct tcf_proto_ops {
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index ccff2b6ef958..1450f4741d9b 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
> >   #endif /* CONFIG_NET_EGRESS */
> >
> >   #ifdef CONFIG_NET_XGRESS
> > -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> > +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> > +               struct tcf_result *res)
> >   {
> > -     int ret = TC_ACT_UNSPEC;
> > +     int ret = 0;
> >   #ifdef CONFIG_NET_CLS_ACT
> >       struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> > -     struct tcf_result res;
> >
> > -     if (!miniq)
> > +     if (!miniq) {
> > +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >               return ret;
> > +     }
> >
> >       tc_skb_cb(skb)->mru = 0;
> >       tc_skb_cb(skb)->post_ct = false;
> >
> >       mini_qdisc_bstats_cpu_update(miniq, skb);
> > -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> > +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> > +     if (ret < 0) {
> > +             mini_qdisc_qstats_cpu_drop(miniq);
> > +             return ret;
> > +     }
> >       /* Only tcf related quirks below. */
> > -     switch (ret) {
> > +     switch (res->verdict) {
> >       case TC_ACT_SHOT:
> >               mini_qdisc_qstats_cpu_drop(miniq);
> >               break;
> >       case TC_ACT_OK:
> >       case TC_ACT_RECLASSIFY:
> > -             skb->tc_index = TC_H_MIN(res.classid);
> > +             skb->tc_index = TC_H_MIN(res->classid);
> >               break;
> >       }
> > +#else
> > +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >   #endif /* CONFIG_NET_CLS_ACT */
> >       return ret;
> >   }
> > @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >                  struct net_device *orig_dev, bool *another)
> >   {
> >       struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> > +     struct tcf_result res = {0};
> >       int sch_ret;
> >
> >       if (!entry)
> > @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >               if (sch_ret != TC_ACT_UNSPEC)
> >                       goto ingress_verdict;
> >       }
> > -     sch_ret = tc_run(tcx_entry(entry), skb);
> > +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> > +     if (sch_ret < 0) {
> > +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> > +             *ret = NET_RX_DROP;
> > +             return NULL;
> > +     }
> >   ingress_verdict:
> > -     switch (sch_ret) {
> > +     switch (res.verdict) {
>
> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
> in such case.
>

I think it is valuable to have a good reason code like
SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
verdicts in the case of tc_run() variant.
For tcx_run(), does this look ok (for consistency)?:

if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC) {
                        res.verdict = sch_ret;
                        goto ingress_verdict;
                }
}

cheers,
jamal

> >       case TC_ACT_REDIRECT:
> >               /* skb_mac_header check was done by BPF, so we can safely
> >                * push the L2 header back before redirecting to another
> > @@ -4032,6 +4046,7 @@ static __always_inline struct sk_buff *
> >   sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >   {
> >       struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
> > +     struct tcf_result res = {0};
> >       int sch_ret;
> >
> >       if (!entry)
> > @@ -4045,9 +4060,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >               if (sch_ret != TC_ACT_UNSPEC)
> >                       goto egress_verdict;
> >       }
> > -     sch_ret = tc_run(tcx_entry(entry), skb);
> > +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> > +     if (sch_ret < 0) {
> > +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS_ERROR);
> > +             *ret = NET_XMIT_DROP;
> > +             return NULL;
> > +     }
> >   egress_verdict:
> > -     switch (sch_ret) {
> > +     switch (res.verdict) {
> >       case TC_ACT_REDIRECT:
> >               /* No need to push/pop skb's mac_header here on egress! */
> >               skb_do_redirect(skb);
Daniel Borkmann Sept. 22, 2023, 8:12 a.m. UTC | #3
On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> [ +Martin, bpf ]
>>
>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>>> Currently there is no way to distinguish between an error and a
>>> classification verdict. This patch adds the verdict field as a part of
>>> struct tcf_result. That way, tcf_classify can return a proper
>>> error number when it fails, and we keep the classification result
>>> information encapsulated in struct tcf_result.
>>>
>>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
>>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
>>> With that we can distinguish between a drop from a processing error versus
>>> a drop from classification.
>>>
>>> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
>>> ---
>>>    include/net/dropreason-core.h |  6 +++++
>>>    include/net/sch_generic.h     |  7 ++++++
>>>    net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
>>>    net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
>>>    net/sched/sch_cake.c          | 32 +++++++++++++-------------
>>>    net/sched/sch_drr.c           | 33 +++++++++++++--------------
>>>    net/sched/sch_ets.c           |  6 +++--
>>>    net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
>>>    net/sched/sch_fq_pie.c        | 28 +++++++++++------------
>>>    net/sched/sch_hfsc.c          |  6 +++--
>>>    net/sched/sch_htb.c           |  6 +++--
>>>    net/sched/sch_multiq.c        |  6 +++--
>>>    net/sched/sch_prio.c          |  7 ++++--
>>>    net/sched/sch_qfq.c           | 34 +++++++++++++---------------
>>>    net/sched/sch_sfb.c           | 29 ++++++++++++------------
>>>    net/sched/sch_sfq.c           | 28 +++++++++++------------
>>>    16 files changed, 195 insertions(+), 142 deletions(-)
>>>
>>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
>>> index a587e83fc169..b1c069c8e7f2 100644
>>> --- a/include/net/dropreason-core.h
>>> +++ b/include/net/dropreason-core.h
>>> @@ -80,6 +80,8 @@
>>>        FN(IPV6_NDISC_BAD_OPTIONS)      \
>>>        FN(IPV6_NDISC_NS_OTHERHOST)     \
>>>        FN(QUEUE_PURGE)                 \
>>> +     FN(TC_EGRESS_ERROR)             \
>>> +     FN(TC_INGRESS_ERROR)            \
>>>        FNe(MAX)
>>>
>>>    /**
>>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>>>        SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>>>        /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>>>        SKB_DROP_REASON_QUEUE_PURGE,
>>> +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_EGRESS_ERROR,
>>> +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_INGRESS_ERROR,
>>>        /**
>>>         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>>>         * shouldn't be used as a real 'reason' - only for tracing code gen
>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>> index f232512505f8..9a3f71d2545e 100644
>>> --- a/include/net/sch_generic.h
>>> +++ b/include/net/sch_generic.h
>>> @@ -326,6 +326,7 @@ struct Qdisc_ops {
>>>
>>>
>>>    struct tcf_result {
>>> +     u32 verdict;
>>>        union {
>>>                struct {
>>>                        unsigned long   class;
>>> @@ -336,6 +337,12 @@ struct tcf_result {
>>>        };
>>>    };
>>>
>>> +static inline void tcf_result_set_verdict(struct tcf_result *res,
>>> +                                       const u32 verdict)
>>> +{
>>> +     res->verdict = verdict;
>>> +}
>>> +
>>>    struct tcf_chain;
>>>
>>>    struct tcf_proto_ops {
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index ccff2b6ef958..1450f4741d9b 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
>>>    #endif /* CONFIG_NET_EGRESS */
>>>
>>>    #ifdef CONFIG_NET_XGRESS
>>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
>>> +               struct tcf_result *res)
>>>    {
>>> -     int ret = TC_ACT_UNSPEC;
>>> +     int ret = 0;
>>>    #ifdef CONFIG_NET_CLS_ACT
>>>        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
>>> -     struct tcf_result res;
>>>
>>> -     if (!miniq)
>>> +     if (!miniq) {
>>> +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>                return ret;
>>> +     }
>>>
>>>        tc_skb_cb(skb)->mru = 0;
>>>        tc_skb_cb(skb)->post_ct = false;
>>>
>>>        mini_qdisc_bstats_cpu_update(miniq, skb);
>>> -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
>>> +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
>>> +     if (ret < 0) {
>>> +             mini_qdisc_qstats_cpu_drop(miniq);
>>> +             return ret;
>>> +     }
>>>        /* Only tcf related quirks below. */
>>> -     switch (ret) {
>>> +     switch (res->verdict) {
>>>        case TC_ACT_SHOT:
>>>                mini_qdisc_qstats_cpu_drop(miniq);
>>>                break;
>>>        case TC_ACT_OK:
>>>        case TC_ACT_RECLASSIFY:
>>> -             skb->tc_index = TC_H_MIN(res.classid);
>>> +             skb->tc_index = TC_H_MIN(res->classid);
>>>                break;
>>>        }
>>> +#else
>>> +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>    #endif /* CONFIG_NET_CLS_ACT */
>>>        return ret;
>>>    }
>>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                   struct net_device *orig_dev, bool *another)
>>>    {
>>>        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
>>> +     struct tcf_result res = {0};
>>>        int sch_ret;
>>>
>>>        if (!entry)
>>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                if (sch_ret != TC_ACT_UNSPEC)
>>>                        goto ingress_verdict;
>>>        }
>>> -     sch_ret = tc_run(tcx_entry(entry), skb);
>>> +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
>>> +     if (sch_ret < 0) {
>>> +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>>> +             *ret = NET_RX_DROP;
>>> +             return NULL;
>>> +     }
>>>    ingress_verdict:
>>> -     switch (sch_ret) {
>>> +     switch (res.verdict) {
>>
>> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
>> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
>> in such case.
> 
> I think it is valuable to have a good reason code like
> SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
> verdicts in the case of tc_run() variant.
> For tcx_run(), does this look ok (for consistency)?:
> 
> if (static_branch_unlikely(&tcx_needed_key)) {
>                  sch_ret = tcx_run(entry, skb, true);
>                  if (sch_ret != TC_ACT_UNSPEC) {
>                          res.verdict = sch_ret;
>                          goto ingress_verdict;
>                  }
> }

In the above case we don't have 'internal' errors which you want to trace, so I would
also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
every packet.

I was more thinking like something below could be a better choice. I presume your main
goal is to trace where these errors originated in the first place, so it might even be
useful to capture the actual return code as well.

Then you can use perf script, bpf and whatnot to gather further insights into what
happened while being less invasive and avoiding the need to extend struct tcf_result.

This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
that in fast path all errors are < TC_ACT_UNSPEC anyway.

diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..4089d195144d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)

  	mini_qdisc_bstats_cpu_update(miniq, skb);
  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+	if (unlikely(ret < TC_ACT_UNSPEC)) {
+		trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
+		ret = TC_ACT_SHOT;
+	}
  	/* Only tcf related quirks below. */
  	switch (ret) {
  	case TC_ACT_SHOT:

Best,
Daniel
Jamal Hadi Salim Sept. 25, 2023, 11:01 p.m. UTC | #4
On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> > On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>
> >> [ +Martin, bpf ]
> >>
> >> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >>> Currently there is no way to distinguish between an error and a
> >>> classification verdict. This patch adds the verdict field as a part of
> >>> struct tcf_result. That way, tcf_classify can return a proper
> >>> error number when it fails, and we keep the classification result
> >>> information encapsulated in struct tcf_result.
> >>>
> >>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
> >>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
> >>> With that we can distinguish between a drop from a processing error versus
> >>> a drop from classification.
> >>>
> >>> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> >>> ---
> >>>    include/net/dropreason-core.h |  6 +++++
> >>>    include/net/sch_generic.h     |  7 ++++++
> >>>    net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
> >>>    net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
> >>>    net/sched/sch_cake.c          | 32 +++++++++++++-------------
> >>>    net/sched/sch_drr.c           | 33 +++++++++++++--------------
> >>>    net/sched/sch_ets.c           |  6 +++--
> >>>    net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
> >>>    net/sched/sch_fq_pie.c        | 28 +++++++++++------------
> >>>    net/sched/sch_hfsc.c          |  6 +++--
> >>>    net/sched/sch_htb.c           |  6 +++--
> >>>    net/sched/sch_multiq.c        |  6 +++--
> >>>    net/sched/sch_prio.c          |  7 ++++--
> >>>    net/sched/sch_qfq.c           | 34 +++++++++++++---------------
> >>>    net/sched/sch_sfb.c           | 29 ++++++++++++------------
> >>>    net/sched/sch_sfq.c           | 28 +++++++++++------------
> >>>    16 files changed, 195 insertions(+), 142 deletions(-)
> >>>
> >>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> >>> index a587e83fc169..b1c069c8e7f2 100644
> >>> --- a/include/net/dropreason-core.h
> >>> +++ b/include/net/dropreason-core.h
> >>> @@ -80,6 +80,8 @@
> >>>        FN(IPV6_NDISC_BAD_OPTIONS)      \
> >>>        FN(IPV6_NDISC_NS_OTHERHOST)     \
> >>>        FN(QUEUE_PURGE)                 \
> >>> +     FN(TC_EGRESS_ERROR)             \
> >>> +     FN(TC_INGRESS_ERROR)            \
> >>>        FNe(MAX)
> >>>
> >>>    /**
> >>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >>>        SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >>>        /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >>>        SKB_DROP_REASON_QUEUE_PURGE,
> >>> +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> >>> +     SKB_DROP_REASON_TC_EGRESS_ERROR,
> >>> +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> >>> +     SKB_DROP_REASON_TC_INGRESS_ERROR,
> >>>        /**
> >>>         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >>>         * shouldn't be used as a real 'reason' - only for tracing code gen
> >>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> >>> index f232512505f8..9a3f71d2545e 100644
> >>> --- a/include/net/sch_generic.h
> >>> +++ b/include/net/sch_generic.h
> >>> @@ -326,6 +326,7 @@ struct Qdisc_ops {
> >>>
> >>>
> >>>    struct tcf_result {
> >>> +     u32 verdict;
> >>>        union {
> >>>                struct {
> >>>                        unsigned long   class;
> >>> @@ -336,6 +337,12 @@ struct tcf_result {
> >>>        };
> >>>    };
> >>>
> >>> +static inline void tcf_result_set_verdict(struct tcf_result *res,
> >>> +                                       const u32 verdict)
> >>> +{
> >>> +     res->verdict = verdict;
> >>> +}
> >>> +
> >>>    struct tcf_chain;
> >>>
> >>>    struct tcf_proto_ops {
> >>> diff --git a/net/core/dev.c b/net/core/dev.c
> >>> index ccff2b6ef958..1450f4741d9b 100644
> >>> --- a/net/core/dev.c
> >>> +++ b/net/core/dev.c
> >>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
> >>>    #endif /* CONFIG_NET_EGRESS */
> >>>
> >>>    #ifdef CONFIG_NET_XGRESS
> >>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> >>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
> >>> +               struct tcf_result *res)
> >>>    {
> >>> -     int ret = TC_ACT_UNSPEC;
> >>> +     int ret = 0;
> >>>    #ifdef CONFIG_NET_CLS_ACT
> >>>        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
> >>> -     struct tcf_result res;
> >>>
> >>> -     if (!miniq)
> >>> +     if (!miniq) {
> >>> +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >>>                return ret;
> >>> +     }
> >>>
> >>>        tc_skb_cb(skb)->mru = 0;
> >>>        tc_skb_cb(skb)->post_ct = false;
> >>>
> >>>        mini_qdisc_bstats_cpu_update(miniq, skb);
> >>> -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> >>> +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
> >>> +     if (ret < 0) {
> >>> +             mini_qdisc_qstats_cpu_drop(miniq);
> >>> +             return ret;
> >>> +     }
> >>>        /* Only tcf related quirks below. */
> >>> -     switch (ret) {
> >>> +     switch (res->verdict) {
> >>>        case TC_ACT_SHOT:
> >>>                mini_qdisc_qstats_cpu_drop(miniq);
> >>>                break;
> >>>        case TC_ACT_OK:
> >>>        case TC_ACT_RECLASSIFY:
> >>> -             skb->tc_index = TC_H_MIN(res.classid);
> >>> +             skb->tc_index = TC_H_MIN(res->classid);
> >>>                break;
> >>>        }
> >>> +#else
> >>> +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
> >>>    #endif /* CONFIG_NET_CLS_ACT */
> >>>        return ret;
> >>>    }
> >>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>>                   struct net_device *orig_dev, bool *another)
> >>>    {
> >>>        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
> >>> +     struct tcf_result res = {0};
> >>>        int sch_ret;
> >>>
> >>>        if (!entry)
> >>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>>                if (sch_ret != TC_ACT_UNSPEC)
> >>>                        goto ingress_verdict;
> >>>        }
> >>> -     sch_ret = tc_run(tcx_entry(entry), skb);
> >>> +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >>> +     if (sch_ret < 0) {
> >>> +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>> +             *ret = NET_RX_DROP;
> >>> +             return NULL;
> >>> +     }
> >>>    ingress_verdict:
> >>> -     switch (sch_ret) {
> >>> +     switch (res.verdict) {
> >>
> >> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
> >> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
> >> in such case.
> >
> > I think it is valuable to have a good reason code like
> > SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
> > verdicts in the case of tc_run() variant.
> > For tcx_run(), does this look ok (for consistency)?:
> >
> > if (static_branch_unlikely(&tcx_needed_key)) {
> >                  sch_ret = tcx_run(entry, skb, true);
> >                  if (sch_ret != TC_ACT_UNSPEC) {
> >                          res.verdict = sch_ret;
> >                          goto ingress_verdict;
> >                  }
> > }
>
> In the above case we don't have 'internal' errors which you want to trace, so I would
> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> every packet.

We can move the zeroing inside tc_run() but we declare it in the same
spot as we do right now. You will still need to set res.verdict as
above.
Would that work for you?

> I was more thinking like something below could be a better choice. I presume your main
> goal is to trace where these errors originated in the first place, so it might even be
> useful to capture the actual return code as well.

The main motivation is a few syzkaller bugs which resulted in not
disambiguating between errors being returned and sometimes
TC_ACT_SHOT.

> Then you can use perf script, bpf and whatnot to gather further insights into what
> happened while being less invasive and avoiding the need to extend struct tcf_result.
>

We could use trace instead - the reason we have the skb reason is
being used in the other spots (does this trace require ebpf to be
usable?).

> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
> that in fast path all errors are < TC_ACT_UNSPEC anyway.
>

I am not sure i followed. 0 means success, result codes are returned in res now.

cheers,
jamal

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 85df22f05c38..4089d195144d 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>
>         mini_qdisc_bstats_cpu_update(miniq, skb);
>         ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
> +               ret = TC_ACT_SHOT;
> +       }
>         /* Only tcf related quirks below. */
>         switch (ret) {
>         case TC_ACT_SHOT:
>
> Best,
> Daniel
Daniel Borkmann Sept. 29, 2023, 3:48 p.m. UTC | #5
On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
[...]
>>
>> In the above case we don't have 'internal' errors which you want to trace, so I would
>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>> every packet.
> 
> We can move the zeroing inside tc_run() but we declare it in the same
> spot as we do right now. You will still need to set res.verdict as
> above.
> Would that work for you?

What I'm not following is that with the below you can avoid the unnecessary
fast path cost (which is only for corner case which is almost never hit) and
get even better visibility. Are you saying it doesn't work?

>> I was more thinking like something below could be a better choice. I presume your main
>> goal is to trace where these errors originated in the first place, so it might even be
>> useful to capture the actual return code as well.
> 
> The main motivation is a few syzkaller bugs which resulted in not
> disambiguating between errors being returned and sometimes
> TC_ACT_SHOT.
> 
>> Then you can use perf script, bpf and whatnot to gather further insights into what
>> happened while being less invasive and avoiding the need to extend struct tcf_result.
> 
> We could use trace instead - the reason we have the skb reason is
> being used in the other spots (does this trace require ebpf to be
> usable?).

No you can just use regular perf by attaching to the tracepoint, no need for using
bpf at all here.

>> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
>> that in fast path all errors are < TC_ACT_UNSPEC anyway.
> 
> I am not sure i followed. 0 means success, result codes are returned in res now.

What I was saying is that you don't need the struct change from the patch, but only
the changes where you rework TC_ACT_SHOT into one of the -E<errors>, and then with
the below you can pass this through an exception tracepoint.

>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 85df22f05c38..4089d195144d 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>>
>>          mini_qdisc_bstats_cpu_update(miniq, skb);
>>          ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
>> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
>> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
>> +               ret = TC_ACT_SHOT;
>> +       }
>>          /* Only tcf related quirks below. */
>>          switch (ret) {
>>          case TC_ACT_SHOT:

Thanks,
Daniel
Jamal Hadi Salim Oct. 2, 2023, 7:54 p.m. UTC | #6
Hi Daniel,
On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> > On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> [...]
> >>
> >> In the above case we don't have 'internal' errors which you want to trace, so I would
> >> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >> every packet.
> >
> > We can move the zeroing inside tc_run() but we declare it in the same
> > spot as we do right now. You will still need to set res.verdict as
> > above.
> > Would that work for you?
>
> What I'm not following is that with the below you can avoid the unnecessary
> fast path cost (which is only for corner case which is almost never hit) and
> get even better visibility. Are you saying it doesn't work?

I am probably missing something:
-1/UNSPEC is a legit errno. And the main motivation here for this
patch is to disambiguate if it was -EPERM vs UNSPEC
Maybe that is what you are calling a "corner case"?

There are two options in my mind right now (since you are guaranteed
in tcx_run you will never return anything below UNSPEC):
1) we just have the switch statement invocation inside an inline
function and you can pass it sch_ret (for tcx case) and we'll pass it
res.verdit for tc_run() case.
2) is something is we leave tcx_run alone and we have something along
the lines of:

--------------
diff --git a/net/core/dev.c b/net/core/dev.c
index 1450f4741d9b..93613bce647c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
 {
        struct bpf_mprog_entry *entry =
rcu_dereference_bh(skb->dev->tcx_ingress);
-       struct tcf_result res = {0};
+       struct tcf_result res;
        int sch_ret;

        if (!entry)
@@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
packet_type **pt_prev, int *ret,
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
+
+       res.verdict = 0;
        sch_ret = tc_run(tcx_entry(entry), skb, &res);
        if (sch_ret < 0) {
                kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
                *ret = NET_RX_DROP;
                return NULL;
        }
+       sch_ret = res.verdict;
 ingress_verdict:
-       switch (res.verdict) {
+       switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can
safely
                 * push the L2 header back before redirecting to another
-----------

on the drop reason - our thinking is to support drop_watch alongside
tracepoint given kfree_skb_reason exists already; if i am not mistaken
what you suggested would require us to create a new tracepoint?

cheers,
jamal

> >> I was more thinking like something below could be a better choice. I presume your main
> >> goal is to trace where these errors originated in the first place, so it might even be
> >> useful to capture the actual return code as well.
> >
> > The main motivation is a few syzkaller bugs which resulted in not
> > disambiguating between errors being returned and sometimes
> > TC_ACT_SHOT.
> >
> >> Then you can use perf script, bpf and whatnot to gather further insights into what
> >> happened while being less invasive and avoiding the need to extend struct tcf_result.
> >
> > We could use trace instead - the reason we have the skb reason is
> > being used in the other spots (does this trace require ebpf to be
> > usable?).
>
> No you can just use regular perf by attaching to the tracepoint, no need for using
> bpf at all here.
>
> >> This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
> >> that in fast path all errors are < TC_ACT_UNSPEC anyway.
> >
> > I am not sure i followed. 0 means success, result codes are returned in res now.
>
> What I was saying is that you don't need the struct change from the patch, but only
> the changes where you rework TC_ACT_SHOT into one of the -E<errors>, and then with
> the below you can pass this through an exception tracepoint.
>
> >> diff --git a/net/core/dev.c b/net/core/dev.c
> >> index 85df22f05c38..4089d195144d 100644
> >> --- a/net/core/dev.c
> >> +++ b/net/core/dev.c
> >> @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
> >>
> >>          mini_qdisc_bstats_cpu_update(miniq, skb);
> >>          ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
> >> +       if (unlikely(ret < TC_ACT_UNSPEC)) {
> >> +               trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
> >> +               ret = TC_ACT_SHOT;
> >> +       }
> >>          /* Only tcf related quirks below. */
> >>          switch (ret) {
> >>          case TC_ACT_SHOT:
>
> Thanks,
> Daniel
Daniel Borkmann Oct. 3, 2023, 9 a.m. UTC | #7
On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>> [...]
>>>>
>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>>>> every packet.
>>>
>>> We can move the zeroing inside tc_run() but we declare it in the same
>>> spot as we do right now. You will still need to set res.verdict as
>>> above.
>>> Would that work for you?
>>
>> What I'm not following is that with the below you can avoid the unnecessary
>> fast path cost (which is only for corner case which is almost never hit) and
>> get even better visibility. Are you saying it doesn't work?
> 
> I am probably missing something:
> -1/UNSPEC is a legit errno. And the main motivation here for this
> patch is to disambiguate if it was -EPERM vs UNSPEC
> Maybe that is what you are calling a "corner case"?

Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
be audited for the code in the tree and therefore avoided so that you never run into
this problem.

> There are two options in my mind right now (since you are guaranteed
> in tcx_run you will never return anything below UNSPEC):
> 1) we just have the switch statement invocation inside an inline
> function and you can pass it sch_ret (for tcx case) and we'll pass it
> res.verdit for tc_run() case.
> 2) is something is we leave tcx_run alone and we have something along
> the lines of:
> 
> --------------
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1450f4741d9b..93613bce647c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
>                     struct net_device *orig_dev, bool *another)
>   {
>          struct bpf_mprog_entry *entry =
> rcu_dereference_bh(skb->dev->tcx_ingress);
> -       struct tcf_result res = {0};
> +       struct tcf_result res;
>          int sch_ret;
> 
>          if (!entry)
> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
>                  if (sch_ret != TC_ACT_UNSPEC)
>                          goto ingress_verdict;
>          }
> +
> +       res.verdict = 0;
>          sch_ret = tc_run(tcx_entry(entry), skb, &res);
>          if (sch_ret < 0) {
>                  kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>                  *ret = NET_RX_DROP;
>                  return NULL;
>          }
> +       sch_ret = res.verdict;
>   ingress_verdict:
> -       switch (res.verdict) {
> +       switch (sch_ret) {
>          case TC_ACT_REDIRECT:
>                  /* skb_mac_header check was done by BPF, so we can
> safely
>                   * push the L2 header back before redirecting to another
> -----------
> 
> on the drop reason - our thinking is to support drop_watch alongside
> tracepoint given kfree_skb_reason exists already; if i am not mistaken
> what you suggested would require us to create a new tracepoint?

So if the only thing you really care about is the different drop reason for
kfree_skb_reason, then I still don't follow why you need to drag this into
struct tcf_result. This can be done in a much simpler and more efficient way
like the following:

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..b1c069c8e7f2 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,8 @@
  	FN(IPV6_NDISC_BAD_OPTIONS)	\
  	FN(IPV6_NDISC_NS_OTHERHOST)	\
  	FN(QUEUE_PURGE)			\
+	FN(TC_EGRESS_ERROR)		\
+	FN(TC_INGRESS_ERROR)		\
  	FNe(MAX)

  /**
@@ -345,6 +347,10 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
  	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
  	SKB_DROP_REASON_QUEUE_PURGE,
+	/** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
+	SKB_DROP_REASON_TC_EGRESS_ERROR,
+	/** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
+	SKB_DROP_REASON_TC_INGRESS_ERROR,
  	/**
  	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
  	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..cd2444dd3745 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -10,6 +10,7 @@

  /* TC action not accessible from user space */
  #define TC_ACT_CONSUMED		(TC_ACT_VALUE_MAX + 1)
+#define TC_ACT_ABORT		(TC_ACT_VALUE_MAX + 2)

  /* Basic packet classifier frontend definitions. */

diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..3abb4d71c170 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		*ret = NET_RX_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+	case TC_ACT_ABORT:
+		kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+				 SKB_DROP_REASON_TC_INGRESS :
+				 SKB_DROP_REASON_TC_INGRESS_ERROR);
  		*ret = NET_RX_DROP;
  		return NULL;
  	/* used by tc_run */
@@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		*ret = NET_XMIT_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+	case TC_ACT_ABORT:
+		kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+				 SKB_DROP_REASON_TC_EGRESS :
+				 SKB_DROP_REASON_TC_EGRESS_ERROR);
  		*ret = NET_XMIT_DROP;
  		return NULL;
  	/* used by tc_run */

Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
and you'll get the same result to make it observable for dropwatch.

Thanks,
Daniel
Jamal Hadi Salim Oct. 3, 2023, 12:46 p.m. UTC | #8
On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> > On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> >>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >> [...]
> >>>>
> >>>> In the above case we don't have 'internal' errors which you want to trace, so I would
> >>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >>>> every packet.
> >>>
> >>> We can move the zeroing inside tc_run() but we declare it in the same
> >>> spot as we do right now. You will still need to set res.verdict as
> >>> above.
> >>> Would that work for you?
> >>
> >> What I'm not following is that with the below you can avoid the unnecessary
> >> fast path cost (which is only for corner case which is almost never hit) and
> >> get even better visibility. Are you saying it doesn't work?
> >
> > I am probably missing something:
> > -1/UNSPEC is a legit errno. And the main motivation here for this
> > patch is to disambiguate if it was -EPERM vs UNSPEC
> > Maybe that is what you are calling a "corner case"?
>
> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
> be audited for the code in the tree and therefore avoided so that you never run into
> this problem.

I am sorry but i am not in favor of this approach.
You are suggesting audits are the way to go forward when in fact lack
of said audits is what got us in this trouble with syzkaller to begin
with. We cant rely on tribal knowledge to be able to spot these
discrepancies. The elder of the tribe may move to a different mountain
at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
long term good for maintainance. We have a clear distinction between
an error vs verdict - lets use that.
We really dont want to make this a special case just for eBPF and how
to make it a happy world for eBPF at the cost of everyone else. I made
a suggestion of leaving tcx alone, you can do your own thing there;
but for tc_run my view is we should keep it generic.

cheers,
jamal

> > There are two options in my mind right now (since you are guaranteed
> > in tcx_run you will never return anything below UNSPEC):
> > 1) we just have the switch statement invocation inside an inline
> > function and you can pass it sch_ret (for tcx case) and we'll pass it
> > res.verdit for tc_run() case.
> > 2) is something is we leave tcx_run alone and we have something along
> > the lines of:
> >
> > --------------
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 1450f4741d9b..93613bce647c 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> > packet_type **pt_prev, int *ret,
> >                     struct net_device *orig_dev, bool *another)
> >   {
> >          struct bpf_mprog_entry *entry =
> > rcu_dereference_bh(skb->dev->tcx_ingress);
> > -       struct tcf_result res = {0};
> > +       struct tcf_result res;
> >          int sch_ret;
> >
> >          if (!entry)
> > @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> > packet_type **pt_prev, int *ret,
> >                  if (sch_ret != TC_ACT_UNSPEC)
> >                          goto ingress_verdict;
> >          }
> > +
> > +       res.verdict = 0;
> >          sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >          if (sch_ret < 0) {
> >                  kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >                  *ret = NET_RX_DROP;
> >                  return NULL;
> >          }
> > +       sch_ret = res.verdict;
> >   ingress_verdict:
> > -       switch (res.verdict) {
> > +       switch (sch_ret) {
> >          case TC_ACT_REDIRECT:
> >                  /* skb_mac_header check was done by BPF, so we can
> > safely
> >                   * push the L2 header back before redirecting to another
> > -----------
> >
> > on the drop reason - our thinking is to support drop_watch alongside
> > tracepoint given kfree_skb_reason exists already; if i am not mistaken
> > what you suggested would require us to create a new tracepoint?
>
> So if the only thing you really care about is the different drop reason for
> kfree_skb_reason, then I still don't follow why you need to drag this into
> struct tcf_result. This can be done in a much simpler and more efficient way
> like the following:
>
> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> index a587e83fc169..b1c069c8e7f2 100644
> --- a/include/net/dropreason-core.h
> +++ b/include/net/dropreason-core.h
> @@ -80,6 +80,8 @@
>         FN(IPV6_NDISC_BAD_OPTIONS)      \
>         FN(IPV6_NDISC_NS_OTHERHOST)     \
>         FN(QUEUE_PURGE)                 \
> +       FN(TC_EGRESS_ERROR)             \
> +       FN(TC_INGRESS_ERROR)            \
>         FNe(MAX)
>
>   /**
> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>         SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>         /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>         SKB_DROP_REASON_QUEUE_PURGE,
> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
>         /**
>          * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>          * shouldn't be used as a real 'reason' - only for tracing code gen
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index f308e8268651..cd2444dd3745 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -10,6 +10,7 @@
>
>   /* TC action not accessible from user space */
>   #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
>
>   /* Basic packet classifier frontend definitions. */
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 85df22f05c38..3abb4d71c170 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>                 *ret = NET_RX_SUCCESS;
>                 return NULL;
>         case TC_ACT_SHOT:
> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
> +       case TC_ACT_ABORT:
> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> +                                SKB_DROP_REASON_TC_INGRESS :
> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
>                 *ret = NET_RX_DROP;
>                 return NULL;
>         /* used by tc_run */
> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>                 *ret = NET_XMIT_SUCCESS;
>                 return NULL;
>         case TC_ACT_SHOT:
> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
> +       case TC_ACT_ABORT:
> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> +                                SKB_DROP_REASON_TC_EGRESS :
> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
>                 *ret = NET_XMIT_DROP;
>                 return NULL;
>         /* used by tc_run */
>
> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
> and you'll get the same result to make it observable for dropwatch.
>
> Thanks,
> Daniel
Daniel Borkmann Oct. 3, 2023, 1:49 p.m. UTC | #9
On 10/3/23 2:46 PM, Jamal Hadi Salim wrote:
> On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
>>> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
>>>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>>>> [...]
>>>>>>
>>>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
>>>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>>>>>> every packet.
>>>>>
>>>>> We can move the zeroing inside tc_run() but we declare it in the same
>>>>> spot as we do right now. You will still need to set res.verdict as
>>>>> above.
>>>>> Would that work for you?
>>>>
>>>> What I'm not following is that with the below you can avoid the unnecessary
>>>> fast path cost (which is only for corner case which is almost never hit) and
>>>> get even better visibility. Are you saying it doesn't work?
>>>
>>> I am probably missing something:
>>> -1/UNSPEC is a legit errno. And the main motivation here for this
>>> patch is to disambiguate if it was -EPERM vs UNSPEC
>>> Maybe that is what you are calling a "corner case"?
>>
>> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
>> be audited for the code in the tree and therefore avoided so that you never run into
>> this problem.
> 
> I am sorry but i am not in favor of this approach.
> You are suggesting audits are the way to go forward when in fact lack
> of said audits is what got us in this trouble with syzkaller to begin
> with. We cant rely on tribal knowledge to be able to spot these
> discrepancies. The elder of the tribe may move to a different mountain
> at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
> long term good for maintainance. We have a clear distinction between
> an error vs verdict - lets use that.
> We really dont want to make this a special case just for eBPF and how
> to make it a happy world for eBPF at the cost of everyone else. I made
> a suggestion of leaving tcx alone, you can do your own thing there;
> but for tc_run my view is we should keep it generic.

Jamal, before you come to early conclusions, it would be great if you also
read until the end of the email, because what I suggested below *is* generic
and with less churn throughout the code base.

>>> There are two options in my mind right now (since you are guaranteed
>>> in tcx_run you will never return anything below UNSPEC):
>>> 1) we just have the switch statement invocation inside an inline
>>> function and you can pass it sch_ret (for tcx case) and we'll pass it
>>> res.verdit for tc_run() case.
>>> 2) is something is we leave tcx_run alone and we have something along
>>> the lines of:
>>>
>>> --------------
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 1450f4741d9b..93613bce647c 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
>>> packet_type **pt_prev, int *ret,
>>>                      struct net_device *orig_dev, bool *another)
>>>    {
>>>           struct bpf_mprog_entry *entry =
>>> rcu_dereference_bh(skb->dev->tcx_ingress);
>>> -       struct tcf_result res = {0};
>>> +       struct tcf_result res;
>>>           int sch_ret;
>>>
>>>           if (!entry)
>>> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
>>> packet_type **pt_prev, int *ret,
>>>                   if (sch_ret != TC_ACT_UNSPEC)
>>>                           goto ingress_verdict;
>>>           }
>>> +
>>> +       res.verdict = 0;
>>>           sch_ret = tc_run(tcx_entry(entry), skb, &res);
>>>           if (sch_ret < 0) {
>>>                   kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>>>                   *ret = NET_RX_DROP;
>>>                   return NULL;
>>>           }
>>> +       sch_ret = res.verdict;
>>>    ingress_verdict:
>>> -       switch (res.verdict) {
>>> +       switch (sch_ret) {
>>>           case TC_ACT_REDIRECT:
>>>                   /* skb_mac_header check was done by BPF, so we can
>>> safely
>>>                    * push the L2 header back before redirecting to another
>>> -----------
>>>
>>> on the drop reason - our thinking is to support drop_watch alongside
>>> tracepoint given kfree_skb_reason exists already; if i am not mistaken
>>> what you suggested would require us to create a new tracepoint?
>>
>> So if the only thing you really care about is the different drop reason for
>> kfree_skb_reason, then I still don't follow why you need to drag this into
>> struct tcf_result. This can be done in a much simpler and more efficient way
>> like the following:
>>
>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
>> index a587e83fc169..b1c069c8e7f2 100644
>> --- a/include/net/dropreason-core.h
>> +++ b/include/net/dropreason-core.h
>> @@ -80,6 +80,8 @@
>>          FN(IPV6_NDISC_BAD_OPTIONS)      \
>>          FN(IPV6_NDISC_NS_OTHERHOST)     \
>>          FN(QUEUE_PURGE)                 \
>> +       FN(TC_EGRESS_ERROR)             \
>> +       FN(TC_INGRESS_ERROR)            \
>>          FNe(MAX)
>>
>>    /**
>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>>          SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>>          /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>>          SKB_DROP_REASON_QUEUE_PURGE,
>> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
>> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
>> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
>> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
>>          /**
>>           * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>>           * shouldn't be used as a real 'reason' - only for tracing code gen
>> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
>> index f308e8268651..cd2444dd3745 100644
>> --- a/include/net/pkt_cls.h
>> +++ b/include/net/pkt_cls.h
>> @@ -10,6 +10,7 @@
>>
>>    /* TC action not accessible from user space */
>>    #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
>> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
>>
>>    /* Basic packet classifier frontend definitions. */
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 85df22f05c38..3abb4d71c170 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>                  *ret = NET_RX_SUCCESS;
>>                  return NULL;
>>          case TC_ACT_SHOT:
>> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
>> +       case TC_ACT_ABORT:
>> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
>> +                                SKB_DROP_REASON_TC_INGRESS :
>> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
>>                  *ret = NET_RX_DROP;
>>                  return NULL;
>>          /* used by tc_run */
>> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
>>                  *ret = NET_XMIT_SUCCESS;
>>                  return NULL;
>>          case TC_ACT_SHOT:
>> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
>> +       case TC_ACT_ABORT:
>> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
>> +                                SKB_DROP_REASON_TC_EGRESS :
>> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
>>                  *ret = NET_XMIT_DROP;
>>                  return NULL;
>>          /* used by tc_run */
>>
>> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
>> and you'll get the same result to make it observable for dropwatch.
>>
>> Thanks,
>> Daniel
Jamal Hadi Salim Oct. 3, 2023, 9:36 p.m. UTC | #10
On Tue, Oct 3, 2023 at 9:49 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/3/23 2:46 PM, Jamal Hadi Salim wrote:
> > On Tue, Oct 3, 2023 at 5:00 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> >>> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
> >>>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> >>>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
> >>>> [...]
> >>>>>>
> >>>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
> >>>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
> >>>>>> every packet.
> >>>>>
> >>>>> We can move the zeroing inside tc_run() but we declare it in the same
> >>>>> spot as we do right now. You will still need to set res.verdict as
> >>>>> above.
> >>>>> Would that work for you?
> >>>>
> >>>> What I'm not following is that with the below you can avoid the unnecessary
> >>>> fast path cost (which is only for corner case which is almost never hit) and
> >>>> get even better visibility. Are you saying it doesn't work?
> >>>
> >>> I am probably missing something:
> >>> -1/UNSPEC is a legit errno. And the main motivation here for this
> >>> patch is to disambiguate if it was -EPERM vs UNSPEC
> >>> Maybe that is what you are calling a "corner case"?
> >>
> >> Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
> >> be audited for the code in the tree and therefore avoided so that you never run into
> >> this problem.
> >
> > I am sorry but i am not in favor of this approach.
> > You are suggesting audits are the way to go forward when in fact lack
> > of said audits is what got us in this trouble with syzkaller to begin
> > with. We cant rely on tribal knowledge to be able to spot these
> > discrepancies. The elder of the tribe may move to a different mountain
> > at some point and TheLinuxWay(tm) is cutnpaste, so i dont see this as
> > long term good for maintainance. We have a clear distinction between
> > an error vs verdict - lets use that.
> > We really dont want to make this a special case just for eBPF and how
> > to make it a happy world for eBPF at the cost of everyone else. I made
> > a suggestion of leaving tcx alone, you can do your own thing there;
> > but for tc_run my view is we should keep it generic.
>
> Jamal, before you come to early conclusions, it would be great if you also
> read until the end of the email, because what I suggested below *is* generic
> and with less churn throughout the code base.
>

I did look, Daniel. You are lumping all the error codes into one -
which doesnt change my view on disambiguation. If i was to debug
closely and run kprobe now i am seeing only one error code
TC_ACT_ABORT instead of -EINVAL vs -ENOMEM, etc. Easier for me to find
the source manually (and possibly even better with Andrii's tool i saw
once if it would work in the datapath - iirc, i think it prints return
codes on the code paths).

cheers,
jamal

> >>> There are two options in my mind right now (since you are guaranteed
> >>> in tcx_run you will never return anything below UNSPEC):
> >>> 1) we just have the switch statement invocation inside an inline
> >>> function and you can pass it sch_ret (for tcx case) and we'll pass it
> >>> res.verdit for tc_run() case.
> >>> 2) is something is we leave tcx_run alone and we have something along
> >>> the lines of:
> >>>
> >>> --------------
> >>> diff --git a/net/core/dev.c b/net/core/dev.c
> >>> index 1450f4741d9b..93613bce647c 100644
> >>> --- a/net/core/dev.c
> >>> +++ b/net/core/dev.c
> >>> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> >>> packet_type **pt_prev, int *ret,
> >>>                      struct net_device *orig_dev, bool *another)
> >>>    {
> >>>           struct bpf_mprog_entry *entry =
> >>> rcu_dereference_bh(skb->dev->tcx_ingress);
> >>> -       struct tcf_result res = {0};
> >>> +       struct tcf_result res;
> >>>           int sch_ret;
> >>>
> >>>           if (!entry)
> >>> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> >>> packet_type **pt_prev, int *ret,
> >>>                   if (sch_ret != TC_ACT_UNSPEC)
> >>>                           goto ingress_verdict;
> >>>           }
> >>> +
> >>> +       res.verdict = 0;
> >>>           sch_ret = tc_run(tcx_entry(entry), skb, &res);
> >>>           if (sch_ret < 0) {
> >>>                   kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>>                   *ret = NET_RX_DROP;
> >>>                   return NULL;
> >>>           }
> >>> +       sch_ret = res.verdict;
> >>>    ingress_verdict:
> >>> -       switch (res.verdict) {
> >>> +       switch (sch_ret) {
> >>>           case TC_ACT_REDIRECT:
> >>>                   /* skb_mac_header check was done by BPF, so we can
> >>> safely
> >>>                    * push the L2 header back before redirecting to another
> >>> -----------
> >>>
> >>> on the drop reason - our thinking is to support drop_watch alongside
> >>> tracepoint given kfree_skb_reason exists already; if i am not mistaken
> >>> what you suggested would require us to create a new tracepoint?
> >>
> >> So if the only thing you really care about is the different drop reason for
> >> kfree_skb_reason, then I still don't follow why you need to drag this into
> >> struct tcf_result. This can be done in a much simpler and more efficient way
> >> like the following:
> >>
> >> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
> >> index a587e83fc169..b1c069c8e7f2 100644
> >> --- a/include/net/dropreason-core.h
> >> +++ b/include/net/dropreason-core.h
> >> @@ -80,6 +80,8 @@
> >>          FN(IPV6_NDISC_BAD_OPTIONS)      \
> >>          FN(IPV6_NDISC_NS_OTHERHOST)     \
> >>          FN(QUEUE_PURGE)                 \
> >> +       FN(TC_EGRESS_ERROR)             \
> >> +       FN(TC_INGRESS_ERROR)            \
> >>          FNe(MAX)
> >>
> >>    /**
> >> @@ -345,6 +347,10 @@ enum skb_drop_reason {
> >>          SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
> >>          /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
> >>          SKB_DROP_REASON_QUEUE_PURGE,
> >> +       /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
> >> +       SKB_DROP_REASON_TC_EGRESS_ERROR,
> >> +       /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
> >> +       SKB_DROP_REASON_TC_INGRESS_ERROR,
> >>          /**
> >>           * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
> >>           * shouldn't be used as a real 'reason' - only for tracing code gen
> >> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> >> index f308e8268651..cd2444dd3745 100644
> >> --- a/include/net/pkt_cls.h
> >> +++ b/include/net/pkt_cls.h
> >> @@ -10,6 +10,7 @@
> >>
> >>    /* TC action not accessible from user space */
> >>    #define TC_ACT_CONSUMED               (TC_ACT_VALUE_MAX + 1)
> >> +#define TC_ACT_ABORT           (TC_ACT_VALUE_MAX + 2)
> >>
> >>    /* Basic packet classifier frontend definitions. */
> >>
> >> diff --git a/net/core/dev.c b/net/core/dev.c
> >> index 85df22f05c38..3abb4d71c170 100644
> >> --- a/net/core/dev.c
> >> +++ b/net/core/dev.c
> >> @@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
> >>                  *ret = NET_RX_SUCCESS;
> >>                  return NULL;
> >>          case TC_ACT_SHOT:
> >> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
> >> +       case TC_ACT_ABORT:
> >> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> >> +                                SKB_DROP_REASON_TC_INGRESS :
> >> +                                SKB_DROP_REASON_TC_INGRESS_ERROR);
> >>                  *ret = NET_RX_DROP;
> >>                  return NULL;
> >>          /* used by tc_run */
> >> @@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
> >>                  *ret = NET_XMIT_SUCCESS;
> >>                  return NULL;
> >>          case TC_ACT_SHOT:
> >> -               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
> >> +       case TC_ACT_ABORT:
> >> +               kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
> >> +                                SKB_DROP_REASON_TC_EGRESS :
> >> +                                SKB_DROP_REASON_TC_EGRESS_ERROR);
> >>                  *ret = NET_XMIT_DROP;
> >>                  return NULL;
> >>          /* used by tc_run */
> >>
> >> Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
> >> and you'll get the same result to make it observable for dropwatch.
> >>
> >> Thanks,
> >> Daniel
>
Daniel Borkmann Oct. 6, 2023, 11:18 a.m. UTC | #11
Hi Jamal,

On 10/3/23 11:36 PM, Jamal Hadi Salim wrote:
[...]
> I did look, Daniel. You are lumping all the error codes into one -
> which doesnt change my view on disambiguation. If i was to debug
> closely and run kprobe now i am seeing only one error code
> TC_ACT_ABORT instead of -EINVAL vs -ENOMEM, etc. Easier for me to find
> the source manually (and possibly even better with Andrii's tool i saw
> once if it would work in the datapath - iirc, i think it prints return
> codes on the code paths).

That should be possible with some work this way, agree. I've been toying a bit
more on this issue, and actually there is an even better way which would cleanly
solve all use cases and we likely would utilize it for bpf as well in future.
I wasn't aware of it before, but the drop reason actually has per-subsystem infra
already which so far only mac80211 and ovs makes use of.

I wrote up below patch as a starting point to get the base infra up and with
TC_DROP_MAX_RECLASSIFY as the initial example on how to utilize it. Then you can
simply just use regular tooling and get more detailed kfree_skb_reason() codes,
which would also remove the need for kprobes/kretprobes to gather the error.

Let me know if this looks like a good path forward, then I'll cook a proper one
and you or Victor can extend it further with more drop reasons. The nice thing is
also that this can be extended successively with more reasons whenever needed.

Best & thanks,
Daniel

 From d62b4a52f9c725d4a63d5c76a576d4e3bbbea4ef Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 6 Oct 2023 08:42:19 +0000
Subject: [PATCH net-next] net, tc: Add extensible drop reason subsystem codes

[ commit msg tbd ]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
  include/net/dropreason-core.h |  4 ++--
  include/net/dropreason.h      |  6 ++++++
  include/net/sch_drop.h        | 35 +++++++++++++++++++++++++++++++++++
  include/net/sch_generic.h     | 11 +++++++++--
  net/core/dev.c                | 15 ++++++++++-----
  net/sched/cls_api.c           |  1 +
  6 files changed, 63 insertions(+), 9 deletions(-)
  create mode 100644 include/net/sch_drop.h

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..670eac9923aa 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -235,7 +235,7 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_NEIGH_QUEUEFULL,
  	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
  	SKB_DROP_REASON_NEIGH_DEAD,
-	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
+	/** @SKB_DROP_REASON_TC_EGRESS: Unused, see TC_DROP_EGRESS instead */
  	SKB_DROP_REASON_TC_EGRESS,
  	/**
  	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
@@ -250,7 +250,7 @@ enum skb_drop_reason {
  	SKB_DROP_REASON_CPU_BACKLOG,
  	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
  	SKB_DROP_REASON_XDP,
-	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
+	/** @SKB_DROP_REASON_TC_INGRESS: Unused, see TC_DROP_INGRESS instead */
  	SKB_DROP_REASON_TC_INGRESS,
  	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
  	SKB_DROP_REASON_UNHANDLED_PROTO,
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 56cb7be92244..434ed2124836 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -29,6 +29,12 @@ enum skb_drop_reason_subsys {
  	 */
  	SKB_DROP_REASON_SUBSYS_OPENVSWITCH,

+	/**
+	 * @SKB_DROP_REASON_SUBSYS_TC: traffic control (tc) drop reasons,
+	 * see include/net/sch_drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_TC,
+
  	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
  	SKB_DROP_REASON_SUBSYS_NUM
  };
diff --git a/include/net/sch_drop.h b/include/net/sch_drop.h
new file mode 100644
index 000000000000..c2471a62c10b
--- /dev/null
+++ b/include/net/sch_drop.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Traffic control drop reason list. */
+
+#ifndef NET_SCH_DROP_H
+#define NET_SCH_DROP_H
+
+#include <linux/skbuff.h>
+#include <net/dropreason.h>
+
+/**
+ * @TC_DROP_INGRESS: dropped in tc ingress hook (generic, catch-all code)
+ * @TC_DROP_EGRESS: dropped in tc egress hook (generic, catch-all code)
+ * @TC_DROP_MAX_RECLASSIFY: dropped due to hitting maximum reclassify limit
+ */
+#define TC_DROP_REASONS(R)			\
+	R(TC_DROP_INGRESS)			\
+	R(TC_DROP_EGRESS)			\
+	R(TC_DROP_MAX_RECLASSIFY)		\
+	/* deliberate comment for trailing \ */
+
+enum tc_drop_reason {
+	__TC_DROP_REASON = SKB_DROP_REASON_SUBSYS_TC <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+#define ENUM(x) x,
+	TC_DROP_REASONS(ENUM)
+#undef ENUM
+	TC_DROP_MAX,
+};
+
+static inline void
+tc_kfree_skb_reason(struct sk_buff *skb, enum tc_drop_reason reason)
+{
+	kfree_skb_reason(skb, (u32)reason);
+}
+#endif /* NET_SCH_DROP_H */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c7318c73cfd6..e50a281ff1af 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -16,9 +16,11 @@
  #include <linux/rwsem.h>
  #include <linux/atomic.h>
  #include <linux/hashtable.h>
+
  #include <net/gen_stats.h>
  #include <net/rtnetlink.h>
  #include <net/flow_offload.h>
+#include <net/sch_drop.h>

  struct Qdisc_ops;
  struct qdisc_walker;
@@ -324,15 +326,14 @@ struct Qdisc_ops {
  	struct module		*owner;
  };

-
  struct tcf_result {
+	enum tc_drop_reason		drop_reason;
  	union {
  		struct {
  			unsigned long	class;
  			u32		classid;
  		};
  		const struct tcf_proto *goto_tp;
-
  	};
  };

@@ -667,6 +668,12 @@ static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
  	return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
  }

+static inline void tc_set_drop_reason(struct tcf_result *res,
+				      enum tc_drop_reason reason)
+{
+	res->drop_reason = reason;
+}
+
  int qdisc_class_hash_init(struct Qdisc_class_hash *);
  void qdisc_class_hash_insert(struct Qdisc_class_hash *,
  			     struct Qdisc_class_common *);
diff --git a/net/core/dev.c b/net/core/dev.c
index 606a366cc209..93cebe374082 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3910,7 +3910,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
  #endif /* CONFIG_NET_EGRESS */

  #ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+		  enum tc_drop_reason *drop_reason)
  {
  	int ret = TC_ACT_UNSPEC;
  #ifdef CONFIG_NET_CLS_ACT
@@ -3922,12 +3923,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)

  	tc_skb_cb(skb)->mru = 0;
  	tc_skb_cb(skb)->post_ct = false;
+	res.drop_reason = *drop_reason;

  	mini_qdisc_bstats_cpu_update(miniq, skb);
  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
  	/* Only tcf related quirks below. */
  	switch (ret) {
  	case TC_ACT_SHOT:
+		*drop_reason = res.drop_reason;
  		mini_qdisc_qstats_cpu_drop(miniq);
  		break;
  	case TC_ACT_OK:
@@ -3977,6 +3980,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		   struct net_device *orig_dev, bool *another)
  {
  	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+	enum tc_drop_reason drop_reason = TC_DROP_INGRESS;
  	int sch_ret;

  	if (!entry)
@@ -3994,7 +3998,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		if (sch_ret != TC_ACT_UNSPEC)
  			goto ingress_verdict;
  	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
  ingress_verdict:
  	switch (sch_ret) {
  	case TC_ACT_REDIRECT:
@@ -4011,7 +4015,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
  		*ret = NET_RX_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+		tc_kfree_skb_reason(skb, drop_reason);
  		*ret = NET_RX_DROP;
  		return NULL;
  	/* used by tc_run */
@@ -4032,6 +4036,7 @@ static __always_inline struct sk_buff *
  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  {
  	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+	enum tc_drop_reason drop_reason = TC_DROP_EGRESS;
  	int sch_ret;

  	if (!entry)
@@ -4045,7 +4050,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		if (sch_ret != TC_ACT_UNSPEC)
  			goto egress_verdict;
  	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
  egress_verdict:
  	switch (sch_ret) {
  	case TC_ACT_REDIRECT:
@@ -4054,7 +4059,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
  		*ret = NET_XMIT_SUCCESS;
  		return NULL;
  	case TC_ACT_SHOT:
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+		tc_kfree_skb_reason(skb, drop_reason);
  		*ret = NET_XMIT_DROP;
  		return NULL;
  	/* used by tc_run */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a193cc7b3241..5d56ddb1462f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1723,6 +1723,7 @@ static inline int __tcf_classify(struct sk_buff *skb,
  				       tp->chain->block->index,
  				       tp->prio & 0xffff,
  				       ntohs(tp->protocol));
+		tc_set_drop_reason(res, TC_DROP_MAX_RECLASSIFY);
  		return TC_ACT_SHOT;
  	}
Jakub Kicinski Oct. 6, 2023, 1:32 p.m. UTC | #12
On Fri, 6 Oct 2023 13:18:40 +0200 Daniel Borkmann wrote:
> That should be possible with some work this way, agree. I've been toying a bit
> more on this issue, and actually there is an even better way which would cleanly
> solve all use cases and we likely would utilize it for bpf as well in future.
> I wasn't aware of it before, but the drop reason actually has per-subsystem infra
> already which so far only mac80211 and ovs makes use of.

FWIW I'm not sure if leaning into the subsys specific error codes for
something as core as TC is a good direction. I'd think that what
matters to the user is was it an intentional policy drop or some form
of an error / exception. More detailed info can come from stats.

Maybe I'm overly conservative because I don't care about debugging
mac80211 or ovs but do very much care about TC :) And I think Alastair 
(bpftrace) is working on auto-prettifying enums when bpftrace outputs
maps. So we can do something like:

$ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
Attaching 1 probe...
^C

@[SKB_DROP_REASON_TC_INGRESS]: 2
@[SKB_CONSUMED]: 34

  ^^^^^^^^^^^^ names!!

Auto-magically.

Which will no longer work with the "pack multiple values into 
the reason" scheme of subsys-specific values :(

What I'm saying is that there is a trade-off here between providing 
as much info as possible vs basic user getting intelligible data..
Daniel Borkmann Oct. 6, 2023, 1:49 p.m. UTC | #13
On 10/6/23 3:32 PM, Jakub Kicinski wrote:
> On Fri, 6 Oct 2023 13:18:40 +0200 Daniel Borkmann wrote:
>> That should be possible with some work this way, agree. I've been toying a bit
>> more on this issue, and actually there is an even better way which would cleanly
>> solve all use cases and we likely would utilize it for bpf as well in future.
>> I wasn't aware of it before, but the drop reason actually has per-subsystem infra
>> already which so far only mac80211 and ovs makes use of.
> 
> FWIW I'm not sure if leaning into the subsys specific error codes for
> something as core as TC is a good direction. I'd think that what
> matters to the user is was it an intentional policy drop or some form
> of an error / exception. More detailed info can come from stats.
> 
> Maybe I'm overly conservative because I don't care about debugging
> mac80211 or ovs but do very much care about TC :) And I think Alastair
> (bpftrace) is working on auto-prettifying enums when bpftrace outputs
> maps. So we can do something like:
> 
> $ bpftrace -e 'tracepoint:skb:kfree_skb { @[args->reason] = count(); }'
> Attaching 1 probe...
> ^C
> 
> @[SKB_DROP_REASON_TC_INGRESS]: 2
> @[SKB_CONSUMED]: 34
> 
>    ^^^^^^^^^^^^ names!!
> 
> Auto-magically.

Very cool!

> Which will no longer work with the "pack multiple values into
> the reason" scheme of subsys-specific values :(

Too bad, do you happen to know why it won't work? Given they went into the
length of extending this for subsystems, they presumably would also like to
benefit from above. :/

> What I'm saying is that there is a trade-off here between providing
> as much info as possible vs basic user getting intelligible data..

Makes sense. I think we can drop that aspect for the subsys specific error
codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
be fine if you think it's better. The rest of the patch shown should still
apply the same way. I can tweak it to use the core section for codes, and
then it can be successively extended if that looks good to you - unless you
are saying from above, that just one error code is better and then going via
detailed stats for specific errors is preferred.

Thanks,
Daniel
Jakub Kicinski Oct. 6, 2023, 2:12 p.m. UTC | #14
On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > Which will no longer work with the "pack multiple values into
> > the reason" scheme of subsys-specific values :(  
> 
> Too bad, do you happen to know why it won't work? 

I'm just guessing but the reason is enum skb_drop_reason
and the values of subsystem specific reasons won't be part
of that enum.

> Given they went into the
> length of extending this for subsystems, they presumably would also like to
> benefit from above. :/
> 
> > What I'm saying is that there is a trade-off here between providing
> > as much info as possible vs basic user getting intelligible data..  
> 
> Makes sense. I think we can drop that aspect for the subsys specific error
> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> be fine if you think it's better. The rest of the patch shown should still
> apply the same way. I can tweak it to use the core section for codes, and
> then it can be successively extended if that looks good to you - unless you
> are saying from above, that just one error code is better and then going via
> detailed stats for specific errors is preferred.

No, no, multiple reasons are perfectly fine. The non-technical
advantage of mac80211 error codes being separate is that there
are no git conflicts when we add new ones. TC codes can just 
be added to the main enum like TCP 
Jamal Hadi Salim Oct. 6, 2023, 3:25 p.m. UTC | #15
On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > > Which will no longer work with the "pack multiple values into
> > > the reason" scheme of subsys-specific values :(
> >
> > Too bad, do you happen to know why it won't work?
>
> I'm just guessing but the reason is enum skb_drop_reason
> and the values of subsystem specific reasons won't be part
> of that enum.

IIUC, this would gives us the readability and never require any
changes to bpftrace, whereas the major:minor encoding would require
further logic in bpftrace.

> > Given they went into the
> > length of extending this for subsystems, they presumably would also like to
> > benefit from above. :/
> >
> > > What I'm saying is that there is a trade-off here between providing
> > > as much info as possible vs basic user getting intelligible data..
> >
> > Makes sense. I think we can drop that aspect for the subsys specific error
> > codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> > be fine if you think it's better. The rest of the patch shown should still
> > apply the same way. I can tweak it to use the core section for codes, and
> > then it can be successively extended if that looks good to you - unless you
> > are saying from above, that just one error code is better and then going via
> > detailed stats for specific errors is preferred.
>
> No, no, multiple reasons are perfectly fine. The non-technical
> advantage of mac80211 error codes being separate is that there
> are no git conflicts when we add new ones. TC codes can just
> be added to the main enum like TCP 
Daniel Borkmann Oct. 6, 2023, 3:45 p.m. UTC | #16
On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
> On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
>>>> Which will no longer work with the "pack multiple values into
>>>> the reason" scheme of subsys-specific values :(
>>>
>>> Too bad, do you happen to know why it won't work?
>>
>> I'm just guessing but the reason is enum skb_drop_reason
>> and the values of subsystem specific reasons won't be part
>> of that enum.
> 
> IIUC, this would gives us the readability and never require any
> changes to bpftrace, whereas the major:minor encoding would require
> further logic in bpftrace.

Makes sense, agree.

>>> Given they went into the
>>> length of extending this for subsystems, they presumably would also like to
>>> benefit from above. :/
>>>
>>>> What I'm saying is that there is a trade-off here between providing
>>>> as much info as possible vs basic user getting intelligible data..
>>>
>>> Makes sense. I think we can drop that aspect for the subsys specific error
>>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
>>> be fine if you think it's better. The rest of the patch shown should still
>>> apply the same way. I can tweak it to use the core section for codes, and
>>> then it can be successively extended if that looks good to you - unless you
>>> are saying from above, that just one error code is better and then going via
>>> detailed stats for specific errors is preferred.
>>
>> No, no, multiple reasons are perfectly fine. The non-technical
>> advantage of mac80211 error codes being separate is that there
>> are no git conflicts when we add new ones. TC codes can just
>> be added to the main enum like TCP 
Daniel Xu Oct. 6, 2023, 5:59 p.m. UTC | #17
On Fri, Oct 06, 2023 at 07:12:15AM -0700, Jakub Kicinski wrote:
> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> > > Which will no longer work with the "pack multiple values into
> > > the reason" scheme of subsys-specific values :(  
> > 
> > Too bad, do you happen to know why it won't work? 
> 
> I'm just guessing but the reason is enum skb_drop_reason
> and the values of subsystem specific reasons won't be part
> of that enum.

Yeah, looks like the subsystem reasons are different enums right?
There's probably a way to still support it in bpftrace but it might take
some minor changes and/or ugly conditionals in scripts.

But I also wonder: why are the subsystem error codes not included into
`enum skb_drop_reason`? It looks like the enum space is partitioned
already. And the modules have already registered themselves into core
kernel (in `enum skb_drop_reason_subsys`). So even if modules are
compiled out there are still hints of it laying around vmlinux.

Thanks,
Daniel

[..]
Jamal Hadi Salim Oct. 6, 2023, 7:39 p.m. UTC | #18
On Fri, Oct 6, 2023 at 11:45 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
> > On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
> >>>> Which will no longer work with the "pack multiple values into
> >>>> the reason" scheme of subsys-specific values :(
> >>>
> >>> Too bad, do you happen to know why it won't work?
> >>
> >> I'm just guessing but the reason is enum skb_drop_reason
> >> and the values of subsystem specific reasons won't be part
> >> of that enum.
> >
> > IIUC, this would gives us the readability and never require any
> > changes to bpftrace, whereas the major:minor encoding would require
> > further logic in bpftrace.
>
> Makes sense, agree.
>
> >>> Given they went into the
> >>> length of extending this for subsystems, they presumably would also like to
> >>> benefit from above. :/
> >>>
> >>>> What I'm saying is that there is a trade-off here between providing
> >>>> as much info as possible vs basic user getting intelligible data..
> >>>
> >>> Makes sense. I think we can drop that aspect for the subsys specific error
> >>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
> >>> be fine if you think it's better. The rest of the patch shown should still
> >>> apply the same way. I can tweak it to use the core section for codes, and
> >>> then it can be successively extended if that looks good to you - unless you
> >>> are saying from above, that just one error code is better and then going via
> >>> detailed stats for specific errors is preferred.
> >>
> >> No, no, multiple reasons are perfectly fine. The non-technical
> >> advantage of mac80211 error codes being separate is that there
> >> are no git conflicts when we add new ones. TC codes can just
> >> be added to the main enum like TCP 
Daniel Borkmann Oct. 6, 2023, 7:59 p.m. UTC | #19
On 10/6/23 9:39 PM, Jamal Hadi Salim wrote:
> On Fri, Oct 6, 2023 at 11:45 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 10/6/23 5:25 PM, Jamal Hadi Salim wrote:
>>> On Fri, Oct 6, 2023 at 10:12 AM Jakub Kicinski <kuba@kernel.org> wrote:
>>>> On Fri, 6 Oct 2023 15:49:18 +0200 Daniel Borkmann wrote:
>>>>>> Which will no longer work with the "pack multiple values into
>>>>>> the reason" scheme of subsys-specific values :(
>>>>>
>>>>> Too bad, do you happen to know why it won't work?
>>>>
>>>> I'm just guessing but the reason is enum skb_drop_reason
>>>> and the values of subsystem specific reasons won't be part
>>>> of that enum.
>>>
>>> IIUC, this would gives us the readability and never require any
>>> changes to bpftrace, whereas the major:minor encoding would require
>>> further logic in bpftrace.
>>
>> Makes sense, agree.
>>
>>>>> Given they went into the
>>>>> length of extending this for subsystems, they presumably would also like to
>>>>> benefit from above. :/
>>>>>
>>>>>> What I'm saying is that there is a trade-off here between providing
>>>>>> as much info as possible vs basic user getting intelligible data..
>>>>>
>>>>> Makes sense. I think we can drop that aspect for the subsys specific error
>>>>> codes. Fwiw, TCP has 22 drop codes in the core section alone, so this should
>>>>> be fine if you think it's better. The rest of the patch shown should still
>>>>> apply the same way. I can tweak it to use the core section for codes, and
>>>>> then it can be successively extended if that looks good to you - unless you
>>>>> are saying from above, that just one error code is better and then going via
>>>>> detailed stats for specific errors is preferred.
>>>>
>>>> No, no, multiple reasons are perfectly fine. The non-technical
>>>> advantage of mac80211 error codes being separate is that there
>>>> are no git conflicts when we add new ones. TC codes can just
>>>> be added to the main enum like TCP 
diff mbox series

Patch

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..b1c069c8e7f2 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,8 @@ 
 	FN(IPV6_NDISC_BAD_OPTIONS)	\
 	FN(IPV6_NDISC_NS_OTHERHOST)	\
 	FN(QUEUE_PURGE)			\
+	FN(TC_EGRESS_ERROR)		\
+	FN(TC_INGRESS_ERROR)		\
 	FNe(MAX)
 
 /**
@@ -345,6 +347,10 @@  enum skb_drop_reason {
 	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
 	/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
 	SKB_DROP_REASON_QUEUE_PURGE,
+	/** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
+	SKB_DROP_REASON_TC_EGRESS_ERROR,
+	/** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
+	SKB_DROP_REASON_TC_INGRESS_ERROR,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f232512505f8..9a3f71d2545e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -326,6 +326,7 @@  struct Qdisc_ops {
 
 
 struct tcf_result {
+	u32 verdict;
 	union {
 		struct {
 			unsigned long	class;
@@ -336,6 +337,12 @@  struct tcf_result {
 	};
 };
 
+static inline void tcf_result_set_verdict(struct tcf_result *res,
+					  const u32 verdict)
+{
+	res->verdict = verdict;
+}
+
 struct tcf_chain;
 
 struct tcf_proto_ops {
diff --git a/net/core/dev.c b/net/core/dev.c
index ccff2b6ef958..1450f4741d9b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3910,31 +3910,39 @@  EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 #endif /* CONFIG_NET_EGRESS */
 
 #ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+		  struct tcf_result *res)
 {
-	int ret = TC_ACT_UNSPEC;
+	int ret = 0;
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
-	struct tcf_result res;
 
-	if (!miniq)
+	if (!miniq) {
+		tcf_result_set_verdict(res, TC_ACT_UNSPEC);
 		return ret;
+	}
 
 	tc_skb_cb(skb)->mru = 0;
 	tc_skb_cb(skb)->post_ct = false;
 
 	mini_qdisc_bstats_cpu_update(miniq, skb);
-	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+	ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
+	if (ret < 0) {
+		mini_qdisc_qstats_cpu_drop(miniq);
+		return ret;
+	}
 	/* Only tcf related quirks below. */
-	switch (ret) {
+	switch (res->verdict) {
 	case TC_ACT_SHOT:
 		mini_qdisc_qstats_cpu_drop(miniq);
 		break;
 	case TC_ACT_OK:
 	case TC_ACT_RECLASSIFY:
-		skb->tc_index = TC_H_MIN(res.classid);
+		skb->tc_index = TC_H_MIN(res->classid);
 		break;
 	}
+#else
+	tcf_result_set_verdict(res, TC_ACT_UNSPEC);
 #endif /* CONFIG_NET_CLS_ACT */
 	return ret;
 }
@@ -3977,6 +3985,7 @@  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		   struct net_device *orig_dev, bool *another)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+	struct tcf_result res = {0};
 	int sch_ret;
 
 	if (!entry)
@@ -3994,9 +4003,14 @@  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto ingress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &res);
+	if (sch_ret < 0) {
+		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
+		*ret = NET_RX_DROP;
+		return NULL;
+	}
 ingress_verdict:
-	switch (sch_ret) {
+	switch (res.verdict) {
 	case TC_ACT_REDIRECT:
 		/* skb_mac_header check was done by BPF, so we can safely
 		 * push the L2 header back before redirecting to another
@@ -4032,6 +4046,7 @@  static __always_inline struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
 	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+	struct tcf_result res = {0};
 	int sch_ret;
 
 	if (!entry)
@@ -4045,9 +4060,14 @@  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		if (sch_ret != TC_ACT_UNSPEC)
 			goto egress_verdict;
 	}
-	sch_ret = tc_run(tcx_entry(entry), skb);
+	sch_ret = tc_run(tcx_entry(entry), skb, &res);
+	if (sch_ret < 0) {
+		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS_ERROR);
+		*ret = NET_XMIT_DROP;
+		return NULL;
+	}
 egress_verdict:
-	switch (sch_ret) {
+	switch (res.verdict) {
 	case TC_ACT_REDIRECT:
 		/* No need to push/pop skb's mac_header here on egress! */
 		skb_do_redirect(skb);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a193cc7b3241..ebd031f3ac5a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1682,11 +1682,11 @@  static inline int __tcf_classify(struct sk_buff *skb,
 			 */
 			if (unlikely(n->tp != tp || n->tp->chain != n->chain ||
 				     !tp->ops->get_exts))
-				return TC_ACT_SHOT;
+				return -EINVAL;
 
 			exts = tp->ops->get_exts(tp, n->handle);
 			if (unlikely(!exts || n->exts != exts))
-				return TC_ACT_SHOT;
+				return -EINVAL;
 
 			n = NULL;
 			err = tcf_exts_exec_ex(skb, exts, act_index, res);
@@ -1708,14 +1708,17 @@  static inline int __tcf_classify(struct sk_buff *skb,
 			goto reset;
 		}
 #endif
-		if (err >= 0)
-			return err;
+		if (err >= 0) {
+			tcf_result_set_verdict(res, err);
+			return 0;
+		}
 	}
 
 	if (unlikely(n))
-		return TC_ACT_SHOT;
+		return -ENOENT;
 
-	return TC_ACT_UNSPEC; /* signal: continue lookup */
+	tcf_result_set_verdict(res, TC_ACT_UNSPEC);
+	return 0;
 #ifdef CONFIG_NET_CLS_ACT
 reset:
 	if (unlikely(limit++ >= max_reclassify_loop)) {
@@ -1723,7 +1726,7 @@  static inline int __tcf_classify(struct sk_buff *skb,
 				       tp->chain->block->index,
 				       tp->prio & 0xffff,
 				       ntohs(tp->protocol));
-		return TC_ACT_SHOT;
+		return -ELOOP;
 	}
 
 	tp = first_tp;
@@ -1760,7 +1763,7 @@  int tcf_classify(struct sk_buff *skb,
 				n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie,
 								&act_index);
 				if (!n)
-					return TC_ACT_SHOT;
+					return -ENOENT;
 
 				chain = n->chain_index;
 			} else {
@@ -1769,7 +1772,7 @@  int tcf_classify(struct sk_buff *skb,
 
 			fchain = tcf_chain_lookup_rcu(block, chain);
 			if (!fchain)
-				return TC_ACT_SHOT;
+				return -ENOENT;
 
 			/* Consume, so cloned/redirect skbs won't inherit ext */
 			skb_ext_del(skb, TC_SKB_EXT);
@@ -1784,12 +1787,13 @@  int tcf_classify(struct sk_buff *skb,
 
 	if (tc_skb_ext_tc_enabled()) {
 		/* If we missed on some chain */
-		if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+		if (res->verdict == TC_ACT_UNSPEC && last_executed_chain) {
 			struct tc_skb_cb *cb = tc_skb_cb(skb);
 
 			ext = tc_skb_ext_alloc(skb);
 			if (WARN_ON_ONCE(!ext))
-				return TC_ACT_SHOT;
+				return -ENOMEM;
+
 			ext->chain = last_executed_chain;
 			ext->mru = cb->mru;
 			ext->post_ct = cb->post_ct;
@@ -3896,15 +3900,23 @@  EXPORT_SYMBOL(tcf_qevent_validate_change);
 struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
 				  struct sk_buff **to_free, int *ret)
 {
-	struct tcf_result cl_res;
+	struct tcf_result cl_res = {0};
 	struct tcf_proto *fl;
+	int res;
 
 	if (!qe->info.block_index)
 		return skb;
 
 	fl = rcu_dereference_bh(qe->filter_chain);
 
-	switch (tcf_classify(skb, NULL, fl, &cl_res, false)) {
+	res = tcf_classify(skb, NULL, fl, &cl_res, false);
+	if (res < 0) {
+		qdisc_qstats_drop(sch);
+		__qdisc_drop(skb, to_free);
+		*ret = __NET_XMIT_BYPASS;
+		return NULL;
+	}
+	switch (cl_res.verdict) {
 	case TC_ACT_SHOT:
 		qdisc_qstats_drop(sch);
 		__qdisc_drop(skb, to_free);
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 9cff99558694..359cf7303b09 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1656,8 +1656,8 @@  static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
 			 struct sk_buff *skb, int flow_mode, int *qerr)
 {
 	struct cake_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct tcf_proto *filter;
-	struct tcf_result res;
 	u16 flow = 0, host = 0;
 	int result;
 
@@ -1667,24 +1667,24 @@  static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	result = tcf_classify(skb, NULL, filter, &res, false);
+	if (result < 0)
+		return result;
 
-	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return 0;
-		}
-#endif
-		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
-			flow = TC_H_MIN(res.classid);
-		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
-			host = TC_H_MAJ(res.classid) >> 16;
+	switch (res.verdict) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return 0;
 	}
+#endif
+	if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
+		flow = TC_H_MIN(res.classid);
+	if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
+		host = TC_H_MAJ(res.classid) >> 16;
 hash:
 	*t = cake_select_tin(sch, skb);
 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 19901e77cd3b..39d2dd411cbe 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -295,8 +295,8 @@  static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct drr_sched *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct drr_class *cl;
-	struct tcf_result res;
 	struct tcf_proto *fl;
 	int result;
 
@@ -309,24 +309,23 @@  static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
 	result = tcf_classify(skb, NULL, fl, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return NULL;
-		}
-#endif
-		cl = (struct drr_class *)res.class;
-		if (cl == NULL)
-			cl = drr_find_class(sch, res.classid);
-		return cl;
+	switch (res.verdict) {
+	case TC_ACT_QUEUED:
+	case TC_ACT_STOLEN:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return NULL;
 	}
-	return NULL;
+#endif
+	cl = (struct drr_class *)res.class;
+	if (cl == NULL)
+		cl = drr_find_class(sch, res.classid);
+	return cl;
 }
 
 static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index b10efeaf0629..cc73d4f96fdc 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -374,8 +374,8 @@  static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct ets_sched *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	u32 band = skb->priority;
-	struct tcf_result res;
 	struct tcf_proto *fl;
 	int err;
 
@@ -383,8 +383,10 @@  static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
 		err = tcf_classify(skb, NULL, fl, &res, false);
+		if (err < 0)
+			return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (err) {
+		switch (res.verdict) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
 		case TC_ACT_TRAP:
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 8c4fee063436..d70d0585f769 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -77,8 +77,8 @@  static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct tcf_proto *filter;
-	struct tcf_result res;
 	int result;
 
 	if (TC_H_MAJ(skb->priority) == sch->handle &&
@@ -92,21 +92,22 @@  static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	result = tcf_classify(skb, NULL, filter, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return 0;
+
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return 0;
-		}
-#endif
-		if (TC_H_MIN(res.classid) <= q->flows_cnt)
-			return TC_H_MIN(res.classid);
+	switch (res.verdict) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return 0;
 	}
+#endif
+	if (TC_H_MIN(res.classid) <= q->flows_cnt)
+		return TC_H_MIN(res.classid);
 	return 0;
 }
 
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 68e6acd0f130..63d87ea2f187 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -81,8 +81,8 @@  static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
 				    int *qerr)
 {
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct tcf_proto *filter;
-	struct tcf_result res;
 	int result;
 
 	if (TC_H_MAJ(skb->priority) == sch->handle &&
@@ -96,21 +96,21 @@  static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	result = tcf_classify(skb, NULL, filter, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return 0;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return 0;
-		}
-#endif
-		if (TC_H_MIN(res.classid) <= q->flows_cnt)
-			return TC_H_MIN(res.classid);
+	switch (res.verdict) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return 0;
 	}
+#endif
+	if (TC_H_MIN(res.classid) <= q->flows_cnt)
+		return TC_H_MIN(res.classid);
 	return 0;
 }
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3554085bc2be..9a45596b87bf 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1122,7 +1122,7 @@  hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *head, *cl;
-	struct tcf_result res;
+	struct tcf_result res = {0};
 	struct tcf_proto *tcf;
 	int result;
 
@@ -1135,8 +1135,10 @@  hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	head = &q->root;
 	tcf = rcu_dereference_bh(q->root.filter_list);
 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
+		if (result < 0)
+			return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
+		switch (res.verdict) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
 		case TC_ACT_TRAP:
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0d947414e616..8ebb56e4ea91 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -220,8 +220,8 @@  static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct htb_sched *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct htb_class *cl;
-	struct tcf_result res;
 	struct tcf_proto *tcf;
 	int result;
 
@@ -243,8 +243,10 @@  static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
+		if (result < 0)
+			return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
+		switch (res.verdict) {
 		case TC_ACT_QUEUED:
 		case TC_ACT_STOLEN:
 		case TC_ACT_TRAP:
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 75c9c860182b..07c5247e9c50 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -31,14 +31,16 @@  multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	u32 band;
-	struct tcf_result res;
 	struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
+	struct tcf_result res = {0};
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	err = tcf_classify(skb, NULL, fl, &res, false);
+	if (err < 0)
+		return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-	switch (err) {
+	switch (res.verdict) {
 	case TC_ACT_STOLEN:
 	case TC_ACT_QUEUED:
 	case TC_ACT_TRAP:
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fdc5ef52c3ee..9abd093b7941 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -31,8 +31,8 @@  static struct Qdisc *
 prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	u32 band = skb->priority;
-	struct tcf_result res;
 	struct tcf_proto *fl;
 	int err;
 
@@ -40,8 +40,11 @@  prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
 		fl = rcu_dereference_bh(q->filter_list);
 		err = tcf_classify(skb, NULL, fl, &res, false);
+		if (err < 0)
+			return NULL;
+
 #ifdef CONFIG_NET_CLS_ACT
-		switch (err) {
+		switch (res.verdict) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
 		case TC_ACT_TRAP:
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 546c10adcacd..9874aefa1994 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -680,8 +680,8 @@  static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
+	struct tcf_result res = {0};
 	struct qfq_class *cl;
-	struct tcf_result res;
 	struct tcf_proto *fl;
 	int result;
 
@@ -695,25 +695,23 @@  static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	fl = rcu_dereference_bh(q->filter_list);
 	result = tcf_classify(skb, NULL, fl, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return NULL;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return NULL;
-		}
-#endif
-		cl = (struct qfq_class *)res.class;
-		if (cl == NULL)
-			cl = qfq_find_class(sch, res.classid);
-		return cl;
+	switch (res.verdict) {
+	case TC_ACT_QUEUED:
+	case TC_ACT_STOLEN:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return NULL;
 	}
-
-	return NULL;
+#endif
+	cl = (struct qfq_class *)res.class;
+	if (cl == NULL)
+		cl = qfq_find_class(sch, res.classid);
+	return cl;
 }
 
 /* Generic comparison function, handling wraparound. */
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1871a1c0224d..e1085c9f8925 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -254,26 +254,25 @@  static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
 static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 			 int *qerr, u32 *salt)
 {
-	struct tcf_result res;
+	struct tcf_result res = {0};
 	int result;
 
 	result = tcf_classify(skb, NULL, fl, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return false;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return false;
-		}
-#endif
-		*salt = TC_H_MIN(res.classid);
-		return true;
+	switch (res.verdict) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return false;
 	}
-	return false;
+#endif
+	*salt = TC_H_MIN(res.classid);
+	return true;
 }
 
 static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 66dcb18638fe..c20a4c8d0785 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -164,7 +164,7 @@  static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 				 int *qerr)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	struct tcf_result res;
+	struct tcf_result res = {0};
 	struct tcf_proto *fl;
 	int result;
 
@@ -179,21 +179,21 @@  static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	result = tcf_classify(skb, NULL, fl, &res, false);
-	if (result >= 0) {
+	if (result < 0)
+		return 0;
 #ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_SHOT:
-			return 0;
-		}
-#endif
-		if (TC_H_MIN(res.classid) <= q->divisor)
-			return TC_H_MIN(res.classid);
+	switch (res.verdict) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+	case TC_ACT_TRAP:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		fallthrough;
+	case TC_ACT_SHOT:
+		return 0;
 	}
+#endif
+	if (TC_H_MIN(res.classid) <= q->divisor)
+		return TC_H_MIN(res.classid);
 	return 0;
 }