diff mbox series

[v2,net-next,21/26] ice: add XDP and XSK generic per-channel statistics

Message ID 20211123163955.154512-22-alexandr.lobakin@intel.com (mailing list archive)
State Not Applicable
Headers show
Series net: introduce and use generic XDP stats | expand

Commit Message

Alexander Lobakin Nov. 23, 2021, 4:39 p.m. UTC
Make ice driver collect and provide all generic XDP/XSK counters.
Unfortunately, XDP rings have a lifetime of an XDP prog, and all
ring stats structures get wiped on xsk_pool attach/detach, so
store them in a separate array with a lifetime of a VSI. New
alloc_xdp_stats field is used to calculate the maximum possible
number of XDP-enabled queues just once and refer to it later.
Reuse all previously introduced helpers and
xdp_get_drv_stats_generic(). Performance wavering from incrementing
a bunch of counters on hotpath is around stddev at [64 ... 1532]
frame sizes.

Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h          |  2 +
 drivers/net/ethernet/intel/ice/ice_lib.c      | 21 ++++++++
 drivers/net/ethernet/intel/ice/ice_main.c     | 17 +++++++
 drivers/net/ethernet/intel/ice/ice_txrx.c     | 33 +++++++++---
 drivers/net/ethernet/intel/ice/ice_txrx.h     | 12 +++--
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  3 ++
 drivers/net/ethernet/intel/ice/ice_xsk.c      | 51 ++++++++++++++-----
 7 files changed, 118 insertions(+), 21 deletions(-)

--
2.33.1

Comments

Daniel Borkmann Nov. 24, 2021, 12:52 a.m. UTC | #1
Hi Alexander,

On 11/23/21 5:39 PM, Alexander Lobakin wrote:
[...]

Just commenting on ice here as one example (similar applies to other drivers):

> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> index 1dd7e84f41f8..7dc287bc3a1a 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> @@ -258,6 +258,8 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
>   		xdp_ring->next_dd = ICE_TX_THRESH - 1;
>   	xdp_ring->next_to_clean = ntc;
>   	ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
> +	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xdp_tx, total_pkts,
> +				total_bytes);
>   }
> 
>   /**
> @@ -277,6 +279,7 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
>   		ice_clean_xdp_irq(xdp_ring);
> 
>   	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
> +		xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xdp_tx);
>   		xdp_ring->tx_stats.tx_busy++;
>   		return ICE_XDP_CONSUMED;
>   	}
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index ff55cb415b11..62ef47a38d93 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -454,42 +454,58 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp_arr)
>    * @xdp: xdp_buff used as input to the XDP program
>    * @xdp_prog: XDP program to run
>    * @xdp_ring: ring to be used for XDP_TX action
> + * @lrstats: onstack Rx XDP stats
>    *
>    * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
>    */
>   static int
>   ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> -	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
> +	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
> +	       struct xdp_rx_drv_stats_local *lrstats)
>   {
>   	int err, result = ICE_XDP_PASS;
>   	u32 act;
> 
> +	lrstats->bytes += xdp->data_end - xdp->data;
> +	lrstats->packets++;
> +
>   	act = bpf_prog_run_xdp(xdp_prog, xdp);
> 
>   	if (likely(act == XDP_REDIRECT)) {
>   		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> -		if (err)
> +		if (err) {
> +			lrstats->redirect_errors++;
>   			goto out_failure;
> +		}
> +		lrstats->redirect++;
>   		return ICE_XDP_REDIR;
>   	}
> 
>   	switch (act) {
>   	case XDP_PASS:
> +		lrstats->pass++;
>   		break;
>   	case XDP_TX:
>   		result = ice_xmit_xdp_buff(xdp, xdp_ring);
> -		if (result == ICE_XDP_CONSUMED)
> +		if (result == ICE_XDP_CONSUMED) {
> +			lrstats->tx_errors++;
>   			goto out_failure;
> +		}
> +		lrstats->tx++;
>   		break;
>   	default:
>   		bpf_warn_invalid_xdp_action(act);
> -		fallthrough;
> +		lrstats->invalid++;
> +		goto out_failure;
>   	case XDP_ABORTED:
> +		lrstats->aborted++;
>   out_failure:
>   		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
> -		fallthrough;
> +		result = ICE_XDP_CONSUMED;
> +		break;
>   	case XDP_DROP:
>   		result = ICE_XDP_CONSUMED;
> +		lrstats->drop++;
>   		break;
>   	}

Imho, the overall approach is way too bloated. I can see the packets/bytes but now we
have 3 counter updates with return codes included and then the additional sync of the
on-stack counters into the ring counters via xdp_update_rx_drv_stats(). So we now need
ice_update_rx_ring_stats() as well as xdp_update_rx_drv_stats() which syncs 10 different
stat counters via u64_stats_add() into the per ring ones. :/

I'm just taking our XDP L4LB in Cilium as an example: there we already count errors and
export them via per-cpu map that eventually lead to XDP_DROP cases including the /reason/
which caused the XDP_DROP (e.g. Prometheus can then scrape these insights from all the
nodes in the cluster). Given the different action codes are very often application specific,
there's not much debugging that you can do when /only/ looking at `ip link xdpstats` to
gather insight on *why* some of these actions were triggered (e.g. fib lookup failure, etc).
If really of interest, then maybe libxdp could have such per-action counters as opt-in in
its call chain..

In the case of ice_run_xdp() today, we already bump total_rx_bytes/total_rx_pkts under
XDP and update ice_update_rx_ring_stats(). I do see the case for XDP_TX and XDP_REDIRECT
where we run into driver-specific errors that are /outside of the reach/ of the BPF prog.
For example, we've been running into errors from XDP_TX in ice_xmit_xdp_ring() in the
past during testing, and were able to pinpoint the location as xdp_ring->tx_stats.tx_busy
was increasing. These things are useful and would make sense to standardize for XDP context.

But then it also seems like above in ice_xmit_xdp_ring() we now need to bump counters
twice just for sake of ethtool vs xdp counters which sucks a bit, would be nice to only
having to do it once:

 >   	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
 > +		xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xdp_tx);
 >   		xdp_ring->tx_stats.tx_busy++;
 >   		return ICE_XDP_CONSUMED;
 >   	}

Anyway, but just to reiterate, for troubleshooting I do care about anomalous events that
led to drops in the driver e.g. due to no space in ring or DMA errors (XDP_TX), or more
detailed insights in xdp_do_redirect() when errors occur (XDP_REDIRECT), very much less
about the action code given the prog has the full error context here already.

One more comment/question on the last doc update patch (I presume you only have dummy
numbers in there from testing?):

+For some interfaces, standard XDP statistics are available.
+It can be accessed the same ways, e.g. `ip`::
+
+  $ ip link xdpstats dev enp178s0
+  16: enp178s0:
+  xdp-channel0-rx_xdp_packets: 0
+  xdp-channel0-rx_xdp_bytes: 1
+  xdp-channel0-rx_xdp_errors: 2

What are the semantics on xdp_errors? Summary of xdp_redirect_errors, xdp_tx_errors and
xdp_xmit_errors? Or driver specific defined?

+  xdp-channel0-rx_xdp_aborted: 3
+  xdp-channel0-rx_xdp_drop: 4
+  xdp-channel0-rx_xdp_invalid: 5
+  xdp-channel0-rx_xdp_pass: 6
[...]

+  xdp-channel0-rx_xdp_redirect: 7
+  xdp-channel0-rx_xdp_redirect_errors: 8
+  xdp-channel0-rx_xdp_tx: 9
+  xdp-channel0-rx_xdp_tx_errors: 10
+  xdp-channel0-tx_xdp_xmit_packets: 11
+  xdp-channel0-tx_xdp_xmit_bytes: 12
+  xdp-channel0-tx_xdp_xmit_errors: 13
+  xdp-channel0-tx_xdp_xmit_full: 14

 From a user PoV to avoid confusion, maybe should be made more clear that the latter refers
to xsk.

> @@ -507,6 +523,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
>   {
>   	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>   	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
> +	struct xdp_rx_drv_stats_local lrstats = { };
>   	struct ice_tx_ring *xdp_ring;
>   	unsigned int xdp_xmit = 0;
>   	struct bpf_prog *xdp_prog;
> @@ -548,7 +565,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
>   		xsk_buff_set_size(*xdp, size);
>   		xsk_buff_dma_sync_for_cpu(*xdp, rx_ring->xsk_pool);
> 
> -		xdp_res = ice_run_xdp_zc(rx_ring, *xdp, xdp_prog, xdp_ring);
> +		xdp_res = ice_run_xdp_zc(rx_ring, *xdp, xdp_prog, xdp_ring,
> +					 &lrstats);
>   		if (xdp_res) {
>   			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
>   				xdp_xmit |= xdp_res;
> @@ -598,6 +616,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> 
>   	ice_finalize_xdp_rx(xdp_ring, xdp_xmit);
>   	ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
> +	xdp_update_rx_drv_stats(&rx_ring->xdp_stats->xsk_rx, &lrstats);
> 
>   	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
>   		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
> @@ -629,6 +648,7 @@ static bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, int budget)
>   		struct ice_tx_buf *tx_buf;
> 
>   		if (unlikely(!ICE_DESC_UNUSED(xdp_ring))) {
> +			xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xsk_tx);
>   			xdp_ring->tx_stats.tx_busy++;
>   			work_done = false;
>   			break;
> @@ -686,11 +706,11 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
>    */
>   bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget)
>   {
> -	int total_packets = 0, total_bytes = 0;
>   	s16 ntc = xdp_ring->next_to_clean;
> +	u32 xdp_frames = 0, xdp_bytes = 0;
> +	u32 xsk_frames = 0, xsk_bytes = 0;
>   	struct ice_tx_desc *tx_desc;
>   	struct ice_tx_buf *tx_buf;
> -	u32 xsk_frames = 0;
>   	bool xmit_done;
> 
>   	tx_desc = ICE_TX_DESC(xdp_ring, ntc);

Thanks,
Daniel
Lorenz Bauer Nov. 24, 2021, 4:34 p.m. UTC | #2
Daniel asked me to share my opinion, as Cloudflare has an XDP load
balancer as well.

On Wed, 24 Nov 2021 at 00:53, Daniel Borkmann <daniel@iogearbox.net> wrote:

> I'm just taking our XDP L4LB in Cilium as an example: there we already count errors and
> export them via per-cpu map that eventually lead to XDP_DROP cases including the /reason/
> which caused the XDP_DROP (e.g. Prometheus can then scrape these insights from all the
> nodes in the cluster). Given the different action codes are very often application specific,
> there's not much debugging that you can do when /only/ looking at `ip link xdpstats` to
> gather insight on *why* some of these actions were triggered (e.g. fib lookup failure, etc).

Agreed. For our purpose we often want to know whether a specific
program has been invoked. Per-channel or per device stats don't help
us much since we have a chain of programs (not using libxdp though).
My colleague Arthur has written xdpcap [1], which gives per-action,
per-program counters. This way we can correlate an action with a
packet and a program.

> If really of interest, then maybe libxdp could have such per-action counters as opt-in in
> its call chain..

We could also make it part of BPF_ENABLE_STATS, it's kind of coarse
grained though.

> In the case of ice_run_xdp() today, we already bump total_rx_bytes/total_rx_pkts under
> XDP and update ice_update_rx_ring_stats(). I do see the case for XDP_TX and XDP_REDIRECT
> where we run into driver-specific errors that are /outside of the reach/ of the BPF prog.
> For example, we've been running into errors from XDP_TX in ice_xmit_xdp_ring() in the
> past during testing, and were able to pinpoint the location as xdp_ring->tx_stats.tx_busy
> was increasing. These things are useful and would make sense to standardize for XDP context.

I'd like to see more tracepoints like trace_xdp_exception, personally.
We can use things like bpftrace for exploration and ebpf_exporter [2]
to generate alerts much more easily than something wired into
iproute2.

Best
Lorenz

1: https://github.com/cloudflare/xdpcap
2: https://github.com/cloudflare/ebpf_exporter
Toke Høiland-Jørgensen Nov. 25, 2021, 11:56 a.m. UTC | #3
Daniel Borkmann <daniel@iogearbox.net> writes:

> Hi Alexander,
>
> On 11/23/21 5:39 PM, Alexander Lobakin wrote:
> [...]
>
> Just commenting on ice here as one example (similar applies to other drivers):
>
>> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
>> index 1dd7e84f41f8..7dc287bc3a1a 100644
>> --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
>> +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
>> @@ -258,6 +258,8 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
>>   		xdp_ring->next_dd = ICE_TX_THRESH - 1;
>>   	xdp_ring->next_to_clean = ntc;
>>   	ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
>> +	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xdp_tx, total_pkts,
>> +				total_bytes);
>>   }
>> 
>>   /**
>> @@ -277,6 +279,7 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
>>   		ice_clean_xdp_irq(xdp_ring);
>> 
>>   	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
>> +		xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xdp_tx);
>>   		xdp_ring->tx_stats.tx_busy++;
>>   		return ICE_XDP_CONSUMED;
>>   	}
>> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
>> index ff55cb415b11..62ef47a38d93 100644
>> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
>> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
>> @@ -454,42 +454,58 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp_arr)
>>    * @xdp: xdp_buff used as input to the XDP program
>>    * @xdp_prog: XDP program to run
>>    * @xdp_ring: ring to be used for XDP_TX action
>> + * @lrstats: onstack Rx XDP stats
>>    *
>>    * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
>>    */
>>   static int
>>   ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
>> -	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
>> +	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
>> +	       struct xdp_rx_drv_stats_local *lrstats)
>>   {
>>   	int err, result = ICE_XDP_PASS;
>>   	u32 act;
>> 
>> +	lrstats->bytes += xdp->data_end - xdp->data;
>> +	lrstats->packets++;
>> +
>>   	act = bpf_prog_run_xdp(xdp_prog, xdp);
>> 
>>   	if (likely(act == XDP_REDIRECT)) {
>>   		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
>> -		if (err)
>> +		if (err) {
>> +			lrstats->redirect_errors++;
>>   			goto out_failure;
>> +		}
>> +		lrstats->redirect++;
>>   		return ICE_XDP_REDIR;
>>   	}
>> 
>>   	switch (act) {
>>   	case XDP_PASS:
>> +		lrstats->pass++;
>>   		break;
>>   	case XDP_TX:
>>   		result = ice_xmit_xdp_buff(xdp, xdp_ring);
>> -		if (result == ICE_XDP_CONSUMED)
>> +		if (result == ICE_XDP_CONSUMED) {
>> +			lrstats->tx_errors++;
>>   			goto out_failure;
>> +		}
>> +		lrstats->tx++;
>>   		break;
>>   	default:
>>   		bpf_warn_invalid_xdp_action(act);
>> -		fallthrough;
>> +		lrstats->invalid++;
>> +		goto out_failure;
>>   	case XDP_ABORTED:
>> +		lrstats->aborted++;
>>   out_failure:
>>   		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
>> -		fallthrough;
>> +		result = ICE_XDP_CONSUMED;
>> +		break;
>>   	case XDP_DROP:
>>   		result = ICE_XDP_CONSUMED;
>> +		lrstats->drop++;
>>   		break;
>>   	}
>
> Imho, the overall approach is way too bloated. I can see the
> packets/bytes but now we have 3 counter updates with return codes
> included and then the additional sync of the on-stack counters into
> the ring counters via xdp_update_rx_drv_stats(). So we now need
> ice_update_rx_ring_stats() as well as xdp_update_rx_drv_stats() which
> syncs 10 different stat counters via u64_stats_add() into the per ring
> ones. :/
>
> I'm just taking our XDP L4LB in Cilium as an example: there we already
> count errors and export them via per-cpu map that eventually lead to
> XDP_DROP cases including the /reason/ which caused the XDP_DROP (e.g.
> Prometheus can then scrape these insights from all the nodes in the
> cluster). Given the different action codes are very often application
> specific, there's not much debugging that you can do when /only/
> looking at `ip link xdpstats` to gather insight on *why* some of these
> actions were triggered (e.g. fib lookup failure, etc). If really of
> interest, then maybe libxdp could have such per-action counters as
> opt-in in its call chain..

To me, standardising these counters is less about helping people debug
their XDP programs (as you say, you can put your own telemetry into
those), and more about making XDP less "mystical" to the system
administrator (who may not be the same person who wrote the XDP
programs). So at the very least, they need to indicate "where are the
packets going", which means at least counters for DROP, REDIRECT and TX
(+ errors for tx/redirect) in addition to the "processed by XDP" initial
counter. Which in the above means 'pass', 'invalid' and 'aborted' could
be dropped, I guess; but I don't mind terribly keeping them either given
that there's no measurable performance impact.

> But then it also seems like above in ice_xmit_xdp_ring() we now need
> to bump counters twice just for sake of ethtool vs xdp counters which
> sucks a bit, would be nice to only having to do it once:

This I agree with, and while I can see the layering argument for putting
them into 'ip' and rtnetlink instead of ethtool, I also worry that these
counters will simply be lost in obscurity, so I do wonder if it wouldn't
be better to accept the "layering violation" and keeping them all in the
'ethtool -S' output?

[...]

> +  xdp-channel0-rx_xdp_redirect: 7
> +  xdp-channel0-rx_xdp_redirect_errors: 8
> +  xdp-channel0-rx_xdp_tx: 9
> +  xdp-channel0-rx_xdp_tx_errors: 10
> +  xdp-channel0-tx_xdp_xmit_packets: 11
> +  xdp-channel0-tx_xdp_xmit_bytes: 12
> +  xdp-channel0-tx_xdp_xmit_errors: 13
> +  xdp-channel0-tx_xdp_xmit_full: 14
>
>  From a user PoV to avoid confusion, maybe should be made more clear that the latter refers
> to xsk.

+1, these should probably be xdp-channel0-tx_xsk_* or something like
that...

-Toke
Alexander Lobakin Nov. 25, 2021, 5:07 p.m. UTC | #4
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Thu, 25 Nov 2021 12:56:01 +0100

> Daniel Borkmann <daniel@iogearbox.net> writes:
> 
> > Hi Alexander,
> >
> > On 11/23/21 5:39 PM, Alexander Lobakin wrote:
> > [...]
> >
> > Just commenting on ice here as one example (similar applies to other drivers):
> >
> >> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> >> index 1dd7e84f41f8..7dc287bc3a1a 100644
> >> --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> >> +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> >> @@ -258,6 +258,8 @@ static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
> >>   		xdp_ring->next_dd = ICE_TX_THRESH - 1;
> >>   	xdp_ring->next_to_clean = ntc;
> >>   	ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
> >> +	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xdp_tx, total_pkts,
> >> +				total_bytes);
> >>   }
> >> 
> >>   /**
> >> @@ -277,6 +279,7 @@ int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
> >>   		ice_clean_xdp_irq(xdp_ring);
> >> 
> >>   	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
> >> +		xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xdp_tx);
> >>   		xdp_ring->tx_stats.tx_busy++;
> >>   		return ICE_XDP_CONSUMED;
> >>   	}
> >> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> >> index ff55cb415b11..62ef47a38d93 100644
> >> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> >> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> >> @@ -454,42 +454,58 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp_arr)
> >>    * @xdp: xdp_buff used as input to the XDP program
> >>    * @xdp_prog: XDP program to run
> >>    * @xdp_ring: ring to be used for XDP_TX action
> >> + * @lrstats: onstack Rx XDP stats
> >>    *
> >>    * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
> >>    */
> >>   static int
> >>   ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> >> -	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
> >> +	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
> >> +	       struct xdp_rx_drv_stats_local *lrstats)
> >>   {
> >>   	int err, result = ICE_XDP_PASS;
> >>   	u32 act;
> >> 
> >> +	lrstats->bytes += xdp->data_end - xdp->data;
> >> +	lrstats->packets++;
> >> +
> >>   	act = bpf_prog_run_xdp(xdp_prog, xdp);
> >> 
> >>   	if (likely(act == XDP_REDIRECT)) {
> >>   		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> >> -		if (err)
> >> +		if (err) {
> >> +			lrstats->redirect_errors++;
> >>   			goto out_failure;
> >> +		}
> >> +		lrstats->redirect++;
> >>   		return ICE_XDP_REDIR;
> >>   	}
> >> 
> >>   	switch (act) {
> >>   	case XDP_PASS:
> >> +		lrstats->pass++;
> >>   		break;
> >>   	case XDP_TX:
> >>   		result = ice_xmit_xdp_buff(xdp, xdp_ring);
> >> -		if (result == ICE_XDP_CONSUMED)
> >> +		if (result == ICE_XDP_CONSUMED) {
> >> +			lrstats->tx_errors++;
> >>   			goto out_failure;
> >> +		}
> >> +		lrstats->tx++;
> >>   		break;
> >>   	default:
> >>   		bpf_warn_invalid_xdp_action(act);
> >> -		fallthrough;
> >> +		lrstats->invalid++;
> >> +		goto out_failure;
> >>   	case XDP_ABORTED:
> >> +		lrstats->aborted++;
> >>   out_failure:
> >>   		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
> >> -		fallthrough;
> >> +		result = ICE_XDP_CONSUMED;
> >> +		break;
> >>   	case XDP_DROP:
> >>   		result = ICE_XDP_CONSUMED;
> >> +		lrstats->drop++;
> >>   		break;
> >>   	}
> >
> > Imho, the overall approach is way too bloated. I can see the
> > packets/bytes but now we have 3 counter updates with return codes
> > included and then the additional sync of the on-stack counters into
> > the ring counters via xdp_update_rx_drv_stats(). So we now need
> > ice_update_rx_ring_stats() as well as xdp_update_rx_drv_stats() which
> > syncs 10 different stat counters via u64_stats_add() into the per ring
> > ones. :/
> >
> > I'm just taking our XDP L4LB in Cilium as an example: there we already
> > count errors and export them via per-cpu map that eventually lead to
> > XDP_DROP cases including the /reason/ which caused the XDP_DROP (e.g.
> > Prometheus can then scrape these insights from all the nodes in the
> > cluster). Given the different action codes are very often application
> > specific, there's not much debugging that you can do when /only/
> > looking at `ip link xdpstats` to gather insight on *why* some of these
> > actions were triggered (e.g. fib lookup failure, etc). If really of
> > interest, then maybe libxdp could have such per-action counters as
> > opt-in in its call chain..
> 
> To me, standardising these counters is less about helping people debug
> their XDP programs (as you say, you can put your own telemetry into
> those), and more about making XDP less "mystical" to the system
> administrator (who may not be the same person who wrote the XDP
> programs). So at the very least, they need to indicate "where are the
> packets going", which means at least counters for DROP, REDIRECT and TX
> (+ errors for tx/redirect) in addition to the "processed by XDP" initial
> counter. Which in the above means 'pass', 'invalid' and 'aborted' could
> be dropped, I guess; but I don't mind terribly keeping them either given
> that there's no measurable performance impact.

Right.

The other reason is that I want to continue the effort of
standardizing widely-implemented statistics. Ethtool private stats
approach is neither scalable (you can't rely on any fields which may
be not exposed in other drivers) nor good for code hygiene (code
duplication, differences in naming and logics etc.).
Let's say if only mlx5 driver has 'cache_waive' stats, then it's
okay to export it using private stats, but if 10 drivers has
'xdp_drop' field it's better to uniform it, isn't it?

> > But then it also seems like above in ice_xmit_xdp_ring() we now need
> > to bump counters twice just for sake of ethtool vs xdp counters which
> > sucks a bit, would be nice to only having to do it once:

We'll remove such duplication in the nearest future, as well as some
of duplications between Ethtool private and this XDP stats. I wanted
this series to be as harmless as possible.

> This I agree with, and while I can see the layering argument for putting
> them into 'ip' and rtnetlink instead of ethtool, I also worry that these
> counters will simply be lost in obscurity, so I do wonder if it wouldn't
> be better to accept the "layering violation" and keeping them all in the
> 'ethtool -S' output?

I don't think we should harm the code and the logics in favor of
'some of the users can face something'. We don't control anything
related to XDP using Ethtool at all, but there is some XDP-related
stuff inside iproute2 code, so for me it's even more intuitive to
have them there.
Jakub, may be you'd like to add something at this point?

> [...]
> 
> > +  xdp-channel0-rx_xdp_redirect: 7
> > +  xdp-channel0-rx_xdp_redirect_errors: 8
> > +  xdp-channel0-rx_xdp_tx: 9
> > +  xdp-channel0-rx_xdp_tx_errors: 10
> > +  xdp-channel0-tx_xdp_xmit_packets: 11
> > +  xdp-channel0-tx_xdp_xmit_bytes: 12
> > +  xdp-channel0-tx_xdp_xmit_errors: 13
> > +  xdp-channel0-tx_xdp_xmit_full: 14
> >
> >  From a user PoV to avoid confusion, maybe should be made more clear that the latter refers
> > to xsk.
> 
> +1, these should probably be xdp-channel0-tx_xsk_* or something like
> that...

I think I should expand this example in Docs a bit. For XSK, there's
a separate set of the same counters, and they differ as follows:

xdp-channel0-rx_xdp_packets: 0
xdp-channel0-rx_xdp_bytes: 1
xdp-channel0-rx_xdp_errors: 2
[ ... ]
xsk-channel0-rx_xdp_packets: 256
xsk-channel0-rx_xdp_bytes: 257
xsk-channel0-rx_xdp_errors: 258
[ ... ]

The only semantic difference is that 'tx_xdp_xmit' for XDP is a
counter for the packets gone through .ndo_xdp_xmit(), and in
case of XSK it's a counter for the packets gone through XSK
ZC xmit.

> -Toke

Thanks,
Al
Jakub Kicinski Nov. 25, 2021, 5:44 p.m. UTC | #5
On Thu, 25 Nov 2021 18:07:08 +0100 Alexander Lobakin wrote:
> > This I agree with, and while I can see the layering argument for putting
> > them into 'ip' and rtnetlink instead of ethtool, I also worry that these
> > counters will simply be lost in obscurity, so I do wonder if it wouldn't
> > be better to accept the "layering violation" and keeping them all in the
> > 'ethtool -S' output?  
> 
> I don't think we should harm the code and the logics in favor of
> 'some of the users can face something'. We don't control anything
> related to XDP using Ethtool at all, but there is some XDP-related
> stuff inside iproute2 code, so for me it's even more intuitive to
> have them there.
> Jakub, may be you'd like to add something at this point?

TBH I wasn't following this thread too closely since I saw Daniel
nacked it already. I do prefer rtnl xstats, I'd just report them 
in -s if they are non-zero. But doesn't sound like we have an agreement
whether they should exist or not.

Can we think of an approach which would make cloudflare and cilium
happy? Feels like we're trying to make the slightly hypothetical 
admin happy while ignoring objections of very real users.

> > > +  xdp-channel0-rx_xdp_redirect: 7
> > > +  xdp-channel0-rx_xdp_redirect_errors: 8
> > > +  xdp-channel0-rx_xdp_tx: 9
> > > +  xdp-channel0-rx_xdp_tx_errors: 10
> > > +  xdp-channel0-tx_xdp_xmit_packets: 11
> > > +  xdp-channel0-tx_xdp_xmit_bytes: 12
> > > +  xdp-channel0-tx_xdp_xmit_errors: 13
> > > +  xdp-channel0-tx_xdp_xmit_full: 14

Please leave the per-channel stats out. They make a precedent for
channel stats which should be an attribute of a channel. Working for 
a large XDP user for a couple of years now I can tell you from my own
experience I've not once found them useful. In fact per-queue stats are
a major PITA as they crowd the output.
Alexander Lobakin Nov. 25, 2021, 8:40 p.m. UTC | #6
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 25 Nov 2021 09:44:40 -0800

> On Thu, 25 Nov 2021 18:07:08 +0100 Alexander Lobakin wrote:
> > > This I agree with, and while I can see the layering argument for putting
> > > them into 'ip' and rtnetlink instead of ethtool, I also worry that these
> > > counters will simply be lost in obscurity, so I do wonder if it wouldn't
> > > be better to accept the "layering violation" and keeping them all in the
> > > 'ethtool -S' output?  
> > 
> > I don't think we should harm the code and the logics in favor of
> > 'some of the users can face something'. We don't control anything
> > related to XDP using Ethtool at all, but there is some XDP-related
> > stuff inside iproute2 code, so for me it's even more intuitive to
> > have them there.
> > Jakub, may be you'd like to add something at this point?
> 
> TBH I wasn't following this thread too closely since I saw Daniel
> nacked it already. I do prefer rtnl xstats, I'd just report them 
> in -s if they are non-zero. But doesn't sound like we have an agreement
> whether they should exist or not.

Right, just -s is fine, if we drop the per-channel approach.

> Can we think of an approach which would make cloudflare and cilium
> happy? Feels like we're trying to make the slightly hypothetical 
> admin happy while ignoring objections of very real users.

The initial idea was to only uniform the drivers. But in general
you are right, 10 drivers having something doesn't mean it's
something good.

Maciej, I think you were talking about Cilium asking for those stats
in Intel drivers? Could you maybe provide their exact usecases/needs
so I'll orient myself? I certainly remember about XSK Tx packets and
bytes.
And speaking of XSK Tx, we have per-socket stats, isn't that enough?

> > > > +  xdp-channel0-rx_xdp_redirect: 7
> > > > +  xdp-channel0-rx_xdp_redirect_errors: 8
> > > > +  xdp-channel0-rx_xdp_tx: 9
> > > > +  xdp-channel0-rx_xdp_tx_errors: 10
> > > > +  xdp-channel0-tx_xdp_xmit_packets: 11
> > > > +  xdp-channel0-tx_xdp_xmit_bytes: 12
> > > > +  xdp-channel0-tx_xdp_xmit_errors: 13
> > > > +  xdp-channel0-tx_xdp_xmit_full: 14
> 
> Please leave the per-channel stats out. They make a precedent for
> channel stats which should be an attribute of a channel. Working for 
> a large XDP user for a couple of years now I can tell you from my own
> experience I've not once found them useful. In fact per-queue stats are
> a major PITA as they crowd the output.

Oh okay. My very first iterations were without this, but then I
found most of the drivers expose their XDP stats per-channel. Since
I didn't plan to degrade the functionality, they went that way.

Al
Toke Høiland-Jørgensen Nov. 26, 2021, 12:30 p.m. UTC | #7
Alexander Lobakin <alexandr.lobakin@intel.com> writes:

> From: Jakub Kicinski <kuba@kernel.org>
> Date: Thu, 25 Nov 2021 09:44:40 -0800
>
>> On Thu, 25 Nov 2021 18:07:08 +0100 Alexander Lobakin wrote:
>> > > This I agree with, and while I can see the layering argument for putting
>> > > them into 'ip' and rtnetlink instead of ethtool, I also worry that these
>> > > counters will simply be lost in obscurity, so I do wonder if it wouldn't
>> > > be better to accept the "layering violation" and keeping them all in the
>> > > 'ethtool -S' output?  
>> > 
>> > I don't think we should harm the code and the logics in favor of
>> > 'some of the users can face something'. We don't control anything
>> > related to XDP using Ethtool at all, but there is some XDP-related
>> > stuff inside iproute2 code, so for me it's even more intuitive to
>> > have them there.
>> > Jakub, may be you'd like to add something at this point?
>> 
>> TBH I wasn't following this thread too closely since I saw Daniel
>> nacked it already. I do prefer rtnl xstats, I'd just report them 
>> in -s if they are non-zero. But doesn't sound like we have an agreement
>> whether they should exist or not.
>
> Right, just -s is fine, if we drop the per-channel approach.

I agree that adding them to -s is fine (and that resolves my "no one
will find them" complain as well). If it crowds the output we could also
default to only output'ing a subset, and have the more detailed
statistics hidden behind a verbose switch (or even just in the JSON
output)?

>> Can we think of an approach which would make cloudflare and cilium
>> happy? Feels like we're trying to make the slightly hypothetical 
>> admin happy while ignoring objections of very real users.
>
> The initial idea was to only uniform the drivers. But in general
> you are right, 10 drivers having something doesn't mean it's
> something good.

I don't think it's accurate to call the admin use case "hypothetical".
We're expending a significant effort explaining to people that XDP can
"eat" your packets, and not having any standard statistics makes this
way harder. We should absolutely cater to our "early adopters", but if
we want XDP to see wider adoption, making it "less weird" is critical!

> Maciej, I think you were talking about Cilium asking for those stats
> in Intel drivers? Could you maybe provide their exact usecases/needs
> so I'll orient myself? I certainly remember about XSK Tx packets and
> bytes.
> And speaking of XSK Tx, we have per-socket stats, isn't that enough?

IMO, as long as the packets are accounted for in the regular XDP stats,
having a whole separate set of stats only for XSK is less important.

>> Please leave the per-channel stats out. They make a precedent for
>> channel stats which should be an attribute of a channel. Working for 
>> a large XDP user for a couple of years now I can tell you from my own
>> experience I've not once found them useful. In fact per-queue stats are
>> a major PITA as they crowd the output.
>
> Oh okay. My very first iterations were without this, but then I
> found most of the drivers expose their XDP stats per-channel. Since
> I didn't plan to degrade the functionality, they went that way.

I personally find the per-channel stats quite useful. One of the primary
reasons for not achieving full performance with XDP is broken
configuration of packet steering to CPUs, and having per-channel stats
is a nice way of seeing this. I can see the point about them being way
too verbose in the default output, though, and I do generally filter the
output as well when viewing them. But see my point above about only
printing a subset of the stats by default; per-channel stats could be
JSON-only, for instance?

-Toke
Jakub Kicinski Nov. 26, 2021, 6:06 p.m. UTC | #8
On Fri, 26 Nov 2021 13:30:16 +0100 Toke Høiland-Jørgensen wrote:
> >> TBH I wasn't following this thread too closely since I saw Daniel
> >> nacked it already. I do prefer rtnl xstats, I'd just report them 
> >> in -s if they are non-zero. But doesn't sound like we have an agreement
> >> whether they should exist or not.  
> >
> > Right, just -s is fine, if we drop the per-channel approach.  
> 
> I agree that adding them to -s is fine (and that resolves my "no one
> will find them" complain as well). If it crowds the output we could also
> default to only output'ing a subset, and have the more detailed
> statistics hidden behind a verbose switch (or even just in the JSON
> output)?
> 
> >> Can we think of an approach which would make cloudflare and cilium
> >> happy? Feels like we're trying to make the slightly hypothetical 
> >> admin happy while ignoring objections of very real users.  
> >
> > The initial idea was to only uniform the drivers. But in general
> > you are right, 10 drivers having something doesn't mean it's
> > something good.  
> 
> I don't think it's accurate to call the admin use case "hypothetical".
> We're expending a significant effort explaining to people that XDP can
> "eat" your packets, and not having any standard statistics makes this
> way harder. We should absolutely cater to our "early adopters", but if
> we want XDP to see wider adoption, making it "less weird" is critical!

Fair. In all honesty I said that hoping to push for a more flexible
approach hidden entirely in BPF, and not involving driver changes.
Assuming the XDP program has more fine grained stats we should be able
to extract those instead of double-counting. Hence my vague "let's work
with apps" comment.

For example to a person familiar with the workload it'd be useful to
know if program returned XDP_DROP because of configured policy or
failure to parse a packet. I don't think that sort distinction is
achievable at the level of standard stats.

The information required by the admin is higher level. As you say the
primary concern there is "how many packets did XDP eat".

Speaking of which, one thing that badly needs clarification is our
expectation around XDP packets getting counted towards the interface
stats.

> > Maciej, I think you were talking about Cilium asking for those stats
> > in Intel drivers? Could you maybe provide their exact usecases/needs
> > so I'll orient myself? I certainly remember about XSK Tx packets and
> > bytes.
> > And speaking of XSK Tx, we have per-socket stats, isn't that enough?  
> 
> IMO, as long as the packets are accounted for in the regular XDP stats,
> having a whole separate set of stats only for XSK is less important.
> 
> >> Please leave the per-channel stats out. They make a precedent for
> >> channel stats which should be an attribute of a channel. Working for 
> >> a large XDP user for a couple of years now I can tell you from my own
> >> experience I've not once found them useful. In fact per-queue stats are
> >> a major PITA as they crowd the output.  
> >
> > Oh okay. My very first iterations were without this, but then I
> > found most of the drivers expose their XDP stats per-channel. Since
> > I didn't plan to degrade the functionality, they went that way.  
> 
> I personally find the per-channel stats quite useful. One of the primary
> reasons for not achieving full performance with XDP is broken
> configuration of packet steering to CPUs, and having per-channel stats
> is a nice way of seeing this.

Right, that's about the only thing I use it for as well. "Is the load
evenly distributed?"  But that's not XDP specific and not worth
standardizing for, yet, IMO, because..

> I can see the point about them being way too verbose in the default
> output, though, and I do generally filter the output as well when
> viewing them. But see my point above about only printing a subset of
> the stats by default; per-channel stats could be JSON-only, for
> instance?

we don't even know what constitutes a channel today. And that will
become increasingly problematic as importance of application specific
queues increases (zctap etc). IMO until the ontological gaps around
queues are filled we should leave per-queue stats in ethtool -S.
Toke Høiland-Jørgensen Nov. 26, 2021, 6:47 p.m. UTC | #9
Jakub Kicinski <kuba@kernel.org> writes:

> On Fri, 26 Nov 2021 13:30:16 +0100 Toke Høiland-Jørgensen wrote:
>> >> TBH I wasn't following this thread too closely since I saw Daniel
>> >> nacked it already. I do prefer rtnl xstats, I'd just report them 
>> >> in -s if they are non-zero. But doesn't sound like we have an agreement
>> >> whether they should exist or not.  
>> >
>> > Right, just -s is fine, if we drop the per-channel approach.  
>> 
>> I agree that adding them to -s is fine (and that resolves my "no one
>> will find them" complain as well). If it crowds the output we could also
>> default to only output'ing a subset, and have the more detailed
>> statistics hidden behind a verbose switch (or even just in the JSON
>> output)?
>> 
>> >> Can we think of an approach which would make cloudflare and cilium
>> >> happy? Feels like we're trying to make the slightly hypothetical 
>> >> admin happy while ignoring objections of very real users.  
>> >
>> > The initial idea was to only uniform the drivers. But in general
>> > you are right, 10 drivers having something doesn't mean it's
>> > something good.  
>> 
>> I don't think it's accurate to call the admin use case "hypothetical".
>> We're expending a significant effort explaining to people that XDP can
>> "eat" your packets, and not having any standard statistics makes this
>> way harder. We should absolutely cater to our "early adopters", but if
>> we want XDP to see wider adoption, making it "less weird" is critical!
>
> Fair. In all honesty I said that hoping to push for a more flexible
> approach hidden entirely in BPF, and not involving driver changes.
> Assuming the XDP program has more fine grained stats we should be able
> to extract those instead of double-counting. Hence my vague "let's work
> with apps" comment.
>
> For example to a person familiar with the workload it'd be useful to
> know if program returned XDP_DROP because of configured policy or
> failure to parse a packet. I don't think that sort distinction is
> achievable at the level of standard stats.
>
> The information required by the admin is higher level. As you say the
> primary concern there is "how many packets did XDP eat".

Right, sure, I am also totally fine with having only a somewhat
restricted subset of stats available at the interface level and make
everything else be BPF-based. I'm hoping we can converge of a common
understanding of what this "minimal set" should be :)

> Speaking of which, one thing that badly needs clarification is our
> expectation around XDP packets getting counted towards the interface
> stats.

Agreed. My immediate thought is that "XDP packets are interface packets"
but that is certainly not what we do today, so not sure if changing it
at this point would break things?

>> > Maciej, I think you were talking about Cilium asking for those stats
>> > in Intel drivers? Could you maybe provide their exact usecases/needs
>> > so I'll orient myself? I certainly remember about XSK Tx packets and
>> > bytes.
>> > And speaking of XSK Tx, we have per-socket stats, isn't that enough?  
>> 
>> IMO, as long as the packets are accounted for in the regular XDP stats,
>> having a whole separate set of stats only for XSK is less important.
>> 
>> >> Please leave the per-channel stats out. They make a precedent for
>> >> channel stats which should be an attribute of a channel. Working for 
>> >> a large XDP user for a couple of years now I can tell you from my own
>> >> experience I've not once found them useful. In fact per-queue stats are
>> >> a major PITA as they crowd the output.  
>> >
>> > Oh okay. My very first iterations were without this, but then I
>> > found most of the drivers expose their XDP stats per-channel. Since
>> > I didn't plan to degrade the functionality, they went that way.  
>> 
>> I personally find the per-channel stats quite useful. One of the primary
>> reasons for not achieving full performance with XDP is broken
>> configuration of packet steering to CPUs, and having per-channel stats
>> is a nice way of seeing this.
>
> Right, that's about the only thing I use it for as well. "Is the load
> evenly distributed?"  But that's not XDP specific and not worth
> standardizing for, yet, IMO, because..
>
>> I can see the point about them being way too verbose in the default
>> output, though, and I do generally filter the output as well when
>> viewing them. But see my point above about only printing a subset of
>> the stats by default; per-channel stats could be JSON-only, for
>> instance?
>
> we don't even know what constitutes a channel today. And that will
> become increasingly problematic as importance of application specific
> queues increases (zctap etc). IMO until the ontological gaps around
> queues are filled we should leave per-queue stats in ethtool -S.

Hmm, right, I see. I suppose that as long as the XDP packets show up in
one of the interface counters in ethtool -S, it's possible to answer the
load distribution issue, and any further debugging (say, XDP drops on a
certain queue due to CPU-based queue indexing on TX) can be delegated to
BPF-based tools...

-Toke
Jakub Kicinski Nov. 26, 2021, 7:14 p.m. UTC | #10
On Fri, 26 Nov 2021 19:47:17 +0100 Toke Høiland-Jørgensen wrote:
> > Fair. In all honesty I said that hoping to push for a more flexible
> > approach hidden entirely in BPF, and not involving driver changes.
> > Assuming the XDP program has more fine grained stats we should be able
> > to extract those instead of double-counting. Hence my vague "let's work
> > with apps" comment.
> >
> > For example to a person familiar with the workload it'd be useful to
> > know if program returned XDP_DROP because of configured policy or
> > failure to parse a packet. I don't think that sort distinction is
> > achievable at the level of standard stats.
> >
> > The information required by the admin is higher level. As you say the
> > primary concern there is "how many packets did XDP eat".  
> 
> Right, sure, I am also totally fine with having only a somewhat
> restricted subset of stats available at the interface level and make
> everything else be BPF-based. I'm hoping we can converge of a common
> understanding of what this "minimal set" should be :)
> 
> > Speaking of which, one thing that badly needs clarification is our
> > expectation around XDP packets getting counted towards the interface
> > stats.  
> 
> Agreed. My immediate thought is that "XDP packets are interface packets"
> but that is certainly not what we do today, so not sure if changing it
> at this point would break things?

I'd vote for taking the risk and trying to align all the drivers.
Daniel Borkmann Nov. 26, 2021, 10:27 p.m. UTC | #11
On 11/26/21 7:06 PM, Jakub Kicinski wrote:
> On Fri, 26 Nov 2021 13:30:16 +0100 Toke Høiland-Jørgensen wrote:
>>>> TBH I wasn't following this thread too closely since I saw Daniel
>>>> nacked it already. I do prefer rtnl xstats, I'd just report them
>>>> in -s if they are non-zero. But doesn't sound like we have an agreement
>>>> whether they should exist or not.
>>>
>>> Right, just -s is fine, if we drop the per-channel approach.
>>
>> I agree that adding them to -s is fine (and that resolves my "no one
>> will find them" complain as well). If it crowds the output we could also
>> default to only output'ing a subset, and have the more detailed
>> statistics hidden behind a verbose switch (or even just in the JSON
>> output)?
>>
>>>> Can we think of an approach which would make cloudflare and cilium
>>>> happy? Feels like we're trying to make the slightly hypothetical
>>>> admin happy while ignoring objections of very real users.
>>>
>>> The initial idea was to only uniform the drivers. But in general
>>> you are right, 10 drivers having something doesn't mean it's
>>> something good.
>>
>> I don't think it's accurate to call the admin use case "hypothetical".
>> We're expending a significant effort explaining to people that XDP can
>> "eat" your packets, and not having any standard statistics makes this
>> way harder. We should absolutely cater to our "early adopters", but if
>> we want XDP to see wider adoption, making it "less weird" is critical!
> 
> Fair. In all honesty I said that hoping to push for a more flexible
> approach hidden entirely in BPF, and not involving driver changes.
> Assuming the XDP program has more fine grained stats we should be able
> to extract those instead of double-counting. Hence my vague "let's work
> with apps" comment.
> 
> For example to a person familiar with the workload it'd be useful to
> know if program returned XDP_DROP because of configured policy or
> failure to parse a packet. I don't think that sort distinction is
> achievable at the level of standard stats.

Agree on the additional context. How often have you looked at tc clsact
/dropped/ stats specifically when you debug a more complex BPF program
there?

   # tc -s qdisc show clsact dev foo
   qdisc clsact ffff: parent ffff:fff1
    Sent 6800 bytes 120 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0

Similarly, XDP_PASS counters may be of limited use as well for same reason
(and I think we might not even have a tc counter equivalent for it).

> The information required by the admin is higher level. As you say the
> primary concern there is "how many packets did XDP eat".

Agree. Above said, for XDP_DROP I would see one use case where you compare
different drivers or bond vs no bond as we did in the past in [0] when
testing against a packet generator (although I don't see bond driver covered
in this series here yet where it aggregates the XDP stats from all bond slave
devs).

On a higher-level wrt "how many packets did XDP eat", it would make sense
to have the stats for successful XDP_{TX,REDIRECT} given these are out
of reach from a BPF prog PoV - we can only count there how many times we
returned with XDP_TX but not whether the pkt /successfully made it/.

In terms of error cases, could we just standardize all drivers on the behavior
of e.g. mlx5e_xdp_handle(), meaning, a failure from XDP_{TX,REDIRECT} will
hit the trace_xdp_exception() and then fallthrough to bump a drop counter
(same as we bump in XDP_DROP then). So the drop counter will account for
program drops but also driver-related drops.

At some later point the trace_xdp_exception() could be extended with an error
code that the driver would propagate (given some of them look quite similar
across drivers, fwiw), and then whoever wants to do further processing with
them can do so via bpftrace or other tooling.

So overall wrt this series: from the lrstats we'd be /dropping/ the pass,
tx_errors, redirect_errors, invalid, aborted counters. And we'd be /keeping/
bytes & packets counters that XDP sees, (driver-)successful tx & redirect
counters as well as drop counter. Also, XDP bytes & packets counters should
not be counted twice wrt ethtool stats.

   [0] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9e2ee5c7e7c35d195e2aa0692a7241d47a433d1e

Thanks,
Daniel
Daniel Borkmann Nov. 26, 2021, 11:01 p.m. UTC | #12
On 11/26/21 11:27 PM, Daniel Borkmann wrote:
> On 11/26/21 7:06 PM, Jakub Kicinski wrote:
[...]
>> The information required by the admin is higher level. As you say the
>> primary concern there is "how many packets did XDP eat".
> 
> Agree. Above said, for XDP_DROP I would see one use case where you compare
> different drivers or bond vs no bond as we did in the past in [0] when
> testing against a packet generator (although I don't see bond driver covered
> in this series here yet where it aggregates the XDP stats from all bond slave
> devs).
> 
> On a higher-level wrt "how many packets did XDP eat", it would make sense
> to have the stats for successful XDP_{TX,REDIRECT} given these are out
> of reach from a BPF prog PoV - we can only count there how many times we
> returned with XDP_TX but not whether the pkt /successfully made it/.
> 
> In terms of error cases, could we just standardize all drivers on the behavior
> of e.g. mlx5e_xdp_handle(), meaning, a failure from XDP_{TX,REDIRECT} will
> hit the trace_xdp_exception() and then fallthrough to bump a drop counter
> (same as we bump in XDP_DROP then). So the drop counter will account for
> program drops but also driver-related drops.
> 
> At some later point the trace_xdp_exception() could be extended with an error
> code that the driver would propagate (given some of them look quite similar
> across drivers, fwiw), and then whoever wants to do further processing with
> them can do so via bpftrace or other tooling.

Just thinking out loud, one straight forward example we could start out with
that is also related to Paolo's series [1] ...

enum xdp_error {
	XDP_UNKNOWN,
	XDP_ACTION_INVALID,
	XDP_ACTION_UNSUPPORTED,
};

... and then bpf_warn_invalid_xdp_action() returns one of the latter two
which we pass to trace_xdp_exception(). Later there could be XDP_DRIVER_*
cases e.g. propagated from XDP_TX error exceptions.

         [...]
         default:
                 err = bpf_warn_invalid_xdp_action(act);
                 fallthrough;
         case XDP_ABORTED:
xdp_abort:
                 trace_xdp_exception(rq->netdev, prog, act, err);
                 fallthrough;
         case XDP_DROP:
                 lrstats->xdp_drop++;
                 break;
         }
         [...]

   [1] https://lore.kernel.org/netdev/cover.1637924200.git.pabeni@redhat.com/

> So overall wrt this series: from the lrstats we'd be /dropping/ the pass,
> tx_errors, redirect_errors, invalid, aborted counters. And we'd be /keeping/
> bytes & packets counters that XDP sees, (driver-)successful tx & redirect
> counters as well as drop counter. Also, XDP bytes & packets counters should
> not be counted twice wrt ethtool stats.
> 
>    [0] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9e2ee5c7e7c35d195e2aa0692a7241d47a433d1e
Ido Schimmel Nov. 28, 2021, 5:54 p.m. UTC | #13
+Petr, Nik

On Fri, Nov 26, 2021 at 11:14:31AM -0800, Jakub Kicinski wrote:
> On Fri, 26 Nov 2021 19:47:17 +0100 Toke Høiland-Jørgensen wrote:
> > > Fair. In all honesty I said that hoping to push for a more flexible
> > > approach hidden entirely in BPF, and not involving driver changes.
> > > Assuming the XDP program has more fine grained stats we should be able
> > > to extract those instead of double-counting. Hence my vague "let's work
> > > with apps" comment.
> > >
> > > For example to a person familiar with the workload it'd be useful to
> > > know if program returned XDP_DROP because of configured policy or
> > > failure to parse a packet. I don't think that sort distinction is
> > > achievable at the level of standard stats.
> > >
> > > The information required by the admin is higher level. As you say the
> > > primary concern there is "how many packets did XDP eat".  
> > 
> > Right, sure, I am also totally fine with having only a somewhat
> > restricted subset of stats available at the interface level and make
> > everything else be BPF-based. I'm hoping we can converge of a common
> > understanding of what this "minimal set" should be :)
> > 
> > > Speaking of which, one thing that badly needs clarification is our
> > > expectation around XDP packets getting counted towards the interface
> > > stats.  
> > 
> > Agreed. My immediate thought is that "XDP packets are interface packets"
> > but that is certainly not what we do today, so not sure if changing it
> > at this point would break things?
> 
> I'd vote for taking the risk and trying to align all the drivers.

I agree. I think IFLA_STATS64 in RTM_NEWLINK should contain statistics
of all the packets seen by the netdev. The breakdown into software /
hardware / XDP should be reported via RTM_NEWSTATS.

Currently, for soft devices such as VLANs, bridges and GRE, user space
only sees statistics of packets forwarded by software, which is quite
useless when forwarding is offloaded from the kernel to hardware.

Petr is working on exposing hardware statistics for such devices via
rtnetlink. Unlike XDP (?), we need to be able to let user space enable /
disable hardware statistics as we have a limited number of hardware
counters and they can also reduce the bandwidth when enabled. We are
thinking of adding a new RTM_SETSTATS for that:

# ip stats set dev swp1 hw_stats on

For query, something like (under discussion):

# ip stats show dev swp1 // all groups
# ip stats show dev swp1 group link
# ip stats show dev swp1 group offload // all sub-groups
# ip stats show dev swp1 group offload sub-group cpu
# ip stats show dev swp1 group offload sub-group hw

Like other iproute2 commands, these follow the nesting of the
RTM_{NEW,GET}STATS uAPI.

Looking at patch #1 [1], I think that whatever you decide to expose for
XDP can be queried via:

# ip stats show dev swp1 group xdp
# ip stats show dev swp1 group xdp sub-group regular
# ip stats show dev swp1 group xdp sub-group xsk

Regardless, the following command should show statistics of all the
packets seen by the netdev:

# ip -s link show dev swp1

There is a PR [2] for node_exporter to use rtnetlink to fetch netdev
statistics instead of the old proc interface. It should be possible to
extend it to use RTM_*STATS for more fine-grained statistics.

[1] https://lore.kernel.org/netdev/20211123163955.154512-2-alexandr.lobakin@intel.com/
[2] https://github.com/prometheus/node_exporter/pull/2074
Toke Høiland-Jørgensen Nov. 29, 2021, 11:51 a.m. UTC | #14
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 11/26/21 7:06 PM, Jakub Kicinski wrote:
>> On Fri, 26 Nov 2021 13:30:16 +0100 Toke Høiland-Jørgensen wrote:
>>>>> TBH I wasn't following this thread too closely since I saw Daniel
>>>>> nacked it already. I do prefer rtnl xstats, I'd just report them
>>>>> in -s if they are non-zero. But doesn't sound like we have an agreement
>>>>> whether they should exist or not.
>>>>
>>>> Right, just -s is fine, if we drop the per-channel approach.
>>>
>>> I agree that adding them to -s is fine (and that resolves my "no one
>>> will find them" complain as well). If it crowds the output we could also
>>> default to only output'ing a subset, and have the more detailed
>>> statistics hidden behind a verbose switch (or even just in the JSON
>>> output)?
>>>
>>>>> Can we think of an approach which would make cloudflare and cilium
>>>>> happy? Feels like we're trying to make the slightly hypothetical
>>>>> admin happy while ignoring objections of very real users.
>>>>
>>>> The initial idea was to only uniform the drivers. But in general
>>>> you are right, 10 drivers having something doesn't mean it's
>>>> something good.
>>>
>>> I don't think it's accurate to call the admin use case "hypothetical".
>>> We're expending a significant effort explaining to people that XDP can
>>> "eat" your packets, and not having any standard statistics makes this
>>> way harder. We should absolutely cater to our "early adopters", but if
>>> we want XDP to see wider adoption, making it "less weird" is critical!
>> 
>> Fair. In all honesty I said that hoping to push for a more flexible
>> approach hidden entirely in BPF, and not involving driver changes.
>> Assuming the XDP program has more fine grained stats we should be able
>> to extract those instead of double-counting. Hence my vague "let's work
>> with apps" comment.
>> 
>> For example to a person familiar with the workload it'd be useful to
>> know if program returned XDP_DROP because of configured policy or
>> failure to parse a packet. I don't think that sort distinction is
>> achievable at the level of standard stats.
>
> Agree on the additional context. How often have you looked at tc clsact
> /dropped/ stats specifically when you debug a more complex BPF program
> there?
>
>    # tc -s qdisc show clsact dev foo
>    qdisc clsact ffff: parent ffff:fff1
>     Sent 6800 bytes 120 pkt (dropped 0, overlimits 0 requeues 0)
>     backlog 0b 0p requeues 0
>
> Similarly, XDP_PASS counters may be of limited use as well for same reason
> (and I think we might not even have a tc counter equivalent for it).
>
>> The information required by the admin is higher level. As you say the
>> primary concern there is "how many packets did XDP eat".
>
> Agree. Above said, for XDP_DROP I would see one use case where you compare
> different drivers or bond vs no bond as we did in the past in [0] when
> testing against a packet generator (although I don't see bond driver covered
> in this series here yet where it aggregates the XDP stats from all bond slave
> devs).
>
> On a higher-level wrt "how many packets did XDP eat", it would make sense
> to have the stats for successful XDP_{TX,REDIRECT} given these are out
> of reach from a BPF prog PoV - we can only count there how many times we
> returned with XDP_TX but not whether the pkt /successfully made it/.
>
> In terms of error cases, could we just standardize all drivers on the behavior
> of e.g. mlx5e_xdp_handle(), meaning, a failure from XDP_{TX,REDIRECT} will
> hit the trace_xdp_exception() and then fallthrough to bump a drop counter
> (same as we bump in XDP_DROP then). So the drop counter will account for
> program drops but also driver-related drops.
>
> At some later point the trace_xdp_exception() could be extended with an error
> code that the driver would propagate (given some of them look quite similar
> across drivers, fwiw), and then whoever wants to do further processing with
> them can do so via bpftrace or other tooling.
>
> So overall wrt this series: from the lrstats we'd be /dropping/ the pass,
> tx_errors, redirect_errors, invalid, aborted counters. And we'd be /keeping/
> bytes & packets counters that XDP sees, (driver-)successful tx & redirect
> counters as well as drop counter. Also, XDP bytes & packets counters should
> not be counted twice wrt ethtool stats.

This sounds reasonable to me, and I also like the error code to
tracepoint idea :)

-Toke
Jesper Dangaard Brouer Nov. 29, 2021, 1:59 p.m. UTC | #15
On 27/11/2021 00.01, Daniel Borkmann wrote:
> On 11/26/21 11:27 PM, Daniel Borkmann wrote:
>> On 11/26/21 7:06 PM, Jakub Kicinski wrote:
> [...]
>>> The information required by the admin is higher level. As you say the
>>> primary concern there is "how many packets did XDP eat".
>>
>> Agree. Above said, for XDP_DROP I would see one use case where you 
>> compare
>> different drivers or bond vs no bond as we did in the past in [0] when
>> testing against a packet generator (although I don't see bond driver 
>> covered
>> in this series here yet where it aggregates the XDP stats from all 
>> bond slave devs).
>>
>> On a higher-level wrt "how many packets did XDP eat", it would make sense
>> to have the stats for successful XDP_{TX,REDIRECT} given these are out
>> of reach from a BPF prog PoV - we can only count there how many times we
>> returned with XDP_TX but not whether the pkt /successfully made it/.
>>

Exactly.

>> In terms of error cases, could we just standardize all drivers on the 
>> behavior
>> of e.g. mlx5e_xdp_handle(), meaning, a failure from XDP_{TX,REDIRECT} will
>> hit the trace_xdp_exception() and then fallthrough to bump a drop counter
>> (same as we bump in XDP_DROP then). So the drop counter will account for
>> program drops but also driver-related drops.
>>

Hmm... I don't agree here.  IMHO the BPF-program's *choice* to drop (via 
XDP_DROP) should NOT share the counter with the driver-related drops.

The driver-related drops must be accounted separate.

For the record, I think mlx5e_xdp_handle() does the wrong thing, of 
accounting everything as XDP_DROP in (rq->stats->xdp_drop++).

Current mlx5 driver stats are highly problematic actually.
Please don't model stats behavior after this driver.

E.g. if BPF-prog takes the *choice* XDP_TX or XDP_REDIRECT or XDP_DROP, 
then the packet is invisible to "ifconfig" stats.  It is like the driver 
never received these packets (which is wrong IMHO). (The stats are only 
avail via ethtool -S).


>> At some later point the trace_xdp_exception() could be extended with 
>> an error
>> code that the driver would propagate (given some of them look quite 
>> similar
>> across drivers, fwiw), and then whoever wants to do further processing 
>> with them can do so via bpftrace or other tooling.

I do like trace_xdp_exception() is invoked in mlx5e_xdp_handle(), but do 
notice that xdp_do_redirect() also have a tracepoint that can be used 
for troubleshooting. (I usually use xdp_monitor for troubleshooting 
which catch both).

I like the stats XDP handling better in mvneta_run_xdp().

> Just thinking out loud, one straight forward example we could start out 
> with that is also related to Paolo's series [1] ...
> 
> enum xdp_error {
>      XDP_UNKNOWN,
>      XDP_ACTION_INVALID,
>      XDP_ACTION_UNSUPPORTED,
> };
> 
> ... and then bpf_warn_invalid_xdp_action() returns one of the latter two
> which we pass to trace_xdp_exception(). Later there could be XDP_DRIVER_*
> cases e.g. propagated from XDP_TX error exceptions.
> 
>          [...]
>          default:
>                  err = bpf_warn_invalid_xdp_action(act);
>                  fallthrough;
>          case XDP_ABORTED:
> xdp_abort:
>                  trace_xdp_exception(rq->netdev, prog, act, err);
>                  fallthrough;
>          case XDP_DROP:
>                  lrstats->xdp_drop++;
>                  break;
>          }
>          [...]
> 
>    [1] 
> https://lore.kernel.org/netdev/cover.1637924200.git.pabeni@redhat.com/
> 
>> So overall wrt this series: from the lrstats we'd be /dropping/ the pass,
>> tx_errors, redirect_errors, invalid, aborted counters. And we'd be 
>> /keeping/
>> bytes & packets counters that XDP sees, (driver-)successful tx & redirect
>> counters as well as drop counter. Also, XDP bytes & packets counters 
>> should
>> not be counted twice wrt ethtool stats.
>>
>>    [0] 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9e2ee5c7e7c35d195e2aa0692a7241d47a433d1e 
>>

Concrete example with mlx5:

For most other hardware (than mlx5) I experience that XDP_TX creates a 
push-back on NIC RX-handing speed.  Thus, the XDP_TX stats recorded by 
BPF-prog is usually correct.

With mlx5 hardware (tested on ConnectX-5 Ex MT28800) the RX 
packets-per-sec (pps) stats can easily be faster than actually XDP_TX 
transmitted frames.

$ sudo ./xdp_rxq_info --dev mlx5p1 --action XDP_TX
  [...]
  Running XDP on dev:mlx5p1 (ifindex:10) action:XDP_TX options:swapmac
  XDP stats       CPU     pps         issue-pps
  XDP-RX CPU      1       13,922,430  0
  XDP-RX CPU      total   13,922,430

  RXQ stats       RXQ:CPU pps         issue-pps
  rx_queue_index    1:1   13,922,430  0
  rx_queue_index    1:sum 13,922,430

The real xmit speed is (from below ethtool_stats.pl) is
  9,391,314 pps <= rx1_xdp_tx_xmit /sec

The dropped packets are double accounted as:
  4,552,033 <= rx_xdp_drop /sec
  4,552,033 <= rx_xdp_tx_full /sec


Show adapter(s) (mlx5p1) statistics (ONLY that changed!)
Ethtool(mlx5p1  ) stat:       217865 (        217,865) <= ch1_poll /sec
Ethtool(mlx5p1  ) stat:       217864 (        217,864) <= ch_poll /sec
Ethtool(mlx5p1  ) stat:     13943371 (     13,943,371) <= 
rx1_cache_reuse /sec
Ethtool(mlx5p1  ) stat:      4552033 (      4,552,033) <= rx1_xdp_drop /sec
Ethtool(mlx5p1  ) stat:       146740 (        146,740) <= 
rx1_xdp_tx_cqes /sec
Ethtool(mlx5p1  ) stat:      4552033 (      4,552,033) <= 
rx1_xdp_tx_full /sec
Ethtool(mlx5p1  ) stat:      9391314 (      9,391,314) <= 
rx1_xdp_tx_inlnw /sec
Ethtool(mlx5p1  ) stat:       880436 (        880,436) <= 
rx1_xdp_tx_mpwqe /sec
Ethtool(mlx5p1  ) stat:       997833 (        997,833) <= 
rx1_xdp_tx_nops /sec
Ethtool(mlx5p1  ) stat:      9391314 (      9,391,314) <= 
rx1_xdp_tx_xmit /sec
Ethtool(mlx5p1  ) stat:     45095173 (     45,095,173) <= 
rx_64_bytes_phy /sec
Ethtool(mlx5p1  ) stat:   2886090490 (  2,886,090,490) <= rx_bytes_phy /sec
Ethtool(mlx5p1  ) stat:     13943293 (     13,943,293) <= rx_cache_reuse 
/sec
Ethtool(mlx5p1  ) stat:     31151957 (     31,151,957) <= 
rx_out_of_buffer /sec
Ethtool(mlx5p1  ) stat:     45095158 (     45,095,158) <= rx_packets_phy 
/sec
Ethtool(mlx5p1  ) stat:   2886072350 (  2,886,072,350) <= rx_prio0_bytes 
/sec
Ethtool(mlx5p1  ) stat:     45094878 (     45,094,878) <= 
rx_prio0_packets /sec
Ethtool(mlx5p1  ) stat:   2705707938 (  2,705,707,938) <= 
rx_vport_unicast_bytes /sec
Ethtool(mlx5p1  ) stat:     45095129 (     45,095,129) <= 
rx_vport_unicast_packets /sec
Ethtool(mlx5p1  ) stat:      4552033 (      4,552,033) <= rx_xdp_drop /sec
Ethtool(mlx5p1  ) stat:       146739 (        146,739) <= rx_xdp_tx_cqe /sec
Ethtool(mlx5p1  ) stat:      4552033 (      4,552,033) <= rx_xdp_tx_full 
/sec
Ethtool(mlx5p1  ) stat:      9391319 (      9,391,319) <= 
rx_xdp_tx_inlnw /sec
Ethtool(mlx5p1  ) stat:       880436 (        880,436) <= 
rx_xdp_tx_mpwqe /sec
Ethtool(mlx5p1  ) stat:       997831 (        997,831) <= rx_xdp_tx_nops 
/sec
Ethtool(mlx5p1  ) stat:      9391319 (      9,391,319) <= rx_xdp_tx_xmit 
/sec
Ethtool(mlx5p1  ) stat:    601044221 (    601,044,221) <= tx_bytes_phy /sec
Ethtool(mlx5p1  ) stat:      9391316 (      9,391,316) <= tx_packets_phy 
/sec
Ethtool(mlx5p1  ) stat:    601040871 (    601,040,871) <= tx_prio0_bytes 
/sec
Ethtool(mlx5p1  ) stat:      9391264 (      9,391,264) <= 
tx_prio0_packets /sec
Ethtool(mlx5p1  ) stat:    563478483 (    563,478,483) <= 
tx_vport_unicast_bytes /sec
Ethtool(mlx5p1  ) stat:      9391316 (      9,391,316) <= 
tx_vport_unicast_packets /sec



[1] 
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl


The net_devices stats says the NIC is processing zero packets:

  $ sar -n DEV 2 1000
  [...]
  Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s 
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
  [...]
  Average:       mlx5p1      0,00      0,00      0,00      0,00 
0,00      0,00      0,00      0,00
  Average:       mlx5p2      0,00      0,00      0,00      0,00 
0,00      0,00      0,00      0,00
Jakub Kicinski Nov. 29, 2021, 2:47 p.m. UTC | #16
On Sun, 28 Nov 2021 19:54:53 +0200 Ido Schimmel wrote:
> > > Right, sure, I am also totally fine with having only a somewhat
> > > restricted subset of stats available at the interface level and make
> > > everything else be BPF-based. I'm hoping we can converge of a common
> > > understanding of what this "minimal set" should be :)
> > > 
> > > Agreed. My immediate thought is that "XDP packets are interface packets"
> > > but that is certainly not what we do today, so not sure if changing it
> > > at this point would break things?  
> > 
> > I'd vote for taking the risk and trying to align all the drivers.  
> 
> I agree. I think IFLA_STATS64 in RTM_NEWLINK should contain statistics
> of all the packets seen by the netdev. The breakdown into software /
> hardware / XDP should be reported via RTM_NEWSTATS.

Hm, in the offload case "seen by the netdev" may be unclear. For 
the offload case I believe our recommendation was phrased more like 
"all packets which would be seen by the netdev if there was no
routing/tc offload", right?

> Currently, for soft devices such as VLANs, bridges and GRE, user space
> only sees statistics of packets forwarded by software, which is quite
> useless when forwarding is offloaded from the kernel to hardware.
> 
> Petr is working on exposing hardware statistics for such devices via
> rtnetlink. Unlike XDP (?), we need to be able to let user space enable /
> disable hardware statistics as we have a limited number of hardware
> counters and they can also reduce the bandwidth when enabled. We are
> thinking of adding a new RTM_SETSTATS for that:
> 
> # ip stats set dev swp1 hw_stats on

Does it belong on the switch port? Not the netdev we want to track?

> For query, something like (under discussion):
> 
> # ip stats show dev swp1 // all groups
> # ip stats show dev swp1 group link
> # ip stats show dev swp1 group offload // all sub-groups
> # ip stats show dev swp1 group offload sub-group cpu
> # ip stats show dev swp1 group offload sub-group hw
> 
> Like other iproute2 commands, these follow the nesting of the
> RTM_{NEW,GET}STATS uAPI.

But we do have IFLA_STATS_LINK_OFFLOAD_XSTATS, isn't it effectively 
the same use case?

> Looking at patch #1 [1], I think that whatever you decide to expose for
> XDP can be queried via:
> 
> # ip stats show dev swp1 group xdp
> # ip stats show dev swp1 group xdp sub-group regular
> # ip stats show dev swp1 group xdp sub-group xsk
> 
> Regardless, the following command should show statistics of all the
> packets seen by the netdev:
> 
> # ip -s link show dev swp1
> 
> There is a PR [2] for node_exporter to use rtnetlink to fetch netdev
> statistics instead of the old proc interface. It should be possible to
> extend it to use RTM_*STATS for more fine-grained statistics.
> 
> [1] https://lore.kernel.org/netdev/20211123163955.154512-2-alexandr.lobakin@intel.com/
> [2] https://github.com/prometheus/node_exporter/pull/2074

Nice!
Jakub Kicinski Nov. 29, 2021, 3:03 p.m. UTC | #17
On Mon, 29 Nov 2021 14:59:53 +0100 Jesper Dangaard Brouer wrote:
> Hmm... I don't agree here.  IMHO the BPF-program's *choice* to drop (via 
> XDP_DROP) should NOT share the counter with the driver-related drops.
> 
> The driver-related drops must be accounted separate.

+1 FWIW. The Tx stat is a little misleading because it differs from the
definition of our other tx stats which mean _successfully_ transmitted
(and are accounted on the completion path in many drivers).

In the past I've used act_*, e.g. act_tx, to indicate the stat counts
returned actions, not whether the packet made it.

I still wonder whether it makes sense to count the stats per-action or
just have one "XDP consumed it" stat and that's it. The semantics of the
action are not of interest to the admin. A firewall can drop or tx
depending if it wants to send an ICMP reject or TCP RST message in
response. I need to know what the application does to understand the
difference, and if I do I can as well look at app stats. But I'm aware
I'm not going to find much support for this position, so just saying...
;)
Petr Machata Nov. 29, 2021, 3:51 p.m. UTC | #18
Jakub Kicinski <kuba@kernel.org> writes:

> On Sun, 28 Nov 2021 19:54:53 +0200 Ido Schimmel wrote:
>> > > Right, sure, I am also totally fine with having only a somewhat
>> > > restricted subset of stats available at the interface level and make
>> > > everything else be BPF-based. I'm hoping we can converge of a common
>> > > understanding of what this "minimal set" should be :)
>> > > 
>> > > Agreed. My immediate thought is that "XDP packets are interface packets"
>> > > but that is certainly not what we do today, so not sure if changing it
>> > > at this point would break things?  
>> > 
>> > I'd vote for taking the risk and trying to align all the drivers.  
>> 
>> I agree. I think IFLA_STATS64 in RTM_NEWLINK should contain statistics
>> of all the packets seen by the netdev. The breakdown into software /
>> hardware / XDP should be reported via RTM_NEWSTATS.
>
> Hm, in the offload case "seen by the netdev" may be unclear. For 
> the offload case I believe our recommendation was phrased more like 
> "all packets which would be seen by the netdev if there was no
> routing/tc offload", right?

Yes. The idea is to expose to Linux stats about traffic at conceptually
corresponding objects in the HW.

>
>> Currently, for soft devices such as VLANs, bridges and GRE, user space
>> only sees statistics of packets forwarded by software, which is quite
>> useless when forwarding is offloaded from the kernel to hardware.
>> 
>> Petr is working on exposing hardware statistics for such devices via
>> rtnetlink. Unlike XDP (?), we need to be able to let user space enable /
>> disable hardware statistics as we have a limited number of hardware
>> counters and they can also reduce the bandwidth when enabled. We are
>> thinking of adding a new RTM_SETSTATS for that:
>> 
>> # ip stats set dev swp1 hw_stats on
>
> Does it belong on the switch port? Not the netdev we want to track?

Yes, it does, and is designed that way. That was just muscle memory
typing that "swp1" above :)

You would do e.g. "ip stats set dev swp1.200 hw_stats on" or, "dev br1",
or something like that.

>> For query, something like (under discussion):
>> 
>> # ip stats show dev swp1 // all groups
>> # ip stats show dev swp1 group link
>> # ip stats show dev swp1 group offload // all sub-groups
>> # ip stats show dev swp1 group offload sub-group cpu
>> # ip stats show dev swp1 group offload sub-group hw
>> 
>> Like other iproute2 commands, these follow the nesting of the
>> RTM_{NEW,GET}STATS uAPI.
>
> But we do have IFLA_STATS_LINK_OFFLOAD_XSTATS, isn't it effectively 
> the same use case?

IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest. Currently it carries just
CPU_HIT stats. The idea is to carry HW stats as well in that group.

>> Looking at patch #1 [1], I think that whatever you decide to expose for
>> XDP can be queried via:
>> 
>> # ip stats show dev swp1 group xdp
>> # ip stats show dev swp1 group xdp sub-group regular
>> # ip stats show dev swp1 group xdp sub-group xsk
>> 
>> Regardless, the following command should show statistics of all the
>> packets seen by the netdev:
>> 
>> # ip -s link show dev swp1
>> 
>> There is a PR [2] for node_exporter to use rtnetlink to fetch netdev
>> statistics instead of the old proc interface. It should be possible to
>> extend it to use RTM_*STATS for more fine-grained statistics.
>> 
>> [1] https://lore.kernel.org/netdev/20211123163955.154512-2-alexandr.lobakin@intel.com/
>> [2] https://github.com/prometheus/node_exporter/pull/2074
>
> Nice!
Petr Machata Nov. 29, 2021, 3:54 p.m. UTC | #19
Petr Machata <petrm@nvidia.com> writes:

> Jakub Kicinski <kuba@kernel.org> writes:
>
>> On Sun, 28 Nov 2021 19:54:53 +0200 Ido Schimmel wrote:
>>> # ip stats set dev swp1 hw_stats on
>>
>> Does it belong on the switch port? Not the netdev we want to track?
>
> Yes, it does, and is designed that way. That was just muscle memory
> typing that "swp1" above :)

And by "yes, it does", I obviously meant "no, it doesn't". It does
belong to the device that you want counters for.

> You would do e.g. "ip stats set dev swp1.200 hw_stats on" or, "dev br1",
> or something like that.
Jakub Kicinski Nov. 29, 2021, 4:05 p.m. UTC | #20
On Mon, 29 Nov 2021 16:51:02 +0100 Petr Machata wrote:
> Jakub Kicinski <kuba@kernel.org> writes:
> > On Sun, 28 Nov 2021 19:54:53 +0200 Ido Schimmel wrote:  
> >> I agree. I think IFLA_STATS64 in RTM_NEWLINK should contain statistics
> >> of all the packets seen by the netdev. The breakdown into software /
> >> hardware / XDP should be reported via RTM_NEWSTATS.  
> >
> > Hm, in the offload case "seen by the netdev" may be unclear. For 
> > the offload case I believe our recommendation was phrased more like 
> > "all packets which would be seen by the netdev if there was no
> > routing/tc offload", right?  
> 
> Yes. The idea is to expose to Linux stats about traffic at conceptually
> corresponding objects in the HW.

Great.

> >> Currently, for soft devices such as VLANs, bridges and GRE, user space
> >> only sees statistics of packets forwarded by software, which is quite
> >> useless when forwarding is offloaded from the kernel to hardware.
> >> 
> >> Petr is working on exposing hardware statistics for such devices via
> >> rtnetlink. Unlike XDP (?), we need to be able to let user space enable /
> >> disable hardware statistics as we have a limited number of hardware
> >> counters and they can also reduce the bandwidth when enabled. We are
> >> thinking of adding a new RTM_SETSTATS for that:
> >> 
> >> # ip stats set dev swp1 hw_stats on  
> >
> > Does it belong on the switch port? Not the netdev we want to track?  
> 
> Yes, it does, and is designed that way. That was just muscle memory
> typing that "swp1" above :)
> 
> You would do e.g. "ip stats set dev swp1.200 hw_stats on" or, "dev br1",
> or something like that.

I see :)

> >> For query, something like (under discussion):
> >> 
> >> # ip stats show dev swp1 // all groups
> >> # ip stats show dev swp1 group link
> >> # ip stats show dev swp1 group offload // all sub-groups
> >> # ip stats show dev swp1 group offload sub-group cpu
> >> # ip stats show dev swp1 group offload sub-group hw
> >> 
> >> Like other iproute2 commands, these follow the nesting of the
> >> RTM_{NEW,GET}STATS uAPI.  
> >
> > But we do have IFLA_STATS_LINK_OFFLOAD_XSTATS, isn't it effectively 
> > the same use case?  
> 
> IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest. Currently it carries just
> CPU_HIT stats. The idea is to carry HW stats as well in that group.

Hm, the expectation was that the HW stats == total - SW. I believe that
still holds true for you, even if HW stats are not "complete" (e.g.
user enabled them after device was already forwarding for a while).
Is the concern about backward compat or such?
Petr Machata Nov. 29, 2021, 5:08 p.m. UTC | #21
Jakub Kicinski <kuba@kernel.org> writes:

> On Mon, 29 Nov 2021 16:51:02 +0100 Petr Machata wrote:
>> Jakub Kicinski <kuba@kernel.org> writes:
>> > On Sun, 28 Nov 2021 19:54:53 +0200 Ido Schimmel wrote:  
>> >> For query, something like (under discussion):
>> >> 
>> >> # ip stats show dev swp1 // all groups
>> >> # ip stats show dev swp1 group link
>> >> # ip stats show dev swp1 group offload // all sub-groups
>> >> # ip stats show dev swp1 group offload sub-group cpu
>> >> # ip stats show dev swp1 group offload sub-group hw
>> >> 
>> >> Like other iproute2 commands, these follow the nesting of the
>> >> RTM_{NEW,GET}STATS uAPI.  
>> >
>> > But we do have IFLA_STATS_LINK_OFFLOAD_XSTATS, isn't it effectively 
>> > the same use case?  
>> 
>> IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest. Currently it carries just
>> CPU_HIT stats. The idea is to carry HW stats as well in that group.
>
> Hm, the expectation was that the HW stats == total - SW. I believe that
> still holds true for you, even if HW stats are not "complete" (e.g.
> user enabled them after device was already forwarding for a while).
> Is the concern about backward compat or such?

I guess you could call it backward compat. But not only. I think a
typical user doing "ip -s l sh", including various scripts, wants to see
the full picture and not worry what's going on where. Physical
netdevices already do that, and by extension bond and team of physical
netdevices. It also makes sense from the point of view of an offloaded
datapath as an implementation detail that you would ideally not notice.

For those who care to know about the offloaded datapath, it would be
nice to have the option to request either just the SW stats, or just the
HW stats. A logical place to put these would be under the OFFLOAD_XSTATS
nest of the RTM_GETSTATS message, but maybe the SW ones should be up
there next to IFLA_STATS_LINK_64. (After all it's going to be
independent from not only offload datapath, but also XDP.)

This way you get the intuitive default behavior, but still have a way to
e.g. request just the SW stats without hitting the HW, or just request
the HW stats if that's what you care about.
Jakub Kicinski Nov. 29, 2021, 5:17 p.m. UTC | #22
On Mon, 29 Nov 2021 18:08:12 +0100 Petr Machata wrote:
> Jakub Kicinski <kuba@kernel.org> writes:
> > On Mon, 29 Nov 2021 16:51:02 +0100 Petr Machata wrote:  
> >> IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest. Currently it carries just
> >> CPU_HIT stats. The idea is to carry HW stats as well in that group.  
> >
> > Hm, the expectation was that the HW stats == total - SW. I believe that
> > still holds true for you, even if HW stats are not "complete" (e.g.
> > user enabled them after device was already forwarding for a while).
> > Is the concern about backward compat or such?  
> 
> I guess you could call it backward compat. But not only. I think a
> typical user doing "ip -s l sh", including various scripts, wants to see
> the full picture and not worry what's going on where. Physical
> netdevices already do that, and by extension bond and team of physical
> netdevices. It also makes sense from the point of view of an offloaded
> datapath as an implementation detail that you would ideally not notice.

Agreed.

> For those who care to know about the offloaded datapath, it would be
> nice to have the option to request either just the SW stats, or just the
> HW stats. A logical place to put these would be under the OFFLOAD_XSTATS
> nest of the RTM_GETSTATS message, but maybe the SW ones should be up
> there next to IFLA_STATS_LINK_64. (After all it's going to be
> independent from not only offload datapath, but also XDP.)

What I'm getting at is that I thought IFLA_OFFLOAD_XSTATS_CPU_HIT
should be sufficient from uAPI perspective in terms of reporting.
User space can do the simple math to calculate the "SW stats" if 
it wants to. We may well be talking about the same thing, so maybe
let's wait for the code?

> This way you get the intuitive default behavior, but still have a way to
> e.g. request just the SW stats without hitting the HW, or just request
> the HW stats if that's what you care about.
Petr Machata Nov. 30, 2021, 11:55 a.m. UTC | #23
Jakub Kicinski <kuba@kernel.org> writes:

> On Mon, 29 Nov 2021 18:08:12 +0100 Petr Machata wrote:
>> For those who care to know about the offloaded datapath, it would be
>> nice to have the option to request either just the SW stats, or just the
>> HW stats. A logical place to put these would be under the OFFLOAD_XSTATS
>> nest of the RTM_GETSTATS message, but maybe the SW ones should be up
>> there next to IFLA_STATS_LINK_64. (After all it's going to be
>> independent from not only offload datapath, but also XDP.)
>
> What I'm getting at is that I thought IFLA_OFFLOAD_XSTATS_CPU_HIT
> should be sufficient from uAPI perspective in terms of reporting.
> User space can do the simple math to calculate the "SW stats" if 
> it wants to. We may well be talking about the same thing, so maybe
> let's wait for the code?

Ha, OK, now I understand. Yeah, CPU_HIT actually does fit the bill for
the traffic that took place in SW. We can reuse it.

I still think it would be better to report HW_STATS explicitly as well
though. One reason is simply convenience. The other is that OK, now we
have SW stats, and XDP stats, and total stats, and I (as a client) don't
necessarily know how it all fits together. But the contract for HW_STATS
is very clear.
Jakub Kicinski Nov. 30, 2021, 3:07 p.m. UTC | #24
On Tue, 30 Nov 2021 12:55:47 +0100 Petr Machata wrote:
> I still think it would be better to report HW_STATS explicitly as well
> though. One reason is simply convenience. The other is that OK, now we
> have SW stats, and XDP stats, and total stats, and I (as a client) don't
> necessarily know how it all fits together. But the contract for HW_STATS
> is very clear.

Would be good to check with Jiri, my recollection is that this argument
was brought up when CPU_HIT stats were added. I don't recall the
reasoning.

<insert xkcd standards>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index b67ad51cbcc9..6cef8b4e887f 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -387,8 +387,10 @@  struct ice_vsi {
 	struct ice_tc_cfg tc_cfg;
 	struct bpf_prog *xdp_prog;
 	struct ice_tx_ring **xdp_rings;	 /* XDP ring array */
+	struct xdp_drv_stats *xdp_stats; /* XDP stats array */
 	unsigned long *af_xdp_zc_qps;	 /* tracks AF_XDP ZC enabled qps */
 	u16 num_xdp_txq;		 /* Used XDP queues */
+	u16 alloc_xdp_stats;		 /* Length of xdp_stats array */
 	u8 xdp_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */

 	struct net_device **target_netdevs;
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 40562600a8cf..934152216df5 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -73,6 +73,7 @@  static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 {
 	struct ice_pf *pf = vsi->back;
 	struct device *dev;
+	u32 i;

 	dev = ice_pf_to_dev(pf);
 	if (vsi->type == ICE_VSI_CHNL)
@@ -115,8 +116,23 @@  static int ice_vsi_alloc_arrays(struct ice_vsi *vsi)
 	if (!vsi->af_xdp_zc_qps)
 		goto err_zc_qps;

+	vsi->alloc_xdp_stats = max_t(u16, vsi->alloc_rxq, num_possible_cpus());
+
+	vsi->xdp_stats = kcalloc(vsi->alloc_xdp_stats, sizeof(*vsi->xdp_stats),
+				 GFP_KERNEL);
+	if (!vsi->xdp_stats)
+		goto err_xdp_stats;
+
+	for (i = 0; i < vsi->alloc_xdp_stats; i++)
+		xdp_init_drv_stats(vsi->xdp_stats + i);
+
 	return 0;

+err_xdp_stats:
+	vsi->alloc_xdp_stats = 0;
+
+	bitmap_free(vsi->af_xdp_zc_qps);
+	vsi->af_xdp_zc_qps = NULL;
 err_zc_qps:
 	devm_kfree(dev, vsi->q_vectors);
 err_vectors:
@@ -317,6 +333,10 @@  static void ice_vsi_free_arrays(struct ice_vsi *vsi)

 	dev = ice_pf_to_dev(pf);

+	kfree(vsi->xdp_stats);
+	vsi->xdp_stats = NULL;
+	vsi->alloc_xdp_stats = 0;
+
 	if (vsi->af_xdp_zc_qps) {
 		bitmap_free(vsi->af_xdp_zc_qps);
 		vsi->af_xdp_zc_qps = NULL;
@@ -1422,6 +1442,7 @@  static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 		ring->netdev = vsi->netdev;
 		ring->dev = dev;
 		ring->count = vsi->num_rx_desc;
+		ring->xdp_stats = vsi->xdp_stats + i;
 		WRITE_ONCE(vsi->rx_rings[i], ring);
 	}

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index f2a5f2f965d1..94d0bf440a49 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2481,6 +2481,7 @@  static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
 		xdp_ring->next_rs = ICE_TX_THRESH - 1;
 		xdp_ring->dev = dev;
 		xdp_ring->count = vsi->num_tx_desc;
+		xdp_ring->xdp_stats = vsi->xdp_stats + i;
 		WRITE_ONCE(vsi->xdp_rings[i], xdp_ring);
 		if (ice_setup_tx_ring(xdp_ring))
 			goto free_xdp_rings;
@@ -2837,6 +2838,19 @@  static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }

+static int ice_get_xdp_stats_nch(const struct net_device *dev, u32 attr_id)
+{
+	const struct ice_netdev_priv *np = netdev_priv(dev);
+
+	switch (attr_id) {
+	case IFLA_XDP_XSTATS_TYPE_XDP:
+	case IFLA_XDP_XSTATS_TYPE_XSK:
+		return np->vsi->alloc_xdp_stats;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 /**
  * ice_ena_misc_vector - enable the non-queue interrupts
  * @pf: board private structure
@@ -3280,6 +3294,7 @@  static int ice_cfg_netdev(struct ice_vsi *vsi)
 	ice_set_netdev_features(netdev);

 	ice_set_ops(netdev);
+	netdev->xstats = vsi->xdp_stats;

 	if (vsi->type == ICE_VSI_PF) {
 		SET_NETDEV_DEV(netdev, ice_pf_to_dev(vsi->back));
@@ -8608,4 +8623,6 @@  static const struct net_device_ops ice_netdev_ops = {
 	.ndo_bpf = ice_xdp,
 	.ndo_xdp_xmit = ice_xdp_xmit,
 	.ndo_xsk_wakeup = ice_xsk_wakeup,
+	.ndo_get_xdp_stats_nch = ice_get_xdp_stats_nch,
+	.ndo_get_xdp_stats = xdp_get_drv_stats_generic,
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index bc3ba19dc88f..d32d6f2975b5 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -532,19 +532,25 @@  ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, unsigned int __maybe_unused s
  * @xdp: xdp_buff used as input to the XDP program
  * @xdp_prog: XDP program to run
  * @xdp_ring: ring to be used for XDP_TX action
+ * @lrstats: onstack Rx XDP stats
  *
  * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
  */
 static int
 ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
-	    struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
+	    struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
+	    struct xdp_rx_drv_stats_local *lrstats)
 {
 	int err;
 	u32 act;

+	lrstats->bytes += xdp->data_end - xdp->data;
+	lrstats->packets++;
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
+		lrstats->pass++;
 		return ICE_XDP_PASS;
 	case XDP_TX:
 		if (static_branch_unlikely(&ice_xdp_locking_key))
@@ -552,22 +558,31 @@  ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
 		err = ice_xmit_xdp_ring(xdp->data, xdp->data_end - xdp->data, xdp_ring);
 		if (static_branch_unlikely(&ice_xdp_locking_key))
 			spin_unlock(&xdp_ring->tx_lock);
-		if (err == ICE_XDP_CONSUMED)
+		if (err == ICE_XDP_CONSUMED) {
+			lrstats->tx_errors++;
 			goto out_failure;
+		}
+		lrstats->tx++;
 		return err;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-		if (err)
+		if (err) {
+			lrstats->redirect_errors++;
 			goto out_failure;
+		}
+		lrstats->redirect++;
 		return ICE_XDP_REDIR;
 	default:
 		bpf_warn_invalid_xdp_action(act);
-		fallthrough;
+		lrstats->invalid++;
+		goto out_failure;
 	case XDP_ABORTED:
+		lrstats->aborted++;
 out_failure:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
-		fallthrough;
+		return ICE_XDP_CONSUMED;
 	case XDP_DROP:
+		lrstats->drop++;
 		return ICE_XDP_CONSUMED;
 	}
 }
@@ -627,6 +642,9 @@  ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (static_branch_unlikely(&ice_xdp_locking_key))
 		spin_unlock(&xdp_ring->tx_lock);

+	if (unlikely(nxmit < n))
+		xdp_update_tx_drv_err(&xdp_ring->xdp_stats->xdp_tx, n - nxmit);
+
 	return nxmit;
 }

@@ -1089,6 +1107,7 @@  int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0;
 	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+	struct xdp_rx_drv_stats_local lrstats = { };
 	unsigned int offset = rx_ring->rx_offset;
 	struct ice_tx_ring *xdp_ring = NULL;
 	unsigned int xdp_res, xdp_xmit = 0;
@@ -1173,7 +1192,8 @@  int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
 		if (!xdp_prog)
 			goto construct_skb;

-		xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring);
+		xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring,
+				      &lrstats);
 		if (!xdp_res)
 			goto construct_skb;
 		if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
@@ -1254,6 +1274,7 @@  int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
 	rx_ring->skb = skb;

 	ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes);
+	xdp_update_rx_drv_stats(&rx_ring->xdp_stats->xdp_rx, &lrstats);

 	/* guarantee a trip back through this routine if there was a failure */
 	return failure ? budget : (int)total_rx_pkts;
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index c56dd1749903..c54be60c3479 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -284,9 +284,9 @@  struct ice_rx_ring {
 	struct ice_rxq_stats rx_stats;
 	struct ice_q_stats	stats;
 	struct u64_stats_sync syncp;
+	struct xdp_drv_stats *xdp_stats;

-	struct rcu_head rcu;		/* to avoid race on free */
-	/* CL4 - 3rd cacheline starts here */
+	/* CL4 - 4rd cacheline starts here */
 	struct ice_channel *ch;
 	struct bpf_prog *xdp_prog;
 	struct ice_tx_ring *xdp_ring;
@@ -298,6 +298,9 @@  struct ice_rx_ring {
 	u8 dcb_tc;			/* Traffic class of ring */
 	u8 ptp_rx;
 	u8 flags;
+
+	/* CL5 - 5th cacheline starts here */
+	struct rcu_head rcu;		/* to avoid race on free */
 } ____cacheline_internodealigned_in_smp;

 struct ice_tx_ring {
@@ -324,13 +327,16 @@  struct ice_tx_ring {
 	/* stats structs */
 	struct ice_q_stats	stats;
 	struct u64_stats_sync syncp;
-	struct ice_txq_stats tx_stats;
+	struct xdp_drv_stats *xdp_stats;

 	/* CL3 - 3rd cacheline starts here */
+	struct ice_txq_stats tx_stats;
 	struct rcu_head rcu;		/* to avoid race on free */
 	DECLARE_BITMAP(xps_state, ICE_TX_NBITS);	/* XPS Config State */
 	struct ice_channel *ch;
 	struct ice_ptp_tx *tx_tstamps;
+
+	/* CL4 - 4th cacheline starts here */
 	spinlock_t tx_lock;
 	u32 txq_teid;			/* Added Tx queue TEID */
 #define ICE_TX_FLAGS_RING_XDP		BIT(0)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 1dd7e84f41f8..7dc287bc3a1a 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -258,6 +258,8 @@  static void ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
 		xdp_ring->next_dd = ICE_TX_THRESH - 1;
 	xdp_ring->next_to_clean = ntc;
 	ice_update_tx_ring_stats(xdp_ring, total_pkts, total_bytes);
+	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xdp_tx, total_pkts,
+				total_bytes);
 }

 /**
@@ -277,6 +279,7 @@  int ice_xmit_xdp_ring(void *data, u16 size, struct ice_tx_ring *xdp_ring)
 		ice_clean_xdp_irq(xdp_ring);

 	if (!unlikely(ICE_DESC_UNUSED(xdp_ring))) {
+		xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xdp_tx);
 		xdp_ring->tx_stats.tx_busy++;
 		return ICE_XDP_CONSUMED;
 	}
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index ff55cb415b11..62ef47a38d93 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -454,42 +454,58 @@  ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp_arr)
  * @xdp: xdp_buff used as input to the XDP program
  * @xdp_prog: XDP program to run
  * @xdp_ring: ring to be used for XDP_TX action
+ * @lrstats: onstack Rx XDP stats
  *
  * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
  */
 static int
 ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
-	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
+	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
+	       struct xdp_rx_drv_stats_local *lrstats)
 {
 	int err, result = ICE_XDP_PASS;
 	u32 act;

+	lrstats->bytes += xdp->data_end - xdp->data;
+	lrstats->packets++;
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);

 	if (likely(act == XDP_REDIRECT)) {
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-		if (err)
+		if (err) {
+			lrstats->redirect_errors++;
 			goto out_failure;
+		}
+		lrstats->redirect++;
 		return ICE_XDP_REDIR;
 	}

 	switch (act) {
 	case XDP_PASS:
+		lrstats->pass++;
 		break;
 	case XDP_TX:
 		result = ice_xmit_xdp_buff(xdp, xdp_ring);
-		if (result == ICE_XDP_CONSUMED)
+		if (result == ICE_XDP_CONSUMED) {
+			lrstats->tx_errors++;
 			goto out_failure;
+		}
+		lrstats->tx++;
 		break;
 	default:
 		bpf_warn_invalid_xdp_action(act);
-		fallthrough;
+		lrstats->invalid++;
+		goto out_failure;
 	case XDP_ABORTED:
+		lrstats->aborted++;
 out_failure:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
-		fallthrough;
+		result = ICE_XDP_CONSUMED;
+		break;
 	case XDP_DROP:
 		result = ICE_XDP_CONSUMED;
+		lrstats->drop++;
 		break;
 	}

@@ -507,6 +523,7 @@  int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+	struct xdp_rx_drv_stats_local lrstats = { };
 	struct ice_tx_ring *xdp_ring;
 	unsigned int xdp_xmit = 0;
 	struct bpf_prog *xdp_prog;
@@ -548,7 +565,8 @@  int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
 		xsk_buff_set_size(*xdp, size);
 		xsk_buff_dma_sync_for_cpu(*xdp, rx_ring->xsk_pool);

-		xdp_res = ice_run_xdp_zc(rx_ring, *xdp, xdp_prog, xdp_ring);
+		xdp_res = ice_run_xdp_zc(rx_ring, *xdp, xdp_prog, xdp_ring,
+					 &lrstats);
 		if (xdp_res) {
 			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
 				xdp_xmit |= xdp_res;
@@ -598,6 +616,7 @@  int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)

 	ice_finalize_xdp_rx(xdp_ring, xdp_xmit);
 	ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
+	xdp_update_rx_drv_stats(&rx_ring->xdp_stats->xsk_rx, &lrstats);

 	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
 		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
@@ -629,6 +648,7 @@  static bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, int budget)
 		struct ice_tx_buf *tx_buf;

 		if (unlikely(!ICE_DESC_UNUSED(xdp_ring))) {
+			xdp_update_tx_drv_full(&xdp_ring->xdp_stats->xsk_tx);
 			xdp_ring->tx_stats.tx_busy++;
 			work_done = false;
 			break;
@@ -686,11 +706,11 @@  ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
  */
 bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget)
 {
-	int total_packets = 0, total_bytes = 0;
 	s16 ntc = xdp_ring->next_to_clean;
+	u32 xdp_frames = 0, xdp_bytes = 0;
+	u32 xsk_frames = 0, xsk_bytes = 0;
 	struct ice_tx_desc *tx_desc;
 	struct ice_tx_buf *tx_buf;
-	u32 xsk_frames = 0;
 	bool xmit_done;

 	tx_desc = ICE_TX_DESC(xdp_ring, ntc);
@@ -702,13 +722,14 @@  bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget)
 		      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
 			break;

-		total_bytes += tx_buf->bytecount;
-		total_packets++;
-
 		if (tx_buf->raw_buf) {
 			ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
 			tx_buf->raw_buf = NULL;
+
+			xdp_bytes += tx_buf->bytecount;
+			xdp_frames++;
 		} else {
+			xsk_bytes += tx_buf->bytecount;
 			xsk_frames++;
 		}

@@ -736,7 +757,13 @@  bool ice_clean_tx_irq_zc(struct ice_tx_ring *xdp_ring, int budget)
 	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
 		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);

-	ice_update_tx_ring_stats(xdp_ring, total_packets, total_bytes);
+	ice_update_tx_ring_stats(xdp_ring, xdp_frames + xsk_frames,
+				 xdp_bytes + xsk_bytes);
+	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xdp_tx, xdp_frames,
+				xdp_bytes);
+	xdp_update_tx_drv_stats(&xdp_ring->xdp_stats->xsk_tx, xsk_frames,
+				xsk_bytes);
+
 	xmit_done = ice_xmit_zc(xdp_ring, ICE_DFLT_IRQ_WORK);

 	return budget > 0 && xmit_done;