diff mbox series

[RFC,bpf-next,07/20] xdp: Track if metadata is supported in xdp_frame <> xdp_buff conversions

Message ID 20250305-afabre-traits-010-rfc2-v1-7-d0ecfb869797@cloudflare.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series traits: Per packet metadata KV store | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-42 fail Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 fail Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline fail Detected static functions without inline keyword in header files: 1
netdev/build_32bit fail Errors and warnings before: 207 this patch: 265
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 7 maintainers not CCed: daniel@iogearbox.net horms@kernel.org john.fastabend@gmail.com ast@kernel.org kuba@kernel.org pabeni@redhat.com edumazet@google.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 242 this patch: 242
netdev/checkpatch fail ERROR: space prohibited before that ':' (ctx:WxV)
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Arthur Fabre March 5, 2025, 2:32 p.m. UTC
From: Arthur Fabre <afabre@cloudflare.com>

xdp_buff stores whether metadata is supported by a NIC by setting
data_meta to be greater than data.

But xdp_frame only stores the metadata size (as metasize), so converting
between xdp_frame and xdp_buff is lossy.

Steal an unused bit in xdp_frame to track whether metadata is supported
or not.

This will lets us have "generic" functions for setting skb fields from
either xdp_frame or xdp_buff from drivers.

Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
---
 include/net/xdp.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

Comments

Alexander Lobakin March 5, 2025, 3:24 p.m. UTC | #1
From: Arthur <arthur@arthurfabre.com>
Date: Wed, 05 Mar 2025 15:32:04 +0100

> From: Arthur Fabre <afabre@cloudflare.com>
> 
> xdp_buff stores whether metadata is supported by a NIC by setting
> data_meta to be greater than data.
> 
> But xdp_frame only stores the metadata size (as metasize), so converting
> between xdp_frame and xdp_buff is lossy.
> 
> Steal an unused bit in xdp_frame to track whether metadata is supported
> or not.
> 
> This will lets us have "generic" functions for setting skb fields from
> either xdp_frame or xdp_buff from drivers.
> 
> Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> ---
>  include/net/xdp.h | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 58019fa299b56dbd45c104fdfa807f73af6e4fa4..84afe07d09efdb2ab0cb78b904f02cb74f9a56b6 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -116,6 +116,9 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
>  	xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
>  }
>  
> +static bool xdp_data_meta_unsupported(const struct xdp_buff *xdp);
> +static void xdp_set_data_meta_invalid(struct xdp_buff *xdp);
> +
>  static __always_inline void *xdp_buff_traits(const struct xdp_buff *xdp)
>  {
>  	return xdp->data_hard_start + _XDP_FRAME_SIZE;
> @@ -270,7 +273,9 @@ struct xdp_frame {
>  	void *data;
>  	u32 len;
>  	u32 headroom;
> -	u32 metasize; /* uses lower 8-bits */
> +	u32	:23, /* unused */
> +		meta_unsupported:1,
> +		metasize:8;

See the history of this structure how we got rid of using bitfields here
and why.

...because of performance.

Even though metasize uses only 8 bits, 1-byte access is slower than
32-byte access.

I was going to write "you can use the fact that metasize is always a
multiple of 4 or that it's never > 252, for example, you could reuse LSB
as a flag indicating that meta is not supported", but first of all

Do we still have drivers which don't support metadata?
Why don't they do that? It's not HW-specific or even driver-specific.
They don't reserve headroom? Then they're invalid, at least XDP_REDIRECT
won't work.

So maybe we need to fix those drivers first, if there are any.

>  	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
>  	 * while mem_type is valid on remote CPU.
>  	 */
> @@ -369,6 +374,8 @@ void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
>  	xdp->data = frame->data;
>  	xdp->data_end = frame->data + frame->len;
>  	xdp->data_meta = frame->data - frame->metasize;
> +	if (frame->meta_unsupported)
> +		xdp_set_data_meta_invalid(xdp);
>  	xdp->frame_sz = frame->frame_sz;
>  	xdp->flags = frame->flags;
>  }
> @@ -396,6 +403,7 @@ int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
>  	xdp_frame->len  = xdp->data_end - xdp->data;
>  	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
>  	xdp_frame->metasize = metasize;
> +	xdp_frame->meta_unsupported = xdp_data_meta_unsupported(xdp);
>  	xdp_frame->frame_sz = xdp->frame_sz;
>  	xdp_frame->flags = xdp->flags;

Thanks,
Olek
Arthur Fabre March 5, 2025, 5:02 p.m. UTC | #2
On Wed Mar 5, 2025 at 4:24 PM CET, Alexander Lobakin wrote:
> From: Arthur <arthur@arthurfabre.com>
> Date: Wed, 05 Mar 2025 15:32:04 +0100
>
> > From: Arthur Fabre <afabre@cloudflare.com>
> > 
> > xdp_buff stores whether metadata is supported by a NIC by setting
> > data_meta to be greater than data.
> > 
> > But xdp_frame only stores the metadata size (as metasize), so converting
> > between xdp_frame and xdp_buff is lossy.
> > 
> > Steal an unused bit in xdp_frame to track whether metadata is supported
> > or not.
> > 
> > This will lets us have "generic" functions for setting skb fields from
> > either xdp_frame or xdp_buff from drivers.
> > 
> > Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> > ---
> >  include/net/xdp.h | 10 +++++++++-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 58019fa299b56dbd45c104fdfa807f73af6e4fa4..84afe07d09efdb2ab0cb78b904f02cb74f9a56b6 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -116,6 +116,9 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
> >  	xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
> >  }
> >  
> > +static bool xdp_data_meta_unsupported(const struct xdp_buff *xdp);
> > +static void xdp_set_data_meta_invalid(struct xdp_buff *xdp);
> > +
> >  static __always_inline void *xdp_buff_traits(const struct xdp_buff *xdp)
> >  {
> >  	return xdp->data_hard_start + _XDP_FRAME_SIZE;
> > @@ -270,7 +273,9 @@ struct xdp_frame {
> >  	void *data;
> >  	u32 len;
> >  	u32 headroom;
> > -	u32 metasize; /* uses lower 8-bits */
> > +	u32	:23, /* unused */
> > +		meta_unsupported:1,
> > +		metasize:8;
>
> See the history of this structure how we got rid of using bitfields here
> and why.
>
> ...because of performance.
>
> Even though metasize uses only 8 bits, 1-byte access is slower than
> 32-byte access.

Interesting, thanks!

> I was going to write "you can use the fact that metasize is always a
> multiple of 4 or that it's never > 252, for example, you could reuse LSB
> as a flag indicating that meta is not supported", but first of all
>
> Do we still have drivers which don't support metadata?
> Why don't they do that? It's not HW-specific or even driver-specific.
> They don't reserve headroom? Then they're invalid, at least XDP_REDIRECT
> won't work.
>
> So maybe we need to fix those drivers first, if there are any.

Most drivers don't support metadata unfortunately:

> rg -U "xdp_prepare_buff\([^)]*false\);" drivers/net/
drivers/net/tun.c
1712:		xdp_prepare_buff(&xdp, buf, pad, len, false);

drivers/net/ethernet/microsoft/mana/mana_bpf.c
94:	xdp_prepare_buff(xdp, buf_va, XDP_PACKET_HEADROOM, pkt_len, false);

drivers/net/ethernet/marvell/mvneta.c
2344:	xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE,
2345:			 data_len, false);

drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
1436:	xdp_prepare_buff(&xdp, hard_start, OTX2_HEAD_ROOM,
1437:			 cqe->sg.seg_size, false);

drivers/net/ethernet/socionext/netsec.c
1021:		xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM,
1022:				 pkt_len, false);

drivers/net/ethernet/google/gve/gve_rx.c
740:	xdp_prepare_buff(&new, frame, headroom, len, false);
859:		xdp_prepare_buff(&xdp, page_info->page_address +
860:				 page_info->page_offset, GVE_RX_PAD,
861:				 len, false);

drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
3984:			xdp_prepare_buff(&xdp, data,
3985:					 MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM,
3986:					 rx_bytes, false);

drivers/net/ethernet/aquantia/atlantic/aq_ring.c
794:		xdp_prepare_buff(&xdp, hard_start, rx_ring->page_offset,
795:				 buff->len, false);

drivers/net/ethernet/cavium/thunder/nicvf_main.c
554:	xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false);

drivers/net/ethernet/ti/cpsw_new.c
348:		xdp_prepare_buff(&xdp, pa, headroom, size, false);

drivers/net/ethernet/freescale/enetc/enetc.c
1710:	xdp_prepare_buff(xdp_buff, hard_start - rx_ring->buffer_offset,
1711:			 rx_ring->buffer_offset, size, false);

drivers/net/ethernet/ti/am65-cpsw-nuss.c
1335:		xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM,
1336:				 pkt_len, false);

drivers/net/ethernet/ti/cpsw.c
403:		xdp_prepare_buff(&xdp, pa, headroom, size, false);

drivers/net/ethernet/sfc/rx.c
289:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
290:			 rx_buf->len, false);

drivers/net/ethernet/mediatek/mtk_eth_soc.c
2097:			xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen,
2098:					 false);

drivers/net/ethernet/sfc/siena/rx.c
291:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
292:			 rx_buf->len, false)

I don't know if it's just because no one has added calls to
skb_metadata_set() in yet, or if there's a more fundamental reason.

I think they all reserve some amount of headroom, but not always the
full XDP_PACKET_HEADROOM. Eg sfc:

drivers/net/ethernet/sfc/net_driver.h:
/* Non-standard XDP_PACKET_HEADROOM and tailroom to satisfy XDP_REDIRECT and
 * still fit two standard MTU size packets into a single 4K page.
 */
#define EFX_XDP_HEADROOM	128

If it's just because skb_metadata_set() is missing, I can take the
patches from this series that adds a "generic" XDP -> skb hook ("trait:
Propagate presence of traits to sk_buff"), have it call
skb_metadata_set(), and try to add it to all the drivers in a separate
series.

> >  	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
> >  	 * while mem_type is valid on remote CPU.
> >  	 */
> > @@ -369,6 +374,8 @@ void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
> >  	xdp->data = frame->data;
> >  	xdp->data_end = frame->data + frame->len;
> >  	xdp->data_meta = frame->data - frame->metasize;
> > +	if (frame->meta_unsupported)
> > +		xdp_set_data_meta_invalid(xdp);
> >  	xdp->frame_sz = frame->frame_sz;
> >  	xdp->flags = frame->flags;
> >  }
> > @@ -396,6 +403,7 @@ int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
> >  	xdp_frame->len  = xdp->data_end - xdp->data;
> >  	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
> >  	xdp_frame->metasize = metasize;
> > +	xdp_frame->meta_unsupported = xdp_data_meta_unsupported(xdp);
> >  	xdp_frame->frame_sz = xdp->frame_sz;
> >  	xdp_frame->flags = xdp->flags;
>
> Thanks,
> Olek
Jesper Dangaard Brouer March 6, 2025, 11:12 a.m. UTC | #3
On 05/03/2025 18.02, Arthur Fabre wrote:
> On Wed Mar 5, 2025 at 4:24 PM CET, Alexander Lobakin wrote:
>> From: Arthur <arthur@arthurfabre.com>
>> Date: Wed, 05 Mar 2025 15:32:04 +0100
>>
>>> From: Arthur Fabre <afabre@cloudflare.com>
>>>
>>> xdp_buff stores whether metadata is supported by a NIC by setting
>>> data_meta to be greater than data.
>>>
>>> But xdp_frame only stores the metadata size (as metasize), so converting
>>> between xdp_frame and xdp_buff is lossy.
>>>
>>> Steal an unused bit in xdp_frame to track whether metadata is supported
>>> or not.
>>>
>>> This will lets us have "generic" functions for setting skb fields from
>>> either xdp_frame or xdp_buff from drivers.
>>>
>>> Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
>>> ---
>>>   include/net/xdp.h | 10 +++++++++-
>>>   1 file changed, 9 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>> index 58019fa299b56dbd45c104fdfa807f73af6e4fa4..84afe07d09efdb2ab0cb78b904f02cb74f9a56b6 100644
>>> --- a/include/net/xdp.h
>>> +++ b/include/net/xdp.h
>>> @@ -116,6 +116,9 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
>>>   	xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
>>>   }
>>>   
>>> +static bool xdp_data_meta_unsupported(const struct xdp_buff *xdp);
>>> +static void xdp_set_data_meta_invalid(struct xdp_buff *xdp);
>>> +
>>>   static __always_inline void *xdp_buff_traits(const struct xdp_buff *xdp)
>>>   {
>>>   	return xdp->data_hard_start + _XDP_FRAME_SIZE;
>>> @@ -270,7 +273,9 @@ struct xdp_frame {
>>>   	void *data;
>>>   	u32 len;
>>>   	u32 headroom;
>>> -	u32 metasize; /* uses lower 8-bits */
>>> +	u32	:23, /* unused */
>>> +		meta_unsupported:1,
>>> +		metasize:8;
>>
>> See the history of this structure how we got rid of using bitfields here
>> and why.
>>
>> ...because of performance.
>>
>> Even though metasize uses only 8 bits, 1-byte access is slower than
>> 32-byte access.
> 
> Interesting, thanks!
> 

I agree with Olek, we should not use bitfields.  Thanks for catching this.

(The xdp_frame have a flags member...)
Why don't we use the flags member for storing this information?


>> I was going to write "you can use the fact that metasize is always a
>> multiple of 4 or that it's never > 252, for example, you could reuse LSB
>> as a flag indicating that meta is not supported", but first of all
>>
>> Do we still have drivers which don't support metadata?
>> Why don't they do that? It's not HW-specific or even driver-specific.
>> They don't reserve headroom? Then they're invalid, at least XDP_REDIRECT
>> won't work.
>>

I'm fairly sure that all drivers support XDP_REDIRECT.
Except didn't Lorenzo add a feature bit for this?
(so, some drivers might explicitly not-support this)

>> So maybe we need to fix those drivers first, if there are any.
> 
> Most drivers don't support metadata unfortunately:
> 
>> rg -U "xdp_prepare_buff\([^)]*false\);" drivers/net/
> drivers/net/tun.c
> 1712:		xdp_prepare_buff(&xdp, buf, pad, len, false);
> 
> drivers/net/ethernet/microsoft/mana/mana_bpf.c
> 94:	xdp_prepare_buff(xdp, buf_va, XDP_PACKET_HEADROOM, pkt_len, false);
> 
> drivers/net/ethernet/marvell/mvneta.c
> 2344:	xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE,
> 2345:			 data_len, false);
> 
> drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
> 1436:	xdp_prepare_buff(&xdp, hard_start, OTX2_HEAD_ROOM,
> 1437:			 cqe->sg.seg_size, false);
> 
> drivers/net/ethernet/socionext/netsec.c
> 1021:		xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM,
> 1022:				 pkt_len, false);
> 
> drivers/net/ethernet/google/gve/gve_rx.c
> 740:	xdp_prepare_buff(&new, frame, headroom, len, false);
> 859:		xdp_prepare_buff(&xdp, page_info->page_address +
> 860:				 page_info->page_offset, GVE_RX_PAD,
> 861:				 len, false);
> 
> drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> 3984:			xdp_prepare_buff(&xdp, data,
> 3985:					 MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM,
> 3986:					 rx_bytes, false);
> 
> drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> 794:		xdp_prepare_buff(&xdp, hard_start, rx_ring->page_offset,
> 795:				 buff->len, false);
> 
> drivers/net/ethernet/cavium/thunder/nicvf_main.c
> 554:	xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false);
> 
> drivers/net/ethernet/ti/cpsw_new.c
> 348:		xdp_prepare_buff(&xdp, pa, headroom, size, false);
> 
> drivers/net/ethernet/freescale/enetc/enetc.c
> 1710:	xdp_prepare_buff(xdp_buff, hard_start - rx_ring->buffer_offset,
> 1711:			 rx_ring->buffer_offset, size, false);
> 
> drivers/net/ethernet/ti/am65-cpsw-nuss.c
> 1335:		xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM,
> 1336:				 pkt_len, false);
> 
> drivers/net/ethernet/ti/cpsw.c
> 403:		xdp_prepare_buff(&xdp, pa, headroom, size, false);
> 
> drivers/net/ethernet/sfc/rx.c
> 289:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
> 290:			 rx_buf->len, false);
> 
> drivers/net/ethernet/mediatek/mtk_eth_soc.c
> 2097:			xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen,
> 2098:					 false);
> 
> drivers/net/ethernet/sfc/siena/rx.c
> 291:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
> 292:			 rx_buf->len, false)
> 
> I don't know if it's just because no one has added calls to
> skb_metadata_set() in yet, or if there's a more fundamental reason.
> 

I simply think driver developers have been lazy.

If someone want some easy kernel commits, these drivers should be fairly
easy to fix...

> I think they all reserve some amount of headroom, but not always the
> full XDP_PACKET_HEADROOM. Eg sfc:
> 

The Intel drivers use 192 (AFAIK if that is still true). The API ended
up supporting non-standard XDP_PACKET_HEADROOM, due to the Intel
drivers, when XDP support was added to those (which is a long time ago now).

> drivers/net/ethernet/sfc/net_driver.h:
> /* Non-standard XDP_PACKET_HEADROOM and tailroom to satisfy XDP_REDIRECT and
>   * still fit two standard MTU size packets into a single 4K page.
>   */
> #define EFX_XDP_HEADROOM	128
> 

This is smaller than most drivers, but still have enough headroom for 
xdp_frame + traits.

> If it's just because skb_metadata_set() is missing, I can take the
> patches from this series that adds a "generic" XDP -> skb hook ("trait:
> Propagate presence of traits to sk_buff"), have it call
> skb_metadata_set(), and try to add it to all the drivers in a separate
> series.
> 

I think someone should cleanup those drivers and add support.

--Jesper

>>>   	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
>>>   	 * while mem_type is valid on remote CPU.
>>>   	 */
>>> @@ -369,6 +374,8 @@ void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
>>>   	xdp->data = frame->data;
>>>   	xdp->data_end = frame->data + frame->len;
>>>   	xdp->data_meta = frame->data - frame->metasize;
>>> +	if (frame->meta_unsupported)
>>> +		xdp_set_data_meta_invalid(xdp);
>>>   	xdp->frame_sz = frame->frame_sz;
>>>   	xdp->flags = frame->flags;
>>>   }
>>> @@ -396,6 +403,7 @@ int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
>>>   	xdp_frame->len  = xdp->data_end - xdp->data;
>>>   	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
>>>   	xdp_frame->metasize = metasize;
>>> +	xdp_frame->meta_unsupported = xdp_data_meta_unsupported(xdp);
>>>   	xdp_frame->frame_sz = xdp->frame_sz;
>>>   	xdp_frame->flags = xdp->flags;
>>
>> Thanks,
>> Olek
Lorenzo Bianconi March 10, 2025, 11:10 a.m. UTC | #4
[...]
> > > 
> 
> I'm fairly sure that all drivers support XDP_REDIRECT.
> Except didn't Lorenzo add a feature bit for this?
> (so, some drivers might explicitly not-support this)

I think most of the drivers support XDP_REDIRECT. IIRC just some vf
implementations (e.g. ixgbevf or thunder/nicvf do not support XDP_REDIRECT).
Maybe nfp is a special case.

Regards,
Lorenzo

> 
> > > So maybe we need to fix those drivers first, if there are any.
> > 
> > Most drivers don't support metadata unfortunately:
> > 
> > > rg -U "xdp_prepare_buff\([^)]*false\);" drivers/net/
> > drivers/net/tun.c
> > 1712:		xdp_prepare_buff(&xdp, buf, pad, len, false);
> > 
> > drivers/net/ethernet/microsoft/mana/mana_bpf.c
> > 94:	xdp_prepare_buff(xdp, buf_va, XDP_PACKET_HEADROOM, pkt_len, false);
> > 
> > drivers/net/ethernet/marvell/mvneta.c
> > 2344:	xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE,
> > 2345:			 data_len, false);
> > 
> > drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
> > 1436:	xdp_prepare_buff(&xdp, hard_start, OTX2_HEAD_ROOM,
> > 1437:			 cqe->sg.seg_size, false);
> > 
> > drivers/net/ethernet/socionext/netsec.c
> > 1021:		xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM,
> > 1022:				 pkt_len, false);
> > 
> > drivers/net/ethernet/google/gve/gve_rx.c
> > 740:	xdp_prepare_buff(&new, frame, headroom, len, false);
> > 859:		xdp_prepare_buff(&xdp, page_info->page_address +
> > 860:				 page_info->page_offset, GVE_RX_PAD,
> > 861:				 len, false);
> > 
> > drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> > 3984:			xdp_prepare_buff(&xdp, data,
> > 3985:					 MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM,
> > 3986:					 rx_bytes, false);
> > 
> > drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> > 794:		xdp_prepare_buff(&xdp, hard_start, rx_ring->page_offset,
> > 795:				 buff->len, false);
> > 
> > drivers/net/ethernet/cavium/thunder/nicvf_main.c
> > 554:	xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false);
> > 
> > drivers/net/ethernet/ti/cpsw_new.c
> > 348:		xdp_prepare_buff(&xdp, pa, headroom, size, false);
> > 
> > drivers/net/ethernet/freescale/enetc/enetc.c
> > 1710:	xdp_prepare_buff(xdp_buff, hard_start - rx_ring->buffer_offset,
> > 1711:			 rx_ring->buffer_offset, size, false);
> > 
> > drivers/net/ethernet/ti/am65-cpsw-nuss.c
> > 1335:		xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM,
> > 1336:				 pkt_len, false);
> > 
> > drivers/net/ethernet/ti/cpsw.c
> > 403:		xdp_prepare_buff(&xdp, pa, headroom, size, false);
> > 
> > drivers/net/ethernet/sfc/rx.c
> > 289:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
> > 290:			 rx_buf->len, false);
> > 
> > drivers/net/ethernet/mediatek/mtk_eth_soc.c
> > 2097:			xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen,
> > 2098:					 false);
> > 
> > drivers/net/ethernet/sfc/siena/rx.c
> > 291:	xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM,
> > 292:			 rx_buf->len, false)
> > 
> > I don't know if it's just because no one has added calls to
> > skb_metadata_set() in yet, or if there's a more fundamental reason.
> > 
> 
> I simply think driver developers have been lazy.
> 
> If someone want some easy kernel commits, these drivers should be fairly
> easy to fix...
> 
> > I think they all reserve some amount of headroom, but not always the
> > full XDP_PACKET_HEADROOM. Eg sfc:
> > 
> 
> The Intel drivers use 192 (AFAIK if that is still true). The API ended
> up supporting non-standard XDP_PACKET_HEADROOM, due to the Intel
> drivers, when XDP support was added to those (which is a long time ago now).
> 
> > drivers/net/ethernet/sfc/net_driver.h:
> > /* Non-standard XDP_PACKET_HEADROOM and tailroom to satisfy XDP_REDIRECT and
> >   * still fit two standard MTU size packets into a single 4K page.
> >   */
> > #define EFX_XDP_HEADROOM	128
> > 
> 
> This is smaller than most drivers, but still have enough headroom for
> xdp_frame + traits.
> 
> > If it's just because skb_metadata_set() is missing, I can take the
> > patches from this series that adds a "generic" XDP -> skb hook ("trait:
> > Propagate presence of traits to sk_buff"), have it call
> > skb_metadata_set(), and try to add it to all the drivers in a separate
> > series.
> > 
> 
> I think someone should cleanup those drivers and add support.
> 
> --Jesper
> 
> > > >   	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
> > > >   	 * while mem_type is valid on remote CPU.
> > > >   	 */
> > > > @@ -369,6 +374,8 @@ void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
> > > >   	xdp->data = frame->data;
> > > >   	xdp->data_end = frame->data + frame->len;
> > > >   	xdp->data_meta = frame->data - frame->metasize;
> > > > +	if (frame->meta_unsupported)
> > > > +		xdp_set_data_meta_invalid(xdp);
> > > >   	xdp->frame_sz = frame->frame_sz;
> > > >   	xdp->flags = frame->flags;
> > > >   }
> > > > @@ -396,6 +403,7 @@ int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
> > > >   	xdp_frame->len  = xdp->data_end - xdp->data;
> > > >   	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
> > > >   	xdp_frame->metasize = metasize;
> > > > +	xdp_frame->meta_unsupported = xdp_data_meta_unsupported(xdp);
> > > >   	xdp_frame->frame_sz = xdp->frame_sz;
> > > >   	xdp_frame->flags = xdp->flags;
> > > 
> > > Thanks,
> > > Olek
>
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 58019fa299b56dbd45c104fdfa807f73af6e4fa4..84afe07d09efdb2ab0cb78b904f02cb74f9a56b6 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -116,6 +116,9 @@  static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
 	xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
 }
 
+static bool xdp_data_meta_unsupported(const struct xdp_buff *xdp);
+static void xdp_set_data_meta_invalid(struct xdp_buff *xdp);
+
 static __always_inline void *xdp_buff_traits(const struct xdp_buff *xdp)
 {
 	return xdp->data_hard_start + _XDP_FRAME_SIZE;
@@ -270,7 +273,9 @@  struct xdp_frame {
 	void *data;
 	u32 len;
 	u32 headroom;
-	u32 metasize; /* uses lower 8-bits */
+	u32	:23, /* unused */
+		meta_unsupported:1,
+		metasize:8;
 	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
 	 * while mem_type is valid on remote CPU.
 	 */
@@ -369,6 +374,8 @@  void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
 	xdp->data = frame->data;
 	xdp->data_end = frame->data + frame->len;
 	xdp->data_meta = frame->data - frame->metasize;
+	if (frame->meta_unsupported)
+		xdp_set_data_meta_invalid(xdp);
 	xdp->frame_sz = frame->frame_sz;
 	xdp->flags = frame->flags;
 }
@@ -396,6 +403,7 @@  int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
 	xdp_frame->len  = xdp->data_end - xdp->data;
 	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
 	xdp_frame->metasize = metasize;
+	xdp_frame->meta_unsupported = xdp_data_meta_unsupported(xdp);
 	xdp_frame->frame_sz = xdp->frame_sz;
 	xdp_frame->flags = xdp->flags;