diff mbox series

[bpf-next,1/9] xsk: Support XDP_TX_METADATA_LEN

Message ID 20230809165418.2831456-2-sdf@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xsk: TX metadata | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1356 this patch: 1356
netdev/cc_maintainers warning 4 maintainers not CCed: pabeni@redhat.com edumazet@google.com davem@davemloft.net jonathan.lemon@gmail.com
netdev/build_clang success Errors and warnings before: 1351 this patch: 1351
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1379 this patch: 1379
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Stanislav Fomichev Aug. 9, 2023, 4:54 p.m. UTC
For zerocopy mode, tx_desc->addr can point to the arbitrary offset
and carry some TX metadata in the headroom. For copy mode, there
is no way currently to populate skb metadata.

Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
to treat as metadata. Metadata bytes come prior to tx_desc address
(same as in RX case).

The size of the metadata has the same constraints as XDP:
- less than 256 bytes
- 4-byte aligned
- non-zero

This data is not interpreted in any way right now.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/net/xdp_sock.h      |  1 +
 include/net/xsk_buff_pool.h |  1 +
 include/uapi/linux/if_xdp.h |  1 +
 net/xdp/xsk.c               | 20 ++++++++++++++++++++
 net/xdp/xsk_buff_pool.c     |  1 +
 net/xdp/xsk_queue.h         | 17 ++++++++++-------
 6 files changed, 34 insertions(+), 7 deletions(-)

Comments

Fijalkowski, Maciej Aug. 14, 2023, 10:56 a.m. UTC | #1
On Wed, Aug 09, 2023 at 09:54:10AM -0700, Stanislav Fomichev wrote:
> For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> and carry some TX metadata in the headroom. For copy mode, there
> is no way currently to populate skb metadata.
> 
> Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
> to treat as metadata. Metadata bytes come prior to tx_desc address
> (same as in RX case).
> 
> The size of the metadata has the same constraints as XDP:
> - less than 256 bytes
> - 4-byte aligned
> - non-zero
> 
> This data is not interpreted in any way right now.
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  include/net/xdp_sock.h      |  1 +
>  include/net/xsk_buff_pool.h |  1 +
>  include/uapi/linux/if_xdp.h |  1 +
>  net/xdp/xsk.c               | 20 ++++++++++++++++++++
>  net/xdp/xsk_buff_pool.c     |  1 +
>  net/xdp/xsk_queue.h         | 17 ++++++++++-------
>  6 files changed, 34 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 1617af380162..467b9fb56827 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -51,6 +51,7 @@ struct xdp_sock {
>  	struct list_head flush_node;
>  	struct xsk_buff_pool *pool;
>  	u16 queue_id;
> +	u8 tx_metadata_len;
>  	bool zc;
>  	bool sg;
>  	enum {
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index b0bdff26fc88..9c31e8d1e198 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -77,6 +77,7 @@ struct xsk_buff_pool {
>  	u32 chunk_size;
>  	u32 chunk_shift;
>  	u32 frame_len;
> +	u8 tx_metadata_len; /* inherited from xsk_sock */
>  	u8 cached_need_wakeup;
>  	bool uses_need_wakeup;
>  	bool dma_need_sync;
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index 8d48863472b9..b37b50102e1c 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -69,6 +69,7 @@ struct xdp_mmap_offsets {
>  #define XDP_UMEM_COMPLETION_RING	6
>  #define XDP_STATISTICS			7
>  #define XDP_OPTIONS			8
> +#define XDP_TX_METADATA_LEN		9
>  
>  struct xdp_umem_reg {
>  	__u64 addr; /* Start of packet data area */
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 47796a5a79b3..28df3280501d 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -1338,6 +1338,26 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
>  		mutex_unlock(&xs->mutex);
>  		return err;
>  	}
> +	case XDP_TX_METADATA_LEN:
> +	{
> +		int val;
> +
> +		if (optlen < sizeof(val))
> +			return -EINVAL;
> +		if (copy_from_sockptr(&val, optval, sizeof(val)))
> +			return -EFAULT;
> +		if (!val || val > 256 || val % 4)
> +			return -EINVAL;
> +
> +		mutex_lock(&xs->mutex);
> +		if (xs->state != XSK_READY) {
> +			mutex_unlock(&xs->mutex);
> +			return -EBUSY;
> +		}
> +		xs->tx_metadata_len = val;
> +		mutex_unlock(&xs->mutex);
> +		return 0;
> +	}
>  	default:
>  		break;
>  	}
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index b3f7b310811e..b351732f1032 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
>  		XDP_PACKET_HEADROOM;
>  	pool->umem = umem;
>  	pool->addrs = umem->addrs;
> +	pool->tx_metadata_len = xs->tx_metadata_len;

Hey Stan,

what would happen in case when one socket sets pool->tx_metadata_len say
to 16 and the other one that is sharing the pool to 24? If sockets should
and can have different padding, then this will not work unless the
metadata_len is on a per socket level.

>  	INIT_LIST_HEAD(&pool->free_list);
>  	INIT_LIST_HEAD(&pool->xskb_list);
>  	INIT_LIST_HEAD(&pool->xsk_tx_list);
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 13354a1e4280..c74a1372bcb9 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -143,15 +143,17 @@ static inline bool xp_unused_options_set(u32 options)
>  static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
>  					    struct xdp_desc *desc)
>  {
> -	u64 offset = desc->addr & (pool->chunk_size - 1);
> +	u64 addr = desc->addr - pool->tx_metadata_len;
> +	u64 len = desc->len + pool->tx_metadata_len;
> +	u64 offset = addr & (pool->chunk_size - 1);
>  
>  	if (!desc->len)
>  		return false;
>  
> -	if (offset + desc->len > pool->chunk_size)
> +	if (offset + len > pool->chunk_size)
>  		return false;
>  
> -	if (desc->addr >= pool->addrs_cnt)
> +	if (addr >= pool->addrs_cnt)
>  		return false;
>  
>  	if (xp_unused_options_set(desc->options))
> @@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
>  static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
>  					      struct xdp_desc *desc)
>  {
> -	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
> +	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
> +	u64 len = desc->len + pool->tx_metadata_len;
>  
>  	if (!desc->len)
>  		return false;
>  
> -	if (desc->len > pool->chunk_size)
> +	if (len > pool->chunk_size)
>  		return false;
>  
> -	if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
> -	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
> +	if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
> +	    xp_desc_crosses_non_contig_pg(pool, addr, len))
>  		return false;
>  
>  	if (xp_unused_options_set(desc->options))
> -- 
> 2.41.0.640.ga95def55d0-goog
>
Stanislav Fomichev Aug. 14, 2023, 6:05 p.m. UTC | #2
On Mon, Aug 14, 2023 at 3:57 AM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Wed, Aug 09, 2023 at 09:54:10AM -0700, Stanislav Fomichev wrote:
> > For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> > and carry some TX metadata in the headroom. For copy mode, there
> > is no way currently to populate skb metadata.
> >
> > Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
> > to treat as metadata. Metadata bytes come prior to tx_desc address
> > (same as in RX case).
> >
> > The size of the metadata has the same constraints as XDP:
> > - less than 256 bytes
> > - 4-byte aligned
> > - non-zero
> >
> > This data is not interpreted in any way right now.
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  include/net/xdp_sock.h      |  1 +
> >  include/net/xsk_buff_pool.h |  1 +
> >  include/uapi/linux/if_xdp.h |  1 +
> >  net/xdp/xsk.c               | 20 ++++++++++++++++++++
> >  net/xdp/xsk_buff_pool.c     |  1 +
> >  net/xdp/xsk_queue.h         | 17 ++++++++++-------
> >  6 files changed, 34 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index 1617af380162..467b9fb56827 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -51,6 +51,7 @@ struct xdp_sock {
> >       struct list_head flush_node;
> >       struct xsk_buff_pool *pool;
> >       u16 queue_id;
> > +     u8 tx_metadata_len;
> >       bool zc;
> >       bool sg;
> >       enum {
> > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > index b0bdff26fc88..9c31e8d1e198 100644
> > --- a/include/net/xsk_buff_pool.h
> > +++ b/include/net/xsk_buff_pool.h
> > @@ -77,6 +77,7 @@ struct xsk_buff_pool {
> >       u32 chunk_size;
> >       u32 chunk_shift;
> >       u32 frame_len;
> > +     u8 tx_metadata_len; /* inherited from xsk_sock */
> >       u8 cached_need_wakeup;
> >       bool uses_need_wakeup;
> >       bool dma_need_sync;
> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> > index 8d48863472b9..b37b50102e1c 100644
> > --- a/include/uapi/linux/if_xdp.h
> > +++ b/include/uapi/linux/if_xdp.h
> > @@ -69,6 +69,7 @@ struct xdp_mmap_offsets {
> >  #define XDP_UMEM_COMPLETION_RING     6
> >  #define XDP_STATISTICS                       7
> >  #define XDP_OPTIONS                  8
> > +#define XDP_TX_METADATA_LEN          9
> >
> >  struct xdp_umem_reg {
> >       __u64 addr; /* Start of packet data area */
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 47796a5a79b3..28df3280501d 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -1338,6 +1338,26 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> >               mutex_unlock(&xs->mutex);
> >               return err;
> >       }
> > +     case XDP_TX_METADATA_LEN:
> > +     {
> > +             int val;
> > +
> > +             if (optlen < sizeof(val))
> > +                     return -EINVAL;
> > +             if (copy_from_sockptr(&val, optval, sizeof(val)))
> > +                     return -EFAULT;
> > +             if (!val || val > 256 || val % 4)
> > +                     return -EINVAL;
> > +
> > +             mutex_lock(&xs->mutex);
> > +             if (xs->state != XSK_READY) {
> > +                     mutex_unlock(&xs->mutex);
> > +                     return -EBUSY;
> > +             }
> > +             xs->tx_metadata_len = val;
> > +             mutex_unlock(&xs->mutex);
> > +             return 0;
> > +     }
> >       default:
> >               break;
> >       }
> > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > index b3f7b310811e..b351732f1032 100644
> > --- a/net/xdp/xsk_buff_pool.c
> > +++ b/net/xdp/xsk_buff_pool.c
> > @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> >               XDP_PACKET_HEADROOM;
> >       pool->umem = umem;
> >       pool->addrs = umem->addrs;
> > +     pool->tx_metadata_len = xs->tx_metadata_len;
>
> Hey Stan,
>
> what would happen in case when one socket sets pool->tx_metadata_len say
> to 16 and the other one that is sharing the pool to 24? If sockets should
> and can have different padding, then this will not work unless the
> metadata_len is on a per socket level.

Hmm, good point. I didn't think about umem sharing :-/
Maybe they all have to agree on the size? And once the size has been
size by one socket, the same size should be set on the others? (or at
least be implied that the other sockets will use the same metadata
layout)
Stanislav Fomichev Aug. 14, 2023, 10:24 p.m. UTC | #3
On Mon, Aug 14, 2023 at 11:05 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> On Mon, Aug 14, 2023 at 3:57 AM Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > On Wed, Aug 09, 2023 at 09:54:10AM -0700, Stanislav Fomichev wrote:
> > > For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> > > and carry some TX metadata in the headroom. For copy mode, there
> > > is no way currently to populate skb metadata.
> > >
> > > Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
> > > to treat as metadata. Metadata bytes come prior to tx_desc address
> > > (same as in RX case).
> > >
> > > The size of the metadata has the same constraints as XDP:
> > > - less than 256 bytes
> > > - 4-byte aligned
> > > - non-zero
> > >
> > > This data is not interpreted in any way right now.
> > >
> > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > ---
> > >  include/net/xdp_sock.h      |  1 +
> > >  include/net/xsk_buff_pool.h |  1 +
> > >  include/uapi/linux/if_xdp.h |  1 +
> > >  net/xdp/xsk.c               | 20 ++++++++++++++++++++
> > >  net/xdp/xsk_buff_pool.c     |  1 +
> > >  net/xdp/xsk_queue.h         | 17 ++++++++++-------
> > >  6 files changed, 34 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > > index 1617af380162..467b9fb56827 100644
> > > --- a/include/net/xdp_sock.h
> > > +++ b/include/net/xdp_sock.h
> > > @@ -51,6 +51,7 @@ struct xdp_sock {
> > >       struct list_head flush_node;
> > >       struct xsk_buff_pool *pool;
> > >       u16 queue_id;
> > > +     u8 tx_metadata_len;
> > >       bool zc;
> > >       bool sg;
> > >       enum {
> > > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > > index b0bdff26fc88..9c31e8d1e198 100644
> > > --- a/include/net/xsk_buff_pool.h
> > > +++ b/include/net/xsk_buff_pool.h
> > > @@ -77,6 +77,7 @@ struct xsk_buff_pool {
> > >       u32 chunk_size;
> > >       u32 chunk_shift;
> > >       u32 frame_len;
> > > +     u8 tx_metadata_len; /* inherited from xsk_sock */
> > >       u8 cached_need_wakeup;
> > >       bool uses_need_wakeup;
> > >       bool dma_need_sync;
> > > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> > > index 8d48863472b9..b37b50102e1c 100644
> > > --- a/include/uapi/linux/if_xdp.h
> > > +++ b/include/uapi/linux/if_xdp.h
> > > @@ -69,6 +69,7 @@ struct xdp_mmap_offsets {
> > >  #define XDP_UMEM_COMPLETION_RING     6
> > >  #define XDP_STATISTICS                       7
> > >  #define XDP_OPTIONS                  8
> > > +#define XDP_TX_METADATA_LEN          9
> > >
> > >  struct xdp_umem_reg {
> > >       __u64 addr; /* Start of packet data area */
> > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > index 47796a5a79b3..28df3280501d 100644
> > > --- a/net/xdp/xsk.c
> > > +++ b/net/xdp/xsk.c
> > > @@ -1338,6 +1338,26 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> > >               mutex_unlock(&xs->mutex);
> > >               return err;
> > >       }
> > > +     case XDP_TX_METADATA_LEN:
> > > +     {
> > > +             int val;
> > > +
> > > +             if (optlen < sizeof(val))
> > > +                     return -EINVAL;
> > > +             if (copy_from_sockptr(&val, optval, sizeof(val)))
> > > +                     return -EFAULT;
> > > +             if (!val || val > 256 || val % 4)
> > > +                     return -EINVAL;
> > > +
> > > +             mutex_lock(&xs->mutex);
> > > +             if (xs->state != XSK_READY) {
> > > +                     mutex_unlock(&xs->mutex);
> > > +                     return -EBUSY;
> > > +             }
> > > +             xs->tx_metadata_len = val;
> > > +             mutex_unlock(&xs->mutex);
> > > +             return 0;
> > > +     }
> > >       default:
> > >               break;
> > >       }
> > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > index b3f7b310811e..b351732f1032 100644
> > > --- a/net/xdp/xsk_buff_pool.c
> > > +++ b/net/xdp/xsk_buff_pool.c
> > > @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > >               XDP_PACKET_HEADROOM;
> > >       pool->umem = umem;
> > >       pool->addrs = umem->addrs;
> > > +     pool->tx_metadata_len = xs->tx_metadata_len;
> >
> > Hey Stan,
> >
> > what would happen in case when one socket sets pool->tx_metadata_len say
> > to 16 and the other one that is sharing the pool to 24? If sockets should
> > and can have different padding, then this will not work unless the
> > metadata_len is on a per socket level.
>
> Hmm, good point. I didn't think about umem sharing :-/
> Maybe they all have to agree on the size? And once the size has been
> size by one socket, the same size should be set on the others? (or at
> least be implied that the other sockets will use the same metadata
> layout)

Thinking about it a bit more: should that tx_metadata_len be part of a
umem then?
Users can provide it as part of xdp_umem_reg which is probably a
better place to pass it compared to the setsockopt?
Magnus Karlsson Aug. 15, 2023, 12:19 p.m. UTC | #4
On Tue, 15 Aug 2023 at 00:25, Stanislav Fomichev <sdf@google.com> wrote:
>
> On Mon, Aug 14, 2023 at 11:05 AM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On Mon, Aug 14, 2023 at 3:57 AM Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > On Wed, Aug 09, 2023 at 09:54:10AM -0700, Stanislav Fomichev wrote:
> > > > For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> > > > and carry some TX metadata in the headroom. For copy mode, there
> > > > is no way currently to populate skb metadata.
> > > >
> > > > Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
> > > > to treat as metadata. Metadata bytes come prior to tx_desc address
> > > > (same as in RX case).
> > > >
> > > > The size of the metadata has the same constraints as XDP:
> > > > - less than 256 bytes
> > > > - 4-byte aligned
> > > > - non-zero
> > > >
> > > > This data is not interpreted in any way right now.
> > > >
> > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > ---
> > > >  include/net/xdp_sock.h      |  1 +
> > > >  include/net/xsk_buff_pool.h |  1 +
> > > >  include/uapi/linux/if_xdp.h |  1 +
> > > >  net/xdp/xsk.c               | 20 ++++++++++++++++++++
> > > >  net/xdp/xsk_buff_pool.c     |  1 +
> > > >  net/xdp/xsk_queue.h         | 17 ++++++++++-------
> > > >  6 files changed, 34 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > > > index 1617af380162..467b9fb56827 100644
> > > > --- a/include/net/xdp_sock.h
> > > > +++ b/include/net/xdp_sock.h
> > > > @@ -51,6 +51,7 @@ struct xdp_sock {
> > > >       struct list_head flush_node;
> > > >       struct xsk_buff_pool *pool;
> > > >       u16 queue_id;
> > > > +     u8 tx_metadata_len;
> > > >       bool zc;
> > > >       bool sg;
> > > >       enum {
> > > > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > > > index b0bdff26fc88..9c31e8d1e198 100644
> > > > --- a/include/net/xsk_buff_pool.h
> > > > +++ b/include/net/xsk_buff_pool.h
> > > > @@ -77,6 +77,7 @@ struct xsk_buff_pool {
> > > >       u32 chunk_size;
> > > >       u32 chunk_shift;
> > > >       u32 frame_len;
> > > > +     u8 tx_metadata_len; /* inherited from xsk_sock */
> > > >       u8 cached_need_wakeup;
> > > >       bool uses_need_wakeup;
> > > >       bool dma_need_sync;
> > > > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> > > > index 8d48863472b9..b37b50102e1c 100644
> > > > --- a/include/uapi/linux/if_xdp.h
> > > > +++ b/include/uapi/linux/if_xdp.h
> > > > @@ -69,6 +69,7 @@ struct xdp_mmap_offsets {
> > > >  #define XDP_UMEM_COMPLETION_RING     6
> > > >  #define XDP_STATISTICS                       7
> > > >  #define XDP_OPTIONS                  8
> > > > +#define XDP_TX_METADATA_LEN          9
> > > >
> > > >  struct xdp_umem_reg {
> > > >       __u64 addr; /* Start of packet data area */
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 47796a5a79b3..28df3280501d 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -1338,6 +1338,26 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> > > >               mutex_unlock(&xs->mutex);
> > > >               return err;
> > > >       }
> > > > +     case XDP_TX_METADATA_LEN:
> > > > +     {
> > > > +             int val;
> > > > +
> > > > +             if (optlen < sizeof(val))
> > > > +                     return -EINVAL;
> > > > +             if (copy_from_sockptr(&val, optval, sizeof(val)))
> > > > +                     return -EFAULT;
> > > > +             if (!val || val > 256 || val % 4)
> > > > +                     return -EINVAL;
> > > > +
> > > > +             mutex_lock(&xs->mutex);
> > > > +             if (xs->state != XSK_READY) {
> > > > +                     mutex_unlock(&xs->mutex);
> > > > +                     return -EBUSY;
> > > > +             }
> > > > +             xs->tx_metadata_len = val;
> > > > +             mutex_unlock(&xs->mutex);
> > > > +             return 0;
> > > > +     }
> > > >       default:
> > > >               break;
> > > >       }
> > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > index b3f7b310811e..b351732f1032 100644
> > > > --- a/net/xdp/xsk_buff_pool.c
> > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > >               XDP_PACKET_HEADROOM;
> > > >       pool->umem = umem;
> > > >       pool->addrs = umem->addrs;
> > > > +     pool->tx_metadata_len = xs->tx_metadata_len;
> > >
> > > Hey Stan,
> > >
> > > what would happen in case when one socket sets pool->tx_metadata_len say
> > > to 16 and the other one that is sharing the pool to 24? If sockets should
> > > and can have different padding, then this will not work unless the
> > > metadata_len is on a per socket level.
> >
> > Hmm, good point. I didn't think about umem sharing :-/
> > Maybe they all have to agree on the size? And once the size has been
> > size by one socket, the same size should be set on the others? (or at
> > least be implied that the other sockets will use the same metadata
> > layout)
>
> Thinking about it a bit more: should that tx_metadata_len be part of a
> umem then?
> Users can provide it as part of xdp_umem_reg which is probably a
> better place to pass it compared to the setsockopt?

If all the sockets sharing the umem have to agree on the
tx_metadata_len, then this is a better place than the setsockopt.
IMHO, it sounds unlikely that multiple versions of the same program,
or different programs, would like to share the same umem.

Please note that some of the members of struct xdp_umem are copied out
to the individual struct xsk_buff_pool even though they are the same
between all of them (chunk_size and the size of the umem for example).
The reason for this is performance, as to avoid having to access the
umem struct in the fast path. Something similar might be a good idea
here too, even though tx_metadata_len is fixed for a umem being
shared. Might be worth trying.

Again, thanks for working on this Stanislav.
Stanislav Fomichev Aug. 15, 2023, 6:21 p.m. UTC | #5
On Tue, Aug 15, 2023 at 5:19 AM Magnus Karlsson
<magnus.karlsson@gmail.com> wrote:
>
> On Tue, 15 Aug 2023 at 00:25, Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On Mon, Aug 14, 2023 at 11:05 AM Stanislav Fomichev <sdf@google.com> wrote:
> > >
> > > On Mon, Aug 14, 2023 at 3:57 AM Maciej Fijalkowski
> > > <maciej.fijalkowski@intel.com> wrote:
> > > >
> > > > On Wed, Aug 09, 2023 at 09:54:10AM -0700, Stanislav Fomichev wrote:
> > > > > For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> > > > > and carry some TX metadata in the headroom. For copy mode, there
> > > > > is no way currently to populate skb metadata.
> > > > >
> > > > > Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
> > > > > to treat as metadata. Metadata bytes come prior to tx_desc address
> > > > > (same as in RX case).
> > > > >
> > > > > The size of the metadata has the same constraints as XDP:
> > > > > - less than 256 bytes
> > > > > - 4-byte aligned
> > > > > - non-zero
> > > > >
> > > > > This data is not interpreted in any way right now.
> > > > >
> > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > > ---
> > > > >  include/net/xdp_sock.h      |  1 +
> > > > >  include/net/xsk_buff_pool.h |  1 +
> > > > >  include/uapi/linux/if_xdp.h |  1 +
> > > > >  net/xdp/xsk.c               | 20 ++++++++++++++++++++
> > > > >  net/xdp/xsk_buff_pool.c     |  1 +
> > > > >  net/xdp/xsk_queue.h         | 17 ++++++++++-------
> > > > >  6 files changed, 34 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > > > > index 1617af380162..467b9fb56827 100644
> > > > > --- a/include/net/xdp_sock.h
> > > > > +++ b/include/net/xdp_sock.h
> > > > > @@ -51,6 +51,7 @@ struct xdp_sock {
> > > > >       struct list_head flush_node;
> > > > >       struct xsk_buff_pool *pool;
> > > > >       u16 queue_id;
> > > > > +     u8 tx_metadata_len;
> > > > >       bool zc;
> > > > >       bool sg;
> > > > >       enum {
> > > > > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > > > > index b0bdff26fc88..9c31e8d1e198 100644
> > > > > --- a/include/net/xsk_buff_pool.h
> > > > > +++ b/include/net/xsk_buff_pool.h
> > > > > @@ -77,6 +77,7 @@ struct xsk_buff_pool {
> > > > >       u32 chunk_size;
> > > > >       u32 chunk_shift;
> > > > >       u32 frame_len;
> > > > > +     u8 tx_metadata_len; /* inherited from xsk_sock */
> > > > >       u8 cached_need_wakeup;
> > > > >       bool uses_need_wakeup;
> > > > >       bool dma_need_sync;
> > > > > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> > > > > index 8d48863472b9..b37b50102e1c 100644
> > > > > --- a/include/uapi/linux/if_xdp.h
> > > > > +++ b/include/uapi/linux/if_xdp.h
> > > > > @@ -69,6 +69,7 @@ struct xdp_mmap_offsets {
> > > > >  #define XDP_UMEM_COMPLETION_RING     6
> > > > >  #define XDP_STATISTICS                       7
> > > > >  #define XDP_OPTIONS                  8
> > > > > +#define XDP_TX_METADATA_LEN          9
> > > > >
> > > > >  struct xdp_umem_reg {
> > > > >       __u64 addr; /* Start of packet data area */
> > > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > > index 47796a5a79b3..28df3280501d 100644
> > > > > --- a/net/xdp/xsk.c
> > > > > +++ b/net/xdp/xsk.c
> > > > > @@ -1338,6 +1338,26 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> > > > >               mutex_unlock(&xs->mutex);
> > > > >               return err;
> > > > >       }
> > > > > +     case XDP_TX_METADATA_LEN:
> > > > > +     {
> > > > > +             int val;
> > > > > +
> > > > > +             if (optlen < sizeof(val))
> > > > > +                     return -EINVAL;
> > > > > +             if (copy_from_sockptr(&val, optval, sizeof(val)))
> > > > > +                     return -EFAULT;
> > > > > +             if (!val || val > 256 || val % 4)
> > > > > +                     return -EINVAL;
> > > > > +
> > > > > +             mutex_lock(&xs->mutex);
> > > > > +             if (xs->state != XSK_READY) {
> > > > > +                     mutex_unlock(&xs->mutex);
> > > > > +                     return -EBUSY;
> > > > > +             }
> > > > > +             xs->tx_metadata_len = val;
> > > > > +             mutex_unlock(&xs->mutex);
> > > > > +             return 0;
> > > > > +     }
> > > > >       default:
> > > > >               break;
> > > > >       }
> > > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > > index b3f7b310811e..b351732f1032 100644
> > > > > --- a/net/xdp/xsk_buff_pool.c
> > > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > > @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > > >               XDP_PACKET_HEADROOM;
> > > > >       pool->umem = umem;
> > > > >       pool->addrs = umem->addrs;
> > > > > +     pool->tx_metadata_len = xs->tx_metadata_len;
> > > >
> > > > Hey Stan,
> > > >
> > > > what would happen in case when one socket sets pool->tx_metadata_len say
> > > > to 16 and the other one that is sharing the pool to 24? If sockets should
> > > > and can have different padding, then this will not work unless the
> > > > metadata_len is on a per socket level.
> > >
> > > Hmm, good point. I didn't think about umem sharing :-/
> > > Maybe they all have to agree on the size? And once the size has been
> > > size by one socket, the same size should be set on the others? (or at
> > > least be implied that the other sockets will use the same metadata
> > > layout)
> >
> > Thinking about it a bit more: should that tx_metadata_len be part of a
> > umem then?
> > Users can provide it as part of xdp_umem_reg which is probably a
> > better place to pass it compared to the setsockopt?
>
> If all the sockets sharing the umem have to agree on the
> tx_metadata_len, then this is a better place than the setsockopt.
> IMHO, it sounds unlikely that multiple versions of the same program,
> or different programs, would like to share the same umem.
>
> Please note that some of the members of struct xdp_umem are copied out
> to the individual struct xsk_buff_pool even though they are the same
> between all of them (chunk_size and the size of the umem for example).
> The reason for this is performance, as to avoid having to access the
> umem struct in the fast path. Something similar might be a good idea
> here too, even though tx_metadata_len is fixed for a umem being
> shared. Might be worth trying.
>
> Again, thanks for working on this Stanislav.

Perfect, thank you, I was trying a similar idea with a copy into the pool!
diff mbox series

Patch

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 1617af380162..467b9fb56827 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -51,6 +51,7 @@  struct xdp_sock {
 	struct list_head flush_node;
 	struct xsk_buff_pool *pool;
 	u16 queue_id;
+	u8 tx_metadata_len;
 	bool zc;
 	bool sg;
 	enum {
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..9c31e8d1e198 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -77,6 +77,7 @@  struct xsk_buff_pool {
 	u32 chunk_size;
 	u32 chunk_shift;
 	u32 frame_len;
+	u8 tx_metadata_len; /* inherited from xsk_sock */
 	u8 cached_need_wakeup;
 	bool uses_need_wakeup;
 	bool dma_need_sync;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 8d48863472b9..b37b50102e1c 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -69,6 +69,7 @@  struct xdp_mmap_offsets {
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
+#define XDP_TX_METADATA_LEN		9
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 47796a5a79b3..28df3280501d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1338,6 +1338,26 @@  static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
+	case XDP_TX_METADATA_LEN:
+	{
+		int val;
+
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_sockptr(&val, optval, sizeof(val)))
+			return -EFAULT;
+		if (!val || val > 256 || val % 4)
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		if (xs->state != XSK_READY) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
+		xs->tx_metadata_len = val;
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
 	default:
 		break;
 	}
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index b3f7b310811e..b351732f1032 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -85,6 +85,7 @@  struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 		XDP_PACKET_HEADROOM;
 	pool->umem = umem;
 	pool->addrs = umem->addrs;
+	pool->tx_metadata_len = xs->tx_metadata_len;
 	INIT_LIST_HEAD(&pool->free_list);
 	INIT_LIST_HEAD(&pool->xskb_list);
 	INIT_LIST_HEAD(&pool->xsk_tx_list);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 13354a1e4280..c74a1372bcb9 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -143,15 +143,17 @@  static inline bool xp_unused_options_set(u32 options)
 static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
 					    struct xdp_desc *desc)
 {
-	u64 offset = desc->addr & (pool->chunk_size - 1);
+	u64 addr = desc->addr - pool->tx_metadata_len;
+	u64 len = desc->len + pool->tx_metadata_len;
+	u64 offset = addr & (pool->chunk_size - 1);
 
 	if (!desc->len)
 		return false;
 
-	if (offset + desc->len > pool->chunk_size)
+	if (offset + len > pool->chunk_size)
 		return false;
 
-	if (desc->addr >= pool->addrs_cnt)
+	if (addr >= pool->addrs_cnt)
 		return false;
 
 	if (xp_unused_options_set(desc->options))
@@ -162,16 +164,17 @@  static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
 static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
 					      struct xdp_desc *desc)
 {
-	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
+	u64 len = desc->len + pool->tx_metadata_len;
 
 	if (!desc->len)
 		return false;
 
-	if (desc->len > pool->chunk_size)
+	if (len > pool->chunk_size)
 		return false;
 
-	if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
-	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+	if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
+	    xp_desc_crosses_non_contig_pg(pool, addr, len))
 		return false;
 
 	if (xp_unused_options_set(desc->options))