diff mbox series

[bpf-next,v4,01/11] xsk: Support tx_metadata_len

Message ID 20231019174944.3376335-2-sdf@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xsk: TX metadata | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1389 this patch: 1389
netdev/cc_maintainers warning 4 maintainers not CCed: jonathan.lemon@gmail.com davem@davemloft.net pabeni@redhat.com edumazet@google.com
netdev/build_clang success Errors and warnings before: 1386 this patch: 1386
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1418 this patch: 1418
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat

Commit Message

Stanislav Fomichev Oct. 19, 2023, 5:49 p.m. UTC
For zerocopy mode, tx_desc->addr can point to the arbitrary offset
and carry some TX metadata in the headroom. For copy mode, there
is no way currently to populate skb metadata.

Introduce new tx_metadata_len umem config option that indicates how many
bytes to treat as metadata. Metadata bytes come prior to tx_desc address
(same as in RX case).

The size of the metadata has the same constraints as XDP:
- less than 256 bytes
- 4-byte aligned
- non-zero

This data is not interpreted in any way right now.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/net/xdp_sock.h            |  1 +
 include/net/xsk_buff_pool.h       |  1 +
 include/uapi/linux/if_xdp.h       |  1 +
 net/xdp/xdp_umem.c                |  4 ++++
 net/xdp/xsk.c                     | 12 +++++++++++-
 net/xdp/xsk_buff_pool.c           |  1 +
 net/xdp/xsk_queue.h               | 17 ++++++++++-------
 tools/include/uapi/linux/if_xdp.h |  1 +
 8 files changed, 30 insertions(+), 8 deletions(-)

Comments

Song Yoong Siang Oct. 20, 2023, 2:29 p.m. UTC | #1
On Friday, October 20, 2023 1:50 AM Stanislav Fomichev <sdf@google.com> wrote:
>For zerocopy mode, tx_desc->addr can point to the arbitrary offset
>and carry some TX metadata in the headroom. For copy mode, there
>is no way currently to populate skb metadata.
>
>Introduce new tx_metadata_len umem config option that indicates how many
>bytes to treat as metadata. Metadata bytes come prior to tx_desc address
>(same as in RX case).
>
>The size of the metadata has the same constraints as XDP:
>- less than 256 bytes
>- 4-byte aligned
>- non-zero
>
>This data is not interpreted in any way right now.
>
>Signed-off-by: Stanislav Fomichev <sdf@google.com>

LGTM.
Reviewed-by: Song Yoong Siang <yoong.siang.song@intel.com>

>---
> include/net/xdp_sock.h            |  1 +
> include/net/xsk_buff_pool.h       |  1 +
> include/uapi/linux/if_xdp.h       |  1 +
> net/xdp/xdp_umem.c                |  4 ++++
> net/xdp/xsk.c                     | 12 +++++++++++-
> net/xdp/xsk_buff_pool.c           |  1 +
> net/xdp/xsk_queue.h               | 17 ++++++++++-------
> tools/include/uapi/linux/if_xdp.h |  1 +
> 8 files changed, 30 insertions(+), 8 deletions(-)
>
>diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>index 7dd0df2f6f8e..5ae88a00f34a 100644
>--- a/include/net/xdp_sock.h
>+++ b/include/net/xdp_sock.h
>@@ -30,6 +30,7 @@ struct xdp_umem {
> 	struct user_struct *user;
> 	refcount_t users;
> 	u8 flags;
>+	u8 tx_metadata_len;
> 	bool zc;
> 	struct page **pgs;
> 	int id;
>diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
>index b0bdff26fc88..1985ffaf9b0c 100644
>--- a/include/net/xsk_buff_pool.h
>+++ b/include/net/xsk_buff_pool.h
>@@ -77,6 +77,7 @@ struct xsk_buff_pool {
> 	u32 chunk_size;
> 	u32 chunk_shift;
> 	u32 frame_len;
>+	u8 tx_metadata_len; /* inherited from umem */
> 	u8 cached_need_wakeup;
> 	bool uses_need_wakeup;
> 	bool dma_need_sync;
>diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>index 8d48863472b9..2ecf79282c26 100644
>--- a/include/uapi/linux/if_xdp.h
>+++ b/include/uapi/linux/if_xdp.h
>@@ -76,6 +76,7 @@ struct xdp_umem_reg {
> 	__u32 chunk_size;
> 	__u32 headroom;
> 	__u32 flags;
>+	__u32 tx_metadata_len;
> };
>
> struct xdp_statistics {
>diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
>index 06cead2b8e34..333f3d53aad4 100644
>--- a/net/xdp/xdp_umem.c
>+++ b/net/xdp/xdp_umem.c
>@@ -199,6 +199,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct
>xdp_umem_reg *mr)
> 	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
> 		return -EINVAL;
>
>+	if (mr->tx_metadata_len > 256 || mr->tx_metadata_len % 4)
>+		return -EINVAL;
>+
> 	umem->size = size;
> 	umem->headroom = headroom;
> 	umem->chunk_size = chunk_size;
>@@ -207,6 +210,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct
>xdp_umem_reg *mr)
> 	umem->pgs = NULL;
> 	umem->user = NULL;
> 	umem->flags = mr->flags;
>+	umem->tx_metadata_len = mr->tx_metadata_len;
>
> 	INIT_LIST_HEAD(&umem->xsk_dma_list);
> 	refcount_set(&umem->users, 1);
>diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>index ba070fd37d24..ba4c77a24a83 100644
>--- a/net/xdp/xsk.c
>+++ b/net/xdp/xsk.c
>@@ -1265,6 +1265,14 @@ struct xdp_umem_reg_v1 {
> 	__u32 headroom;
> };
>
>+struct xdp_umem_reg_v2 {
>+	__u64 addr; /* Start of packet data area */
>+	__u64 len; /* Length of packet data area */
>+	__u32 chunk_size;
>+	__u32 headroom;
>+	__u32 flags;
>+};
>+
> static int xsk_setsockopt(struct socket *sock, int level, int optname,
> 			  sockptr_t optval, unsigned int optlen)
> {
>@@ -1308,8 +1316,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int
>optname,
>
> 		if (optlen < sizeof(struct xdp_umem_reg_v1))
> 			return -EINVAL;
>-		else if (optlen < sizeof(mr))
>+		else if (optlen < sizeof(struct xdp_umem_reg_v2))
> 			mr_size = sizeof(struct xdp_umem_reg_v1);
>+		else if (optlen < sizeof(mr))
>+			mr_size = sizeof(struct xdp_umem_reg_v2);
>
> 		if (copy_from_sockptr(&mr, optval, mr_size))
> 			return -EFAULT;
>diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
>index 49cb9f9a09be..386eddcdf837 100644
>--- a/net/xdp/xsk_buff_pool.c
>+++ b/net/xdp/xsk_buff_pool.c
>@@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct
>xdp_sock *xs,
> 		XDP_PACKET_HEADROOM;
> 	pool->umem = umem;
> 	pool->addrs = umem->addrs;
>+	pool->tx_metadata_len = umem->tx_metadata_len;
> 	INIT_LIST_HEAD(&pool->free_list);
> 	INIT_LIST_HEAD(&pool->xskb_list);
> 	INIT_LIST_HEAD(&pool->xsk_tx_list);
>diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
>index 13354a1e4280..c74a1372bcb9 100644
>--- a/net/xdp/xsk_queue.h
>+++ b/net/xdp/xsk_queue.h
>@@ -143,15 +143,17 @@ static inline bool xp_unused_options_set(u32 options)
> static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
> 					    struct xdp_desc *desc)
> {
>-	u64 offset = desc->addr & (pool->chunk_size - 1);
>+	u64 addr = desc->addr - pool->tx_metadata_len;
>+	u64 len = desc->len + pool->tx_metadata_len;
>+	u64 offset = addr & (pool->chunk_size - 1);
>
> 	if (!desc->len)
> 		return false;
>
>-	if (offset + desc->len > pool->chunk_size)
>+	if (offset + len > pool->chunk_size)
> 		return false;
>
>-	if (desc->addr >= pool->addrs_cnt)
>+	if (addr >= pool->addrs_cnt)
> 		return false;
>
> 	if (xp_unused_options_set(desc->options))
>@@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct
>xsk_buff_pool *pool,
> static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
> 					      struct xdp_desc *desc)
> {
>-	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
>+	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool-
>>tx_metadata_len;
>+	u64 len = desc->len + pool->tx_metadata_len;
>
> 	if (!desc->len)
> 		return false;
>
>-	if (desc->len > pool->chunk_size)
>+	if (len > pool->chunk_size)
> 		return false;
>
>-	if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
>-	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
>+	if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
>+	    xp_desc_crosses_non_contig_pg(pool, addr, len))
> 		return false;
>
> 	if (xp_unused_options_set(desc->options))
>diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
>index 73a47da885dc..34411a2e5b6c 100644
>--- a/tools/include/uapi/linux/if_xdp.h
>+++ b/tools/include/uapi/linux/if_xdp.h
>@@ -76,6 +76,7 @@ struct xdp_umem_reg {
> 	__u32 chunk_size;
> 	__u32 headroom;
> 	__u32 flags;
>+	__u32 tx_metadata_len;
> };
>
> struct xdp_statistics {
>--
>2.42.0.655.g421f12c284-goog
Jakub Kicinski Oct. 21, 2023, 1:12 a.m. UTC | #2
On Thu, 19 Oct 2023 10:49:34 -0700 Stanislav Fomichev wrote:
> - 4-byte aligned

But there is an 8B field in it. Won't this trap on some funky
architecture of yore?
Magnus Karlsson Oct. 23, 2023, 8:28 a.m. UTC | #3
On Thu, 19 Oct 2023 at 19:50, Stanislav Fomichev <sdf@google.com> wrote:
>
> For zerocopy mode, tx_desc->addr can point to the arbitrary offset

nit: the -> an

> and carry some TX metadata in the headroom. For copy mode, there
> is no way currently to populate skb metadata.
>
> Introduce new tx_metadata_len umem config option that indicates how many
> bytes to treat as metadata. Metadata bytes come prior to tx_desc address
> (same as in RX case).
>
> The size of the metadata has the same constraints as XDP:
> - less than 256 bytes
> - 4-byte aligned
> - non-zero
>
> This data is not interpreted in any way right now.
>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  include/net/xdp_sock.h            |  1 +
>  include/net/xsk_buff_pool.h       |  1 +
>  include/uapi/linux/if_xdp.h       |  1 +
>  net/xdp/xdp_umem.c                |  4 ++++
>  net/xdp/xsk.c                     | 12 +++++++++++-
>  net/xdp/xsk_buff_pool.c           |  1 +
>  net/xdp/xsk_queue.h               | 17 ++++++++++-------
>  tools/include/uapi/linux/if_xdp.h |  1 +
>  8 files changed, 30 insertions(+), 8 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 7dd0df2f6f8e..5ae88a00f34a 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -30,6 +30,7 @@ struct xdp_umem {
>         struct user_struct *user;
>         refcount_t users;
>         u8 flags;
> +       u8 tx_metadata_len;
>         bool zc;
>         struct page **pgs;
>         int id;
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index b0bdff26fc88..1985ffaf9b0c 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -77,6 +77,7 @@ struct xsk_buff_pool {
>         u32 chunk_size;
>         u32 chunk_shift;
>         u32 frame_len;
> +       u8 tx_metadata_len; /* inherited from umem */
>         u8 cached_need_wakeup;
>         bool uses_need_wakeup;
>         bool dma_need_sync;
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index 8d48863472b9..2ecf79282c26 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -76,6 +76,7 @@ struct xdp_umem_reg {
>         __u32 chunk_size;
>         __u32 headroom;
>         __u32 flags;
> +       __u32 tx_metadata_len;
>  };
>
>  struct xdp_statistics {
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 06cead2b8e34..333f3d53aad4 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -199,6 +199,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>         if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
>                 return -EINVAL;
>
> +       if (mr->tx_metadata_len > 256 || mr->tx_metadata_len % 4)
> +               return -EINVAL;

Should be >= 256 since the final internal destination is a u8 and the
documentation says "should be less than 256 bytes".

> +
>         umem->size = size;
>         umem->headroom = headroom;
>         umem->chunk_size = chunk_size;
> @@ -207,6 +210,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>         umem->pgs = NULL;
>         umem->user = NULL;
>         umem->flags = mr->flags;
> +       umem->tx_metadata_len = mr->tx_metadata_len;
>
>         INIT_LIST_HEAD(&umem->xsk_dma_list);
>         refcount_set(&umem->users, 1);
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ba070fd37d24..ba4c77a24a83 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -1265,6 +1265,14 @@ struct xdp_umem_reg_v1 {
>         __u32 headroom;
>  };
>
> +struct xdp_umem_reg_v2 {
> +       __u64 addr; /* Start of packet data area */
> +       __u64 len; /* Length of packet data area */
> +       __u32 chunk_size;
> +       __u32 headroom;
> +       __u32 flags;
> +};
> +
>  static int xsk_setsockopt(struct socket *sock, int level, int optname,
>                           sockptr_t optval, unsigned int optlen)
>  {
> @@ -1308,8 +1316,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
>
>                 if (optlen < sizeof(struct xdp_umem_reg_v1))
>                         return -EINVAL;
> -               else if (optlen < sizeof(mr))
> +               else if (optlen < sizeof(struct xdp_umem_reg_v2))
>                         mr_size = sizeof(struct xdp_umem_reg_v1);
> +               else if (optlen < sizeof(mr))
> +                       mr_size = sizeof(struct xdp_umem_reg_v2);
>
>                 if (copy_from_sockptr(&mr, optval, mr_size))
>                         return -EFAULT;
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index 49cb9f9a09be..386eddcdf837 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
>                 XDP_PACKET_HEADROOM;
>         pool->umem = umem;
>         pool->addrs = umem->addrs;
> +       pool->tx_metadata_len = umem->tx_metadata_len;
>         INIT_LIST_HEAD(&pool->free_list);
>         INIT_LIST_HEAD(&pool->xskb_list);
>         INIT_LIST_HEAD(&pool->xsk_tx_list);
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 13354a1e4280..c74a1372bcb9 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -143,15 +143,17 @@ static inline bool xp_unused_options_set(u32 options)
>  static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
>                                             struct xdp_desc *desc)
>  {
> -       u64 offset = desc->addr & (pool->chunk_size - 1);
> +       u64 addr = desc->addr - pool->tx_metadata_len;
> +       u64 len = desc->len + pool->tx_metadata_len;
> +       u64 offset = addr & (pool->chunk_size - 1);
>
>         if (!desc->len)
>                 return false;
>
> -       if (offset + desc->len > pool->chunk_size)
> +       if (offset + len > pool->chunk_size)
>                 return false;
>
> -       if (desc->addr >= pool->addrs_cnt)
> +       if (addr >= pool->addrs_cnt)
>                 return false;
>
>         if (xp_unused_options_set(desc->options))
> @@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
>  static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
>                                               struct xdp_desc *desc)
>  {
> -       u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
> +       u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
> +       u64 len = desc->len + pool->tx_metadata_len;
>
>         if (!desc->len)
>                 return false;
>
> -       if (desc->len > pool->chunk_size)
> +       if (len > pool->chunk_size)
>                 return false;
>
> -       if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
> -           xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
> +       if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
> +           xp_desc_crosses_non_contig_pg(pool, addr, len))
>                 return false;
>
>         if (xp_unused_options_set(desc->options))
> diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
> index 73a47da885dc..34411a2e5b6c 100644
> --- a/tools/include/uapi/linux/if_xdp.h
> +++ b/tools/include/uapi/linux/if_xdp.h
> @@ -76,6 +76,7 @@ struct xdp_umem_reg {
>         __u32 chunk_size;
>         __u32 headroom;
>         __u32 flags;
> +       __u32 tx_metadata_len;
>  };
>
>  struct xdp_statistics {
> --
> 2.42.0.655.g421f12c284-goog
>
>
Stanislav Fomichev Oct. 23, 2023, 5:33 p.m. UTC | #4
On 10/20, Jakub Kicinski wrote:
> On Thu, 19 Oct 2023 10:49:34 -0700 Stanislav Fomichev wrote:
> > - 4-byte aligned
> 
> But there is an 8B field in it. Won't this trap on some funky
> architecture of yore?

Hmm, good point, will update this to 8 byte alignment and will use
8-bit timestamp as a reason.
Stanislav Fomichev Oct. 23, 2023, 6:37 p.m. UTC | #5
On 10/23, Magnus Karlsson wrote:
> On Thu, 19 Oct 2023 at 19:50, Stanislav Fomichev <sdf@google.com> wrote:
> >
> > For zerocopy mode, tx_desc->addr can point to the arbitrary offset
> 
> nit: the -> an

Thanks!
 
> > and carry some TX metadata in the headroom. For copy mode, there
> > is no way currently to populate skb metadata.
> >
> > Introduce new tx_metadata_len umem config option that indicates how many
> > bytes to treat as metadata. Metadata bytes come prior to tx_desc address
> > (same as in RX case).
> >
> > The size of the metadata has the same constraints as XDP:
> > - less than 256 bytes
> > - 4-byte aligned
> > - non-zero
> >
> > This data is not interpreted in any way right now.
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  include/net/xdp_sock.h            |  1 +
> >  include/net/xsk_buff_pool.h       |  1 +
> >  include/uapi/linux/if_xdp.h       |  1 +
> >  net/xdp/xdp_umem.c                |  4 ++++
> >  net/xdp/xsk.c                     | 12 +++++++++++-
> >  net/xdp/xsk_buff_pool.c           |  1 +
> >  net/xdp/xsk_queue.h               | 17 ++++++++++-------
> >  tools/include/uapi/linux/if_xdp.h |  1 +
> >  8 files changed, 30 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index 7dd0df2f6f8e..5ae88a00f34a 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -30,6 +30,7 @@ struct xdp_umem {
> >         struct user_struct *user;
> >         refcount_t users;
> >         u8 flags;
> > +       u8 tx_metadata_len;
> >         bool zc;
> >         struct page **pgs;
> >         int id;
> > diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> > index b0bdff26fc88..1985ffaf9b0c 100644
> > --- a/include/net/xsk_buff_pool.h
> > +++ b/include/net/xsk_buff_pool.h
> > @@ -77,6 +77,7 @@ struct xsk_buff_pool {
> >         u32 chunk_size;
> >         u32 chunk_shift;
> >         u32 frame_len;
> > +       u8 tx_metadata_len; /* inherited from umem */
> >         u8 cached_need_wakeup;
> >         bool uses_need_wakeup;
> >         bool dma_need_sync;
> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> > index 8d48863472b9..2ecf79282c26 100644
> > --- a/include/uapi/linux/if_xdp.h
> > +++ b/include/uapi/linux/if_xdp.h
> > @@ -76,6 +76,7 @@ struct xdp_umem_reg {
> >         __u32 chunk_size;
> >         __u32 headroom;
> >         __u32 flags;
> > +       __u32 tx_metadata_len;
> >  };
> >
> >  struct xdp_statistics {
> > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> > index 06cead2b8e34..333f3d53aad4 100644
> > --- a/net/xdp/xdp_umem.c
> > +++ b/net/xdp/xdp_umem.c
> > @@ -199,6 +199,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> >         if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
> >                 return -EINVAL;
> >
> > +       if (mr->tx_metadata_len > 256 || mr->tx_metadata_len % 4)
> > +               return -EINVAL;
> 
> Should be >= 256 since the final internal destination is a u8 and the
> documentation says "should be less than 256 bytes".

Thanks, will fix.
diff mbox series

Patch

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7dd0df2f6f8e..5ae88a00f34a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@  struct xdp_umem {
 	struct user_struct *user;
 	refcount_t users;
 	u8 flags;
+	u8 tx_metadata_len;
 	bool zc;
 	struct page **pgs;
 	int id;
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..1985ffaf9b0c 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -77,6 +77,7 @@  struct xsk_buff_pool {
 	u32 chunk_size;
 	u32 chunk_shift;
 	u32 frame_len;
+	u8 tx_metadata_len; /* inherited from umem */
 	u8 cached_need_wakeup;
 	bool uses_need_wakeup;
 	bool dma_need_sync;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 8d48863472b9..2ecf79282c26 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -76,6 +76,7 @@  struct xdp_umem_reg {
 	__u32 chunk_size;
 	__u32 headroom;
 	__u32 flags;
+	__u32 tx_metadata_len;
 };
 
 struct xdp_statistics {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 06cead2b8e34..333f3d53aad4 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -199,6 +199,9 @@  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
 		return -EINVAL;
 
+	if (mr->tx_metadata_len > 256 || mr->tx_metadata_len % 4)
+		return -EINVAL;
+
 	umem->size = size;
 	umem->headroom = headroom;
 	umem->chunk_size = chunk_size;
@@ -207,6 +210,7 @@  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->pgs = NULL;
 	umem->user = NULL;
 	umem->flags = mr->flags;
+	umem->tx_metadata_len = mr->tx_metadata_len;
 
 	INIT_LIST_HEAD(&umem->xsk_dma_list);
 	refcount_set(&umem->users, 1);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ba070fd37d24..ba4c77a24a83 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1265,6 +1265,14 @@  struct xdp_umem_reg_v1 {
 	__u32 headroom;
 };
 
+struct xdp_umem_reg_v2 {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+	__u32 flags;
+};
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  sockptr_t optval, unsigned int optlen)
 {
@@ -1308,8 +1316,10 @@  static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 		if (optlen < sizeof(struct xdp_umem_reg_v1))
 			return -EINVAL;
-		else if (optlen < sizeof(mr))
+		else if (optlen < sizeof(struct xdp_umem_reg_v2))
 			mr_size = sizeof(struct xdp_umem_reg_v1);
+		else if (optlen < sizeof(mr))
+			mr_size = sizeof(struct xdp_umem_reg_v2);
 
 		if (copy_from_sockptr(&mr, optval, mr_size))
 			return -EFAULT;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 49cb9f9a09be..386eddcdf837 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -85,6 +85,7 @@  struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 		XDP_PACKET_HEADROOM;
 	pool->umem = umem;
 	pool->addrs = umem->addrs;
+	pool->tx_metadata_len = umem->tx_metadata_len;
 	INIT_LIST_HEAD(&pool->free_list);
 	INIT_LIST_HEAD(&pool->xskb_list);
 	INIT_LIST_HEAD(&pool->xsk_tx_list);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 13354a1e4280..c74a1372bcb9 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -143,15 +143,17 @@  static inline bool xp_unused_options_set(u32 options)
 static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
 					    struct xdp_desc *desc)
 {
-	u64 offset = desc->addr & (pool->chunk_size - 1);
+	u64 addr = desc->addr - pool->tx_metadata_len;
+	u64 len = desc->len + pool->tx_metadata_len;
+	u64 offset = addr & (pool->chunk_size - 1);
 
 	if (!desc->len)
 		return false;
 
-	if (offset + desc->len > pool->chunk_size)
+	if (offset + len > pool->chunk_size)
 		return false;
 
-	if (desc->addr >= pool->addrs_cnt)
+	if (addr >= pool->addrs_cnt)
 		return false;
 
 	if (xp_unused_options_set(desc->options))
@@ -162,16 +164,17 @@  static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
 static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
 					      struct xdp_desc *desc)
 {
-	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+	u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
+	u64 len = desc->len + pool->tx_metadata_len;
 
 	if (!desc->len)
 		return false;
 
-	if (desc->len > pool->chunk_size)
+	if (len > pool->chunk_size)
 		return false;
 
-	if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
-	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+	if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
+	    xp_desc_crosses_non_contig_pg(pool, addr, len))
 		return false;
 
 	if (xp_unused_options_set(desc->options))
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 73a47da885dc..34411a2e5b6c 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -76,6 +76,7 @@  struct xdp_umem_reg {
 	__u32 chunk_size;
 	__u32 headroom;
 	__u32 flags;
+	__u32 tx_metadata_len;
 };
 
 struct xdp_statistics {