diff mbox series

[bpf-next,v2,2/6] net: tun: enable transfer of XDP metadata to skb

Message ID 20250217172308.3291739-3-marcus.wichelmann@hetzner-cloud.de (mailing list archive)
State New
Headers show
Series XDP metadata support for tun driver | expand

Commit Message

Marcus Wichelmann Feb. 17, 2025, 5:23 p.m. UTC
When the XDP metadata area was used, it is expected that the same
metadata can also be accessed from TC, as can be read in the description
of the bpf_xdp_adjust_meta helper function. In the tun driver, this was
not yet implemented.

To make this work, the skb that is being built on XDP_PASS should know
of the current size of the metadata area. This is ensured by adding
calls to skb_metadata_set. For the tun_xdp_one code path, an additional
check is necessary to handle the case where the externally initialized
xdp_buff has no metadata support (xdp->data_meta == xdp->data + 1).

More information about this feature can be found in the commit message
of commit de8f3a83b0a0 ("bpf: add meta pointer for direct access").

Signed-off-by: Marcus Wichelmann <marcus.wichelmann@hetzner-cloud.de>
---
 drivers/net/tun.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

Comments

Willem de Bruijn Feb. 18, 2025, 1:45 a.m. UTC | #1
Marcus Wichelmann wrote:
> When the XDP metadata area was used, it is expected that the same
> metadata can also be accessed from TC, as can be read in the description
> of the bpf_xdp_adjust_meta helper function. In the tun driver, this was
> not yet implemented.
> 
> To make this work, the skb that is being built on XDP_PASS should know
> of the current size of the metadata area. This is ensured by adding
> calls to skb_metadata_set. For the tun_xdp_one code path, an additional
> check is necessary to handle the case where the externally initialized
> xdp_buff has no metadata support (xdp->data_meta == xdp->data + 1).
> 
> More information about this feature can be found in the commit message
> of commit de8f3a83b0a0 ("bpf: add meta pointer for direct access").
> 
> Signed-off-by: Marcus Wichelmann <marcus.wichelmann@hetzner-cloud.de>
> ---
>  drivers/net/tun.c | 21 ++++++++++++++++++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c95ab9c46bd2..3dde6cd29a84 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1604,7 +1604,8 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
>  
>  static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
>  				       struct page_frag *alloc_frag, char *buf,
> -				       int buflen, int len, int pad)
> +				       int buflen, int len, int pad,
> +				       int metasize)
>  {
>  	struct sk_buff *skb = build_skb(buf, buflen);
>  
> @@ -1613,6 +1614,8 @@ static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
>  
>  	skb_reserve(skb, pad);
>  	skb_put(skb, len);
> +	if (metasize)
> +		skb_metadata_set(skb, metasize);
>  	skb_set_owner_w(skb, tfile->socket.sk);
>  
>  	get_page(alloc_frag->page);
> @@ -1672,6 +1675,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	char *buf;
>  	size_t copied;
>  	int pad = TUN_RX_PAD;
> +	int metasize = 0;
>  	int err = 0;
>  
>  	rcu_read_lock();
> @@ -1699,7 +1703,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	if (hdr->gso_type || !xdp_prog) {
>  		*skb_xdp = 1;
>  		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
> -				       pad);
> +				       pad, metasize);
>  	}
>  
>  	*skb_xdp = 0;
> @@ -1734,12 +1738,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  
>  		pad = xdp.data - xdp.data_hard_start;
>  		len = xdp.data_end - xdp.data;
> +
> +		/* It is known that the xdp_buff was prepared with metadata
> +		 * support, so no additional check is necessary.
> +		 */
> +		metasize = xdp.data - xdp.data_meta;
>  	}
>  	bpf_net_ctx_clear(bpf_net_ctx);
>  	rcu_read_unlock();
>  	local_bh_enable();
>  
> -	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
> +	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad,
> +			       metasize);
>  
>  out:
>  	bpf_net_ctx_clear(bpf_net_ctx);
> @@ -2456,6 +2466,7 @@ static int tun_xdp_one(struct tun_struct *tun,
>  	struct sk_buff_head *queue;
>  	u32 rxhash = 0, act;
>  	int buflen = hdr->buflen;
> +	int metasize = 0;
>  	int ret = 0;
>  	bool skb_xdp = false;
>  	struct page *page;
> @@ -2510,6 +2521,10 @@ static int tun_xdp_one(struct tun_struct *tun,
>  	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>  	skb_put(skb, xdp->data_end - xdp->data);
>  
> +	metasize = max(xdp->data - xdp->data_meta, 0);

Is it ever possible for xdp->data_meta to be greater than xdp->data?

This is pointer arithmetic, which is a bit complex wrt type. This is
likely ptrdiff_t, which is signed. But may want to use max_t(int, to
make this more explicit.

> +	if (metasize)
> +		skb_metadata_set(skb, metasize);
> +

Technically not needed as skb_metadata_clear is just
skb_metadata_set(skb, 0). But fine to test and elide.

>  	if (virtio_net_hdr_to_skb(skb, gso, tun_is_lttle_endian(tun))) {
>  		atomic_long_inc(&tun->rx_frame_errors);
>  		kfree_skb(skb);
> -- 
> 2.43.0
>
Willem de Bruijn Feb. 18, 2025, 1:47 a.m. UTC | #2
Marcus Wichelmann wrote:
> When the XDP metadata area was used, it is expected that the same
> metadata can also be accessed from TC, as can be read in the description
> of the bpf_xdp_adjust_meta helper function. In the tun driver, this was
> not yet implemented.
> 
> To make this work, the skb that is being built on XDP_PASS should know
> of the current size of the metadata area. This is ensured by adding
> calls to skb_metadata_set. For the tun_xdp_one code path, an additional
> check is necessary to handle the case where the externally initialized
> xdp_buff has no metadata support (xdp->data_meta == xdp->data + 1).
> 
> More information about this feature can be found in the commit message
> of commit de8f3a83b0a0 ("bpf: add meta pointer for direct access").
> 
> Signed-off-by: Marcus Wichelmann <marcus.wichelmann@hetzner-cloud.de>
> ---
>  drivers/net/tun.c | 21 ++++++++++++++++++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c95ab9c46bd2..3dde6cd29a84 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1604,7 +1604,8 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
>  
>  static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
>  				       struct page_frag *alloc_frag, char *buf,
> -				       int buflen, int len, int pad)
> +				       int buflen, int len, int pad,
> +				       int metasize)
>  {
>  	struct sk_buff *skb = build_skb(buf, buflen);
>  
> @@ -1613,6 +1614,8 @@ static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
>  
>  	skb_reserve(skb, pad);
>  	skb_put(skb, len);
> +	if (metasize)
> +		skb_metadata_set(skb, metasize);
>  	skb_set_owner_w(skb, tfile->socket.sk);
>  
>  	get_page(alloc_frag->page);
> @@ -1672,6 +1675,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	char *buf;
>  	size_t copied;
>  	int pad = TUN_RX_PAD;
> +	int metasize = 0;
>  	int err = 0;
>  
>  	rcu_read_lock();
> @@ -1699,7 +1703,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	if (hdr->gso_type || !xdp_prog) {
>  		*skb_xdp = 1;
>  		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
> -				       pad);
> +				       pad, metasize);
>  	}
>  
>  	*skb_xdp = 0;
> @@ -1734,12 +1738,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  
>  		pad = xdp.data - xdp.data_hard_start;
>  		len = xdp.data_end - xdp.data;
> +
> +		/* It is known that the xdp_buff was prepared with metadata
> +		 * support, so no additional check is necessary.
> +		 */
> +		metasize = xdp.data - xdp.data_meta;
>  	}
>  	bpf_net_ctx_clear(bpf_net_ctx);
>  	rcu_read_unlock();
>  	local_bh_enable();
>  
> -	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
> +	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad,
> +			       metasize);
>  
>  out:
>  	bpf_net_ctx_clear(bpf_net_ctx);
> @@ -2456,6 +2466,7 @@ static int tun_xdp_one(struct tun_struct *tun,
>  	struct sk_buff_head *queue;
>  	u32 rxhash = 0, act;
>  	int buflen = hdr->buflen;
> +	int metasize = 0;
>  	int ret = 0;
>  	bool skb_xdp = false;
>  	struct page *page;
> @@ -2510,6 +2521,10 @@ static int tun_xdp_one(struct tun_struct *tun,
>  	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>  	skb_put(skb, xdp->data_end - xdp->data);
>  
> +	metasize = max(xdp->data - xdp->data_meta, 0);

Can xdp->data_meta ever be greater than xdp->data?

This is pointer comparison, which is tricky wrt type. It likely is
ptrdiff_t and thus signed. But may want to use max_t(long int, ..) to
make this explicit.

> +	if (metasize)
> +		skb_metadata_set(skb, metasize);
> +

Not strictly needed. As skb_metadata_clear is just
skb_metadata_set(skb, 0). But also not wrong, so fine to keep.

>  	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
>  		atomic_long_inc(&tun->rx_frame_errors);
>  		kfree_skb(skb);
> -- 
> 2.43.0
>
Marcus Wichelmann Feb. 19, 2025, 2:47 p.m. UTC | #3
Am 18.02.25 um 02:47 schrieb Willem de Bruijn:
> Marcus Wichelmann wrote:
>> [...]
>> +	metasize = max(xdp->data - xdp->data_meta, 0);
> 
> Can xdp->data_meta ever be greater than xdp->data?

When an xdp_buff has no metadata support, then this is marked by setting
xdp->data_meta to xdp->data + 1. See xdp_prepare_buff or
xdp_set_data_meta_invalid.

In the case of tun_xdp_one, the xdp_buff is externally created by another
driver and passed to the tun driver using sendmsg and TUN_MSG_PTR. For
now, the vhost_net driver is the only driver doing that, and
xdp->data_meta is set to xdp->data there, marking support for metadata.

So knowing that vhost_net is currently the only driver passing xdp_buffs
to tun_sendmsg, the check is not strictly necessary. But other drivers
may use this API as well in the future. That's why I'd like to not make
the assumption that other drivers always create the xdp_buffs with
metadata support, when they pass them to tun_sendmsg.

Or am I just to careful about this? What do you think?

> This is pointer comparison, which is tricky wrt type. It likely is
> ptrdiff_t and thus signed. But may want to use max_t(long int, ..) to
> make this explicit.

Ah, I see, good point.

So like that?

	metasize = max_t(long int, xdp->data - xdp->data_meta, 0);
	if (metasize)
		skb_metadata_set(skb, metasize);

Alternatively, there is also xdp_data_meta_unsupported(xdp_buff) which
could be used to make this check very explicit, but I don't see it being
used in network drivers elsewhere. Not sure why.

>> +	if (metasize)
>> +		skb_metadata_set(skb, metasize);
>> +
> 
> Not strictly needed. As skb_metadata_clear is just
> skb_metadata_set(skb, 0). But also not wrong, so fine to keep.

Oh, haven't seen that.
I'm following a common pattern here that I've seen in many other network
drivers (grep for "skb_metadata_set"):

	unsigned int metasize = xdp->data - xdp->data_meta;
	[...]
	if (metasize)
		skb_metadata_set(skb, metasize);

Marcus
Willem de Bruijn Feb. 19, 2025, 3:06 p.m. UTC | #4
Marcus Wichelmann wrote:
> Am 18.02.25 um 02:47 schrieb Willem de Bruijn:
> > Marcus Wichelmann wrote:
> >> [...]
> >> +	metasize = max(xdp->data - xdp->data_meta, 0);
> > 
> > Can xdp->data_meta ever be greater than xdp->data?
> 
> When an xdp_buff has no metadata support, then this is marked by setting
> xdp->data_meta to xdp->data + 1. See xdp_prepare_buff or
> xdp_set_data_meta_invalid.
> 
> In the case of tun_xdp_one, the xdp_buff is externally created by another
> driver and passed to the tun driver using sendmsg and TUN_MSG_PTR. For
> now, the vhost_net driver is the only driver doing that, and
> xdp->data_meta is set to xdp->data there, marking support for metadata.
> 
> So knowing that vhost_net is currently the only driver passing xdp_buffs
> to tun_sendmsg, the check is not strictly necessary. But other drivers
> may use this API as well in the future. That's why I'd like to not make
> the assumption that other drivers always create the xdp_buffs with
> metadata support, when they pass them to tun_sendmsg.
> 
> Or am I just to careful about this? What do you think?

I agree.
 
> > This is pointer comparison, which is tricky wrt type. It likely is
> > ptrdiff_t and thus signed. But may want to use max_t(long int, ..) to
> > make this explicit.
> 
> Ah, I see, good point.
> 
> So like that?
> 
> 	metasize = max_t(long int, xdp->data - xdp->data_meta, 0);
> 	if (metasize)
> 		skb_metadata_set(skb, metasize);

Or just this? Also ensures the test uses signed int.

    int metasize;

    ...


    metasize = xdp->data - xdp->data_meta;
    if (metasize > 0)
            skb_metadata_set(skb, metasize);


> Alternatively, there is also xdp_data_meta_unsupported(xdp_buff) which
> could be used to make this check very explicit, but I don't see it being
> used in network drivers elsewhere. Not sure why.
> 
> >> +	if (metasize)
> >> +		skb_metadata_set(skb, metasize);
> >> +
> > 
> > Not strictly needed. As skb_metadata_clear is just
> > skb_metadata_set(skb, 0). But also not wrong, so fine to keep.
> 
> Oh, haven't seen that.
> I'm following a common pattern here that I've seen in many other network
> drivers (grep for "skb_metadata_set"):
> 
> 	unsigned int metasize = xdp->data - xdp->data_meta;
> 	[...]
> 	if (metasize)
> 		skb_metadata_set(skb, metasize);

Thanks for that context. Sounds good.
Marcus Wichelmann Feb. 19, 2025, 3:15 p.m. UTC | #5
Am 19.02.25 um 16:06 schrieb Willem de Bruijn:
> Marcus Wichelmann wrote:
>> Am 18.02.25 um 02:47 schrieb Willem de Bruijn:
>>> [...]
>>> This is pointer comparison, which is tricky wrt type. It likely is
>>> ptrdiff_t and thus signed. But may want to use max_t(long int, ..) to
>>> make this explicit.
>>
>> Ah, I see, good point.
>>
>> So like that?
>>
>> 	metasize = max_t(long int, xdp->data - xdp->data_meta, 0);
>> 	if (metasize)
>> 		skb_metadata_set(skb, metasize);
> 
> Or just this? Also ensures the test uses signed int.
> 
>      int metasize;
> 
>      ...
> 
> 
>      metasize = xdp->data - xdp->data_meta;
>      if (metasize > 0)
>              skb_metadata_set(skb, metasize);
> 

Well, yeah, just keep it simple I guess. ;) Will do that.

I'll send a V3 patch series with the change.

Thanks!

Marcus
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c95ab9c46bd2..3dde6cd29a84 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1604,7 +1604,8 @@  static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 
 static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
 				       struct page_frag *alloc_frag, char *buf,
-				       int buflen, int len, int pad)
+				       int buflen, int len, int pad,
+				       int metasize)
 {
 	struct sk_buff *skb = build_skb(buf, buflen);
 
@@ -1613,6 +1614,8 @@  static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
 
 	skb_reserve(skb, pad);
 	skb_put(skb, len);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 	skb_set_owner_w(skb, tfile->socket.sk);
 
 	get_page(alloc_frag->page);
@@ -1672,6 +1675,7 @@  static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	char *buf;
 	size_t copied;
 	int pad = TUN_RX_PAD;
+	int metasize = 0;
 	int err = 0;
 
 	rcu_read_lock();
@@ -1699,7 +1703,7 @@  static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	if (hdr->gso_type || !xdp_prog) {
 		*skb_xdp = 1;
 		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
-				       pad);
+				       pad, metasize);
 	}
 
 	*skb_xdp = 0;
@@ -1734,12 +1738,18 @@  static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		pad = xdp.data - xdp.data_hard_start;
 		len = xdp.data_end - xdp.data;
+
+		/* It is known that the xdp_buff was prepared with metadata
+		 * support, so no additional check is necessary.
+		 */
+		metasize = xdp.data - xdp.data_meta;
 	}
 	bpf_net_ctx_clear(bpf_net_ctx);
 	rcu_read_unlock();
 	local_bh_enable();
 
-	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
+	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad,
+			       metasize);
 
 out:
 	bpf_net_ctx_clear(bpf_net_ctx);
@@ -2456,6 +2466,7 @@  static int tun_xdp_one(struct tun_struct *tun,
 	struct sk_buff_head *queue;
 	u32 rxhash = 0, act;
 	int buflen = hdr->buflen;
+	int metasize = 0;
 	int ret = 0;
 	bool skb_xdp = false;
 	struct page *page;
@@ -2510,6 +2521,10 @@  static int tun_xdp_one(struct tun_struct *tun,
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	skb_put(skb, xdp->data_end - xdp->data);
 
+	metasize = max(xdp->data - xdp->data_meta, 0);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+
 	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
 		atomic_long_inc(&tun->rx_frame_errors);
 		kfree_skb(skb);