Message ID | c928f7c698de070b33d38f230081fd4f993f2567.1701128026.git.lorenzo@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | [net-next] xdp: add multi-buff support for xdp running in generic mode | expand |
Lorenzo Bianconi <lorenzo@kernel.org> writes: > Similar to native xdp, do not always linearize the skb in > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be > processed by the eBPF program. This allow to add multi-buffer support > for xdp running in generic mode. > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> With one nit below: Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> > --- > net/core/dev.c | 28 +++++++++++++++++++++++----- > 1 file changed, 23 insertions(+), 5 deletions(-) > > diff --git a/net/core/dev.c b/net/core/dev.c > index 3950ced396b5..5a58f3e28657 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); > xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, > skb_headlen(skb) + mac_len, true); > + if (skb_is_nonlinear(skb)) { > + skb_shinfo(skb)->xdp_frags_size = skb->data_len; > + xdp_buff_set_frags_flag(xdp); > + } else { > + xdp_buff_clear_frags_flag(xdp); > + } > > orig_data_end = xdp->data_end; > orig_data = xdp->data; > @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > skb->len += off; /* positive on grow, negative on shrink */ > } > > + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers > + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. > + */ > + if (xdp_buff_has_frags(xdp)) > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > + else > + skb->data_len = 0; > + > /* check if XDP changed eth hdr such SKB needs update */ > eth = (struct ethhdr *)xdp->data; > if ((orig_eth_type != eth->h_proto) || > @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > if (skb_is_redirected(skb)) > return XDP_PASS; > > - /* XDP packets must be linear and must have sufficient headroom > - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also > - * native XDP provides, thus we need to do it here as well. > + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM > + * bytes. This is the guarantee that also native XDP provides, > + * thus we need to do it here as well. > */ > if (skb_cloned(skb) || skb_is_nonlinear(skb) || > skb_headroom(skb) < XDP_PACKET_HEADROOM) { > @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, > troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) > goto do_drop; There's a comment above the pskb_expand_head() call that isn't quite part of the context here, that should also be adjusted now that we don't always linearise: /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ Actually, I think maybe just dropping that comment entirely with this change would make sense? > - if (skb_linearize(skb)) > - goto do_drop; > + > + /* XDP does not support fraglist */ > + if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) { > + if (skb_linearize(skb)) > + goto do_drop; > + } > } > > act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); > -- > 2.43.0
> Lorenzo Bianconi <lorenzo@kernel.org> writes: > > > Similar to native xdp, do not always linearize the skb in > > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be > > processed by the eBPF program. This allow to add multi-buffer support > > for xdp running in generic mode. > > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > > With one nit below: > > Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> > > > --- > > net/core/dev.c | 28 +++++++++++++++++++++++----- > > 1 file changed, 23 insertions(+), 5 deletions(-) > > > > diff --git a/net/core/dev.c b/net/core/dev.c > > index 3950ced396b5..5a58f3e28657 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > > xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); > > xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, > > skb_headlen(skb) + mac_len, true); > > + if (skb_is_nonlinear(skb)) { > > + skb_shinfo(skb)->xdp_frags_size = skb->data_len; > > + xdp_buff_set_frags_flag(xdp); > > + } else { > > + xdp_buff_clear_frags_flag(xdp); > > + } > > > > orig_data_end = xdp->data_end; > > orig_data = xdp->data; > > @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > > skb->len += off; /* positive on grow, negative on shrink */ > > } > > > > + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers > > + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. > > + */ > > + if (xdp_buff_has_frags(xdp)) > > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > > + else > > + skb->data_len = 0; > > + > > /* check if XDP changed eth hdr such SKB needs update */ > > eth = (struct ethhdr *)xdp->data; > > if ((orig_eth_type != eth->h_proto) || > > @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > > if (skb_is_redirected(skb)) > > return XDP_PASS; > > > > - /* XDP packets must be linear and must have sufficient headroom > > - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also > > - * native XDP provides, thus we need to do it here as well. > > + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM > > + * bytes. This is the guarantee that also native XDP provides, > > + * thus we need to do it here as well. > > */ > > if (skb_cloned(skb) || skb_is_nonlinear(skb) || > > skb_headroom(skb) < XDP_PACKET_HEADROOM) { > > @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > > hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, > > troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) > > goto do_drop; > > There's a comment above the pskb_expand_head() call that isn't quite > part of the context here, that should also be adjusted now that we don't > always linearise: > > /* In case we have to go down the path and also linearize, > * then lets do the pskb_expand_head() work just once here. > */ > > Actually, I think maybe just dropping that comment entirely with this > change would make sense? ack, I will get rid of it. Thx. Regards, Lorenzo > > > > - if (skb_linearize(skb)) > > - goto do_drop; > > + > > + /* XDP does not support fraglist */ > > + if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) { > > + if (skb_linearize(skb)) > > + goto do_drop; > > + } > > } > > > > act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); > > -- > > 2.43.0 >
> Similar to native xdp, do not always linearize the skb in > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be > processed by the eBPF program. This allow to add multi-buffer support > for xdp running in generic mode. > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > --- > net/core/dev.c | 28 +++++++++++++++++++++++----- > 1 file changed, 23 insertions(+), 5 deletions(-) > > diff --git a/net/core/dev.c b/net/core/dev.c > index 3950ced396b5..5a58f3e28657 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); > xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, > skb_headlen(skb) + mac_len, true); > + if (skb_is_nonlinear(skb)) { > + skb_shinfo(skb)->xdp_frags_size = skb->data_len; > + xdp_buff_set_frags_flag(xdp); > + } else { > + xdp_buff_clear_frags_flag(xdp); > + } > > orig_data_end = xdp->data_end; > orig_data = xdp->data; > @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, > skb->len += off; /* positive on grow, negative on shrink */ > } > > + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers > + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. > + */ > + if (xdp_buff_has_frags(xdp)) > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > + else > + skb->data_len = 0; > + > /* check if XDP changed eth hdr such SKB needs update */ > eth = (struct ethhdr *)xdp->data; > if ((orig_eth_type != eth->h_proto) || > @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > if (skb_is_redirected(skb)) > return XDP_PASS; > > - /* XDP packets must be linear and must have sufficient headroom > - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also > - * native XDP provides, thus we need to do it here as well. > + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM > + * bytes. This is the guarantee that also native XDP provides, > + * thus we need to do it here as well. > */ > if (skb_cloned(skb) || skb_is_nonlinear(skb) || > skb_headroom(skb) < XDP_PACKET_HEADROOM) { > @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, > troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) > goto do_drop; > - if (skb_linearize(skb)) > - goto do_drop; > + > + /* XDP does not support fraglist */ > + if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) { > + if (skb_linearize(skb)) > + goto do_drop; @Jakub: iirc we were discussing something similar for veth [0]. Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[]) just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data() we always reallocate the paged area if skb_shinfo()->nr_frags is set [2]. Since the eBPF program can theoretically modify paged data, I would say we should do the same we did for veth even here, right? Regards, Lorenzo [0] https://lore.kernel.org/netdev/20220312131806.1c2919ba@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com/ [1] https://elixir.bootlin.com/linux/v6.6.2/source/net/core/skbuff.c#L2113 [2] https://elixir.bootlin.com/linux/v6.6.2/source/net/core/skbuff.c#L5016 > + } > } > > act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); > -- > 2.43.0 >
On Tue, 28 Nov 2023 18:29:20 +0100 Lorenzo Bianconi wrote: > @Jakub: iirc we were discussing something similar for veth [0]. > Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[]) > just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data() > we always reallocate the paged area if skb_shinfo()->nr_frags is set [2]. > Since the eBPF program can theoretically modify paged data, I would say we > should do the same we did for veth even here, right? Yes, don't we allow writes to fragments in XDP based on the assumption that it runs on Rx so that paged data must not be zero copy? bpf_xdp_store_bytes() doesn't seem to have any checks which would stop it from writing fragments, as far as I can see. I don't see how we can ever correctly support this form of mbuf for veth or generic XDP :(
> On Tue, 28 Nov 2023 18:29:20 +0100 Lorenzo Bianconi wrote: > > @Jakub: iirc we were discussing something similar for veth [0]. > > Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[]) > > just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data() > > we always reallocate the paged area if skb_shinfo()->nr_frags is set [2]. > > Since the eBPF program can theoretically modify paged data, I would say we > > should do the same we did for veth even here, right? > > Yes, don't we allow writes to fragments in XDP based on the assumption > that it runs on Rx so that paged data must not be zero copy? > bpf_xdp_store_bytes() doesn't seem to have any checks which would > stop it from writing fragments, as far as I can see. do you mean in the skb use-case we could write to fragments (without copying them) if the skb is not cloned and the paged area is not 'zero-copied'? With respect to this patch it would mean we can rely on pskb_expand_head() to reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0]. Is my understanding correct or am I missing something? Regards, Lorenzo [0] https://elixir.bootlin.com/linux/v6.6.2/source/drivers/net/veth.c#L738 > > I don't see how we can ever correctly support this form of mbuf for veth > or generic XDP :(
On Tue, 28 Nov 2023 23:27:29 +0100 Lorenzo Bianconi wrote: > > Yes, don't we allow writes to fragments in XDP based on the assumption > > that it runs on Rx so that paged data must not be zero copy? > > bpf_xdp_store_bytes() doesn't seem to have any checks which would > > stop it from writing fragments, as far as I can see. > > do you mean in the skb use-case we could write to fragments (without copying > them) if the skb is not cloned and the paged area is not 'zero-copied'? The zero-copy thing is a red herring. If application uses sendpage/sendfile/splice the frag may be a page cache page of a file. Or something completely read only. IIUC you're trying to avoid the copy if the prog is mbuf capable. So I was saying that can't work for forms of XDP which actually deal with skbs. But that wasn't really your question, sorry :) > With respect to this patch it would mean we can rely on pskb_expand_head() to > reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly > reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0]. > Is my understanding correct or am I missing something? The difference is that pskb_expand_head() will give you a linear skb, potentially triggering an order 5 allocation. Expensive and likely to fail under memory pressure. veth_convert_skb_to_xdp_buff() tries to allocate pages, and keep the skb fragmented.
> On Tue, 28 Nov 2023 23:27:29 +0100 Lorenzo Bianconi wrote: > > > Yes, don't we allow writes to fragments in XDP based on the assumption > > > that it runs on Rx so that paged data must not be zero copy? > > > bpf_xdp_store_bytes() doesn't seem to have any checks which would > > > stop it from writing fragments, as far as I can see. > > > > do you mean in the skb use-case we could write to fragments (without copying > > them) if the skb is not cloned and the paged area is not 'zero-copied'? > > The zero-copy thing is a red herring. If application uses > sendpage/sendfile/splice the frag may be a page cache page > of a file. Or something completely read only. ack, thx for pointing this out. It is clear now :) > > IIUC you're trying to avoid the copy if the prog is mbuf capable. > So I was saying that can't work for forms of XDP which actually > deal with skbs. But that wasn't really your question, sorry :) > > > With respect to this patch it would mean we can rely on pskb_expand_head() to > > reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly > > reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0]. > > Is my understanding correct or am I missing something? > > The difference is that pskb_expand_head() will give you a linear skb, > potentially triggering an order 5 allocation. Expensive and likely to > fail under memory pressure. ack > > veth_convert_skb_to_xdp_buff() tries to allocate pages, and keep > the skb fragmented. I will rework the patch using this approach. Regards, Lorenzo
diff --git a/net/core/dev.c b/net/core/dev.c index 3950ced396b5..5a58f3e28657 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; - if (skb_linearize(skb)) - goto do_drop; + + /* XDP does not support fraglist */ + if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) { + if (skb_linearize(skb)) + goto do_drop; + } } act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
Similar to native xdp, do not always linearize the skb in netif_receive_generic_xdp routine but create a non-linear xdp_buff to be processed by the eBPF program. This allow to add multi-buffer support for xdp running in generic mode. Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> --- net/core/dev.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-)