Message ID | 20231221132656.384606-3-maciej.fijalkowski@intel.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | BPF |
Headers | show |
Series | net: bpf_xdp_adjust_tail() fixes | expand |
On 12/21/23 5:26 AM, Maciej Fijalkowski wrote: > This comes from __xdp_return() call with xdp_buff argument passed as > NULL which is supposed to be consumed by xsk_buff_free() call. > > To address this properly, in ZC case, a node that represents the frag > being removed has to be pulled out of xskb_list. Introduce > appriopriate xsk helpers to do such node operation and use them > accordingly within bpf_xdp_adjust_tail(). [ ... ] > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > +{ > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); > + struct xdp_buff_xsk *frag; > + > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, > + xskb_list_node); > + return &frag->xdp; > +} > + [ ... ] > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, > + skb_frag_t *frag, int shrink) > +{ > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > + struct xdp_buff *tail = xsk_buff_get_tail(xdp); > + > + if (tail) > + tail->data_end -= shrink; > + } > + skb_frag_size_sub(frag, shrink); > +} > + > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) > +{ > + struct xdp_mem_info *mem_info = &xdp->rxq->mem; > + > + if (skb_frag_size(frag) == shrink) { > + struct page *page = skb_frag_page(frag); > + struct xdp_buff *zc_frag = NULL; > + > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > + zc_frag = xsk_buff_get_tail(xdp); > + > + if (zc_frag) { Based on the xsk_buff_get_tail(), would zc_frag ever be NULL? > + xdp_buff_clear_frags_flag(zc_frag); > + xsk_buff_del_tail(zc_frag); > + } > + } > + > + __xdp_return(page_address(page), mem_info, false, zc_frag); and iiuc, this patch is fixing a bug when zc_frag is NULL and MEM_TYPE_XSK_BUFF_POOL. > + return true; > + } > + __shrink_data(xdp, mem_info, frag, shrink); > + return false; > +} > +
On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote: > On 12/21/23 5:26 AM, Maciej Fijalkowski wrote: > > This comes from __xdp_return() call with xdp_buff argument passed as > > NULL which is supposed to be consumed by xsk_buff_free() call. > > > > To address this properly, in ZC case, a node that represents the frag > > being removed has to be pulled out of xskb_list. Introduce > > appriopriate xsk helpers to do such node operation and use them > > accordingly within bpf_xdp_adjust_tail(). > > [ ... ] > > > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > > +{ > > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); > > + struct xdp_buff_xsk *frag; > > + > > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, > > + xskb_list_node); > > + return &frag->xdp; > > +} > > + > > [ ... ] > > > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, > > + skb_frag_t *frag, int shrink) > > +{ > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > + struct xdp_buff *tail = xsk_buff_get_tail(xdp); > > + > > + if (tail) > > + tail->data_end -= shrink; > > + } > > + skb_frag_size_sub(frag, shrink); > > +} > > + > > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) > > +{ > > + struct xdp_mem_info *mem_info = &xdp->rxq->mem; > > + > > + if (skb_frag_size(frag) == shrink) { > > + struct page *page = skb_frag_page(frag); > > + struct xdp_buff *zc_frag = NULL; > > + > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > + zc_frag = xsk_buff_get_tail(xdp); > > + > > + if (zc_frag) { > > Based on the xsk_buff_get_tail(), would zc_frag ever be NULL? Hey Martin thanks for taking a look, I had to do this in order to satisfy !CONFIG_XDP_SOCKETS builds :/ > > > + xdp_buff_clear_frags_flag(zc_frag); > > + xsk_buff_del_tail(zc_frag); > > + } > > + } > > + > > + __xdp_return(page_address(page), mem_info, false, zc_frag); > > and iiuc, this patch is fixing a bug when zc_frag is NULL and > MEM_TYPE_XSK_BUFF_POOL. Generally I don't see the need for xdp_return_buff() (which calls in the end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this could be refactored later and then probably this fix would look different, but this is out of the scope now. > > > + return true; > > + } > > + __shrink_data(xdp, mem_info, frag, shrink); > > + return false; > > +} > > + > >
Maciej Fijalkowski wrote: > Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr > dereference happens: > > [1136314.192256] BUG: kernel NULL pointer dereference, address: > 0000000000000034 > [1136314.203943] #PF: supervisor read access in kernel mode > [1136314.213768] #PF: error_code(0x0000) - not-present page > [1136314.223550] PGD 0 P4D 0 > [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI > [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 > [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, > BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 > [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 > [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 > [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 > [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: > 0000000000000000 > [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > ffffc9003168c000 > [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: > 0000000000010000 > [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: > 0000000000000001 > [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: > 0000000000000001 > [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) > knlGS:0000000000000000 > [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: > 00000000007706f0 > [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [1136314.431890] PKRU: 55555554 > [1136314.439143] Call Trace: > [1136314.446058] <IRQ> > [1136314.452465] ? __die+0x20/0x70 > [1136314.459881] ? page_fault_oops+0x15b/0x440 > [1136314.468305] ? exc_page_fault+0x6a/0x150 > [1136314.476491] ? asm_exc_page_fault+0x22/0x30 > [1136314.484927] ? __xdp_return+0x6c/0x210 > [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 > [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 > [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] > [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] > [1136314.528506] ice_napi_poll+0x467/0x670 [ice] > [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 > [1136314.546010] __napi_poll+0x29/0x1b0 > [1136314.553462] net_rx_action+0x133/0x270 > [1136314.561619] __do_softirq+0xbe/0x28e > [1136314.569303] do_softirq+0x3f/0x60 > > This comes from __xdp_return() call with xdp_buff argument passed as > NULL which is supposed to be consumed by xsk_buff_free() call. > > To address this properly, in ZC case, a node that represents the frag > being removed has to be pulled out of xskb_list. Introduce hmm it looks like xsk_buff_free() called by __xdp_return would pull the frag out of the xskb_list? Or am I wrong? Then the issue is primarily the NULL handling? > appriopriate xsk helpers to do such node operation and use them > accordingly within bpf_xdp_adjust_tail(). > > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> > --- > include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++ > net/core/filter.c | 48 +++++++++++++++++++++++++++++++------- > 2 files changed, 65 insertions(+), 9 deletions(-) > > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h > index b62bb8525a5f..3d35ac0f838b 100644 > --- a/include/net/xdp_sock_drv.h > +++ b/include/net/xdp_sock_drv.h > @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) > return ret; > } > > +static inline void xsk_buff_del_tail(struct xdp_buff *tail) > +{ > + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); > + > + list_del(&xskb->xskb_list_node); > +} > + > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > +{ > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); > + struct xdp_buff_xsk *frag; > + > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, > + xskb_list_node); > + return &frag->xdp; > +} > + > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) > { > xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; > @@ -350,6 +367,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) > return NULL; > } > > +static inline void xsk_buff_del_tail(struct xdp_buff *tail) > +{ > +} > + > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > +{ > + return NULL; > +} > + > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) > { > } > diff --git a/net/core/filter.c b/net/core/filter.c > index 24061f29c9dd..1e20196687fd 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -83,6 +83,7 @@ > #include <net/netfilter/nf_conntrack_bpf.h> > #include <net/netkit.h> > #include <linux/un.h> > +#include <net/xdp_sock_drv.h> > > #include "dev.h" > > @@ -4096,6 +4097,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) > return 0; > } > > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, > + skb_frag_t *frag, int shrink) > +{ > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > + struct xdp_buff *tail = xsk_buff_get_tail(xdp); > + > + if (tail) > + tail->data_end -= shrink; > + } > + skb_frag_size_sub(frag, shrink); > +} > + > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) > +{ > + struct xdp_mem_info *mem_info = &xdp->rxq->mem; > + > + if (skb_frag_size(frag) == shrink) { > + struct page *page = skb_frag_page(frag); > + struct xdp_buff *zc_frag = NULL; > + > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > + zc_frag = xsk_buff_get_tail(xdp); > + > + if (zc_frag) { > + xdp_buff_clear_frags_flag(zc_frag); > + xsk_buff_del_tail(zc_frag); > + } > + } Should this be fixed in xdp_return instead of here? The xdp_return is doing what xsk_buff_del_tail() does. If we also called clear_frags there could this be simpler? if (skb_frag_size(frag) == shrink) { struct page *page = skb_frag_page(frag); __xdp_return(page_address(page), mem_info, false, xsk_buff_get_tail(xdp)); } else { __shrink_data(xdp, mem_info, frag, shrink); } the return will need to have an unlikely(!xdp) to guard the case it might be NULL, but also not sure if we would ever expect a NULL here if MEM_TYPE_XSK_BUFF_POOL so you might skip that unlikely as well? > + > + __xdp_return(page_address(page), mem_info, false, zc_frag); > + return true; > + } > + __shrink_data(xdp, mem_info, frag, shrink); > + return false; > +} > + > static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) > { > struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); > @@ -4110,17 +4147,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) > > len_free += shrink; > offset -= shrink; > - > - if (skb_frag_size(frag) == shrink) { > - struct page *page = skb_frag_page(frag); > - And then I likely would avoid the helper altogether? And code example above just lands here? > - __xdp_return(page_address(page), &xdp->rxq->mem, > - false, NULL); > + if (shrink_data(xdp, frag, shrink)) > n_frags_free++; > - } else { > - skb_frag_size_sub(frag, shrink); > + else > break; > - } > } I think the fix can be more straight-forward if we just populate the NULL field with the xdp_buff using the get_tail() helper created above. > sinfo->nr_frags -= n_frags_free; > sinfo->xdp_frags_size -= len_free; > -- > 2.34.1 > >
On 1/3/24 4:04 AM, Maciej Fijalkowski wrote: > On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote: >> On 12/21/23 5:26 AM, Maciej Fijalkowski wrote: >>> This comes from __xdp_return() call with xdp_buff argument passed as >>> NULL which is supposed to be consumed by xsk_buff_free() call. >>> >>> To address this properly, in ZC case, a node that represents the frag >>> being removed has to be pulled out of xskb_list. Introduce >>> appriopriate xsk helpers to do such node operation and use them >>> accordingly within bpf_xdp_adjust_tail(). >> >> [ ... ] >> >>> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) >>> +{ >>> + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); >>> + struct xdp_buff_xsk *frag; >>> + >>> + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, >>> + xskb_list_node); >>> + return &frag->xdp; >>> +} >>> + >> >> [ ... ] >> >>> +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, >>> + skb_frag_t *frag, int shrink) >>> +{ >>> + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { >>> + struct xdp_buff *tail = xsk_buff_get_tail(xdp); >>> + >>> + if (tail) >>> + tail->data_end -= shrink; >>> + } >>> + skb_frag_size_sub(frag, shrink); >>> +} >>> + >>> +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) >>> +{ >>> + struct xdp_mem_info *mem_info = &xdp->rxq->mem; >>> + >>> + if (skb_frag_size(frag) == shrink) { >>> + struct page *page = skb_frag_page(frag); >>> + struct xdp_buff *zc_frag = NULL; >>> + >>> + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { >>> + zc_frag = xsk_buff_get_tail(xdp); >>> + >>> + if (zc_frag) { >> >> Based on the xsk_buff_get_tail(), would zc_frag ever be NULL? > > Hey Martin thanks for taking a look, I had to do this in order to satisfy > !CONFIG_XDP_SOCKETS builds :/ There is compilation/checker warning if it does not check for NULL? hmm... but it still should not reach here in the runtime and call xsk_buff_get_tail() in the !CONFIG_XDP_SOCKETS build. Can the NULL test on the get_tail() return value be removed? The above "mem_info->type == MEM_TYPE_XSK_BUFF_POOL" should have avoided the get_tail() call for the !CONFIG_XDP_SOCKETS build. Otherwise, it could be passing NULL to the __xdp_return() and hit the same bug again. The NULL check here is pretty hard to reason logically. > >> >>> + xdp_buff_clear_frags_flag(zc_frag); >>> + xsk_buff_del_tail(zc_frag); >>> + } >>> + } >>> + >>> + __xdp_return(page_address(page), mem_info, false, zc_frag); >> >> and iiuc, this patch is fixing a bug when zc_frag is NULL and >> MEM_TYPE_XSK_BUFF_POOL. > > Generally I don't see the need for xdp_return_buff() (which calls in the > end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this > could be refactored later and then probably this fix would look different, > but this is out of the scope now. > >> >>> + return true; >>> + } >>> + __shrink_data(xdp, mem_info, frag, shrink); >>> + return false; >>> +} >>> + >> >> >
On Wed, Jan 03, 2024 at 12:48:10PM -0800, John Fastabend wrote: > Maciej Fijalkowski wrote: > > Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr > > dereference happens: > > > > [1136314.192256] BUG: kernel NULL pointer dereference, address: > > 0000000000000034 > > [1136314.203943] #PF: supervisor read access in kernel mode > > [1136314.213768] #PF: error_code(0x0000) - not-present page > > [1136314.223550] PGD 0 P4D 0 > > [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI > > [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 > > [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, > > BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 > > [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 > > [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 > > [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 > > [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: > > 0000000000000000 > > [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > > ffffc9003168c000 > > [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: > > 0000000000010000 > > [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: > > 0000000000000001 > > [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: > > 0000000000000001 > > [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) > > knlGS:0000000000000000 > > [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: > > 00000000007706f0 > > [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > > 0000000000000000 > > [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > > 0000000000000400 > > [1136314.431890] PKRU: 55555554 > > [1136314.439143] Call Trace: > > [1136314.446058] <IRQ> > > [1136314.452465] ? __die+0x20/0x70 > > [1136314.459881] ? page_fault_oops+0x15b/0x440 > > [1136314.468305] ? exc_page_fault+0x6a/0x150 > > [1136314.476491] ? asm_exc_page_fault+0x22/0x30 > > [1136314.484927] ? __xdp_return+0x6c/0x210 > > [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 > > [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 > > [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] > > [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] > > [1136314.528506] ice_napi_poll+0x467/0x670 [ice] > > [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 > > [1136314.546010] __napi_poll+0x29/0x1b0 > > [1136314.553462] net_rx_action+0x133/0x270 > > [1136314.561619] __do_softirq+0xbe/0x28e > > [1136314.569303] do_softirq+0x3f/0x60 > > > > This comes from __xdp_return() call with xdp_buff argument passed as > > NULL which is supposed to be consumed by xsk_buff_free() call. > > > > To address this properly, in ZC case, a node that represents the frag > > being removed has to be pulled out of xskb_list. Introduce > > hmm it looks like xsk_buff_free() called by __xdp_return would > pull the frag out of the xskb_list? Or am I wrong? > > Then the issue is primarily the NULL handling? Hey John, as you see later on it is also about adjusting the size within xdp_buff that comes from xsk pool, in case when offset is not bigger than frag size. > > > appriopriate xsk helpers to do such node operation and use them > > accordingly within bpf_xdp_adjust_tail(). > > > > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") > > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> > > --- > > include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++ > > net/core/filter.c | 48 +++++++++++++++++++++++++++++++------- > > 2 files changed, 65 insertions(+), 9 deletions(-) > > > > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h > > index b62bb8525a5f..3d35ac0f838b 100644 > > --- a/include/net/xdp_sock_drv.h > > +++ b/include/net/xdp_sock_drv.h > > @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) > > return ret; > > } > > > > +static inline void xsk_buff_del_tail(struct xdp_buff *tail) > > +{ > > + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); > > + > > + list_del(&xskb->xskb_list_node); > > +} > > + > > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > > +{ > > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); > > + struct xdp_buff_xsk *frag; > > + > > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, > > + xskb_list_node); > > + return &frag->xdp; > > +} > > + > > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) > > { > > xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; > > @@ -350,6 +367,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) > > return NULL; > > } > > > > +static inline void xsk_buff_del_tail(struct xdp_buff *tail) > > +{ > > +} > > + > > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > > +{ > > + return NULL; > > +} > > + > > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) > > { > > } > > diff --git a/net/core/filter.c b/net/core/filter.c > > index 24061f29c9dd..1e20196687fd 100644 > > --- a/net/core/filter.c > > +++ b/net/core/filter.c > > @@ -83,6 +83,7 @@ > > #include <net/netfilter/nf_conntrack_bpf.h> > > #include <net/netkit.h> > > #include <linux/un.h> > > +#include <net/xdp_sock_drv.h> > > > > #include "dev.h" > > > > @@ -4096,6 +4097,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) > > return 0; > > } > > > > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, > > + skb_frag_t *frag, int shrink) > > +{ > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > + struct xdp_buff *tail = xsk_buff_get_tail(xdp); > > + > > + if (tail) > > + tail->data_end -= shrink; > > + } > > + skb_frag_size_sub(frag, shrink); > > +} > > + > > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) > > +{ > > + struct xdp_mem_info *mem_info = &xdp->rxq->mem; > > + > > + if (skb_frag_size(frag) == shrink) { > > + struct page *page = skb_frag_page(frag); > > + struct xdp_buff *zc_frag = NULL; > > + > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > + zc_frag = xsk_buff_get_tail(xdp); > > + > > + if (zc_frag) { > > + xdp_buff_clear_frags_flag(zc_frag); > > + xsk_buff_del_tail(zc_frag); > > + } > > + } > > Should this be fixed in xdp_return instead of here? The xdp_return > is doing what xsk_buff_del_tail() does. If we also called clear_frags > there could this be simpler? xsk_buff_del_tail() only deletes node from xskb_list and xsk_buff_free() in the will call xp_free() on frag being deleted. I think I would be rather leaning towards adding xp_free() to xsk_buff_del_tail() and skipping __xdp_return() call from ZC case altogether... > > if (skb_frag_size(frag) == shrink) { > struct page *page = skb_frag_page(frag); > > __xdp_return(page_address(page), mem_info, false, xsk_buff_get_tail(xdp)); > } else { > __shrink_data(xdp, mem_info, frag, shrink); > } > > the return will need to have an unlikely(!xdp) to guard the case it > might be NULL, but also not sure if we would ever expect a NULL > here if MEM_TYPE_XSK_BUFF_POOL so you might skip that unlikely > as well? In that approach you would xp_free() the frag being removed but it would still be dangling in xskb_list that first xdp_buff carries. Some adjustments would have to be done within xsk_buff_free(). Regarding the NULLness of zc_frag for MEM_TYPE_XSK_BUFF_POOL that Martin also brought, you are right, I went too far with this, I misread what kernel test robot reported :) so on monday I will look into this. > > > + > > + __xdp_return(page_address(page), mem_info, false, zc_frag); > > + return true; > > + } > > + __shrink_data(xdp, mem_info, frag, shrink); > > + return false; > > +} > > + > > static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) > > { > > struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); > > @@ -4110,17 +4147,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) > > > > len_free += shrink; > > offset -= shrink; > > - > > - if (skb_frag_size(frag) == shrink) { > > - struct page *page = skb_frag_page(frag); > > - > > And then I likely would avoid the helper altogether? And code > example above just lands here? > > > - __xdp_return(page_address(page), &xdp->rxq->mem, > > - false, NULL); > > + if (shrink_data(xdp, frag, shrink)) > > n_frags_free++; > > - } else { > > - skb_frag_size_sub(frag, shrink); > > + else > > break; > > - } > > } > > I think the fix can be more straight-forward if we just populate > the NULL field with the xdp_buff using the get_tail() helper > created above. I'll think about both approaches, the one you're suggesting and the other that I wrote up above. Thanks a lot for taking a look! > > > sinfo->nr_frags -= n_frags_free; > > sinfo->xdp_frags_size -= len_free; > > -- > > 2.34.1 > > > > > > >
On Wed, Jan 03, 2024 at 02:53:20PM -0800, Martin KaFai Lau wrote: > On 1/3/24 4:04 AM, Maciej Fijalkowski wrote: > > On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote: > > > On 12/21/23 5:26 AM, Maciej Fijalkowski wrote: > > > > This comes from __xdp_return() call with xdp_buff argument passed as > > > > NULL which is supposed to be consumed by xsk_buff_free() call. > > > > > > > > To address this properly, in ZC case, a node that represents the frag > > > > being removed has to be pulled out of xskb_list. Introduce > > > > appriopriate xsk helpers to do such node operation and use them > > > > accordingly within bpf_xdp_adjust_tail(). > > > > > > [ ... ] > > > > > > > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) > > > > +{ > > > > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); > > > > + struct xdp_buff_xsk *frag; > > > > + > > > > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, > > > > + xskb_list_node); > > > > + return &frag->xdp; > > > > +} > > > > + > > > > > > [ ... ] > > > > > > > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, > > > > + skb_frag_t *frag, int shrink) > > > > +{ > > > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > > > + struct xdp_buff *tail = xsk_buff_get_tail(xdp); > > > > + > > > > + if (tail) > > > > + tail->data_end -= shrink; > > > > + } > > > > + skb_frag_size_sub(frag, shrink); > > > > +} > > > > + > > > > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) > > > > +{ > > > > + struct xdp_mem_info *mem_info = &xdp->rxq->mem; > > > > + > > > > + if (skb_frag_size(frag) == shrink) { > > > > + struct page *page = skb_frag_page(frag); > > > > + struct xdp_buff *zc_frag = NULL; > > > > + > > > > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { > > > > + zc_frag = xsk_buff_get_tail(xdp); > > > > + > > > > + if (zc_frag) { > > > > > > Based on the xsk_buff_get_tail(), would zc_frag ever be NULL? > > > > Hey Martin thanks for taking a look, I had to do this in order to satisfy > > !CONFIG_XDP_SOCKETS builds :/ > > There is compilation/checker warning if it does not check for NULL? > > hmm... but it still should not reach here in the runtime and call > xsk_buff_get_tail() in the !CONFIG_XDP_SOCKETS build. Can the NULL test on > the get_tail() return value be removed? The above "mem_info->type == > MEM_TYPE_XSK_BUFF_POOL" should have avoided the get_tail() call for the > !CONFIG_XDP_SOCKETS build. Otherwise, it could be passing NULL to the > __xdp_return() and hit the same bug again. The NULL check here is pretty > hard to reason logically. Thanks for bringing this up, you are of course right. I'll address that. > > > > > > > > > > + xdp_buff_clear_frags_flag(zc_frag); > > > > + xsk_buff_del_tail(zc_frag); > > > > + } > > > > + } > > > > + > > > > + __xdp_return(page_address(page), mem_info, false, zc_frag); > > > > > > and iiuc, this patch is fixing a bug when zc_frag is NULL and > > > MEM_TYPE_XSK_BUFF_POOL. > > > > Generally I don't see the need for xdp_return_buff() (which calls in the > > end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this > > could be refactored later and then probably this fix would look different, > > but this is out of the scope now. > > > > > > > > > + return true; > > > > + } > > > > + __shrink_data(xdp, mem_info, frag, shrink); > > > > + return false; > > > > +} > > > > + > > > > > > > > > >
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index b62bb8525a5f..3d35ac0f838b 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return ret; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + + list_del(&xskb->xskb_list_node); +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + xskb_list_node); + return &frag->xdp; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -350,6 +367,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) return NULL; } +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/net/core/filter.c b/net/core/filter.c index 24061f29c9dd..1e20196687fd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -83,6 +83,7 @@ #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> +#include <net/xdp_sock_drv.h> #include "dev.h" @@ -4096,6 +4097,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info, + skb_frag_t *frag, int shrink) +{ + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { + struct xdp_buff *tail = xsk_buff_get_tail(xdp); + + if (tail) + tail->data_end -= shrink; + } + skb_frag_size_sub(frag, shrink); +} + +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) +{ + struct xdp_mem_info *mem_info = &xdp->rxq->mem; + + if (skb_frag_size(frag) == shrink) { + struct page *page = skb_frag_page(frag); + struct xdp_buff *zc_frag = NULL; + + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { + zc_frag = xsk_buff_get_tail(xdp); + + if (zc_frag) { + xdp_buff_clear_frags_flag(zc_frag); + xsk_buff_del_tail(zc_frag); + } + } + + __xdp_return(page_address(page), mem_info, false, zc_frag); + return true; + } + __shrink_data(xdp, mem_info, frag, shrink); + return false; +} + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -4110,17 +4147,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - - if (skb_frag_size(frag) == shrink) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), &xdp->rxq->mem, - false, NULL); + if (shrink_data(xdp, frag, shrink)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); + else break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free;