diff mbox series

[v3,bpf,2/4] xsk: fix usage of multi-buffer BPF helpers for ZC XDP

Message ID 20231221132656.384606-3-maciej.fijalkowski@intel.com (mailing list archive)
State New, archived
Delegated to: BPF
Headers show
Series net: bpf_xdp_adjust_tail() fixes | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf
netdev/apply fail Patch does not apply to bpf
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Fijalkowski, Maciej Dec. 21, 2023, 1:26 p.m. UTC
Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
dereference happens:

[1136314.192256] BUG: kernel NULL pointer dereference, address:
0000000000000034
[1136314.203943] #PF: supervisor read access in kernel mode
[1136314.213768] #PF: error_code(0x0000) - not-present page
[1136314.223550] PGD 0 P4D 0
[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
0000000000000000
[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
ffffc9003168c000
[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
0000000000010000
[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
0000000000000001
[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
0000000000000001
[1136314.373298] FS:  00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
knlGS:0000000000000000
[1136314.386105] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
00000000007706f0
[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[1136314.431890] PKRU: 55555554
[1136314.439143] Call Trace:
[1136314.446058]  <IRQ>
[1136314.452465]  ? __die+0x20/0x70
[1136314.459881]  ? page_fault_oops+0x15b/0x440
[1136314.468305]  ? exc_page_fault+0x6a/0x150
[1136314.476491]  ? asm_exc_page_fault+0x22/0x30
[1136314.484927]  ? __xdp_return+0x6c/0x210
[1136314.492863]  bpf_xdp_adjust_tail+0x155/0x1d0
[1136314.501269]  bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
[1136314.511263]  ice_clean_rx_irq_zc+0x206/0xc60 [ice]
[1136314.520222]  ? ice_xmit_zc+0x6e/0x150 [ice]
[1136314.528506]  ice_napi_poll+0x467/0x670 [ice]
[1136314.536858]  ? ttwu_do_activate.constprop.0+0x8f/0x1a0
[1136314.546010]  __napi_poll+0x29/0x1b0
[1136314.553462]  net_rx_action+0x133/0x270
[1136314.561619]  __do_softirq+0xbe/0x28e
[1136314.569303]  do_softirq+0x3f/0x60

This comes from __xdp_return() call with xdp_buff argument passed as
NULL which is supposed to be consumed by xsk_buff_free() call.

To address this properly, in ZC case, a node that represents the frag
being removed has to be pulled out of xskb_list. Introduce
appriopriate xsk helpers to do such node operation and use them
accordingly within bpf_xdp_adjust_tail().

Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
 include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
 net/core/filter.c          | 48 +++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 9 deletions(-)

Comments

Martin KaFai Lau Jan. 2, 2024, 10:58 p.m. UTC | #1
On 12/21/23 5:26 AM, Maciej Fijalkowski wrote:
> This comes from __xdp_return() call with xdp_buff argument passed as
> NULL which is supposed to be consumed by xsk_buff_free() call.
> 
> To address this properly, in ZC case, a node that represents the frag
> being removed has to be pulled out of xskb_list. Introduce
> appriopriate xsk helpers to do such node operation and use them
> accordingly within bpf_xdp_adjust_tail().

[ ... ]

> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> +{
> +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> +	struct xdp_buff_xsk *frag;
> +
> +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> +			       xskb_list_node);
> +	return &frag->xdp;
> +}
> +

[ ... ]

> +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> +			  skb_frag_t *frag, int shrink)
> +{
> +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> +
> +		if (tail)
> +			tail->data_end -= shrink;
> +	}
> +	skb_frag_size_sub(frag, shrink);
> +}
> +
> +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> +{
> +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> +
> +	if (skb_frag_size(frag) == shrink) {
> +		struct page *page = skb_frag_page(frag);
> +		struct xdp_buff *zc_frag = NULL;
> +
> +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> +			zc_frag = xsk_buff_get_tail(xdp);
> +
> +			if (zc_frag) {

Based on the xsk_buff_get_tail(), would zc_frag ever be NULL?

> +				xdp_buff_clear_frags_flag(zc_frag);
> +				xsk_buff_del_tail(zc_frag);
> +			}
> +		}
> +
> +		__xdp_return(page_address(page), mem_info, false, zc_frag);

and iiuc, this patch is fixing a bug when zc_frag is NULL and 
MEM_TYPE_XSK_BUFF_POOL.

> +		return true;
> +	}
> +	__shrink_data(xdp, mem_info, frag, shrink);
> +	return false;
> +}
> +
Fijalkowski, Maciej Jan. 3, 2024, 12:04 p.m. UTC | #2
On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote:
> On 12/21/23 5:26 AM, Maciej Fijalkowski wrote:
> > This comes from __xdp_return() call with xdp_buff argument passed as
> > NULL which is supposed to be consumed by xsk_buff_free() call.
> > 
> > To address this properly, in ZC case, a node that represents the frag
> > being removed has to be pulled out of xskb_list. Introduce
> > appriopriate xsk helpers to do such node operation and use them
> > accordingly within bpf_xdp_adjust_tail().
> 
> [ ... ]
> 
> > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > +{
> > +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> > +	struct xdp_buff_xsk *frag;
> > +
> > +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> > +			       xskb_list_node);
> > +	return &frag->xdp;
> > +}
> > +
> 
> [ ... ]
> 
> > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> > +			  skb_frag_t *frag, int shrink)
> > +{
> > +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> > +
> > +		if (tail)
> > +			tail->data_end -= shrink;
> > +	}
> > +	skb_frag_size_sub(frag, shrink);
> > +}
> > +
> > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> > +{
> > +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> > +
> > +	if (skb_frag_size(frag) == shrink) {
> > +		struct page *page = skb_frag_page(frag);
> > +		struct xdp_buff *zc_frag = NULL;
> > +
> > +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > +			zc_frag = xsk_buff_get_tail(xdp);
> > +
> > +			if (zc_frag) {
> 
> Based on the xsk_buff_get_tail(), would zc_frag ever be NULL?

Hey Martin thanks for taking a look, I had to do this in order to satisfy
!CONFIG_XDP_SOCKETS builds :/

> 
> > +				xdp_buff_clear_frags_flag(zc_frag);
> > +				xsk_buff_del_tail(zc_frag);
> > +			}
> > +		}
> > +
> > +		__xdp_return(page_address(page), mem_info, false, zc_frag);
> 
> and iiuc, this patch is fixing a bug when zc_frag is NULL and
> MEM_TYPE_XSK_BUFF_POOL.

Generally I don't see the need for xdp_return_buff() (which calls in the
end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this
could be refactored later and then probably this fix would look different,
but this is out of the scope now.

> 
> > +		return true;
> > +	}
> > +	__shrink_data(xdp, mem_info, frag, shrink);
> > +	return false;
> > +}
> > +
> 
>
John Fastabend Jan. 3, 2024, 8:48 p.m. UTC | #3
Maciej Fijalkowski wrote:
> Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
> dereference happens:
> 
> [1136314.192256] BUG: kernel NULL pointer dereference, address:
> 0000000000000034
> [1136314.203943] #PF: supervisor read access in kernel mode
> [1136314.213768] #PF: error_code(0x0000) - not-present page
> [1136314.223550] PGD 0 P4D 0
> [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
> [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
> BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
> [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
> [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
> [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
> [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
> 0000000000000000
> [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> ffffc9003168c000
> [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
> 0000000000010000
> [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
> 0000000000000001
> [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
> 0000000000000001
> [1136314.373298] FS:  00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
> knlGS:0000000000000000
> [1136314.386105] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
> 00000000007706f0
> [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [1136314.431890] PKRU: 55555554
> [1136314.439143] Call Trace:
> [1136314.446058]  <IRQ>
> [1136314.452465]  ? __die+0x20/0x70
> [1136314.459881]  ? page_fault_oops+0x15b/0x440
> [1136314.468305]  ? exc_page_fault+0x6a/0x150
> [1136314.476491]  ? asm_exc_page_fault+0x22/0x30
> [1136314.484927]  ? __xdp_return+0x6c/0x210
> [1136314.492863]  bpf_xdp_adjust_tail+0x155/0x1d0
> [1136314.501269]  bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
> [1136314.511263]  ice_clean_rx_irq_zc+0x206/0xc60 [ice]
> [1136314.520222]  ? ice_xmit_zc+0x6e/0x150 [ice]
> [1136314.528506]  ice_napi_poll+0x467/0x670 [ice]
> [1136314.536858]  ? ttwu_do_activate.constprop.0+0x8f/0x1a0
> [1136314.546010]  __napi_poll+0x29/0x1b0
> [1136314.553462]  net_rx_action+0x133/0x270
> [1136314.561619]  __do_softirq+0xbe/0x28e
> [1136314.569303]  do_softirq+0x3f/0x60
> 
> This comes from __xdp_return() call with xdp_buff argument passed as
> NULL which is supposed to be consumed by xsk_buff_free() call.
> 
> To address this properly, in ZC case, a node that represents the frag
> being removed has to be pulled out of xskb_list. Introduce

hmm it looks like xsk_buff_free() called by __xdp_return would
pull the frag out of the xskb_list? Or am I wrong?

Then the issue is primarily the NULL handling?

> appriopriate xsk helpers to do such node operation and use them
> accordingly within bpf_xdp_adjust_tail().
> 
> Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>  include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
>  net/core/filter.c          | 48 +++++++++++++++++++++++++++++++-------
>  2 files changed, 65 insertions(+), 9 deletions(-)
> 
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index b62bb8525a5f..3d35ac0f838b 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
>  	return ret;
>  }
>  
> +static inline void xsk_buff_del_tail(struct xdp_buff *tail)
> +{
> +	struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
> +
> +	list_del(&xskb->xskb_list_node);
> +}
> +
> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> +{
> +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> +	struct xdp_buff_xsk *frag;
> +
> +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> +			       xskb_list_node);
> +	return &frag->xdp;
> +}
> +
>  static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
>  {
>  	xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> @@ -350,6 +367,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
>  	return NULL;
>  }
>  
> +static inline void xsk_buff_del_tail(struct xdp_buff *tail)
> +{
> +}
> +
> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> +{
> +	return NULL;
> +}
> +
>  static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
>  {
>  }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 24061f29c9dd..1e20196687fd 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -83,6 +83,7 @@
>  #include <net/netfilter/nf_conntrack_bpf.h>
>  #include <net/netkit.h>
>  #include <linux/un.h>
> +#include <net/xdp_sock_drv.h>
>  
>  #include "dev.h"
>  
> @@ -4096,6 +4097,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
>  	return 0;
>  }
>  
> +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> +			  skb_frag_t *frag, int shrink)
> +{
> +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> +
> +		if (tail)
> +			tail->data_end -= shrink;
> +	}
> +	skb_frag_size_sub(frag, shrink);
> +}
> +
> +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> +{
> +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> +
> +	if (skb_frag_size(frag) == shrink) {
> +		struct page *page = skb_frag_page(frag);
> +		struct xdp_buff *zc_frag = NULL;
> +
> +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> +			zc_frag = xsk_buff_get_tail(xdp);
> +
> +			if (zc_frag) {
> +				xdp_buff_clear_frags_flag(zc_frag);
> +				xsk_buff_del_tail(zc_frag);
> +			}
> +		}

Should this be fixed in xdp_return instead of here? The xdp_return
is doing what xsk_buff_del_tail() does. If we also called clear_frags
there could this be simpler?

 if (skb_frag_size(frag) == shrink) {
	struct page *page = skb_frag_page(frag);

	__xdp_return(page_address(page), mem_info, false, xsk_buff_get_tail(xdp));
 } else {
   __shrink_data(xdp, mem_info, frag, shrink);
 }

the return will need to have an unlikely(!xdp) to guard the case it
might be NULL, but also not sure if we would ever expect a NULL
here if MEM_TYPE_XSK_BUFF_POOL so you might skip that unlikely
as well?

> +
> +		__xdp_return(page_address(page), mem_info, false, zc_frag);
> +		return true;
> +	}
> +	__shrink_data(xdp, mem_info, frag, shrink);
> +	return false;
> +}
> +
>  static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
>  {
>  	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> @@ -4110,17 +4147,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
>  
>  		len_free += shrink;
>  		offset -= shrink;
> -
> -		if (skb_frag_size(frag) == shrink) {
> -			struct page *page = skb_frag_page(frag);
> -

And then I likely would avoid the helper altogether? And code
example above just lands here?

> -			__xdp_return(page_address(page), &xdp->rxq->mem,
> -				     false, NULL);
> +		if (shrink_data(xdp, frag, shrink))
>  			n_frags_free++;
> -		} else {
> -			skb_frag_size_sub(frag, shrink);
> +		else
>  			break;
> -		}
>  	}

I think the fix can be more straight-forward if we just populate
the NULL field with the xdp_buff using the get_tail() helper
created above.

>  	sinfo->nr_frags -= n_frags_free;
>  	sinfo->xdp_frags_size -= len_free;
> -- 
> 2.34.1
> 
>
Martin KaFai Lau Jan. 3, 2024, 10:53 p.m. UTC | #4
On 1/3/24 4:04 AM, Maciej Fijalkowski wrote:
> On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote:
>> On 12/21/23 5:26 AM, Maciej Fijalkowski wrote:
>>> This comes from __xdp_return() call with xdp_buff argument passed as
>>> NULL which is supposed to be consumed by xsk_buff_free() call.
>>>
>>> To address this properly, in ZC case, a node that represents the frag
>>> being removed has to be pulled out of xskb_list. Introduce
>>> appriopriate xsk helpers to do such node operation and use them
>>> accordingly within bpf_xdp_adjust_tail().
>>
>> [ ... ]
>>
>>> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
>>> +{
>>> +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
>>> +	struct xdp_buff_xsk *frag;
>>> +
>>> +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
>>> +			       xskb_list_node);
>>> +	return &frag->xdp;
>>> +}
>>> +
>>
>> [ ... ]
>>
>>> +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
>>> +			  skb_frag_t *frag, int shrink)
>>> +{
>>> +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
>>> +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
>>> +
>>> +		if (tail)
>>> +			tail->data_end -= shrink;
>>> +	}
>>> +	skb_frag_size_sub(frag, shrink);
>>> +}
>>> +
>>> +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
>>> +{
>>> +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
>>> +
>>> +	if (skb_frag_size(frag) == shrink) {
>>> +		struct page *page = skb_frag_page(frag);
>>> +		struct xdp_buff *zc_frag = NULL;
>>> +
>>> +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
>>> +			zc_frag = xsk_buff_get_tail(xdp);
>>> +
>>> +			if (zc_frag) {
>>
>> Based on the xsk_buff_get_tail(), would zc_frag ever be NULL?
> 
> Hey Martin thanks for taking a look, I had to do this in order to satisfy
> !CONFIG_XDP_SOCKETS builds :/

There is compilation/checker warning if it does not check for NULL?

hmm... but it still should not reach here in the runtime and call 
xsk_buff_get_tail() in the !CONFIG_XDP_SOCKETS build. Can the NULL test on the 
get_tail() return value be removed? The above "mem_info->type == 
MEM_TYPE_XSK_BUFF_POOL" should have avoided the get_tail() call for the 
!CONFIG_XDP_SOCKETS build. Otherwise, it could be passing NULL to the 
__xdp_return() and hit the same bug again. The NULL check here is pretty hard to 
reason logically.

> 
>>
>>> +				xdp_buff_clear_frags_flag(zc_frag);
>>> +				xsk_buff_del_tail(zc_frag);
>>> +			}
>>> +		}
>>> +
>>> +		__xdp_return(page_address(page), mem_info, false, zc_frag);
>>
>> and iiuc, this patch is fixing a bug when zc_frag is NULL and
>> MEM_TYPE_XSK_BUFF_POOL.
> 
> Generally I don't see the need for xdp_return_buff() (which calls in the
> end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this
> could be refactored later and then probably this fix would look different,
> but this is out of the scope now.
> 
>>
>>> +		return true;
>>> +	}
>>> +	__shrink_data(xdp, mem_info, frag, shrink);
>>> +	return false;
>>> +}
>>> +
>>
>>
>
Fijalkowski, Maciej Jan. 4, 2024, 8:18 p.m. UTC | #5
On Wed, Jan 03, 2024 at 12:48:10PM -0800, John Fastabend wrote:
> Maciej Fijalkowski wrote:
> > Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
> > dereference happens:
> > 
> > [1136314.192256] BUG: kernel NULL pointer dereference, address:
> > 0000000000000034
> > [1136314.203943] #PF: supervisor read access in kernel mode
> > [1136314.213768] #PF: error_code(0x0000) - not-present page
> > [1136314.223550] PGD 0 P4D 0
> > [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
> > [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
> > BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
> > [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
> > [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
> > [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
> > [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
> > 0000000000000000
> > [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> > ffffc9003168c000
> > [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
> > 0000000000010000
> > [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
> > 0000000000000001
> > [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
> > 0000000000000001
> > [1136314.373298] FS:  00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
> > knlGS:0000000000000000
> > [1136314.386105] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
> > 00000000007706f0
> > [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> > 0000000000000000
> > [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> > 0000000000000400
> > [1136314.431890] PKRU: 55555554
> > [1136314.439143] Call Trace:
> > [1136314.446058]  <IRQ>
> > [1136314.452465]  ? __die+0x20/0x70
> > [1136314.459881]  ? page_fault_oops+0x15b/0x440
> > [1136314.468305]  ? exc_page_fault+0x6a/0x150
> > [1136314.476491]  ? asm_exc_page_fault+0x22/0x30
> > [1136314.484927]  ? __xdp_return+0x6c/0x210
> > [1136314.492863]  bpf_xdp_adjust_tail+0x155/0x1d0
> > [1136314.501269]  bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
> > [1136314.511263]  ice_clean_rx_irq_zc+0x206/0xc60 [ice]
> > [1136314.520222]  ? ice_xmit_zc+0x6e/0x150 [ice]
> > [1136314.528506]  ice_napi_poll+0x467/0x670 [ice]
> > [1136314.536858]  ? ttwu_do_activate.constprop.0+0x8f/0x1a0
> > [1136314.546010]  __napi_poll+0x29/0x1b0
> > [1136314.553462]  net_rx_action+0x133/0x270
> > [1136314.561619]  __do_softirq+0xbe/0x28e
> > [1136314.569303]  do_softirq+0x3f/0x60
> > 
> > This comes from __xdp_return() call with xdp_buff argument passed as
> > NULL which is supposed to be consumed by xsk_buff_free() call.
> > 
> > To address this properly, in ZC case, a node that represents the frag
> > being removed has to be pulled out of xskb_list. Introduce
> 
> hmm it looks like xsk_buff_free() called by __xdp_return would
> pull the frag out of the xskb_list? Or am I wrong?
> 
> Then the issue is primarily the NULL handling?

Hey John, as you see later on it is also about adjusting the size within
xdp_buff that comes from xsk pool, in case when offset is not bigger than
frag size.

> 
> > appriopriate xsk helpers to do such node operation and use them
> > accordingly within bpf_xdp_adjust_tail().
> > 
> > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> >  include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
> >  net/core/filter.c          | 48 +++++++++++++++++++++++++++++++-------
> >  2 files changed, 65 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> > index b62bb8525a5f..3d35ac0f838b 100644
> > --- a/include/net/xdp_sock_drv.h
> > +++ b/include/net/xdp_sock_drv.h
> > @@ -159,6 +159,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> >  	return ret;
> >  }
> >  
> > +static inline void xsk_buff_del_tail(struct xdp_buff *tail)
> > +{
> > +	struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
> > +
> > +	list_del(&xskb->xskb_list_node);
> > +}
> > +
> > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > +{
> > +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> > +	struct xdp_buff_xsk *frag;
> > +
> > +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> > +			       xskb_list_node);
> > +	return &frag->xdp;
> > +}
> > +
> >  static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> >  {
> >  	xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> > @@ -350,6 +367,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> >  	return NULL;
> >  }
> >  
> > +static inline void xsk_buff_del_tail(struct xdp_buff *tail)
> > +{
> > +}
> > +
> > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > +{
> > +	return NULL;
> > +}
> > +
> >  static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> >  {
> >  }
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 24061f29c9dd..1e20196687fd 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -83,6 +83,7 @@
> >  #include <net/netfilter/nf_conntrack_bpf.h>
> >  #include <net/netkit.h>
> >  #include <linux/un.h>
> > +#include <net/xdp_sock_drv.h>
> >  
> >  #include "dev.h"
> >  
> > @@ -4096,6 +4097,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
> >  	return 0;
> >  }
> >  
> > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> > +			  skb_frag_t *frag, int shrink)
> > +{
> > +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> > +
> > +		if (tail)
> > +			tail->data_end -= shrink;
> > +	}
> > +	skb_frag_size_sub(frag, shrink);
> > +}
> > +
> > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> > +{
> > +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> > +
> > +	if (skb_frag_size(frag) == shrink) {
> > +		struct page *page = skb_frag_page(frag);
> > +		struct xdp_buff *zc_frag = NULL;
> > +
> > +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > +			zc_frag = xsk_buff_get_tail(xdp);
> > +
> > +			if (zc_frag) {
> > +				xdp_buff_clear_frags_flag(zc_frag);
> > +				xsk_buff_del_tail(zc_frag);
> > +			}
> > +		}
> 
> Should this be fixed in xdp_return instead of here? The xdp_return
> is doing what xsk_buff_del_tail() does. If we also called clear_frags
> there could this be simpler?

xsk_buff_del_tail() only deletes node from xskb_list and xsk_buff_free()
in the will call xp_free() on frag being deleted.

I think I would be rather leaning towards adding xp_free() to
xsk_buff_del_tail() and skipping __xdp_return() call from ZC case
altogether...

> 
>  if (skb_frag_size(frag) == shrink) {
> 	struct page *page = skb_frag_page(frag);
> 
> 	__xdp_return(page_address(page), mem_info, false, xsk_buff_get_tail(xdp));
>  } else {
>    __shrink_data(xdp, mem_info, frag, shrink);
>  }
> 
> the return will need to have an unlikely(!xdp) to guard the case it
> might be NULL, but also not sure if we would ever expect a NULL
> here if MEM_TYPE_XSK_BUFF_POOL so you might skip that unlikely
> as well?

In that approach you would xp_free() the frag being removed but it would
still be dangling in xskb_list that first xdp_buff carries. Some
adjustments would have to be done within xsk_buff_free().

Regarding the NULLness of zc_frag for MEM_TYPE_XSK_BUFF_POOL that Martin
also brought, you are right, I went too far with this, I misread what
kernel test robot reported :) so on monday I will look into this.

> 
> > +
> > +		__xdp_return(page_address(page), mem_info, false, zc_frag);
> > +		return true;
> > +	}
> > +	__shrink_data(xdp, mem_info, frag, shrink);
> > +	return false;
> > +}
> > +
> >  static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
> >  {
> >  	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> > @@ -4110,17 +4147,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
> >  
> >  		len_free += shrink;
> >  		offset -= shrink;
> > -
> > -		if (skb_frag_size(frag) == shrink) {
> > -			struct page *page = skb_frag_page(frag);
> > -
> 
> And then I likely would avoid the helper altogether? And code
> example above just lands here?
> 
> > -			__xdp_return(page_address(page), &xdp->rxq->mem,
> > -				     false, NULL);
> > +		if (shrink_data(xdp, frag, shrink))
> >  			n_frags_free++;
> > -		} else {
> > -			skb_frag_size_sub(frag, shrink);
> > +		else
> >  			break;
> > -		}
> >  	}
> 
> I think the fix can be more straight-forward if we just populate
> the NULL field with the xdp_buff using the get_tail() helper
> created above.

I'll think about both approaches, the one you're suggesting and the other
that I wrote up above. Thanks a lot for taking a look!

> 
> >  	sinfo->nr_frags -= n_frags_free;
> >  	sinfo->xdp_frags_size -= len_free;
> > -- 
> > 2.34.1
> > 
> > 
> 
> 
>
Fijalkowski, Maciej Jan. 4, 2024, 8:23 p.m. UTC | #6
On Wed, Jan 03, 2024 at 02:53:20PM -0800, Martin KaFai Lau wrote:
> On 1/3/24 4:04 AM, Maciej Fijalkowski wrote:
> > On Tue, Jan 02, 2024 at 02:58:00PM -0800, Martin KaFai Lau wrote:
> > > On 12/21/23 5:26 AM, Maciej Fijalkowski wrote:
> > > > This comes from __xdp_return() call with xdp_buff argument passed as
> > > > NULL which is supposed to be consumed by xsk_buff_free() call.
> > > > 
> > > > To address this properly, in ZC case, a node that represents the frag
> > > > being removed has to be pulled out of xskb_list. Introduce
> > > > appriopriate xsk helpers to do such node operation and use them
> > > > accordingly within bpf_xdp_adjust_tail().
> > > 
> > > [ ... ]
> > > 
> > > > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > > > +{
> > > > +	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> > > > +	struct xdp_buff_xsk *frag;
> > > > +
> > > > +	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> > > > +			       xskb_list_node);
> > > > +	return &frag->xdp;
> > > > +}
> > > > +
> > > 
> > > [ ... ]
> > > 
> > > > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> > > > +			  skb_frag_t *frag, int shrink)
> > > > +{
> > > > +	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > > > +		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> > > > +
> > > > +		if (tail)
> > > > +			tail->data_end -= shrink;
> > > > +	}
> > > > +	skb_frag_size_sub(frag, shrink);
> > > > +}
> > > > +
> > > > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> > > > +{
> > > > +	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> > > > +
> > > > +	if (skb_frag_size(frag) == shrink) {
> > > > +		struct page *page = skb_frag_page(frag);
> > > > +		struct xdp_buff *zc_frag = NULL;
> > > > +
> > > > +		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > > > +			zc_frag = xsk_buff_get_tail(xdp);
> > > > +
> > > > +			if (zc_frag) {
> > > 
> > > Based on the xsk_buff_get_tail(), would zc_frag ever be NULL?
> > 
> > Hey Martin thanks for taking a look, I had to do this in order to satisfy
> > !CONFIG_XDP_SOCKETS builds :/
> 
> There is compilation/checker warning if it does not check for NULL?
> 
> hmm... but it still should not reach here in the runtime and call
> xsk_buff_get_tail() in the !CONFIG_XDP_SOCKETS build. Can the NULL test on
> the get_tail() return value be removed? The above "mem_info->type ==
> MEM_TYPE_XSK_BUFF_POOL" should have avoided the get_tail() call for the
> !CONFIG_XDP_SOCKETS build. Otherwise, it could be passing NULL to the
> __xdp_return() and hit the same bug again. The NULL check here is pretty
> hard to reason logically.

Thanks for bringing this up, you are of course right. I'll address that.

> 
> > 
> > > 
> > > > +				xdp_buff_clear_frags_flag(zc_frag);
> > > > +				xsk_buff_del_tail(zc_frag);
> > > > +			}
> > > > +		}
> > > > +
> > > > +		__xdp_return(page_address(page), mem_info, false, zc_frag);
> > > 
> > > and iiuc, this patch is fixing a bug when zc_frag is NULL and
> > > MEM_TYPE_XSK_BUFF_POOL.
> > 
> > Generally I don't see the need for xdp_return_buff() (which calls in the
> > end __xdp_return() being discussed) to handle MEM_TYPE_XSK_BUFF_POOL, this
> > could be refactored later and then probably this fix would look different,
> > but this is out of the scope now.
> > 
> > > 
> > > > +		return true;
> > > > +	}
> > > > +	__shrink_data(xdp, mem_info, frag, shrink);
> > > > +	return false;
> > > > +}
> > > > +
> > > 
> > > 
> > 
> 
>
diff mbox series

Patch

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index b62bb8525a5f..3d35ac0f838b 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -159,6 +159,23 @@  static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
 	return ret;
 }
 
+static inline void xsk_buff_del_tail(struct xdp_buff *tail)
+{
+	struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
+
+	list_del(&xskb->xskb_list_node);
+}
+
+static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
+{
+	struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
+	struct xdp_buff_xsk *frag;
+
+	frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
+			       xskb_list_node);
+	return &frag->xdp;
+}
+
 static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
 {
 	xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
@@ -350,6 +367,15 @@  static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
 	return NULL;
 }
 
+static inline void xsk_buff_del_tail(struct xdp_buff *tail)
+{
+}
+
+static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
+{
+	return NULL;
+}
+
 static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
 {
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 24061f29c9dd..1e20196687fd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,7 @@ 
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netkit.h>
 #include <linux/un.h>
+#include <net/xdp_sock_drv.h>
 
 #include "dev.h"
 
@@ -4096,6 +4097,42 @@  static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
 	return 0;
 }
 
+static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
+			  skb_frag_t *frag, int shrink)
+{
+	if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
+		struct xdp_buff *tail = xsk_buff_get_tail(xdp);
+
+		if (tail)
+			tail->data_end -= shrink;
+	}
+	skb_frag_size_sub(frag, shrink);
+}
+
+static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
+{
+	struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+
+	if (skb_frag_size(frag) == shrink) {
+		struct page *page = skb_frag_page(frag);
+		struct xdp_buff *zc_frag = NULL;
+
+		if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
+			zc_frag = xsk_buff_get_tail(xdp);
+
+			if (zc_frag) {
+				xdp_buff_clear_frags_flag(zc_frag);
+				xsk_buff_del_tail(zc_frag);
+			}
+		}
+
+		__xdp_return(page_address(page), mem_info, false, zc_frag);
+		return true;
+	}
+	__shrink_data(xdp, mem_info, frag, shrink);
+	return false;
+}
+
 static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -4110,17 +4147,10 @@  static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
 
 		len_free += shrink;
 		offset -= shrink;
-
-		if (skb_frag_size(frag) == shrink) {
-			struct page *page = skb_frag_page(frag);
-
-			__xdp_return(page_address(page), &xdp->rxq->mem,
-				     false, NULL);
+		if (shrink_data(xdp, frag, shrink))
 			n_frags_free++;
-		} else {
-			skb_frag_size_sub(frag, shrink);
+		else
 			break;
-		}
 	}
 	sinfo->nr_frags -= n_frags_free;
 	sinfo->xdp_frags_size -= len_free;