diff mbox series

[bpf,2/2] tcp_bpf: add sk_rmem_alloc related logic for ingress redirection

Message ID 20241017005742.3374075-3-zijianzhang@bytedance.com (mailing list archive)
State Needs ACK
Delegated to: BPF
Headers show
Series tcp_bpf: update the rmem scheduling for ingress redirection | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/build_tools success Errors and warnings before: 0 (+1) this patch: 0 (+1)
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 44 this patch: 44
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 60 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18

Commit Message

Zijian Zhang Oct. 17, 2024, 12:57 a.m. UTC
From: Zijian Zhang <zijianzhang@bytedance.com>

Although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir
in bpf_tcp_ingress, we do not update sk_rmem_alloc. As a result, except
for the global memory limit, the rmem of sk_redir is nearly unlimited.

Thus, add sk_rmem_alloc related logic to limit the recv buffer.

Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
---
 include/linux/skmsg.h | 11 ++++++++---
 net/core/skmsg.c      |  6 +++++-
 net/ipv4/tcp_bpf.c    |  4 +++-
 3 files changed, 16 insertions(+), 5 deletions(-)

Comments

Cong Wang Nov. 8, 2024, 4:04 a.m. UTC | #1
On Thu, Oct 17, 2024 at 12:57:42AM +0000, zijianzhang@bytedance.com wrote:
> From: Zijian Zhang <zijianzhang@bytedance.com>
> 
> Although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir
> in bpf_tcp_ingress, we do not update sk_rmem_alloc. As a result, except
> for the global memory limit, the rmem of sk_redir is nearly unlimited.
> 
> Thus, add sk_rmem_alloc related logic to limit the recv buffer.
> 
> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
> ---
>  include/linux/skmsg.h | 11 ++++++++---
>  net/core/skmsg.c      |  6 +++++-
>  net/ipv4/tcp_bpf.c    |  4 +++-
>  3 files changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index d9b03e0746e7..2cbe0c22a32f 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
>  	kfree_skb(skb);
>  }
>  
> -static inline void sk_psock_queue_msg(struct sk_psock *psock,
> +static inline bool sk_psock_queue_msg(struct sk_psock *psock,
>  				      struct sk_msg *msg)
>  {
> +	bool ret;
> +
>  	spin_lock_bh(&psock->ingress_lock);
> -	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
> +	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
>  		list_add_tail(&msg->list, &psock->ingress_msg);
> -	else {
> +		ret = true;
> +	} else {
>  		sk_msg_free(psock->sk, msg);
>  		kfree(msg);
> +		ret = false;
>  	}
>  	spin_unlock_bh(&psock->ingress_lock);
> +	return ret;
>  }
>  
>  static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index b1dcbd3be89e..110ee0abcfe0 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
>  			if (likely(!peek)) {
>  				sge->offset += copy;
>  				sge->length -= copy;
> -				if (!msg_rx->skb)
> +				if (!msg_rx->skb) {
>  					sk_mem_uncharge(sk, copy);
> +					atomic_sub(copy, &sk->sk_rmem_alloc);
> +				}
>  				msg_rx->sg.size -= copy;
>  
>  				if (!sge->length) {
> @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>  
>  	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
>  		list_del(&msg->list);
> +		if (!msg->skb)
> +			atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
>  		sk_msg_free(psock->sk, msg);

Why not calling this atomic_sub() in sk_msg_free_elem()?

Thanks.
Zijian Zhang Nov. 8, 2024, 6:28 p.m. UTC | #2
On 11/7/24 8:04 PM, Cong Wang wrote:
> On Thu, Oct 17, 2024 at 12:57:42AM +0000, zijianzhang@bytedance.com wrote:
>> From: Zijian Zhang <zijianzhang@bytedance.com>
>>
>> Although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir
>> in bpf_tcp_ingress, we do not update sk_rmem_alloc. As a result, except
>> for the global memory limit, the rmem of sk_redir is nearly unlimited.
>>
>> Thus, add sk_rmem_alloc related logic to limit the recv buffer.
>>
>> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
>> ---
>>   include/linux/skmsg.h | 11 ++++++++---
>>   net/core/skmsg.c      |  6 +++++-
>>   net/ipv4/tcp_bpf.c    |  4 +++-
>>   3 files changed, 16 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
>> index d9b03e0746e7..2cbe0c22a32f 100644
>> --- a/include/linux/skmsg.h
>> +++ b/include/linux/skmsg.h
>> @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
>>   	kfree_skb(skb);
>>   }
>>   
>> -static inline void sk_psock_queue_msg(struct sk_psock *psock,
>> +static inline bool sk_psock_queue_msg(struct sk_psock *psock,
>>   				      struct sk_msg *msg)
>>   {
>> +	bool ret;
>> +
>>   	spin_lock_bh(&psock->ingress_lock);
>> -	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
>> +	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
>>   		list_add_tail(&msg->list, &psock->ingress_msg);
>> -	else {
>> +		ret = true;
>> +	} else {
>>   		sk_msg_free(psock->sk, msg);
>>   		kfree(msg);
>> +		ret = false;
>>   	}
>>   	spin_unlock_bh(&psock->ingress_lock);
>> +	return ret;
>>   }
>>   
>>   static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
>> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
>> index b1dcbd3be89e..110ee0abcfe0 100644
>> --- a/net/core/skmsg.c
>> +++ b/net/core/skmsg.c
>> @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
>>   			if (likely(!peek)) {
>>   				sge->offset += copy;
>>   				sge->length -= copy;
>> -				if (!msg_rx->skb)
>> +				if (!msg_rx->skb) {
>>   					sk_mem_uncharge(sk, copy);
>> +					atomic_sub(copy, &sk->sk_rmem_alloc);
>> +				}
>>   				msg_rx->sg.size -= copy;
>>   
>>   				if (!sge->length) {
>> @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
>>   
>>   	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
>>   		list_del(&msg->list);
>> +		if (!msg->skb)
>> +			atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
>>   		sk_msg_free(psock->sk, msg);
> 
> Why not calling this atomic_sub() in sk_msg_free_elem()?
> 
> Thanks.

sk_msg_free_elem called by sk_msg_free or sk_msg_free_no_charge will
be invoked in multiple locations including TX/RX/Error and etc.

We should call atomic_sub(&sk->sk_rmem_alloc) for sk_msgs that have
been atomic_add before. In other words, we need to call atomic_sub
only for sk_msgs in ingress_msg.

As for "!msg->skb" check here, I want to make sure the sk_msg is not
from function sk_psock_skb_ingress_enqueue, because these sk_msgs'
rmem accounting has already handled by skb_set_owner_r in function
sk_psock_skb_ingress.
John Fastabend Nov. 21, 2024, 6:07 a.m. UTC | #3
Zijian Zhang wrote:
> 
> On 11/7/24 8:04 PM, Cong Wang wrote:
> > On Thu, Oct 17, 2024 at 12:57:42AM +0000, zijianzhang@bytedance.com wrote:
> >> From: Zijian Zhang <zijianzhang@bytedance.com>
> >>
> >> Although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir
> >> in bpf_tcp_ingress, we do not update sk_rmem_alloc. As a result, except
> >> for the global memory limit, the rmem of sk_redir is nearly unlimited.
> >>
> >> Thus, add sk_rmem_alloc related logic to limit the recv buffer.
> >>
> >> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
> >> ---
> >>   include/linux/skmsg.h | 11 ++++++++---
> >>   net/core/skmsg.c      |  6 +++++-
> >>   net/ipv4/tcp_bpf.c    |  4 +++-
> >>   3 files changed, 16 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> >> index d9b03e0746e7..2cbe0c22a32f 100644
> >> --- a/include/linux/skmsg.h
> >> +++ b/include/linux/skmsg.h
> >> @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
> >>   	kfree_skb(skb);
> >>   }
> >>   
> >> -static inline void sk_psock_queue_msg(struct sk_psock *psock,
> >> +static inline bool sk_psock_queue_msg(struct sk_psock *psock,
> >>   				      struct sk_msg *msg)
> >>   {
> >> +	bool ret;
> >> +
> >>   	spin_lock_bh(&psock->ingress_lock);
> >> -	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
> >> +	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
> >>   		list_add_tail(&msg->list, &psock->ingress_msg);
> >> -	else {
> >> +		ret = true;
> >> +	} else {
> >>   		sk_msg_free(psock->sk, msg);
> >>   		kfree(msg);
> >> +		ret = false;
> >>   	}
> >>   	spin_unlock_bh(&psock->ingress_lock);
> >> +	return ret;
> >>   }
> >>   
> >>   static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
> >> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> >> index b1dcbd3be89e..110ee0abcfe0 100644
> >> --- a/net/core/skmsg.c
> >> +++ b/net/core/skmsg.c
> >> @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> >>   			if (likely(!peek)) {
> >>   				sge->offset += copy;
> >>   				sge->length -= copy;
> >> -				if (!msg_rx->skb)
> >> +				if (!msg_rx->skb) {
> >>   					sk_mem_uncharge(sk, copy);
> >> +					atomic_sub(copy, &sk->sk_rmem_alloc);
> >> +				}
> >>   				msg_rx->sg.size -= copy;
> >>   
> >>   				if (!sge->length) {
> >> @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
> >>   
> >>   	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
> >>   		list_del(&msg->list);
> >> +		if (!msg->skb)
> >> +			atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
> >>   		sk_msg_free(psock->sk, msg);
> > 
> > Why not calling this atomic_sub() in sk_msg_free_elem()?
> > 
> > Thanks.
> 
> sk_msg_free_elem called by sk_msg_free or sk_msg_free_no_charge will
> be invoked in multiple locations including TX/RX/Error and etc.
> 
> We should call atomic_sub(&sk->sk_rmem_alloc) for sk_msgs that have
> been atomic_add before. In other words, we need to call atomic_sub
> only for sk_msgs in ingress_msg.
> 
> As for "!msg->skb" check here, I want to make sure the sk_msg is not
> from function sk_psock_skb_ingress_enqueue, because these sk_msgs'
> rmem accounting has already handled by skb_set_owner_r in function
> sk_psock_skb_ingress.
> 

Assuming I read above correct this is only an issue when doing a
redirect to ingress of a socket? The other path where we do a
SK_PASS directly from the verdict to hit the ingress of the
current socket is OK because all account is done already through
skb. Basically that is what the above explanation for !msg->skb
is describing?

Can we add this to the description so we don't forget it and/or
have to look up the mailing list in the future.

Thanks,
John
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d9b03e0746e7..2cbe0c22a32f 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -317,17 +317,22 @@  static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
-static inline void sk_psock_queue_msg(struct sk_psock *psock,
+static inline bool sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
+	bool ret;
+
 	spin_lock_bh(&psock->ingress_lock);
-	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 		list_add_tail(&msg->list, &psock->ingress_msg);
-	else {
+		ret = true;
+	} else {
 		sk_msg_free(psock->sk, msg);
 		kfree(msg);
+		ret = false;
 	}
 	spin_unlock_bh(&psock->ingress_lock);
+	return ret;
 }
 
 static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index b1dcbd3be89e..110ee0abcfe0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -445,8 +445,10 @@  int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 			if (likely(!peek)) {
 				sge->offset += copy;
 				sge->length -= copy;
-				if (!msg_rx->skb)
+				if (!msg_rx->skb) {
 					sk_mem_uncharge(sk, copy);
+					atomic_sub(copy, &sk->sk_rmem_alloc);
+				}
 				msg_rx->sg.size -= copy;
 
 				if (!sge->length) {
@@ -772,6 +774,8 @@  static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 
 	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
 		list_del(&msg->list);
+		if (!msg->skb)
+			atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
 		sk_msg_free(psock->sk, msg);
 		kfree(msg);
 	}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 48c412744f77..39155bec746f 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -56,6 +56,7 @@  static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 		}
 
 		sk_mem_charge(sk, size);
+		atomic_add(size, &sk->sk_rmem_alloc);
 		sk_msg_xfer(tmp, msg, i, size);
 		copied += size;
 		if (sge->length)
@@ -74,7 +75,8 @@  static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 
 	if (!ret) {
 		msg->sg.start = i;
-		sk_psock_queue_msg(psock, tmp);
+		if (!sk_psock_queue_msg(psock, tmp))
+			atomic_sub(copied, &sk->sk_rmem_alloc);
 		sk_psock_data_ready(sk, psock);
 	} else {
 		sk_msg_free(sk, tmp);