diff mbox series

[v5,bpf-next,4/7] bpf: udp: Implement batching for sockets iterator

Message ID 20230330151758.531170-5-aditi.ghag@isovalent.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Add socket destroy capability | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 pending Logs for test_maps on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 25 this patch: 26
netdev/cc_maintainers warning 6 maintainers not CCed: pabeni@redhat.com dsahern@kernel.org willemdebruijn.kernel@gmail.com kuba@kernel.org netdev@vger.kernel.org davem@davemloft.net
netdev/build_clang success Errors and warnings before: 26 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 25 this patch: 26
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on s390x with gcc

Commit Message

Aditi Ghag March 30, 2023, 3:17 p.m. UTC
Batch UDP sockets from BPF iterator that allows for overlapping locking
semantics in BPF/kernel helpers executed in BPF programs.  This facilitates
BPF socket destroy kfunc (introduced by follow-up patches) to execute from
BPF iterator programs.

Previously, BPF iterators acquired the sock lock and sockets hash table
bucket lock while executing BPF programs. This prevented BPF helpers that
again acquire these locks to be executed from BPF iterators.  With the
batching approach, we acquire a bucket lock, batch all the bucket sockets,
and then release the bucket lock. This enables BPF or kernel helpers to
skip sock locking when invoked in the supported BPF contexts.

The batching logic is similar to the logic implemented in TCP iterator:
https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
---
 net/ipv4/udp.c | 230 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 213 insertions(+), 17 deletions(-)

Comments

Martin KaFai Lau March 31, 2023, 9:08 p.m. UTC | #1
On 3/30/23 8:17 AM, Aditi Ghag wrote:
> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
> +				      unsigned int new_batch_sz);
>   
>   static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>   {
> @@ -3151,6 +3163,149 @@ static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>   		net_eq(sock_net(sk), seq_file_net(seq)));
>   }
>   
> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
> +	struct net *net = seq_file_net(seq);
> +	struct sock *first_sk = NULL;
> +	struct udp_seq_afinfo afinfo;
> +	struct udp_table *udptable;
> +	unsigned int batch_sks = 0;
> +	bool resized = false;
> +	struct sock *sk;
> +	int offset = 0;
> +	int new_offset;
> +
> +	/* The current batch is done, so advance the bucket. */
> +	if (iter->st_bucket_done) {
> +		state->bucket++;
> +		iter->offset = 0;
> +	}
> +
> +	afinfo.family = AF_UNSPEC;
> +	afinfo.udp_table = NULL;
> +	udptable = udp_get_table_afinfo(&afinfo, net);
> +
> +	if (state->bucket > udptable->mask) {

This test looks unnecessary. The for-loop below should take care of this case?

> +		state->bucket = 0;

Reset state->bucket here looks suspicious (or at least unnecessary) also. The 
iterator cannot restart from the beginning. or I am missing something here? This 
at least requires a comment if it is really needed.

> +		iter->offset = 0;
> +		return NULL;
> +	}
> +
> +again:
> +	/* New batch for the next bucket.
> +	 * Iterate over the hash table to find a bucket with sockets matching
> +	 * the iterator attributes, and return the first matching socket from
> +	 * the bucket. The remaining matched sockets from the bucket are batched
> +	 * before releasing the bucket lock. This allows BPF programs that are
> +	 * called in seq_show to acquire the bucket lock if needed.
> +	 */
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	first_sk = NULL;
> +	batch_sks = 0;
> +	offset = iter->offset;
> +
> +	for (; state->bucket <= udptable->mask; state->bucket++) {
> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
> +
> +		if (hlist_empty(&hslot2->head)) {
> +			offset = 0;
> +			continue;
> +		}
> +		new_offset = offset;
> +
> +		spin_lock_bh(&hslot2->lock);
> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
> +			if (seq_sk_match(seq, sk)) {
> +				/* Resume from the last iterated socket at the
> +				 * offset in the bucket before iterator was stopped.
> +				 */
> +				if (offset) {
> +					--offset;
> +					continue;
> +				}
> +				if (!first_sk)
> +					first_sk = sk;
> +				if (iter->end_sk < iter->max_sk) {
> +					sock_hold(sk);
> +					iter->batch[iter->end_sk++] = sk;
> +				}
> +				batch_sks++;
> +				new_offset++;
> +			}
> +		}
> +		spin_unlock_bh(&hslot2->lock);
> +
> +		if (first_sk)
> +			break;
> +
> +		/* Reset the current bucket's offset before moving to the next bucket. */
> +		offset = 0;
> +	}
> +
> +	/* All done: no batch made. */
> +	if (!first_sk)

Testing !iter->end_sk should be the same?

> +		goto ret;
> +
> +	if (iter->end_sk == batch_sks) {
> +		/* Batching is done for the current bucket; return the first
> +		 * socket to be iterated from the batch.
> +		 */
> +		iter->st_bucket_done = true;
> +		goto ret;
> +	}
> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
> +		resized = true;
> +		/* Go back to the previous bucket to resize its batch. */
> +		state->bucket--;
> +		goto again;
> +	}
> +ret:
> +	iter->offset = new_offset;

hmm... updating iter->offset looks not right and,
does it need a new_offset?

afaict, either

a) it can resume at the old bucket. In this case, the iter->offset should not 
change.

or

b) it moved to the next bucket and iter->offset should be 0.

> +	return first_sk;

&iter->batch[0] is the first_sk. 'first_sk' variable is not needed then.
Aditi Ghag April 3, 2023, 3:54 p.m. UTC | #2
> On Mar 31, 2023, at 2:08 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
> 
> On 3/30/23 8:17 AM, Aditi Ghag wrote:
>> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
>> +				      unsigned int new_batch_sz);
>>    static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>>  {
>> @@ -3151,6 +3163,149 @@ static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>>  		net_eq(sock_net(sk), seq_file_net(seq)));
>>  }
>>  +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>> +{
>> +	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>> +	struct net *net = seq_file_net(seq);
>> +	struct sock *first_sk = NULL;
>> +	struct udp_seq_afinfo afinfo;
>> +	struct udp_table *udptable;
>> +	unsigned int batch_sks = 0;
>> +	bool resized = false;
>> +	struct sock *sk;
>> +	int offset = 0;
>> +	int new_offset;
>> +
>> +	/* The current batch is done, so advance the bucket. */
>> +	if (iter->st_bucket_done) {
>> +		state->bucket++;
>> +		iter->offset = 0;
>> +	}
>> +
>> +	afinfo.family = AF_UNSPEC;
>> +	afinfo.udp_table = NULL;
>> +	udptable = udp_get_table_afinfo(&afinfo, net);
>> +
>> +	if (state->bucket > udptable->mask) {
> 
> This test looks unnecessary. The for-loop below should take care of this case?

We could return early in case the iterator has reached the end of the hash table. I suppose reset of the bucket should only happen when user stops, and starts a new iterator round.  

> 
>> +		state->bucket = 0;
> 
> Reset state->bucket here looks suspicious (or at least unnecessary) also. The iterator cannot restart from the beginning. or I am missing something here? This at least requires a comment if it is really needed. 
> 
>> +		iter->offset = 0;
>> +		return NULL;
>> +	}
>> +
>> +again:
>> +	/* New batch for the next bucket.
>> +	 * Iterate over the hash table to find a bucket with sockets matching
>> +	 * the iterator attributes, and return the first matching socket from
>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>> +	 * before releasing the bucket lock. This allows BPF programs that are
>> +	 * called in seq_show to acquire the bucket lock if needed.
>> +	 */
>> +	iter->cur_sk = 0;
>> +	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	first_sk = NULL;
>> +	batch_sks = 0;
>> +	offset = iter->offset;
>> +
>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>> +
>> +		if (hlist_empty(&hslot2->head)) {
>> +			offset = 0;
>> +			continue;
>> +		}
>> +		new_offset = offset;
>> +
>> +		spin_lock_bh(&hslot2->lock);
>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>> +			if (seq_sk_match(seq, sk)) {
>> +				/* Resume from the last iterated socket at the
>> +				 * offset in the bucket before iterator was stopped.
>> +				 */
>> +				if (offset) {
>> +					--offset;
>> +					continue;
>> +				}
>> +				if (!first_sk)
>> +					first_sk = sk;
>> +				if (iter->end_sk < iter->max_sk) {
>> +					sock_hold(sk);
>> +					iter->batch[iter->end_sk++] = sk;
>> +				}
>> +				batch_sks++;
>> +				new_offset++;
>> +			}
>> +		}
>> +		spin_unlock_bh(&hslot2->lock);
>> +
>> +		if (first_sk)
>> +			break;
>> +
>> +		/* Reset the current bucket's offset before moving to the next bucket. */
>> +		offset = 0;
>> +	}
>> +
>> +	/* All done: no batch made. */
>> +	if (!first_sk)
> 
> Testing !iter->end_sk should be the same?

Sure, that could work too.

> 
>> +		goto ret;
>> +
>> +	if (iter->end_sk == batch_sks) {
>> +		/* Batching is done for the current bucket; return the first
>> +		 * socket to be iterated from the batch.
>> +		 */
>> +		iter->st_bucket_done = true;
>> +		goto ret;
>> +	}
>> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
>> +		resized = true;
>> +		/* Go back to the previous bucket to resize its batch. */
>> +		state->bucket--;
>> +		goto again;
>> +	}
>> +ret:
>> +	iter->offset = new_offset;
> 
> hmm... updating iter->offset looks not right and,
> does it need a new_offset?
> 

This is a leftover from earlier versions. :( Sorry, I didn't do my due diligence here. 

> afaict, either
> 
> a) it can resume at the old bucket. In this case, the iter->offset should not change.
> 
> or
> 
> b) it moved to the next bucket and iter->offset should be 0.

Yes, that's the behavior I had in mind as well. 

> 
>> +	return first_sk;
> 
> &iter->batch[0] is the first_sk. 'first_sk' variable is not needed then.

It's possible that we didn't find any socket to return, or resized batch didn't go through. So we can't always rely on iter->batch[0]. As an alternative, we could return early when a socket is found. Anyway either option seems fine.
Martin KaFai Lau April 3, 2023, 7:20 p.m. UTC | #3
On 4/3/23 8:54 AM, Aditi Ghag wrote:
> 
> 
>> On Mar 31, 2023, at 2:08 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 3/30/23 8:17 AM, Aditi Ghag wrote:
>>> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
>>> +				      unsigned int new_batch_sz);
>>>     static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>>>   {
>>> @@ -3151,6 +3163,149 @@ static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
>>>   		net_eq(sock_net(sk), seq_file_net(seq)));
>>>   }
>>>   +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>> +{
>>> +	struct bpf_udp_iter_state *iter = seq->private;
>>> +	struct udp_iter_state *state = &iter->state;
>>> +	struct net *net = seq_file_net(seq);
>>> +	struct sock *first_sk = NULL;
>>> +	struct udp_seq_afinfo afinfo;
>>> +	struct udp_table *udptable;
>>> +	unsigned int batch_sks = 0;
>>> +	bool resized = false;
>>> +	struct sock *sk;
>>> +	int offset = 0;
>>> +	int new_offset;
>>> +
>>> +	/* The current batch is done, so advance the bucket. */
>>> +	if (iter->st_bucket_done) {
>>> +		state->bucket++;
>>> +		iter->offset = 0;
>>> +	}
>>> +
>>> +	afinfo.family = AF_UNSPEC;
>>> +	afinfo.udp_table = NULL;
>>> +	udptable = udp_get_table_afinfo(&afinfo, net);
>>> +
>>> +	if (state->bucket > udptable->mask) {
>>
>> This test looks unnecessary. The for-loop below should take care of this case?
> 
> We could return early in case the iterator has reached the end of the hash table. I suppose reset of the bucket should only happen when user stops, and starts a new iterator round.

bucket should not go back because bpf-iter cannot lseek back. It is why I was 
confused in the following bucket reset back to zero.

It can just fall through to the following for-loop and...

> 
>>
>>> +		state->bucket = 0;
>>
>> Reset state->bucket here looks suspicious (or at least unnecessary) also. The iterator cannot restart from the beginning. or I am missing something here? This at least requires a comment if it is really needed.
>>
>>> +		iter->offset = 0;
>>> +		return NULL;
>>> +	}
>>> +
>>> +again:
>>> +	/* New batch for the next bucket.
>>> +	 * Iterate over the hash table to find a bucket with sockets matching
>>> +	 * the iterator attributes, and return the first matching socket from
>>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>>> +	 * before releasing the bucket lock. This allows BPF programs that are
>>> +	 * called in seq_show to acquire the bucket lock if needed.
>>> +	 */
>>> +	iter->cur_sk = 0;
>>> +	iter->end_sk = 0;
>>> +	iter->st_bucket_done = false;
>>> +	first_sk = NULL;
>>> +	batch_sks = 0;
>>> +	offset = iter->offset;
>>> +
>>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>>> +
>>> +		if (hlist_empty(&hslot2->head)) {
>>> +			offset = 0;
>>> +			continue;
>>> +		}
>>> +		new_offset = offset;
>>> +
>>> +		spin_lock_bh(&hslot2->lock);
>>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>>> +			if (seq_sk_match(seq, sk)) {
>>> +				/* Resume from the last iterated socket at the
>>> +				 * offset in the bucket before iterator was stopped.
>>> +				 */
>>> +				if (offset) {
>>> +					--offset;
>>> +					continue;
>>> +				}
>>> +				if (!first_sk)
>>> +					first_sk = sk;
>>> +				if (iter->end_sk < iter->max_sk) {
>>> +					sock_hold(sk);
>>> +					iter->batch[iter->end_sk++] = sk;
>>> +				}
>>> +				batch_sks++;
>>> +				new_offset++;
>>> +			}
>>> +		}
>>> +		spin_unlock_bh(&hslot2->lock);
>>> +
>>> +		if (first_sk)
>>> +			break;
>>> +
>>> +		/* Reset the current bucket's offset before moving to the next bucket. */
>>> +		offset = 0;
>>> +	}
>>> +
>>> +	/* All done: no batch made. */
>>> +	if (!first_sk)
>>
>> Testing !iter->end_sk should be the same?
> 
> Sure, that could work too.
> 
>>
>>> +		goto ret;

... return NULL here because new_offset is not needed.

>>> +
>>> +	if (iter->end_sk == batch_sks) {
>>> +		/* Batching is done for the current bucket; return the first
>>> +		 * socket to be iterated from the batch.
>>> +		 */
>>> +		iter->st_bucket_done = true;
>>> +		goto ret;
>>> +	}
>>> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
>>> +		resized = true;
>>> +		/* Go back to the previous bucket to resize its batch. */
>>> +		state->bucket--;
>>> +		goto again;
>>> +	}
>>> +ret:
>>> +	iter->offset = new_offset;
>>
>> hmm... updating iter->offset looks not right and,
>> does it need a new_offset?
>>
> 
> This is a leftover from earlier versions. :( Sorry, I didn't do my due diligence here.
> 
>> afaict, either
>>
>> a) it can resume at the old bucket. In this case, the iter->offset should not change.
>>
>> or
>>
>> b) it moved to the next bucket and iter->offset should be 0.
> 
> Yes, that's the behavior I had in mind as well.
> 
>>
>>> +	return first_sk;
>>
>> &iter->batch[0] is the first_sk. 'first_sk' variable is not needed then.
> 
> It's possible that we didn't find any socket to return, or resized batch didn't go through. So we can't always rely on iter->batch[0]. As an alternative, we could return early when a socket is found. Anyway either option seems fine.

yeah, if it didn't find any socket, I was assuming the earlier !first_sk (or 
!iter->end_sk) check should just directly return NULL because there is no need 
to change the iter->offset.

If the resize fails, it will return whatever is in iter->batch[0] anyway.

I was asking to use iter->batch[0] instead of having a first_sk is because, when 
I was trying to understand why 'new_offset++' was needed at all, the 'if 
(!first_sk) first_sk = sk;' in the above 'udp_portaddr_for_each_entry' loop kept 
coming back to my head on why it needs to be done since first_sk is just an 
alias of iter->batch[0].
diff mbox series

Patch

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cead4acb64c6..9af23d1c8d6b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3140,7 +3140,19 @@  struct bpf_iter__udp {
 	int bucket __aligned(8);
 };
 
+struct bpf_udp_iter_state {
+	struct udp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	int offset;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
 static unsigned short seq_file_family(const struct seq_file *seq);
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz);
 
 static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
 {
@@ -3151,6 +3163,149 @@  static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
 		net_eq(sock_net(sk), seq_file_net(seq)));
 }
 
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct udp_iter_state *state = &iter->state;
+	struct net *net = seq_file_net(seq);
+	struct sock *first_sk = NULL;
+	struct udp_seq_afinfo afinfo;
+	struct udp_table *udptable;
+	unsigned int batch_sks = 0;
+	bool resized = false;
+	struct sock *sk;
+	int offset = 0;
+	int new_offset;
+
+	/* The current batch is done, so advance the bucket. */
+	if (iter->st_bucket_done) {
+		state->bucket++;
+		iter->offset = 0;
+	}
+
+	afinfo.family = AF_UNSPEC;
+	afinfo.udp_table = NULL;
+	udptable = udp_get_table_afinfo(&afinfo, net);
+
+	if (state->bucket > udptable->mask) {
+		state->bucket = 0;
+		iter->offset = 0;
+		return NULL;
+	}
+
+again:
+	/* New batch for the next bucket.
+	 * Iterate over the hash table to find a bucket with sockets matching
+	 * the iterator attributes, and return the first matching socket from
+	 * the bucket. The remaining matched sockets from the bucket are batched
+	 * before releasing the bucket lock. This allows BPF programs that are
+	 * called in seq_show to acquire the bucket lock if needed.
+	 */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+	first_sk = NULL;
+	batch_sks = 0;
+	offset = iter->offset;
+
+	for (; state->bucket <= udptable->mask; state->bucket++) {
+		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
+
+		if (hlist_empty(&hslot2->head)) {
+			offset = 0;
+			continue;
+		}
+		new_offset = offset;
+
+		spin_lock_bh(&hslot2->lock);
+		udp_portaddr_for_each_entry(sk, &hslot2->head) {
+			if (seq_sk_match(seq, sk)) {
+				/* Resume from the last iterated socket at the
+				 * offset in the bucket before iterator was stopped.
+				 */
+				if (offset) {
+					--offset;
+					continue;
+				}
+				if (!first_sk)
+					first_sk = sk;
+				if (iter->end_sk < iter->max_sk) {
+					sock_hold(sk);
+					iter->batch[iter->end_sk++] = sk;
+				}
+				batch_sks++;
+				new_offset++;
+			}
+		}
+		spin_unlock_bh(&hslot2->lock);
+
+		if (first_sk)
+			break;
+
+		/* Reset the current bucket's offset before moving to the next bucket. */
+		offset = 0;
+	}
+
+	/* All done: no batch made. */
+	if (!first_sk)
+		goto ret;
+
+	if (iter->end_sk == batch_sks) {
+		/* Batching is done for the current bucket; return the first
+		 * socket to be iterated from the batch.
+		 */
+		iter->st_bucket_done = true;
+		goto ret;
+	}
+	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
+		resized = true;
+		/* Go back to the previous bucket to resize its batch. */
+		state->bucket--;
+		goto again;
+	}
+ret:
+	iter->offset = new_offset;
+	return first_sk;
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so unref the iter->cur_sk.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		sock_put(iter->batch[iter->cur_sk++]);
+		++iter->offset;
+	}
+
+	/* After updating iter->cur_sk, check if there are more sockets
+	 * available in the current bucket batch.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		sk = iter->batch[iter->cur_sk];
+	} else {
+		// Prepare a new batch.
+		sk = bpf_iter_udp_batch(seq);
+	}
+
+	++*pos;
+	return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_udp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
 static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
 			     struct udp_sock *udp_sk, uid_t uid, int bucket)
 {
@@ -3171,18 +3326,37 @@  static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
 	struct bpf_prog *prog;
 	struct sock *sk = v;
 	uid_t uid;
+	int rc;
 
 	if (v == SEQ_START_TOKEN)
 		return 0;
 
+	lock_sock(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		rc = SEQ_SKIP;
+		goto unlock;
+	}
+
 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
-	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+	rc = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+	release_sock(sk);
+	return rc;
+}
+
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
 }
 
 static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct bpf_udp_iter_state *iter = seq->private;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 
@@ -3193,12 +3367,15 @@  static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
 	}
 
-	udp_seq_stop(seq, v);
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_udp_put_batch(iter);
+		iter->st_bucket_done = false;
+	}
 }
 
 static const struct seq_operations bpf_iter_udp_seq_ops = {
-	.start		= udp_seq_start,
-	.next		= udp_seq_next,
+	.start		= bpf_iter_udp_seq_start,
+	.next		= bpf_iter_udp_seq_next,
 	.stop		= bpf_iter_udp_seq_stop,
 	.show		= bpf_iter_udp_seq_show,
 };
@@ -3425,38 +3602,57 @@  static struct pernet_operations __net_initdata udp_sysctl_ops = {
 DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
 		     struct udp_sock *udp_sk, uid_t uid, int bucket)
 
-static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz)
 {
-	struct udp_iter_state *st = priv_data;
-	struct udp_seq_afinfo *afinfo;
-	int ret;
+	struct sock **new_batch;
 
-	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
-	if (!afinfo)
+	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+				   GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
 		return -ENOMEM;
 
-	afinfo->family = AF_UNSPEC;
-	afinfo->udp_table = NULL;
-	st->bpf_seq_afinfo = afinfo;
+	bpf_iter_udp_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_udp_iter_state *iter = priv_data;
+	int ret;
+
 	ret = bpf_iter_init_seq_net(priv_data, aux);
 	if (ret)
-		kfree(afinfo);
+		return ret;
+
+	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (ret) {
+		bpf_iter_fini_seq_net(priv_data);
+		return ret;
+	}
+
 	return ret;
 }
 
 static void bpf_iter_fini_udp(void *priv_data)
 {
-	struct udp_iter_state *st = priv_data;
+	struct bpf_udp_iter_state *iter = priv_data;
 
-	kfree(st->bpf_seq_afinfo);
 	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
 }
 
 static const struct bpf_iter_seq_info udp_seq_info = {
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
 	.fini_seq_private	= bpf_iter_fini_udp,
-	.seq_priv_size		= sizeof(struct udp_iter_state),
+	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
 };
 
 static struct bpf_iter_reg udp_reg_info = {