diff mbox series

[v9,bpf-next,5/9] bpf: udp: Implement batching for sockets iterator

Message ID 20230519225157.760788-6-aditi.ghag@isovalent.com (mailing list archive)
State Accepted
Commit c96dac8d369ffd713a45f4e5c30f23c47a1671f0
Delegated to: BPF
Headers show
Series bpf: Add socket destroy capability | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16 this patch: 16
netdev/cc_maintainers fail 7 maintainers not CCed: kuba@kernel.org dsahern@kernel.org netdev@vger.kernel.org davem@davemloft.net pabeni@redhat.com willemdebruijn.kernel@gmail.com edumazet@google.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16 this patch: 16
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail merge-conflict

Commit Message

Aditi Ghag May 19, 2023, 10:51 p.m. UTC
Batch UDP sockets from BPF iterator that allows for overlapping locking
semantics in BPF/kernel helpers executed in BPF programs.  This facilitates
BPF socket destroy kfunc (introduced by follow-up patches) to execute from
BPF iterator programs.

Previously, BPF iterators acquired the sock lock and sockets hash table
bucket lock while executing BPF programs. This prevented BPF helpers that
again acquire these locks to be executed from BPF iterators.  With the
batching approach, we acquire a bucket lock, batch all the bucket sockets,
and then release the bucket lock. This enables BPF or kernel helpers to
skip sock locking when invoked in the supported BPF contexts.

The batching logic is similar to the logic implemented in TCP iterator:
https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
---
 net/ipv4/udp.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 199 insertions(+), 6 deletions(-)

Comments

Martin KaFai Lau Sept. 20, 2023, 12:38 a.m. UTC | #1
On 5/19/23 3:51 PM, Aditi Ghag wrote:
> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
> +	struct net *net = seq_file_net(seq);
> +	struct udp_table *udptable;
> +	unsigned int batch_sks = 0;
> +	bool resized = false;
> +	struct sock *sk;
> +
> +	/* The current batch is done, so advance the bucket. */
> +	if (iter->st_bucket_done) {
> +		state->bucket++;
> +		iter->offset = 0;
> +	}
> +
> +	udptable = udp_get_table_seq(seq, net);
> +
> +again:
> +	/* New batch for the next bucket.
> +	 * Iterate over the hash table to find a bucket with sockets matching
> +	 * the iterator attributes, and return the first matching socket from
> +	 * the bucket. The remaining matched sockets from the bucket are batched
> +	 * before releasing the bucket lock. This allows BPF programs that are
> +	 * called in seq_show to acquire the bucket lock if needed.
> +	 */
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	batch_sks = 0;
> +
> +	for (; state->bucket <= udptable->mask; state->bucket++) {
> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
> +
> +		if (hlist_empty(&hslot2->head)) {
> +			iter->offset = 0;
> +			continue;
> +		}
> +
> +		spin_lock_bh(&hslot2->lock);
> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
> +			if (seq_sk_match(seq, sk)) {
> +				/* Resume from the last iterated socket at the
> +				 * offset in the bucket before iterator was stopped.
> +				 */
> +				if (iter->offset) {
> +					--iter->offset;

Hi Aditi, I think this part has a bug.

When I run './test_progs -t bpf_iter/udp6' in a machine with some udp 
so_reuseport sockets, this test is never finished.

A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() 
can only get one sk at a time before it calls bpf_iter_udp_seq_stop().

I did not try the change yet. However, from looking at the code where 
iter->offset is changed, --iter->offset here is the most likely culprit and it 
will make backward progress for the same bucket (state->bucket). Other places 
touching iter->offset look fine.

It needs a local "int offset" variable for the zero test. Could you help to take 
a look, add (or modify) a test and fix it?

The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in 
prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk 
in the same bucket. Probably a few so_reuseport sk should do.

Thanks.

> +					continue;
> +				}
> +				if (iter->end_
Aditi Ghag Sept. 20, 2023, 5:16 p.m. UTC | #2
> On Sep 19, 2023, at 5:38 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
> 
> On 5/19/23 3:51 PM, Aditi Ghag wrote:
>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>> +{
>> +	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>> +	struct net *net = seq_file_net(seq);
>> +	struct udp_table *udptable;
>> +	unsigned int batch_sks = 0;
>> +	bool resized = false;
>> +	struct sock *sk;
>> +
>> +	/* The current batch is done, so advance the bucket. */
>> +	if (iter->st_bucket_done) {
>> +		state->bucket++;
>> +		iter->offset = 0;
>> +	}
>> +
>> +	udptable = udp_get_table_seq(seq, net);
>> +
>> +again:
>> +	/* New batch for the next bucket.
>> +	 * Iterate over the hash table to find a bucket with sockets matching
>> +	 * the iterator attributes, and return the first matching socket from
>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>> +	 * before releasing the bucket lock. This allows BPF programs that are
>> +	 * called in seq_show to acquire the bucket lock if needed.
>> +	 */
>> +	iter->cur_sk = 0;
>> +	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	batch_sks = 0;
>> +
>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>> +
>> +		if (hlist_empty(&hslot2->head)) {
>> +			iter->offset = 0;
>> +			continue;
>> +		}
>> +
>> +		spin_lock_bh(&hslot2->lock);
>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>> +			if (seq_sk_match(seq, sk)) {
>> +				/* Resume from the last iterated socket at the
>> +				 * offset in the bucket before iterator was stopped.
>> +				 */
>> +				if (iter->offset) {
>> +					--iter->offset;
> 
> Hi Aditi, I think this part has a bug.
> 
> When I run './test_progs -t bpf_iter/udp6' in a machine with some udp so_reuseport sockets, this test is never finished.
> 
> A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() can only get one sk at a time before it calls bpf_iter_udp_seq_stop().
> 
> I did not try the change yet. However, from looking at the code where iter->offset is changed, --iter->offset here is the most likely culprit and it will make backward progress for the same bucket (state->bucket). Other places touching iter->offset look fine.
> 
> It needs a local "int offset" variable for the zero test. Could you help to take a look, add (or modify) a test and fix it?
> 
> The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk in the same bucket. Probably a few so_reuseport sk should do.

Hi Martin,

Thanks for the report. I'll take a look.


> 
> Thanks.
> 
>> +					continue;
>> +				}
>> +				if (iter->end_
>
Aditi Ghag Sept. 25, 2023, 11:34 p.m. UTC | #3
> On Sep 19, 2023, at 5:38 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
> 
> On 5/19/23 3:51 PM, Aditi Ghag wrote:
>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>> +{
>> +	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>> +	struct net *net = seq_file_net(seq);
>> +	struct udp_table *udptable;
>> +	unsigned int batch_sks = 0;
>> +	bool resized = false;
>> +	struct sock *sk;
>> +
>> +	/* The current batch is done, so advance the bucket. */
>> +	if (iter->st_bucket_done) {
>> +		state->bucket++;
>> +		iter->offset = 0;
>> +	}
>> +
>> +	udptable = udp_get_table_seq(seq, net);
>> +
>> +again:
>> +	/* New batch for the next bucket.
>> +	 * Iterate over the hash table to find a bucket with sockets matching
>> +	 * the iterator attributes, and return the first matching socket from
>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>> +	 * before releasing the bucket lock. This allows BPF programs that are
>> +	 * called in seq_show to acquire the bucket lock if needed.
>> +	 */
>> +	iter->cur_sk = 0;
>> +	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	batch_sks = 0;
>> +
>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>> +
>> +		if (hlist_empty(&hslot2->head)) {
>> +			iter->offset = 0;
>> +			continue;
>> +		}
>> +
>> +		spin_lock_bh(&hslot2->lock);
>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>> +			if (seq_sk_match(seq, sk)) {
>> +				/* Resume from the last iterated socket at the
>> +				 * offset in the bucket before iterator was stopped.
>> +				 */
>> +				if (iter->offset) {
>> +					--iter->offset;
> 
> Hi Aditi, I think this part has a bug.
> 
> When I run './test_progs -t bpf_iter/udp6' in a machine with some udp so_reuseport sockets, this test is never finished.
> 
> A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() can only get one sk at a time before it calls bpf_iter_udp_seq_stop().

Just so that I understand the broken case better, are you doing something in your BPF iterator program so that "bpf_seq_read() can only get one sk at a time"? 

> 
> I did not try the change yet. However, from looking at the code where iter->offset is changed, --iter->offset here is the most likely culprit and it will make backward progress for the same bucket (state->bucket). Other places touching iter->offset look fine.
> 
> It needs a local "int offset" variable for the zero test. Could you help to take a look, add (or modify) a test and fix it?
> 
> The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk in the same bucket. Probably a few so_reuseport sk should do.


The sock_destroy patch set had added a test with multiple so_reuseport sks in a bucket in order to exercise batching [1]. I was wondering if extending the test with an additional bucket should do it, or some more cases are required (asked for clarification above) to reproduce the issue. 


[1] https://elixir.bootlin.com/linux/v6.5/source/tools/testing/selftests/bpf/prog_tests/sock_destroy.c#L146

> 
> Thanks.
> 
>> +					continue;
>> +				}
>> +				if (iter->end_
>
Martin KaFai Lau Sept. 26, 2023, 5:02 a.m. UTC | #4
On 9/25/23 4:34 PM, Aditi Ghag wrote:
> 
> 
>> On Sep 19, 2023, at 5:38 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 5/19/23 3:51 PM, Aditi Ghag wrote:
>>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>> +{
>>> +	struct bpf_udp_iter_state *iter = seq->private;
>>> +	struct udp_iter_state *state = &iter->state;
>>> +	struct net *net = seq_file_net(seq);
>>> +	struct udp_table *udptable;
>>> +	unsigned int batch_sks = 0;
>>> +	bool resized = false;
>>> +	struct sock *sk;
>>> +
>>> +	/* The current batch is done, so advance the bucket. */
>>> +	if (iter->st_bucket_done) {
>>> +		state->bucket++;
>>> +		iter->offset = 0;
>>> +	}
>>> +
>>> +	udptable = udp_get_table_seq(seq, net);
>>> +
>>> +again:
>>> +	/* New batch for the next bucket.
>>> +	 * Iterate over the hash table to find a bucket with sockets matching
>>> +	 * the iterator attributes, and return the first matching socket from
>>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>>> +	 * before releasing the bucket lock. This allows BPF programs that are
>>> +	 * called in seq_show to acquire the bucket lock if needed.
>>> +	 */
>>> +	iter->cur_sk = 0;
>>> +	iter->end_sk = 0;
>>> +	iter->st_bucket_done = false;
>>> +	batch_sks = 0;
>>> +
>>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>>> +
>>> +		if (hlist_empty(&hslot2->head)) {
>>> +			iter->offset = 0;
>>> +			continue;
>>> +		}
>>> +
>>> +		spin_lock_bh(&hslot2->lock);
>>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>>> +			if (seq_sk_match(seq, sk)) {
>>> +				/* Resume from the last iterated socket at the
>>> +				 * offset in the bucket before iterator was stopped.
>>> +				 */
>>> +				if (iter->offset) {
>>> +					--iter->offset;
>>
>> Hi Aditi, I think this part has a bug.
>>
>> When I run './test_progs -t bpf_iter/udp6' in a machine with some udp so_reuseport sockets, this test is never finished.
>>
>> A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() can only get one sk at a time before it calls bpf_iter_udp_seq_stop().
> 
> Just so that I understand the broken case better, are you doing something in your BPF iterator program so that "bpf_seq_read() can only get one sk at a time"?
> 
>>
>> I did not try the change yet. However, from looking at the code where iter->offset is changed, --iter->offset here is the most likely culprit and it will make backward progress for the same bucket (state->bucket). Other places touching iter->offset look fine.
>>
>> It needs a local "int offset" variable for the zero test. Could you help to take a look, add (or modify) a test and fix it?
>>
>> The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk in the same bucket. Probably a few so_reuseport sk should do.
> 
> 
> The sock_destroy patch set had added a test with multiple so_reuseport sks in a bucket in order to exercise batching [1]. I was wondering if extending the test with an additional bucket should do it, or some more cases are required (asked for clarification above) to reproduce the issue.

Number of bucket should not matter. It should only need a bucket to have 
multiple sockets.

I did notice test_udp_server() has 5 so_reuseport udp sk in the same bucket when 
trying to understand how this issue was missed. It is enough on the hashtable 
side. This is the easier part and one start_reuseport_server() call will do. 
Having multiple sk in a bucket is not enough to reprod though.

The bpf prog 'iter_udp6_server' in the sock_destroy test is not doing 
bpf_seq_printf(). bpf_seq_printf() is necessary to reproduce the issue. The 
read() buf from the userspace program side also needs to be small. It needs to 
hit the "if (seq->count >= size) break;" condition in the "while (1)" loop in 
the kernel/bpf/bpf_iter.c.

You can try to add both to the sock_destroy test. I was suggesting 
bpf_iter/udp[46] test instead (i.e. the test_udp[46] function) because the 
bpf_seq_printf and the buf[] size are all aligned to reprod the problem already. 
  Try to add a start_reuseport_server(..., 5) to the beginning of test_udp6() in 
prog_tests/bpf_iter.c to ensure there is multiple udp sk in a bucket. It should 
be enough to reprod.

In the final fix, I don't have strong preference on where the test should be.
Modifying one of the two existing tests (i.e. sock_destroy or bpf_iter) or a 
completely new test.

Let me know if you have problem reproducing it. Thanks.

> 
> 
> [1] https://elixir.bootlin.com/linux/v6.5/source/tools/testing/selftests/bpf/prog_tests/sock_destroy.c#L146
> 
>>
>> Thanks.
>>
>>> +					continue;
>>> +				}
>>> +				if (iter->end_
>>
>
Martin KaFai Lau Sept. 26, 2023, 5:07 a.m. UTC | #5
On 9/25/23 4:34 PM, Aditi Ghag wrote:
> Just so that I understand the broken case better, are you doing something in your BPF iterator program so that "bpf_seq_read() can only get one sk at a time"?

ah, hit send too early.

Yes, bpf_seq_printf(). It is why I was suggesting to use the bpf_iter/udp[46] to 
reprod by adding start_reuseport_server(). Please see my earlier reply.
Aditi Ghag Sept. 26, 2023, 4:07 p.m. UTC | #6
> On Sep 25, 2023, at 10:02 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
> 
> On 9/25/23 4:34 PM, Aditi Ghag wrote:
>>> On Sep 19, 2023, at 5:38 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>> 
>>> On 5/19/23 3:51 PM, Aditi Ghag wrote:
>>>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>>> +{
>>>> +	struct bpf_udp_iter_state *iter = seq->private;
>>>> +	struct udp_iter_state *state = &iter->state;
>>>> +	struct net *net = seq_file_net(seq);
>>>> +	struct udp_table *udptable;
>>>> +	unsigned int batch_sks = 0;
>>>> +	bool resized = false;
>>>> +	struct sock *sk;
>>>> +
>>>> +	/* The current batch is done, so advance the bucket. */
>>>> +	if (iter->st_bucket_done) {
>>>> +		state->bucket++;
>>>> +		iter->offset = 0;
>>>> +	}
>>>> +
>>>> +	udptable = udp_get_table_seq(seq, net);
>>>> +
>>>> +again:
>>>> +	/* New batch for the next bucket.
>>>> +	 * Iterate over the hash table to find a bucket with sockets matching
>>>> +	 * the iterator attributes, and return the first matching socket from
>>>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>>>> +	 * before releasing the bucket lock. This allows BPF programs that are
>>>> +	 * called in seq_show to acquire the bucket lock if needed.
>>>> +	 */
>>>> +	iter->cur_sk = 0;
>>>> +	iter->end_sk = 0;
>>>> +	iter->st_bucket_done = false;
>>>> +	batch_sks = 0;
>>>> +
>>>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>>>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>>>> +
>>>> +		if (hlist_empty(&hslot2->head)) {
>>>> +			iter->offset = 0;
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +		spin_lock_bh(&hslot2->lock);
>>>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>>>> +			if (seq_sk_match(seq, sk)) {
>>>> +				/* Resume from the last iterated socket at the
>>>> +				 * offset in the bucket before iterator was stopped.
>>>> +				 */
>>>> +				if (iter->offset) {
>>>> +					--iter->offset;
>>> 
>>> Hi Aditi, I think this part has a bug.
>>> 
>>> When I run './test_progs -t bpf_iter/udp6' in a machine with some udp so_reuseport sockets, this test is never finished.
>>> 
>>> A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() can only get one sk at a time before it calls bpf_iter_udp_seq_stop().
>> Just so that I understand the broken case better, are you doing something in your BPF iterator program so that "bpf_seq_read() can only get one sk at a time"?
>>> 
>>> I did not try the change yet. However, from looking at the code where iter->offset is changed, --iter->offset here is the most likely culprit and it will make backward progress for the same bucket (state->bucket). Other places touching iter->offset look fine.
>>> 
>>> It needs a local "int offset" variable for the zero test. Could you help to take a look, add (or modify) a test and fix it?
>>> 
>>> The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk in the same bucket. Probably a few so_reuseport sk should do.
>> The sock_destroy patch set had added a test with multiple so_reuseport sks in a bucket in order to exercise batching [1]. I was wondering if extending the test with an additional bucket should do it, or some more cases are required (asked for clarification above) to reproduce the issue.
> 
> Number of bucket should not matter. It should only need a bucket to have multiple sockets.
> 
> I did notice test_udp_server() has 5 so_reuseport udp sk in the same bucket when trying to understand how this issue was missed. It is enough on the hashtable side. This is the easier part and one start_reuseport_server() call will do. Having multiple sk in a bucket is not enough to reprod though.
> 
> The bpf prog 'iter_udp6_server' in the sock_destroy test is not doing bpf_seq_printf(). bpf_seq_printf() is necessary to reproduce the issue. The read() buf from the userspace program side also needs to be small. It needs to hit the "if (seq->count >= size) break;" condition in the "while (1)" loop in the kernel/bpf/bpf_iter.c.
> 
> You can try to add both to the sock_destroy test. I was suggesting bpf_iter/udp[46] test instead (i.e. the test_udp[46] function) because the bpf_seq_printf and the buf[] size are all aligned to reprod the problem already.  Try to add a start_reuseport_server(..., 5) to the beginning of test_udp6() in prog_tests/bpf_iter.c to ensure there is multiple udp sk in a bucket. It should be enough to reprod.


Gotcha! I think I understand the repro steps. The offset field in question was added for this scenario where an iterator is stopped and resumed that the sock_destroy test cases don't entirely exercise. 
Thanks! 

> 
> In the final fix, I don't have strong preference on where the test should be.
> Modifying one of the two existing tests (i.e. sock_destroy or bpf_iter) or a completely new test.
> 
> Let me know if you have problem reproducing it. Thanks.
> 
>> [1] https://elixir.bootlin.com/linux/v6.5/source/tools/testing/selftests/bpf/prog_tests/sock_destroy.c#L146
>>> 
>>> Thanks.
>>> 
>>>> +					continue;
>>>> +				}
>>>> +				if (iter->end_
Aditi Ghag Oct. 24, 2023, 10:50 p.m. UTC | #7
> On Sep 26, 2023, at 9:07 AM, Aditi Ghag <aditi.ghag@isovalent.com> wrote:
> 
> 
> 
>> On Sep 25, 2023, at 10:02 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
>> 
>> On 9/25/23 4:34 PM, Aditi Ghag wrote:
>>>> On Sep 19, 2023, at 5:38 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>> 
>>>> On 5/19/23 3:51 PM, Aditi Ghag wrote:
>>>>> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>>>> +{
>>>>> +	struct bpf_udp_iter_state *iter = seq->private;
>>>>> +	struct udp_iter_state *state = &iter->state;
>>>>> +	struct net *net = seq_file_net(seq);
>>>>> +	struct udp_table *udptable;
>>>>> +	unsigned int batch_sks = 0;
>>>>> +	bool resized = false;
>>>>> +	struct sock *sk;
>>>>> +
>>>>> +	/* The current batch is done, so advance the bucket. */
>>>>> +	if (iter->st_bucket_done) {
>>>>> +		state->bucket++;
>>>>> +		iter->offset = 0;
>>>>> +	}
>>>>> +
>>>>> +	udptable = udp_get_table_seq(seq, net);
>>>>> +
>>>>> +again:
>>>>> +	/* New batch for the next bucket.
>>>>> +	 * Iterate over the hash table to find a bucket with sockets matching
>>>>> +	 * the iterator attributes, and return the first matching socket from
>>>>> +	 * the bucket. The remaining matched sockets from the bucket are batched
>>>>> +	 * before releasing the bucket lock. This allows BPF programs that are
>>>>> +	 * called in seq_show to acquire the bucket lock if needed.
>>>>> +	 */
>>>>> +	iter->cur_sk = 0;
>>>>> +	iter->end_sk = 0;
>>>>> +	iter->st_bucket_done = false;
>>>>> +	batch_sks = 0;
>>>>> +
>>>>> +	for (; state->bucket <= udptable->mask; state->bucket++) {
>>>>> +		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>>>>> +
>>>>> +		if (hlist_empty(&hslot2->head)) {
>>>>> +			iter->offset = 0;
>>>>> +			continue;
>>>>> +		}
>>>>> +
>>>>> +		spin_lock_bh(&hslot2->lock);
>>>>> +		udp_portaddr_for_each_entry(sk, &hslot2->head) {
>>>>> +			if (seq_sk_match(seq, sk)) {
>>>>> +				/* Resume from the last iterated socket at the
>>>>> +				 * offset in the bucket before iterator was stopped.
>>>>> +				 */
>>>>> +				if (iter->offset) {
>>>>> +					--iter->offset;
>>>> 
>>>> Hi Aditi, I think this part has a bug.
>>>> 
>>>> When I run './test_progs -t bpf_iter/udp6' in a machine with some udp so_reuseport sockets, this test is never finished.
>>>> 
>>>> A broken case I am seeing is when the bucket has >1 sockets and bpf_seq_read() can only get one sk at a time before it calls bpf_iter_udp_seq_stop().
>>> Just so that I understand the broken case better, are you doing something in your BPF iterator program so that "bpf_seq_read() can only get one sk at a time"?
>>>> 
>>>> I did not try the change yet. However, from looking at the code where iter->offset is changed, --iter->offset here is the most likely culprit and it will make backward progress for the same bucket (state->bucket). Other places touching iter->offset look fine.
>>>> 
>>>> It needs a local "int offset" variable for the zero test. Could you help to take a look, add (or modify) a test and fix it?
>>>> 
>>>> The progs/bpf_iter_udp[46].c test can be used to reproduce. The test_udp[46] in prog_tests/bpf_iter.c needs to be changed though to ensure there is multiple sk in the same bucket. Probably a few so_reuseport sk should do.
>>> The sock_destroy patch set had added a test with multiple so_reuseport sks in a bucket in order to exercise batching [1]. I was wondering if extending the test with an additional bucket should do it, or some more cases are required (asked for clarification above) to reproduce the issue.
>> 
>> Number of bucket should not matter. It should only need a bucket to have multiple sockets.
>> 
>> I did notice test_udp_server() has 5 so_reuseport udp sk in the same bucket when trying to understand how this issue was missed. It is enough on the hashtable side. This is the easier part and one start_reuseport_server() call will do. Having multiple sk in a bucket is not enough to reprod though.
>> 
>> The bpf prog 'iter_udp6_server' in the sock_destroy test is not doing bpf_seq_printf(). bpf_seq_printf() is necessary to reproduce the issue. The read() buf from the userspace program side also needs to be small. It needs to hit the "if (seq->count >= size) break;" condition in the "while (1)" loop in the kernel/bpf/bpf_iter.c.
>> 
>> You can try to add both to the sock_destroy test. I was suggesting bpf_iter/udp[46] test instead (i.e. the test_udp[46] function) because the bpf_seq_printf and the buf[] size are all aligned to reprod the problem already.  Try to add a start_reuseport_server(..., 5) to the beginning of test_udp6() in prog_tests/bpf_iter.c to ensure there is multiple udp sk in a bucket. It should be enough to reprod.
> 
> 
> Gotcha! I think I understand the repro steps. The offset field in question was added for this scenario where an iterator is stopped and resumed that the sock_destroy test cases don't entirely exercise. 
> Thanks! 


Just a small update: I was able to reproduce the issue where the so_reuseport test hangs by modifying the read buffer in the existing sock_destroy test (test_udp_server). I have a fix, and verified that the test doesn't hang with the fixed code. Will send a patch soon.


> 
>> 
>> In the final fix, I don't have strong preference on where the test should be.
>> Modifying one of the two existing tests (i.e. sock_destroy or bpf_iter) or a completely new test.
>> 
>> Let me know if you have problem reproducing it. Thanks.
>> 
>>> [1] https://elixir.bootlin.com/linux/v6.5/source/tools/testing/selftests/bpf/prog_tests/sock_destroy.c#L146
>>>> 
>>>> Thanks.
>>>> 
>>>>> +					continue;
>>>>> +				}
>>>>> +				if (iter->end_
diff mbox series

Patch

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 289ef05b5c15..8fe2fd6255cc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3150,6 +3150,143 @@  struct bpf_iter__udp {
 	int bucket __aligned(8);
 };
 
+struct bpf_udp_iter_state {
+	struct udp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	int offset;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz);
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct udp_iter_state *state = &iter->state;
+	struct net *net = seq_file_net(seq);
+	struct udp_table *udptable;
+	unsigned int batch_sks = 0;
+	bool resized = false;
+	struct sock *sk;
+
+	/* The current batch is done, so advance the bucket. */
+	if (iter->st_bucket_done) {
+		state->bucket++;
+		iter->offset = 0;
+	}
+
+	udptable = udp_get_table_seq(seq, net);
+
+again:
+	/* New batch for the next bucket.
+	 * Iterate over the hash table to find a bucket with sockets matching
+	 * the iterator attributes, and return the first matching socket from
+	 * the bucket. The remaining matched sockets from the bucket are batched
+	 * before releasing the bucket lock. This allows BPF programs that are
+	 * called in seq_show to acquire the bucket lock if needed.
+	 */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+	batch_sks = 0;
+
+	for (; state->bucket <= udptable->mask; state->bucket++) {
+		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
+
+		if (hlist_empty(&hslot2->head)) {
+			iter->offset = 0;
+			continue;
+		}
+
+		spin_lock_bh(&hslot2->lock);
+		udp_portaddr_for_each_entry(sk, &hslot2->head) {
+			if (seq_sk_match(seq, sk)) {
+				/* Resume from the last iterated socket at the
+				 * offset in the bucket before iterator was stopped.
+				 */
+				if (iter->offset) {
+					--iter->offset;
+					continue;
+				}
+				if (iter->end_sk < iter->max_sk) {
+					sock_hold(sk);
+					iter->batch[iter->end_sk++] = sk;
+				}
+				batch_sks++;
+			}
+		}
+		spin_unlock_bh(&hslot2->lock);
+
+		if (iter->end_sk)
+			break;
+
+		/* Reset the current bucket's offset before moving to the next bucket. */
+		iter->offset = 0;
+	}
+
+	/* All done: no batch made. */
+	if (!iter->end_sk)
+		return NULL;
+
+	if (iter->end_sk == batch_sks) {
+		/* Batching is done for the current bucket; return the first
+		 * socket to be iterated from the batch.
+		 */
+		iter->st_bucket_done = true;
+		goto done;
+	}
+	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
+		resized = true;
+		/* After allocating a larger batch, retry one more time to grab
+		 * the whole bucket.
+		 */
+		state->bucket--;
+		goto again;
+	}
+done:
+	return iter->batch[0];
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so unref the iter->cur_sk.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		sock_put(iter->batch[iter->cur_sk++]);
+		++iter->offset;
+	}
+
+	/* After updating iter->cur_sk, check if there are more sockets
+	 * available in the current bucket batch.
+	 */
+	if (iter->cur_sk < iter->end_sk)
+		sk = iter->batch[iter->cur_sk];
+	else
+		/* Prepare a new batch. */
+		sk = bpf_iter_udp_batch(seq);
+
+	++*pos;
+	return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_udp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
 static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
 			     struct udp_sock *udp_sk, uid_t uid, int bucket)
 {
@@ -3170,18 +3307,37 @@  static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
 	struct bpf_prog *prog;
 	struct sock *sk = v;
 	uid_t uid;
+	int ret;
 
 	if (v == SEQ_START_TOKEN)
 		return 0;
 
+	lock_sock(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		ret = SEQ_SKIP;
+		goto unlock;
+	}
+
 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
-	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+	ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+	release_sock(sk);
+	return ret;
+}
+
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
 }
 
 static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct bpf_udp_iter_state *iter = seq->private;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 
@@ -3192,12 +3348,15 @@  static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
 	}
 
-	udp_seq_stop(seq, v);
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_udp_put_batch(iter);
+		iter->st_bucket_done = false;
+	}
 }
 
 static const struct seq_operations bpf_iter_udp_seq_ops = {
-	.start		= udp_seq_start,
-	.next		= udp_seq_next,
+	.start		= bpf_iter_udp_seq_start,
+	.next		= bpf_iter_udp_seq_next,
 	.stop		= bpf_iter_udp_seq_stop,
 	.show		= bpf_iter_udp_seq_show,
 };
@@ -3426,21 +3585,55 @@  static struct pernet_operations __net_initdata udp_sysctl_ops = {
 DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
 		     struct udp_sock *udp_sk, uid_t uid, int bucket)
 
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+				   GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_udp_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
 static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
-	return bpf_iter_init_seq_net(priv_data, aux);
+	struct bpf_udp_iter_state *iter = priv_data;
+	int ret;
+
+	ret = bpf_iter_init_seq_net(priv_data, aux);
+	if (ret)
+		return ret;
+
+	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (ret)
+		bpf_iter_fini_seq_net(priv_data);
+
+	return ret;
 }
 
 static void bpf_iter_fini_udp(void *priv_data)
 {
+	struct bpf_udp_iter_state *iter = priv_data;
+
 	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
 }
 
 static const struct bpf_iter_seq_info udp_seq_info = {
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
 	.fini_seq_private	= bpf_iter_fini_udp,
-	.seq_priv_size		= sizeof(struct udp_iter_state),
+	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
 };
 
 static struct bpf_iter_reg udp_reg_info = {