diff mbox series

[v2,bpf-next,1/3] bpf: Implement batching in UDP iterator

Message ID 20230223215311.926899-2-aditi.ghag@isovalent.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series : Add socket destroy capability | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 6 maintainers not CCed: dsahern@kernel.org pabeni@redhat.com willemdebruijn.kernel@gmail.com kuba@kernel.org netdev@vger.kernel.org davem@davemloft.net
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 287 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 1
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-17

Commit Message

Aditi Ghag Feb. 23, 2023, 9:53 p.m. UTC
Batch UDP sockets from BPF iterator that allows for overlapping locking
semantics in BPF/kernel helpers executed in BPF programs.  This facilitates
BPF socket destroy kfunc (introduced by follow-up patches) to execute from
BPF iterator programs.

Previously, BPF iterators acquired the sock lock and sockets hash table
bucket lock while executing BPF programs. This prevented BPF helpers that
again acquire these locks to be executed from BPF iterators.  With the
batching approach, we acquire a bucket lock, batch all the bucket sockets,
and then release the bucket lock. This enables BPF or kernel helpers to
skip sock locking when invoked in the supported BPF contexts.

The batching logic is similar to the logic implemented in TCP iterator:
https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
---
 net/ipv4/udp.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 215 insertions(+), 9 deletions(-)

Comments

Stanislav Fomichev Feb. 24, 2023, 10:32 p.m. UTC | #1
On 02/23, Aditi Ghag wrote:
> Batch UDP sockets from BPF iterator that allows for overlapping locking
> semantics in BPF/kernel helpers executed in BPF programs.  This  
> facilitates
> BPF socket destroy kfunc (introduced by follow-up patches) to execute from
> BPF iterator programs.

> Previously, BPF iterators acquired the sock lock and sockets hash table
> bucket lock while executing BPF programs. This prevented BPF helpers that
> again acquire these locks to be executed from BPF iterators.  With the
> batching approach, we acquire a bucket lock, batch all the bucket sockets,
> and then release the bucket lock. This enables BPF or kernel helpers to
> skip sock locking when invoked in the supported BPF contexts.

> The batching logic is similar to the logic implemented in TCP iterator:
> https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com/.

> Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
> ---
>   net/ipv4/udp.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 215 insertions(+), 9 deletions(-)

> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c605d171eb2d..2f3978de45f2 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -3152,6 +3152,141 @@ struct bpf_iter__udp {
>   	int bucket __aligned(8);
>   };

> +struct bpf_udp_iter_state {
> +	struct udp_iter_state state;

[..]

> +	unsigned int cur_sk;
> +	unsigned int end_sk;
> +	unsigned int max_sk;
> +	struct sock **batch;
> +	bool st_bucket_done;

Any change we can generalize some of those across tcp & udp? I haven't
looked too deep, but a lot of things look like a plain copy-paste
from tcp batching. Or not worth it?

> +};
> +
> +static unsigned short seq_file_family(const struct seq_file *seq);
> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
> +				      unsigned int new_batch_sz);
> +
> +static inline bool seq_sk_match(struct seq_file *seq, const struct sock  
> *sk)
> +{
> +	unsigned short family = seq_file_family(seq);
> +
> +	/* AF_UNSPEC is used as a match all */
> +	return ((family == AF_UNSPEC || family == sk->sk_family) &&
> +		net_eq(sock_net(sk), seq_file_net(seq)));
> +}
> +
> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
> +	struct net *net = seq_file_net(seq);
> +	struct udp_seq_afinfo *afinfo = state->bpf_seq_afinfo;
> +	struct udp_table *udptable;
> +	struct sock *first_sk = NULL;
> +	struct sock *sk;
> +	unsigned int bucket_sks = 0;
> +	bool first;
> +	bool resized = false;
> +
> +	/* The current batch is done, so advance the bucket. */
> +	if (iter->st_bucket_done)
> +		state->bucket++;
> +
> +	udptable = udp_get_table_afinfo(afinfo, net);
> +
> +again:
> +	/* New batch for the next bucket.
> +	 * Iterate over the hash table to find a bucket with sockets matching
> +	 * the iterator attributes, and return the first matching socket from
> +	 * the bucket. The remaining matched sockets from the bucket are batched
> +	 * before releasing the bucket lock. This allows BPF programs that are
> +	 * called in seq_show to acquire the bucket lock if needed.
> +	 */
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	first = true;
> +
> +	for (; state->bucket <= udptable->mask; state->bucket++) {
> +		struct udp_hslot *hslot = &udptable->hash[state->bucket];
> +
> +		if (hlist_empty(&hslot->head))
> +			continue;
> +
> +		spin_lock_bh(&hslot->lock);
> +		sk_for_each(sk, &hslot->head) {
> +			if (seq_sk_match(seq, sk)) {
> +				if (first) {
> +					first_sk = sk;
> +					first = false;
> +				}
> +				if (iter->end_sk < iter->max_sk) {
> +					sock_hold(sk);
> +					iter->batch[iter->end_sk++] = sk;
> +				}
> +				bucket_sks++;
> +			}
> +		}
> +		spin_unlock_bh(&hslot->lock);
> +		if (first_sk)
> +			break;
> +	}
> +
> +	/* All done: no batch made. */
> +	if (!first_sk)
> +		return NULL;
> +
> +	if (iter->end_sk == bucket_sks) {
> +		/* Batching is done for the current bucket; return the first
> +		 * socket to be iterated from the batch.
> +		 */
> +		iter->st_bucket_done = true;
> +		return first_sk;
> +	}
> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
> +		resized = true;
> +		/* Go back to the previous bucket to resize its batch. */
> +		state->bucket--;
> +		goto again;
> +	}
> +	return first_sk;
> +}
> +
> +static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t  
> *pos)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct sock *sk;
> +
> +	/* Whenever seq_next() is called, the iter->cur_sk is
> +	 * done with seq_show(), so unref the iter->cur_sk.
> +	 */
> +	if (iter->cur_sk < iter->end_sk)
> +		sock_put(iter->batch[iter->cur_sk++]);
> +
> +	/* After updating iter->cur_sk, check if there are more sockets
> +	 * available in the current bucket batch.
> +	 */
> +	if (iter->cur_sk < iter->end_sk) {
> +		sk = iter->batch[iter->cur_sk];
> +	} else {
> +		// Prepare a new batch.
> +		sk = bpf_iter_udp_batch(seq);
> +	}
> +
> +	++*pos;
> +	return sk;
> +}
> +
> +static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	/* bpf iter does not support lseek, so it always
> +	 * continue from where it was stop()-ped.
> +	 */
> +	if (*pos)
> +		return bpf_iter_udp_batch(seq);
> +
> +	return SEQ_START_TOKEN;
> +}
> +
>   static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta  
> *meta,
>   			     struct udp_sock *udp_sk, uid_t uid, int bucket)
>   {
> @@ -3172,18 +3307,34 @@ static int bpf_iter_udp_seq_show(struct seq_file  
> *seq, void *v)
>   	struct bpf_prog *prog;
>   	struct sock *sk = v;
>   	uid_t uid;
> +	bool slow;
> +	int rc;

>   	if (v == SEQ_START_TOKEN)
>   		return 0;

> +	slow = lock_sock_fast(sk);

Hm, I missed the fact that we're already using fast lock in the tcp batching
as well. Should we not use fask locks here? On a loaded system it's
probably fair to pay some backlog processing in the path that goes
over every socket (here)? Martin, WDYT?

> +
> +	if (unlikely(sk_unhashed(sk))) {
> +		rc = SEQ_SKIP;
> +		goto unlock;
> +	}
> +
>   	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
>   	meta.seq = seq;
>   	prog = bpf_iter_get_info(&meta, false);
> -	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
> +	rc = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
> +
> +unlock:
> +	unlock_sock_fast(sk, slow);
> +	return rc;
>   }

> +static void bpf_iter_udp_unref_batch(struct bpf_udp_iter_state *iter);

Why forward declaration? Why not define the function here?

> +
>   static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
>   {
> +	struct bpf_udp_iter_state *iter = seq->private;
>   	struct bpf_iter_meta meta;
>   	struct bpf_prog *prog;

> @@ -3194,15 +3345,31 @@ static void bpf_iter_udp_seq_stop(struct seq_file  
> *seq, void *v)
>   			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
>   	}

> -	udp_seq_stop(seq, v);
> +	if (iter->cur_sk < iter->end_sk) {
> +		bpf_iter_udp_unref_batch(iter);
> +		iter->st_bucket_done = false;
> +	}
>   }

>   static const struct seq_operations bpf_iter_udp_seq_ops = {
> -	.start		= udp_seq_start,
> -	.next		= udp_seq_next,
> +	.start		= bpf_iter_udp_seq_start,
> +	.next		= bpf_iter_udp_seq_next,
>   	.stop		= bpf_iter_udp_seq_stop,
>   	.show		= bpf_iter_udp_seq_show,
>   };
> +
> +static unsigned short seq_file_family(const struct seq_file *seq)
> +{
> +	const struct udp_seq_afinfo *afinfo;
> +
> +	/* BPF iterator: bpf programs to filter sockets. */
> +	if (seq->op == &bpf_iter_udp_seq_ops)
> +		return AF_UNSPEC;
> +
> +	/* Proc fs iterator */
> +	afinfo = pde_data(file_inode(seq->file));
> +	return afinfo->family;
> +}
>   #endif

>   const struct seq_operations udp_seq_ops = {
> @@ -3413,9 +3580,38 @@ static struct pernet_operations __net_initdata  
> udp_sysctl_ops = {
>   DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
>   		     struct udp_sock *udp_sk, uid_t uid, int bucket)

> +static void bpf_iter_udp_unref_batch(struct bpf_udp_iter_state *iter)
> +{
> +	while (iter->cur_sk < iter->end_sk)
> +		sock_put(iter->batch[iter->cur_sk++]);
> +}
> +
> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
> +				      unsigned int new_batch_sz)
> +{
> +	struct sock **new_batch;
> +
> +	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
> +				   GFP_USER | __GFP_NOWARN);
> +	if (!new_batch)
> +		return -ENOMEM;
> +
> +	bpf_iter_udp_unref_batch(iter);
> +	kvfree(iter->batch);
> +	iter->batch = new_batch;
> +	iter->max_sk = new_batch_sz;
> +
> +	return 0;
> +}
> +
> +#define INIT_BATCH_SZ 16
> +
> +static void bpf_iter_fini_udp(void *priv_data);
> +
>   static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info  
> *aux)
>   {
> -	struct udp_iter_state *st = priv_data;
> +	struct bpf_udp_iter_state *iter = priv_data;
> +	struct udp_iter_state *st = &iter->state;
>   	struct udp_seq_afinfo *afinfo;
>   	int ret;

> @@ -3427,24 +3623,34 @@ static int bpf_iter_init_udp(void *priv_data,  
> struct bpf_iter_aux_info *aux)
>   	afinfo->udp_table = NULL;
>   	st->bpf_seq_afinfo = afinfo;
>   	ret = bpf_iter_init_seq_net(priv_data, aux);
> -	if (ret)
> +	if (ret) {
>   		kfree(afinfo);
> +		return ret;
> +	}
> +	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
> +	if (ret) {
> +		bpf_iter_fini_seq_net(priv_data);

Leaking afinfo here? Since we are not feeing it from bpf_iter_fini_udp
any more? (why?)

> +		return ret;
> +	}
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +
>   	return ret;
>   }

>   static void bpf_iter_fini_udp(void *priv_data)
>   {
> -	struct udp_iter_state *st = priv_data;
> +	struct bpf_udp_iter_state *iter = priv_data;

> -	kfree(st->bpf_seq_afinfo);


>   	bpf_iter_fini_seq_net(priv_data);
> +	kfree(iter->batch);
>   }

>   static const struct bpf_iter_seq_info udp_seq_info = {
>   	.seq_ops		= &bpf_iter_udp_seq_ops,
>   	.init_seq_private	= bpf_iter_init_udp,
>   	.fini_seq_private	= bpf_iter_fini_udp,
> -	.seq_priv_size		= sizeof(struct udp_iter_state),
> +	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
>   };

>   static struct bpf_iter_reg udp_reg_info = {
> --
> 2.34.1
Martin KaFai Lau Feb. 28, 2023, 7:58 p.m. UTC | #2
On 2/23/23 1:53 PM, Aditi Ghag wrote:
> +struct bpf_udp_iter_state {
> +	struct udp_iter_state state;
> +	unsigned int cur_sk;
> +	unsigned int end_sk;
> +	unsigned int max_sk;
> +	struct sock **batch;
> +	bool st_bucket_done;
> +};
> +
> +static unsigned short seq_file_family(const struct seq_file *seq);
> +static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
> +				      unsigned int new_batch_sz);
> +
> +static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
> +{
> +	unsigned short family = seq_file_family(seq);
> +
> +	/* AF_UNSPEC is used as a match all */
> +	return ((family == AF_UNSPEC || family == sk->sk_family) &&
> +		net_eq(sock_net(sk), seq_file_net(seq)));
> +}
> +
> +static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> +{
> +	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
> +	struct net *net = seq_file_net(seq);
> +	struct udp_seq_afinfo *afinfo = state->bpf_seq_afinfo;
> +	struct udp_table *udptable;
> +	struct sock *first_sk = NULL;
> +	struct sock *sk;
> +	unsigned int bucket_sks = 0;
> +	bool first;
> +	bool resized = false;
> +
> +	/* The current batch is done, so advance the bucket. */
> +	if (iter->st_bucket_done)
> +		state->bucket++;
> +
> +	udptable = udp_get_table_afinfo(afinfo, net);
> +
> +again:
> +	/* New batch for the next bucket.
> +	 * Iterate over the hash table to find a bucket with sockets matching
> +	 * the iterator attributes, and return the first matching socket from
> +	 * the bucket. The remaining matched sockets from the bucket are batched
> +	 * before releasing the bucket lock. This allows BPF programs that are
> +	 * called in seq_show to acquire the bucket lock if needed.
> +	 */
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	first = true;
> +
> +	for (; state->bucket <= udptable->mask; state->bucket++) {
> +		struct udp_hslot *hslot = &udptable->hash[state->bucket];

Since it is mostly separated from the proc's udp-seq-file now, may as well 
iterate the udptable->hash"2" which is hashed by both addr and port such that 
each batch should be smaller.

> +
> +		if (hlist_empty(&hslot->head))
> +			continue;
> +
> +		spin_lock_bh(&hslot->lock);
> +		sk_for_each(sk, &hslot->head) {
> +			if (seq_sk_match(seq, sk)) {
> +				if (first) {
> +					first_sk = sk;
> +					first = false;
> +				}
> +				if (iter->end_sk < iter->max_sk) {
> +					sock_hold(sk);
> +					iter->batch[iter->end_sk++] = sk;
> +				}
> +				bucket_sks++;
> +			}
> +		}
> +		spin_unlock_bh(&hslot->lock);
> +		if (first_sk)
> +			break;
> +	}
> +
> +	/* All done: no batch made. */
> +	if (!first_sk)
> +		return NULL;

I think first_sk and bucket_sks need to be reset on the "again" case also?

If bpf_iter_udp_seq_stop() is called before a batch has been fully processed by 
the bpf prog in ".show", how does the next bpf_iter_udp_seq_start() continue 
from where it left off? The bpf_tcp_iter remembers the bucket and the 
offset-in-this-bucket. I think bpf_udp_iter can do something similar.

> +
> +	if (iter->end_sk == bucket_sks) {
> +		/* Batching is done for the current bucket; return the first
> +		 * socket to be iterated from the batch.
> +		 */
> +		iter->st_bucket_done = true;
> +		return first_sk;
> +	}
> +	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
> +		resized = true;
> +		/* Go back to the previous bucket to resize its batch. */
> +		state->bucket--;
> +		goto again;
> +	}
> +	return first_sk;
> +}
> +

[ ... ]

>   static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
>   {
> -	struct udp_iter_state *st = priv_data;
> +	struct bpf_udp_iter_state *iter = priv_data;
> +	struct udp_iter_state *st = &iter->state;
>   	struct udp_seq_afinfo *afinfo;
>   	int ret;
>   
> @@ -3427,24 +3623,34 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
>   	afinfo->udp_table = NULL;
>   	st->bpf_seq_afinfo = afinfo;

Is bpf_seq_afinfo still needed in 'struct udp_iter_state'? Can it be removed?

>   	ret = bpf_iter_init_seq_net(priv_data, aux);
> -	if (ret)
> +	if (ret) {
>   		kfree(afinfo);
> +		return ret;
> +	}
> +	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
> +	if (ret) {
> +		bpf_iter_fini_seq_net(priv_data);
> +		return ret;
> +	}
> +	iter->cur_sk = 0;
> +	iter->end_sk = 0;
> +
>   	return ret;
>   }
>   
>   static void bpf_iter_fini_udp(void *priv_data)
>   {
> -	struct udp_iter_state *st = priv_data;
> +	struct bpf_udp_iter_state *iter = priv_data;
>   
> -	kfree(st->bpf_seq_afinfo);
>   	bpf_iter_fini_seq_net(priv_data);
> +	kfree(iter->batch);
kvfree
Martin KaFai Lau Feb. 28, 2023, 8:32 p.m. UTC | #3
On 2/24/23 2:32 PM, Stanislav Fomichev wrote:
>> +    unsigned int cur_sk;
>> +    unsigned int end_sk;
>> +    unsigned int max_sk;
>> +    struct sock **batch;
>> +    bool st_bucket_done;
> 
> Any change we can generalize some of those across tcp & udp? I haven't
> looked too deep, but a lot of things look like a plain copy-paste
> from tcp batching. Or not worth it?

The batching has some small but subtle differences between tcp and udp, so not 
sure if it can end up sharing enough codes.

>>   static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
>>                    struct udp_sock *udp_sk, uid_t uid, int bucket)
>>   {
>> @@ -3172,18 +3307,34 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, 
>> void *v)
>>       struct bpf_prog *prog;
>>       struct sock *sk = v;
>>       uid_t uid;
>> +    bool slow;
>> +    int rc;
> 
>>       if (v == SEQ_START_TOKEN)
>>           return 0;
> 
>> +    slow = lock_sock_fast(sk);
> 
> Hm, I missed the fact that we're already using fast lock in the tcp batching
> as well. Should we not use fask locks here? On a loaded system it's
> probably fair to pay some backlog processing in the path that goes
> over every socket (here)? Martin, WDYT?

hmm... not sure if it is needed. The lock_sock_fast was borrowed from 
tcp_get_info() which is also used in inet_diag iteration. bpf iter prog should 
be doing something pretty fast also. In the future, it could allow the bpf-iter 
program to acquire the lock by itself only when it is necessary if the current 
always lock strategy is too expensive.
Stanislav Fomichev Feb. 28, 2023, 8:52 p.m. UTC | #4
On Tue, Feb 28, 2023 at 12:32 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 2/24/23 2:32 PM, Stanislav Fomichev wrote:
> >> +    unsigned int cur_sk;
> >> +    unsigned int end_sk;
> >> +    unsigned int max_sk;
> >> +    struct sock **batch;
> >> +    bool st_bucket_done;
> >
> > Any change we can generalize some of those across tcp & udp? I haven't
> > looked too deep, but a lot of things look like a plain copy-paste
> > from tcp batching. Or not worth it?
>
> The batching has some small but subtle differences between tcp and udp, so not
> sure if it can end up sharing enough codes.
>
> >>   static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
> >>                    struct udp_sock *udp_sk, uid_t uid, int bucket)
> >>   {
> >> @@ -3172,18 +3307,34 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq,
> >> void *v)
> >>       struct bpf_prog *prog;
> >>       struct sock *sk = v;
> >>       uid_t uid;
> >> +    bool slow;
> >> +    int rc;
> >
> >>       if (v == SEQ_START_TOKEN)
> >>           return 0;
> >
> >> +    slow = lock_sock_fast(sk);
> >
> > Hm, I missed the fact that we're already using fast lock in the tcp batching
> > as well. Should we not use fask locks here? On a loaded system it's
> > probably fair to pay some backlog processing in the path that goes
> > over every socket (here)? Martin, WDYT?
>
> hmm... not sure if it is needed. The lock_sock_fast was borrowed from
> tcp_get_info() which is also used in inet_diag iteration. bpf iter prog should
> be doing something pretty fast also. In the future, it could allow the bpf-iter
> program to acquire the lock by itself only when it is necessary if the current
> always lock strategy is too expensive.

Cool, thx!
Aditi Ghag March 1, 2023, 2:40 a.m. UTC | #5
>> +
>> +		if (hlist_empty(&hslot->head))
>> +			continue;
>> +
>> +		spin_lock_bh(&hslot->lock);
>> +		sk_for_each(sk, &hslot->head) {
>> +			if (seq_sk_match(seq, sk)) {
>> +				if (first) {
>> +					first_sk = sk;
>> +					first = false;
>> +				}
>> +				if (iter->end_sk < iter->max_sk) {
>> +					sock_hold(sk);
>> +					iter->batch[iter->end_sk++] = sk;
>> +				}
>> +				bucket_sks++;
>> +			}
>> +		}
>> +		spin_unlock_bh(&hslot->lock);
>> +		if (first_sk)
>> +			break;
>> +	}
>> +
>> +	/* All done: no batch made. */
>> +	if (!first_sk)
>> +		return NULL;
> 
> I think first_sk and bucket_sks need to be reset on the "again" case also?
> 
> If bpf_iter_udp_seq_stop() is called before a batch has been fully processed by the bpf prog in ".show", how does the next bpf_iter_udp_seq_start() continue from where it left off? The bpf_tcp_iter remembers the bucket and the offset-in-this-bucket. I think bpf_udp_iter can do something similar.

Hmm, I suppose you are referring to the `tcp_seek_last_pos` logic? This was the [1] commit that added the optimization in v2.6, but only for TCP. Extending the same logic for UDP is out of the scope of this PR? Although reading the [1] commit description, the latency issue seems to be specific to the file based iterators, no? Of course, BPF iterators are quite new, and I'm not sure if we have the same "slowness" reported in that commit. Having said that, do we expect users to start from where they previously left off by stopping BPF iterators? Regardless, I'll do some testing to ensure that we at least don't crash. 


[1] https://github.com/torvalds/linux/commit/a8b690f98baf9fb1902b8eeab801351ea603fa3a
Martin KaFai Lau March 2, 2023, 6:43 a.m. UTC | #6
On 2/28/23 6:40 PM, Aditi Ghag wrote:
> 
>>> +
>>> +		if (hlist_empty(&hslot->head))
>>> +			continue;
>>> +
>>> +		spin_lock_bh(&hslot->lock);
>>> +		sk_for_each(sk, &hslot->head) {
>>> +			if (seq_sk_match(seq, sk)) {
>>> +				if (first) {
>>> +					first_sk = sk;
>>> +					first = false;
>>> +				}
>>> +				if (iter->end_sk < iter->max_sk) {
>>> +					sock_hold(sk);
>>> +					iter->batch[iter->end_sk++] = sk;
>>> +				}
>>> +				bucket_sks++;
>>> +			}
>>> +		}
>>> +		spin_unlock_bh(&hslot->lock);
>>> +		if (first_sk)
>>> +			break;
>>> +	}
>>> +
>>> +	/* All done: no batch made. */
>>> +	if (!first_sk)
>>> +		return NULL;
>>
>> I think first_sk and bucket_sks need to be reset on the "again" case also?
>>
>> If bpf_iter_udp_seq_stop() is called before a batch has been fully processed by the bpf prog in ".show", how does the next bpf_iter_udp_seq_start() continue from where it left off? The bpf_tcp_iter remembers the bucket and the offset-in-this-bucket. I think bpf_udp_iter can do something similar.
> 
> Hmm, I suppose you are referring to the `tcp_seek_last_pos` logic? This was the [1] commit that added the optimization in v2.6, but only for TCP. Extending the same logic for UDP is out of the scope of this PR? Although reading the [1] commit description, the latency issue seems to be specific to the file based iterators, no? Of course, BPF iterators are quite new, and I'm not sure if we have the same "slowness" reported in that commit. Having said that, do we expect users to start from where they previously left off by stopping BPF iterators? Regardless, I'll do some testing to ensure that we at least don't crash.

It is about ensuring the bpf-udp-iter makes forward progress, although speeding 
up is another plus. The iteration may have to be stopped (ie. 
bpf_iter_udp_seq_stop) for other reasons. The bpf-(udp|tcp|unix)-iter is not 
only for doing bpf_setsockopt or bpf_sock_destroy. A major use case is to 
bpf_seq_printf. eg. when seq_has_overflowed() in bpf_seq_read, 
bpf_iter_udp_seq_stop will be called.

If I read this patch correctly, bpf_iter_udp_seq_start() always starts from the 
beginning of the current bucket. The bpf-iter cannot make forward progress if 
the bpf_iter_udp_seq_stop is called at the middle of the bucket.

The "resume" is currently done by udp_get_idx(, *pos..) but is taken away from 
bpf-udp-iter in this patch. udp_get_idx(...) starts counting from the very first 
bucket and could potentially be reused here. However, I believe this patch is 
easier to just resume from the current bucket. Everything is there, all it needs 
is to remember its previous position/offset of the bucket.

> 
> 
> [1] https://github.com/torvalds/linux/commit/a8b690f98baf9fb1902b8eeab801351ea603fa3a
> 
>
diff mbox series

Patch

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c605d171eb2d..2f3978de45f2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3152,6 +3152,141 @@  struct bpf_iter__udp {
 	int bucket __aligned(8);
 };
 
+struct bpf_udp_iter_state {
+	struct udp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
+static unsigned short seq_file_family(const struct seq_file *seq);
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz);
+
+static inline bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+	unsigned short family = seq_file_family(seq);
+
+	/* AF_UNSPEC is used as a match all */
+	return ((family == AF_UNSPEC || family == sk->sk_family) &&
+		net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct udp_iter_state *state = &iter->state;
+	struct net *net = seq_file_net(seq);
+	struct udp_seq_afinfo *afinfo = state->bpf_seq_afinfo;
+	struct udp_table *udptable;
+	struct sock *first_sk = NULL;
+	struct sock *sk;
+	unsigned int bucket_sks = 0;
+	bool first;
+	bool resized = false;
+
+	/* The current batch is done, so advance the bucket. */
+	if (iter->st_bucket_done)
+		state->bucket++;
+
+	udptable = udp_get_table_afinfo(afinfo, net);
+
+again:
+	/* New batch for the next bucket.
+	 * Iterate over the hash table to find a bucket with sockets matching
+	 * the iterator attributes, and return the first matching socket from
+	 * the bucket. The remaining matched sockets from the bucket are batched
+	 * before releasing the bucket lock. This allows BPF programs that are
+	 * called in seq_show to acquire the bucket lock if needed.
+	 */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+	first = true;
+
+	for (; state->bucket <= udptable->mask; state->bucket++) {
+		struct udp_hslot *hslot = &udptable->hash[state->bucket];
+
+		if (hlist_empty(&hslot->head))
+			continue;
+
+		spin_lock_bh(&hslot->lock);
+		sk_for_each(sk, &hslot->head) {
+			if (seq_sk_match(seq, sk)) {
+				if (first) {
+					first_sk = sk;
+					first = false;
+				}
+				if (iter->end_sk < iter->max_sk) {
+					sock_hold(sk);
+					iter->batch[iter->end_sk++] = sk;
+				}
+				bucket_sks++;
+			}
+		}
+		spin_unlock_bh(&hslot->lock);
+		if (first_sk)
+			break;
+	}
+
+	/* All done: no batch made. */
+	if (!first_sk)
+		return NULL;
+
+	if (iter->end_sk == bucket_sks) {
+		/* Batching is done for the current bucket; return the first
+		 * socket to be iterated from the batch.
+		 */
+		iter->st_bucket_done = true;
+		return first_sk;
+	}
+	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
+		resized = true;
+		/* Go back to the previous bucket to resize its batch. */
+		state->bucket--;
+		goto again;
+	}
+	return first_sk;
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so unref the iter->cur_sk.
+	 */
+	if (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+
+	/* After updating iter->cur_sk, check if there are more sockets
+	 * available in the current bucket batch.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		sk = iter->batch[iter->cur_sk];
+	} else {
+		// Prepare a new batch.
+		sk = bpf_iter_udp_batch(seq);
+	}
+
+	++*pos;
+	return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_udp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
 static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
 			     struct udp_sock *udp_sk, uid_t uid, int bucket)
 {
@@ -3172,18 +3307,34 @@  static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
 	struct bpf_prog *prog;
 	struct sock *sk = v;
 	uid_t uid;
+	bool slow;
+	int rc;
 
 	if (v == SEQ_START_TOKEN)
 		return 0;
 
+	slow = lock_sock_fast(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		rc = SEQ_SKIP;
+		goto unlock;
+	}
+
 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
-	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+	rc = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+	unlock_sock_fast(sk, slow);
+	return rc;
 }
 
+static void bpf_iter_udp_unref_batch(struct bpf_udp_iter_state *iter);
+
 static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct bpf_udp_iter_state *iter = seq->private;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 
@@ -3194,15 +3345,31 @@  static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
 			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
 	}
 
-	udp_seq_stop(seq, v);
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_udp_unref_batch(iter);
+		iter->st_bucket_done = false;
+	}
 }
 
 static const struct seq_operations bpf_iter_udp_seq_ops = {
-	.start		= udp_seq_start,
-	.next		= udp_seq_next,
+	.start		= bpf_iter_udp_seq_start,
+	.next		= bpf_iter_udp_seq_next,
 	.stop		= bpf_iter_udp_seq_stop,
 	.show		= bpf_iter_udp_seq_show,
 };
+
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+	const struct udp_seq_afinfo *afinfo;
+
+	/* BPF iterator: bpf programs to filter sockets. */
+	if (seq->op == &bpf_iter_udp_seq_ops)
+		return AF_UNSPEC;
+
+	/* Proc fs iterator */
+	afinfo = pde_data(file_inode(seq->file));
+	return afinfo->family;
+}
 #endif
 
 const struct seq_operations udp_seq_ops = {
@@ -3413,9 +3580,38 @@  static struct pernet_operations __net_initdata udp_sysctl_ops = {
 DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
 		     struct udp_sock *udp_sk, uid_t uid, int bucket)
 
+static void bpf_iter_udp_unref_batch(struct bpf_udp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+				   GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_udp_unref_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static void bpf_iter_fini_udp(void *priv_data);
+
 static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
-	struct udp_iter_state *st = priv_data;
+	struct bpf_udp_iter_state *iter = priv_data;
+	struct udp_iter_state *st = &iter->state;
 	struct udp_seq_afinfo *afinfo;
 	int ret;
 
@@ -3427,24 +3623,34 @@  static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 	afinfo->udp_table = NULL;
 	st->bpf_seq_afinfo = afinfo;
 	ret = bpf_iter_init_seq_net(priv_data, aux);
-	if (ret)
+	if (ret) {
 		kfree(afinfo);
+		return ret;
+	}
+	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (ret) {
+		bpf_iter_fini_seq_net(priv_data);
+		return ret;
+	}
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+
 	return ret;
 }
 
 static void bpf_iter_fini_udp(void *priv_data)
 {
-	struct udp_iter_state *st = priv_data;
+	struct bpf_udp_iter_state *iter = priv_data;
 
-	kfree(st->bpf_seq_afinfo);
 	bpf_iter_fini_seq_net(priv_data);
+	kfree(iter->batch);
 }
 
 static const struct bpf_iter_seq_info udp_seq_info = {
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
 	.fini_seq_private	= bpf_iter_fini_udp,
-	.seq_priv_size		= sizeof(struct udp_iter_state),
+	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
 };
 
 static struct bpf_iter_reg udp_reg_info = {