diff mbox series

[v3,bpf-next,2/5] bpf: Add bpf_sock_destroy kfunc

Message ID 20230321184541.1857363-3-aditi.ghag@isovalent.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf-next: Add socket destroy capability | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/apply fail Patch does not apply to bpf-next
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on x86_64 with llvm-16

Commit Message

Aditi Ghag March 21, 2023, 6:45 p.m. UTC
The socket destroy kfunc is used to forcefully terminate sockets from
certain BPF contexts. We plan to use the capability in Cilium to force
client sockets to reconnect when their remote load-balancing backends are
deleted. The other use case is on-the-fly policy enforcement where existing
socket connections prevented by policies need to be forcefully terminated.
The helper allows terminating sockets that may or may not be actively
sending traffic.

The helper is currently exposed to certain BPF iterators where users can
filter, and terminate selected sockets.  Additionally, the helper can only
be called from these BPF contexts that ensure socket locking in order to
allow synchronous execution of destroy helpers that also acquire socket
locks. The previous commit that batches UDP sockets during iteration
facilitated a synchronous invocation of the destroy helper from BPF context
by skipping taking socket locks in the destroy handler. TCP iterators
already supported batching.

The helper takes `sock_common` type argument, even though it expects, and
casts them to a `sock` pointer. This enables the verifier to allow the
sock_destroy kfunc to be called for TCP with `sock_common` and UDP with
`sock` structs. As a comparison, BPF helpers enable this behavior with the
`ARG_PTR_TO_BTF_ID_SOCK_COMMON` argument type. However, there is no such
option available with the verifier logic that handles kfuncs where BTF
types are inferred. Furthermore, as `sock_common` only has a subset of
certain fields of `sock`, casting pointer to the latter type might not
always be safe. Hence, the BPF kfunc converts the argument to a full sock
before casting.

Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
---
 include/net/udp.h |  1 +
 net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp.c    | 16 +++++++++----
 net/ipv4/udp.c    | 60 +++++++++++++++++++++++++++++++++++++----------
 4 files changed, 114 insertions(+), 17 deletions(-)

Comments

Stanislav Fomichev March 21, 2023, 9:02 p.m. UTC | #1
On 03/21, Aditi Ghag wrote:
> The socket destroy kfunc is used to forcefully terminate sockets from
> certain BPF contexts. We plan to use the capability in Cilium to force
> client sockets to reconnect when their remote load-balancing backends are
> deleted. The other use case is on-the-fly policy enforcement where  
> existing
> socket connections prevented by policies need to be forcefully terminated.
> The helper allows terminating sockets that may or may not be actively
> sending traffic.

> The helper is currently exposed to certain BPF iterators where users can
> filter, and terminate selected sockets.  Additionally, the helper can only
> be called from these BPF contexts that ensure socket locking in order to
> allow synchronous execution of destroy helpers that also acquire socket
> locks. The previous commit that batches UDP sockets during iteration
> facilitated a synchronous invocation of the destroy helper from BPF  
> context
> by skipping taking socket locks in the destroy handler. TCP iterators
> already supported batching.

> The helper takes `sock_common` type argument, even though it expects, and
> casts them to a `sock` pointer. This enables the verifier to allow the
> sock_destroy kfunc to be called for TCP with `sock_common` and UDP with
> `sock` structs. As a comparison, BPF helpers enable this behavior with the
> `ARG_PTR_TO_BTF_ID_SOCK_COMMON` argument type. However, there is no such
> option available with the verifier logic that handles kfuncs where BTF
> types are inferred. Furthermore, as `sock_common` only has a subset of
> certain fields of `sock`, casting pointer to the latter type might not
> always be safe. Hence, the BPF kfunc converts the argument to a full sock
> before casting.

> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
> ---
>   include/net/udp.h |  1 +
>   net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++
>   net/ipv4/tcp.c    | 16 +++++++++----
>   net/ipv4/udp.c    | 60 +++++++++++++++++++++++++++++++++++++----------
>   4 files changed, 114 insertions(+), 17 deletions(-)

> diff --git a/include/net/udp.h b/include/net/udp.h
> index de4b528522bb..d2999447d3f2 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>   struct udp_iter_state {
>   	struct seq_net_private  p;
>   	int			bucket;
> +	int			offset;
>   	struct udp_seq_afinfo	*bpf_seq_afinfo;
>   };

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1d6f165923bf..ba3e0dac119c 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)

>   	return func;
>   }
> +
> +/* Disables missing prototype warnings */
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global functions as their definitions will be in vmlinux BTF");
> +
> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error  
> code.
> + *
> + * The helper expects a non-NULL pointer to a socket. It invokes the
> + * protocol specific socket destroy handlers.
> + *
> + * The helper can only be called from BPF contexts that have acquired  
> the socket
> + * locks.
> + *
> + * Parameters:
> + * @sock: Pointer to socket to be destroyed
> + *
> + * Return:
> + * On error, may return EPROTONOSUPPORT, EINVAL.
> + * EPROTONOSUPPORT if protocol specific destroy handler is not  
> implemented.
> + * 0 otherwise
> + */
> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
> +{
> +	struct sock *sk = (struct sock *)sock;
> +
> +	if (!sk)
> +		return -EINVAL;
> +
> +	/* The locking semantics that allow for synchronous execution of the
> +	 * destroy handlers are only supported for TCP and UDP.
> +	 */
> +	if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
> +		return -EOPNOTSUPP;

What makes IPPROTO_RAW special? Looks like it locks the socket as well?

> +
> +	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
> +}
> +
> +__diag_pop()
> +
> +BTF_SET8_START(sock_destroy_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_sock_destroy)
> +BTF_SET8_END(sock_destroy_kfunc_set)
> +
> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &sock_destroy_kfunc_set,
> +};
> +
> +static int init_subsystem(void)
> +{
> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,  
> &bpf_sock_destroy_kfunc_set);
> +}
> +late_initcall(init_subsystem);
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 33f559f491c8..59a833c0c872 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
>   		return 0;
>   	}

> -	/* Don't race with userspace socket closes such as tcp_close. */
> -	lock_sock(sk);
> +	/* BPF context ensures sock locking. */
> +	if (!has_current_bpf_ctx())
> +		/* Don't race with userspace socket closes such as tcp_close. */
> +		lock_sock(sk);

>   	if (sk->sk_state == TCP_LISTEN) {
>   		tcp_set_state(sk, TCP_CLOSE);
> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)

>   	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
>   	local_bh_disable();

[..]

> -	bh_lock_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		bh_lock_sock(sk);

These are spinlocks and should probably be grabbed in the bpf context as
well?


>   	if (!sock_flag(sk, SOCK_DEAD)) {
>   		sk->sk_err = err;
> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
>   		tcp_done(sk);
>   	}

> -	bh_unlock_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		bh_unlock_sock(sk);
> +
>   	local_bh_enable();
>   	tcp_write_queue_purge(sk);
> -	release_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		release_sock(sk);
>   	return 0;
>   }
>   EXPORT_SYMBOL_GPL(tcp_abort);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 545e56329355..02d357713838 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -2925,7 +2925,9 @@ EXPORT_SYMBOL(udp_poll);

>   int udp_abort(struct sock *sk, int err)
>   {
> -	lock_sock(sk);
> +	/* BPF context ensures sock locking. */
> +	if (!has_current_bpf_ctx())
> +		lock_sock(sk);

>   	/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
>   	 * with close()
> @@ -2938,7 +2940,8 @@ int udp_abort(struct sock *sk, int err)
>   	__udp_disconnect(sk, 0);

>   out:
> -	release_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		release_sock(sk);

>   	return 0;
>   }
> @@ -3184,15 +3187,23 @@ static struct sock *bpf_iter_udp_batch(struct  
> seq_file *seq)
>   	struct sock *first_sk = NULL;
>   	struct sock *sk;
>   	unsigned int bucket_sks = 0;
> -	bool first;
>   	bool resized = false;
> +	int offset = 0;
> +	int new_offset;

>   	/* The current batch is done, so advance the bucket. */
> -	if (iter->st_bucket_done)
> +	if (iter->st_bucket_done) {
>   		state->bucket++;
> +		state->offset = 0;
> +	}

>   	udptable = udp_get_table_afinfo(afinfo, net);

> +	if (state->bucket > udptable->mask) {
> +		state->bucket = 0;
> +		state->offset = 0;
> +	}
> +
>   again:
>   	/* New batch for the next bucket.
>   	 * Iterate over the hash table to find a bucket with sockets matching
> @@ -3204,43 +3215,60 @@ static struct sock *bpf_iter_udp_batch(struct  
> seq_file *seq)
>   	iter->cur_sk = 0;
>   	iter->end_sk = 0;
>   	iter->st_bucket_done = false;
> -	first = true;
> +	first_sk = NULL;
> +	bucket_sks = 0;
> +	offset = state->offset;
> +	new_offset = offset;

>   	for (; state->bucket <= udptable->mask; state->bucket++) {
>   		struct udp_hslot *hslot = &udptable->hash[state->bucket];

> -		if (hlist_empty(&hslot->head))
> +		if (hlist_empty(&hslot->head)) {
> +			offset = 0;
>   			continue;
> +		}

>   		spin_lock_bh(&hslot->lock);
> +		/* Resume from the last saved position in a bucket before
> +		 * iterator was stopped.
> +		 */
> +		while (offset-- > 0) {
> +			sk_for_each(sk, &hslot->head)
> +				continue;
> +		}
>   		sk_for_each(sk, &hslot->head) {
>   			if (seq_sk_match(seq, sk)) {
> -				if (first) {
> +				if (!first_sk)
>   					first_sk = sk;
> -					first = false;
> -				}
>   				if (iter->end_sk < iter->max_sk) {
>   					sock_hold(sk);
>   					iter->batch[iter->end_sk++] = sk;
>   				}
>   				bucket_sks++;
>   			}
> +			new_offset++;
>   		}
>   		spin_unlock_bh(&hslot->lock);
> +
>   		if (first_sk)
>   			break;
> +
> +		/* Reset the current bucket's offset before moving to the next bucket.  
> */
> +		offset = 0;
> +		new_offset = 0;
> +
>   	}

>   	/* All done: no batch made. */
>   	if (!first_sk)
> -		return NULL;
> +		goto ret;

>   	if (iter->end_sk == bucket_sks) {
>   		/* Batching is done for the current bucket; return the first
>   		 * socket to be iterated from the batch.
>   		 */
>   		iter->st_bucket_done = true;
> -		return first_sk;
> +		goto ret;
>   	}
>   	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
>   		resized = true;
> @@ -3248,19 +3276,24 @@ static struct sock *bpf_iter_udp_batch(struct  
> seq_file *seq)
>   		state->bucket--;
>   		goto again;
>   	}
> +ret:
> +	state->offset = new_offset;
>   	return first_sk;
>   }

>   static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t  
> *pos)
>   {
>   	struct bpf_udp_iter_state *iter = seq->private;
> +	struct udp_iter_state *state = &iter->state;
>   	struct sock *sk;

>   	/* Whenever seq_next() is called, the iter->cur_sk is
>   	 * done with seq_show(), so unref the iter->cur_sk.
>   	 */
> -	if (iter->cur_sk < iter->end_sk)
> +	if (iter->cur_sk < iter->end_sk) {
>   		sock_put(iter->batch[iter->cur_sk++]);
> +		++state->offset;
> +	}

>   	/* After updating iter->cur_sk, check if there are more sockets
>   	 * available in the current bucket batch.
> @@ -3630,6 +3663,9 @@ static int bpf_iter_init_udp(void *priv_data,  
> struct bpf_iter_aux_info *aux)
>   	}
>   	iter->cur_sk = 0;
>   	iter->end_sk = 0;
> +	iter->st_bucket_done = false;
> +	st->bucket = 0;
> +	st->offset = 0;

>   	return ret;
>   }
> --
> 2.34.1
Martin KaFai Lau March 21, 2023, 11:43 p.m. UTC | #2
On 3/21/23 11:45 AM, Aditi Ghag wrote:
> diff --git a/include/net/udp.h b/include/net/udp.h
> index de4b528522bb..d2999447d3f2 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>   struct udp_iter_state {
>   	struct seq_net_private  p;
>   	int			bucket;
> +	int			offset;

All offset change is easier to review with patch 1 together, so please move them 
to patch 1.

>   	struct udp_seq_afinfo	*bpf_seq_afinfo;
>   };
>   
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1d6f165923bf..ba3e0dac119c 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
>   
>   	return func;
>   }
> +
> +/* Disables missing prototype warnings */
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global functions as their definitions will be in vmlinux BTF");
> +
> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
> + *
> + * The helper expects a non-NULL pointer to a socket. It invokes the
> + * protocol specific socket destroy handlers.
> + *
> + * The helper can only be called from BPF contexts that have acquired the socket
> + * locks.
> + *
> + * Parameters:
> + * @sock: Pointer to socket to be destroyed
> + *
> + * Return:
> + * On error, may return EPROTONOSUPPORT, EINVAL.
> + * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
> + * 0 otherwise
> + */
> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
> +{
> +	struct sock *sk = (struct sock *)sock;
> +
> +	if (!sk)
> +		return -EINVAL;
> +
> +	/* The locking semantics that allow for synchronous execution of the
> +	 * destroy handlers are only supported for TCP and UDP.
> +	 */
> +	if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
> +		return -EOPNOTSUPP;
> +
> +	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
> +}
> +
> +__diag_pop()
> +
> +BTF_SET8_START(sock_destroy_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_sock_destroy)
> +BTF_SET8_END(sock_destroy_kfunc_set)
> +
> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &sock_destroy_kfunc_set,
> +};
> +
> +static int init_subsystem(void)
> +{
> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);

It still needs a way to guarantee the running context is safe to use this kfunc. 
  BPF_PROG_TYPE_TRACING is too broad. Trying to brainstorm ways here instead of 
needing to filter by expected_attach_type. I wonder using KF_TRUSTED_ARGS like this:

BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)

is enough or it needs some care to filter out BPF_TRACE_RAW_TP after looking at 
prog_args_trusted().
or it needs another tag to specify this kfunc requires the arg to be locked also.

> +}
> +late_initcall(init_subsystem);
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 33f559f491c8..59a833c0c872 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
>   		return 0;
>   	}
>   
> -	/* Don't race with userspace socket closes such as tcp_close. */
> -	lock_sock(sk);
> +	/* BPF context ensures sock locking. */
> +	if (!has_current_bpf_ctx())
> +		/* Don't race with userspace socket closes such as tcp_close. */
> +		lock_sock(sk);

This is ok.

>   
>   	if (sk->sk_state == TCP_LISTEN) {
>   		tcp_set_state(sk, TCP_CLOSE);
> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)
>   
>   	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
>   	local_bh_disable();
> -	bh_lock_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		bh_lock_sock(sk);

This may or may not work, depending on the earlier lock_sock_fast() done in 
bpf_iter_tcp_seq_show() returned true or false. It is probably a good reason to 
replace the lock_sock_fast() with lock_sock() in bpf_iter_tcp_seq_show().

>   
>   	if (!sock_flag(sk, SOCK_DEAD)) {
>   		sk->sk_err = err;
> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
>   		tcp_done(sk);
>   	}
>   
> -	bh_unlock_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		bh_unlock_sock(sk);
> +
>   	local_bh_enable();
>   	tcp_write_queue_purge(sk);
> -	release_sock(sk);
> +	if (!has_current_bpf_ctx())
> +		release_sock(sk);
>   	return 0;
>   }
>   EXPORT_SYMBOL_GPL(tcp_abort);
Aditi Ghag March 22, 2023, midnight UTC | #3
> On Mar 21, 2023, at 4:43 PM, Martin KaFai Lau <martin.lau@linux.dev> wrote:
> 
> On 3/21/23 11:45 AM, Aditi Ghag wrote:
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index de4b528522bb..d2999447d3f2 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>>  struct udp_iter_state {
>>  	struct seq_net_private  p;
>>  	int			bucket;
>> +	int			offset;
> 
> All offset change is easier to review with patch 1 together, so please move them to patch 1.

Thanks for the quick review! 

Oh boy... Absolutely! Looks like I misplaced the changes during interactive rebase. Can I fix this up in this patch itself instead of creating a new patch series? That way, I can batch things up in the next revision. 

> 
>>  	struct udp_seq_afinfo	*bpf_seq_afinfo;
>>  };
>>  diff --git a/net/core/filter.c b/net/core/filter.c
>> index 1d6f165923bf..ba3e0dac119c 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
>>    	return func;
>>  }
>> +
>> +/* Disables missing prototype warnings */
>> +__diag_push();
>> +__diag_ignore_all("-Wmissing-prototypes",
>> +		  "Global functions as their definitions will be in vmlinux BTF");
>> +
>> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
>> + *
>> + * The helper expects a non-NULL pointer to a socket. It invokes the
>> + * protocol specific socket destroy handlers.
>> + *
>> + * The helper can only be called from BPF contexts that have acquired the socket
>> + * locks.
>> + *
>> + * Parameters:
>> + * @sock: Pointer to socket to be destroyed
>> + *
>> + * Return:
>> + * On error, may return EPROTONOSUPPORT, EINVAL.
>> + * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
>> + * 0 otherwise
>> + */
>> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
>> +{
>> +	struct sock *sk = (struct sock *)sock;
>> +
>> +	if (!sk)
>> +		return -EINVAL;
>> +
>> +	/* The locking semantics that allow for synchronous execution of the
>> +	 * destroy handlers are only supported for TCP and UDP.
>> +	 */
>> +	if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
>> +		return -EOPNOTSUPP;
>> +
>> +	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
>> +}
>> +
>> +__diag_pop()
>> +
>> +BTF_SET8_START(sock_destroy_kfunc_set)
>> +BTF_ID_FLAGS(func, bpf_sock_destroy)
>> +BTF_SET8_END(sock_destroy_kfunc_set)
>> +
>> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
>> +	.owner = THIS_MODULE,
>> +	.set   = &sock_destroy_kfunc_set,
>> +};
>> +
>> +static int init_subsystem(void)
>> +{
>> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);
> 
> It still needs a way to guarantee the running context is safe to use this kfunc.  BPF_PROG_TYPE_TRACING is too broad. Trying to brainstorm ways here instead of needing to filter by expected_attach_type. I wonder using KF_TRUSTED_ARGS like this:
> 
> BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
> 
> is enough or it needs some care to filter out BPF_TRACE_RAW_TP after looking at prog_args_trusted().
> or it needs another tag to specify this kfunc requires the arg to be locked also.
> 


Agreed. We had some open ended discussion in the earlier patch. 


>> +}
>> +late_initcall(init_subsystem);
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 33f559f491c8..59a833c0c872 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
>>  		return 0;
>>  	}
>>  -	/* Don't race with userspace socket closes such as tcp_close. */
>> -	lock_sock(sk);
>> +	/* BPF context ensures sock locking. */
>> +	if (!has_current_bpf_ctx())
>> +		/* Don't race with userspace socket closes such as tcp_close. */
>> +		lock_sock(sk);
> 
> This is ok.
> 
>>    	if (sk->sk_state == TCP_LISTEN) {
>>  		tcp_set_state(sk, TCP_CLOSE);
>> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)
>>    	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
>>  	local_bh_disable();
>> -	bh_lock_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		bh_lock_sock(sk);
> 
> This may or may not work, depending on the earlier lock_sock_fast() done in bpf_iter_tcp_seq_show() returned true or false. It is probably a good reason to replace the lock_sock_fast() with lock_sock() in bpf_iter_tcp_seq_show().


:) We would have to replace lock_sock_fast with lock_sock anyway, else this causes problems in inet_unhash while destroying TCP listening socket, see - https://lore.kernel.org/bpf/20230321184541.1857363-1-aditi.ghag@isovalent.com/T/#m0bb85df6482e7eae296b00394c482254db544748. 


> 
>>    	if (!sock_flag(sk, SOCK_DEAD)) {
>>  		sk->sk_err = err;
>> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
>>  		tcp_done(sk);
>>  	}
>>  -	bh_unlock_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		bh_unlock_sock(sk);
>> +
>>  	local_bh_enable();
>>  	tcp_write_queue_purge(sk);
>> -	release_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		release_sock(sk);
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(tcp_abort);
>
Martin KaFai Lau March 22, 2023, 12:36 a.m. UTC | #4
On 3/21/23 5:00 PM, Aditi Ghag wrote:
>> n Mar 21, 2023, at 4:43 PM, Martin KaFai Lau<martin.lau@linux.dev>  wrote:
>>
>> On 3/21/23 11:45 AM, Aditi Ghag wrote:
>>> diff --git a/include/net/udp.h b/include/net/udp.h
>>> index de4b528522bb..d2999447d3f2 100644
>>> --- a/include/net/udp.h
>>> +++ b/include/net/udp.h
>>> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>>>   struct udp_iter_state {
>>>   	struct seq_net_private  p;
>>>   	int			bucket;
>>> +	int			offset;
>> All offset change is easier to review with patch 1 together, so please move them to patch 1.
> Thanks for the quick review!
> 
> Oh boy... Absolutely! Looks like I misplaced the changes during interactive rebase. Can I fix this up in this patch itself instead of creating a new patch series? That way, I can batch things up in the next revision.
> 

Instead of re-posting a single patch, it will be easier to batch up all the 
changes in one set in the next revision after the review comments in v3.
Aditi Ghag March 23, 2023, 8:02 p.m. UTC | #5
> On Mar 21, 2023, at 2:02 PM, Stanislav Fomichev <sdf@google.com> wrote:
> 
> On 03/21, Aditi Ghag wrote:
>> The socket destroy kfunc is used to forcefully terminate sockets from
>> certain BPF contexts. We plan to use the capability in Cilium to force
>> client sockets to reconnect when their remote load-balancing backends are
>> deleted. The other use case is on-the-fly policy enforcement where existing
>> socket connections prevented by policies need to be forcefully terminated.
>> The helper allows terminating sockets that may or may not be actively
>> sending traffic.
> 
>> The helper is currently exposed to certain BPF iterators where users can
>> filter, and terminate selected sockets.  Additionally, the helper can only
>> be called from these BPF contexts that ensure socket locking in order to
>> allow synchronous execution of destroy helpers that also acquire socket
>> locks. The previous commit that batches UDP sockets during iteration
>> facilitated a synchronous invocation of the destroy helper from BPF context
>> by skipping taking socket locks in the destroy handler. TCP iterators
>> already supported batching.
> 
>> The helper takes `sock_common` type argument, even though it expects, and
>> casts them to a `sock` pointer. This enables the verifier to allow the
>> sock_destroy kfunc to be called for TCP with `sock_common` and UDP with
>> `sock` structs. As a comparison, BPF helpers enable this behavior with the
>> `ARG_PTR_TO_BTF_ID_SOCK_COMMON` argument type. However, there is no such
>> option available with the verifier logic that handles kfuncs where BTF
>> types are inferred. Furthermore, as `sock_common` only has a subset of
>> certain fields of `sock`, casting pointer to the latter type might not
>> always be safe. Hence, the BPF kfunc converts the argument to a full sock
>> before casting.
> 
>> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
>> ---
>>  include/net/udp.h |  1 +
>>  net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp.c    | 16 +++++++++----
>>  net/ipv4/udp.c    | 60 +++++++++++++++++++++++++++++++++++++----------
>>  4 files changed, 114 insertions(+), 17 deletions(-)
> 
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index de4b528522bb..d2999447d3f2 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
>>  struct udp_iter_state {
>>  	struct seq_net_private  p;
>>  	int			bucket;
>> +	int			offset;
>>  	struct udp_seq_afinfo	*bpf_seq_afinfo;
>>  };
> 
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 1d6f165923bf..ba3e0dac119c 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
> 
>>  	return func;
>>  }
>> +
>> +/* Disables missing prototype warnings */
>> +__diag_push();
>> +__diag_ignore_all("-Wmissing-prototypes",
>> +		  "Global functions as their definitions will be in vmlinux BTF");
>> +
>> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
>> + *
>> + * The helper expects a non-NULL pointer to a socket. It invokes the
>> + * protocol specific socket destroy handlers.
>> + *
>> + * The helper can only be called from BPF contexts that have acquired the socket
>> + * locks.
>> + *
>> + * Parameters:
>> + * @sock: Pointer to socket to be destroyed
>> + *
>> + * Return:
>> + * On error, may return EPROTONOSUPPORT, EINVAL.
>> + * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
>> + * 0 otherwise
>> + */
>> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
>> +{
>> +	struct sock *sk = (struct sock *)sock;
>> +
>> +	if (!sk)
>> +		return -EINVAL;
>> +
>> +	/* The locking semantics that allow for synchronous execution of the
>> +	 * destroy handlers are only supported for TCP and UDP.
>> +	 */
>> +	if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
>> +		return -EOPNOTSUPP;
> 
> What makes IPPROTO_RAW special? Looks like it locks the socket as well?

I haven't looked at the locking semantics for IPPROTO_RAW. These can be handled in a follow-up patch. Wdyt?

> 
>> +
>> +	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
>> +}
>> +
>> +__diag_pop()
>> +
>> +BTF_SET8_START(sock_destroy_kfunc_set)
>> +BTF_ID_FLAGS(func, bpf_sock_destroy)
>> +BTF_SET8_END(sock_destroy_kfunc_set)
>> +
>> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
>> +	.owner = THIS_MODULE,
>> +	.set   = &sock_destroy_kfunc_set,
>> +};
>> +
>> +static int init_subsystem(void)
>> +{
>> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);
>> +}
>> +late_initcall(init_subsystem);
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 33f559f491c8..59a833c0c872 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
>>  		return 0;
>>  	}
> 
>> -	/* Don't race with userspace socket closes such as tcp_close. */
>> -	lock_sock(sk);
>> +	/* BPF context ensures sock locking. */
>> +	if (!has_current_bpf_ctx())
>> +		/* Don't race with userspace socket closes such as tcp_close. */
>> +		lock_sock(sk);
> 
>>  	if (sk->sk_state == TCP_LISTEN) {
>>  		tcp_set_state(sk, TCP_CLOSE);
>> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)
> 
>>  	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
>>  	local_bh_disable();
> 
> [..]
> 
>> -	bh_lock_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		bh_lock_sock(sk);
> 
> These are spinlocks and should probably be grabbed in the bpf context as
> well?

Fixed in the next revision. Thanks!

> 
> 
>>  	if (!sock_flag(sk, SOCK_DEAD)) {
>>  		sk->sk_err = err;
>> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
>>  		tcp_done(sk);
>>  	}
> 
>> -	bh_unlock_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		bh_unlock_sock(sk);
>> +
>>  	local_bh_enable();
>>  	tcp_write_queue_purge(sk);
>> -	release_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		release_sock(sk);
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(tcp_abort);
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index 545e56329355..02d357713838 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -2925,7 +2925,9 @@ EXPORT_SYMBOL(udp_poll);
> 
>>  int udp_abort(struct sock *sk, int err)
>>  {
>> -	lock_sock(sk);
>> +	/* BPF context ensures sock locking. */
>> +	if (!has_current_bpf_ctx())
>> +		lock_sock(sk);
> 
>>  	/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
>>  	 * with close()
>> @@ -2938,7 +2940,8 @@ int udp_abort(struct sock *sk, int err)
>>  	__udp_disconnect(sk, 0);
> 
>>  out:
>> -	release_sock(sk);
>> +	if (!has_current_bpf_ctx())
>> +		release_sock(sk);
> 
>>  	return 0;
>>  }
>> @@ -3184,15 +3187,23 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>  	struct sock *first_sk = NULL;
>>  	struct sock *sk;
>>  	unsigned int bucket_sks = 0;
>> -	bool first;
>>  	bool resized = false;
>> +	int offset = 0;
>> +	int new_offset;
> 
>>  	/* The current batch is done, so advance the bucket. */
>> -	if (iter->st_bucket_done)
>> +	if (iter->st_bucket_done) {
>>  		state->bucket++;
>> +		state->offset = 0;
>> +	}
> 
>>  	udptable = udp_get_table_afinfo(afinfo, net);
> 
>> +	if (state->bucket > udptable->mask) {
>> +		state->bucket = 0;
>> +		state->offset = 0;
>> +	}
>> +
>>  again:
>>  	/* New batch for the next bucket.
>>  	 * Iterate over the hash table to find a bucket with sockets matching
>> @@ -3204,43 +3215,60 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>  	iter->cur_sk = 0;
>>  	iter->end_sk = 0;
>>  	iter->st_bucket_done = false;
>> -	first = true;
>> +	first_sk = NULL;
>> +	bucket_sks = 0;
>> +	offset = state->offset;
>> +	new_offset = offset;
> 
>>  	for (; state->bucket <= udptable->mask; state->bucket++) {
>>  		struct udp_hslot *hslot = &udptable->hash[state->bucket];
> 
>> -		if (hlist_empty(&hslot->head))
>> +		if (hlist_empty(&hslot->head)) {
>> +			offset = 0;
>>  			continue;
>> +		}
> 
>>  		spin_lock_bh(&hslot->lock);
>> +		/* Resume from the last saved position in a bucket before
>> +		 * iterator was stopped.
>> +		 */
>> +		while (offset-- > 0) {
>> +			sk_for_each(sk, &hslot->head)
>> +				continue;
>> +		}
>>  		sk_for_each(sk, &hslot->head) {
>>  			if (seq_sk_match(seq, sk)) {
>> -				if (first) {
>> +				if (!first_sk)
>>  					first_sk = sk;
>> -					first = false;
>> -				}
>>  				if (iter->end_sk < iter->max_sk) {
>>  					sock_hold(sk);
>>  					iter->batch[iter->end_sk++] = sk;
>>  				}
>>  				bucket_sks++;
>>  			}
>> +			new_offset++;
>>  		}
>>  		spin_unlock_bh(&hslot->lock);
>> +
>>  		if (first_sk)
>>  			break;
>> +
>> +		/* Reset the current bucket's offset before moving to the next bucket. */
>> +		offset = 0;
>> +		new_offset = 0;
>> +
>>  	}
> 
>>  	/* All done: no batch made. */
>>  	if (!first_sk)
>> -		return NULL;
>> +		goto ret;
> 
>>  	if (iter->end_sk == bucket_sks) {
>>  		/* Batching is done for the current bucket; return the first
>>  		 * socket to be iterated from the batch.
>>  		 */
>>  		iter->st_bucket_done = true;
>> -		return first_sk;
>> +		goto ret;
>>  	}
>>  	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
>>  		resized = true;
>> @@ -3248,19 +3276,24 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>>  		state->bucket--;
>>  		goto again;
>>  	}
>> +ret:
>> +	state->offset = new_offset;
>>  	return first_sk;
>>  }
> 
>>  static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>>  {
>>  	struct bpf_udp_iter_state *iter = seq->private;
>> +	struct udp_iter_state *state = &iter->state;
>>  	struct sock *sk;
> 
>>  	/* Whenever seq_next() is called, the iter->cur_sk is
>>  	 * done with seq_show(), so unref the iter->cur_sk.
>>  	 */
>> -	if (iter->cur_sk < iter->end_sk)
>> +	if (iter->cur_sk < iter->end_sk) {
>>  		sock_put(iter->batch[iter->cur_sk++]);
>> +		++state->offset;
>> +	}
> 
>>  	/* After updating iter->cur_sk, check if there are more sockets
>>  	 * available in the current bucket batch.
>> @@ -3630,6 +3663,9 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
>>  	}
>>  	iter->cur_sk = 0;
>>  	iter->end_sk = 0;
>> +	iter->st_bucket_done = false;
>> +	st->bucket = 0;
>> +	st->offset = 0;
> 
>>  	return ret;
>>  }
>> --
>> 2.34.1
Stanislav Fomichev March 23, 2023, 9:45 p.m. UTC | #6
On Thu, Mar 23, 2023 at 1:02 PM Aditi Ghag <aditi.ghag@isovalent.com> wrote:
>
>
>
> > On Mar 21, 2023, at 2:02 PM, Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On 03/21, Aditi Ghag wrote:
> >> The socket destroy kfunc is used to forcefully terminate sockets from
> >> certain BPF contexts. We plan to use the capability in Cilium to force
> >> client sockets to reconnect when their remote load-balancing backends are
> >> deleted. The other use case is on-the-fly policy enforcement where existing
> >> socket connections prevented by policies need to be forcefully terminated.
> >> The helper allows terminating sockets that may or may not be actively
> >> sending traffic.
> >
> >> The helper is currently exposed to certain BPF iterators where users can
> >> filter, and terminate selected sockets.  Additionally, the helper can only
> >> be called from these BPF contexts that ensure socket locking in order to
> >> allow synchronous execution of destroy helpers that also acquire socket
> >> locks. The previous commit that batches UDP sockets during iteration
> >> facilitated a synchronous invocation of the destroy helper from BPF context
> >> by skipping taking socket locks in the destroy handler. TCP iterators
> >> already supported batching.
> >
> >> The helper takes `sock_common` type argument, even though it expects, and
> >> casts them to a `sock` pointer. This enables the verifier to allow the
> >> sock_destroy kfunc to be called for TCP with `sock_common` and UDP with
> >> `sock` structs. As a comparison, BPF helpers enable this behavior with the
> >> `ARG_PTR_TO_BTF_ID_SOCK_COMMON` argument type. However, there is no such
> >> option available with the verifier logic that handles kfuncs where BTF
> >> types are inferred. Furthermore, as `sock_common` only has a subset of
> >> certain fields of `sock`, casting pointer to the latter type might not
> >> always be safe. Hence, the BPF kfunc converts the argument to a full sock
> >> before casting.
> >
> >> Signed-off-by: Aditi Ghag <aditi.ghag@isovalent.com>
> >> ---
> >>  include/net/udp.h |  1 +
> >>  net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++
> >>  net/ipv4/tcp.c    | 16 +++++++++----
> >>  net/ipv4/udp.c    | 60 +++++++++++++++++++++++++++++++++++++----------
> >>  4 files changed, 114 insertions(+), 17 deletions(-)
> >
> >> diff --git a/include/net/udp.h b/include/net/udp.h
> >> index de4b528522bb..d2999447d3f2 100644
> >> --- a/include/net/udp.h
> >> +++ b/include/net/udp.h
> >> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
> >>  struct udp_iter_state {
> >>      struct seq_net_private  p;
> >>      int                     bucket;
> >> +    int                     offset;
> >>      struct udp_seq_afinfo   *bpf_seq_afinfo;
> >>  };
> >
> >> diff --git a/net/core/filter.c b/net/core/filter.c
> >> index 1d6f165923bf..ba3e0dac119c 100644
> >> --- a/net/core/filter.c
> >> +++ b/net/core/filter.c
> >> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
> >
> >>      return func;
> >>  }
> >> +
> >> +/* Disables missing prototype warnings */
> >> +__diag_push();
> >> +__diag_ignore_all("-Wmissing-prototypes",
> >> +              "Global functions as their definitions will be in vmlinux BTF");
> >> +
> >> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
> >> + *
> >> + * The helper expects a non-NULL pointer to a socket. It invokes the
> >> + * protocol specific socket destroy handlers.
> >> + *
> >> + * The helper can only be called from BPF contexts that have acquired the socket
> >> + * locks.
> >> + *
> >> + * Parameters:
> >> + * @sock: Pointer to socket to be destroyed
> >> + *
> >> + * Return:
> >> + * On error, may return EPROTONOSUPPORT, EINVAL.
> >> + * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
> >> + * 0 otherwise
> >> + */
> >> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
> >> +{
> >> +    struct sock *sk = (struct sock *)sock;
> >> +
> >> +    if (!sk)
> >> +            return -EINVAL;
> >> +
> >> +    /* The locking semantics that allow for synchronous execution of the
> >> +     * destroy handlers are only supported for TCP and UDP.
> >> +     */
> >> +    if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
> >> +            return -EOPNOTSUPP;
> >
> > What makes IPPROTO_RAW special? Looks like it locks the socket as well?
>
> I haven't looked at the locking semantics for IPPROTO_RAW. These can be handled in a follow-up patch. Wdyt?

Fine with me. Maybe make it more opt-in? (vs current "opt ipproto_raw out")

if (sk->sk_prot->diag_destroy != udp_abort &&
    sk->sk_prot->diag_destroy != tcp_abort)
        return -EOPNOTSUPP;

Is it more robust? Or does it look uglier? )
But maybe fine as is, I'm just thinking out loud..

> >
> >> +
> >> +    return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
> >> +}
> >> +
> >> +__diag_pop()
> >> +
> >> +BTF_SET8_START(sock_destroy_kfunc_set)
> >> +BTF_ID_FLAGS(func, bpf_sock_destroy)
> >> +BTF_SET8_END(sock_destroy_kfunc_set)
> >> +
> >> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
> >> +    .owner = THIS_MODULE,
> >> +    .set   = &sock_destroy_kfunc_set,
> >> +};
> >> +
> >> +static int init_subsystem(void)
> >> +{
> >> +    return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);
> >> +}
> >> +late_initcall(init_subsystem);
> >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> >> index 33f559f491c8..59a833c0c872 100644
> >> --- a/net/ipv4/tcp.c
> >> +++ b/net/ipv4/tcp.c
> >> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
> >>              return 0;
> >>      }
> >
> >> -    /* Don't race with userspace socket closes such as tcp_close. */
> >> -    lock_sock(sk);
> >> +    /* BPF context ensures sock locking. */
> >> +    if (!has_current_bpf_ctx())
> >> +            /* Don't race with userspace socket closes such as tcp_close. */
> >> +            lock_sock(sk);
> >
> >>      if (sk->sk_state == TCP_LISTEN) {
> >>              tcp_set_state(sk, TCP_CLOSE);
> >> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)
> >
> >>      /* Don't race with BH socket closes such as inet_csk_listen_stop. */
> >>      local_bh_disable();
> >
> > [..]
> >
> >> -    bh_lock_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            bh_lock_sock(sk);
> >
> > These are spinlocks and should probably be grabbed in the bpf context as
> > well?
>
> Fixed in the next revision. Thanks!
>
> >
> >
> >>      if (!sock_flag(sk, SOCK_DEAD)) {
> >>              sk->sk_err = err;
> >> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
> >>              tcp_done(sk);
> >>      }
> >
> >> -    bh_unlock_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            bh_unlock_sock(sk);
> >> +
> >>      local_bh_enable();
> >>      tcp_write_queue_purge(sk);
> >> -    release_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            release_sock(sk);
> >>      return 0;
> >>  }
> >>  EXPORT_SYMBOL_GPL(tcp_abort);
> >> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> >> index 545e56329355..02d357713838 100644
> >> --- a/net/ipv4/udp.c
> >> +++ b/net/ipv4/udp.c
> >> @@ -2925,7 +2925,9 @@ EXPORT_SYMBOL(udp_poll);
> >
> >>  int udp_abort(struct sock *sk, int err)
> >>  {
> >> -    lock_sock(sk);
> >> +    /* BPF context ensures sock locking. */
> >> +    if (!has_current_bpf_ctx())
> >> +            lock_sock(sk);
> >
> >>      /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
> >>       * with close()
> >> @@ -2938,7 +2940,8 @@ int udp_abort(struct sock *sk, int err)
> >>      __udp_disconnect(sk, 0);
> >
> >>  out:
> >> -    release_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            release_sock(sk);
> >
> >>      return 0;
> >>  }
> >> @@ -3184,15 +3187,23 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>      struct sock *first_sk = NULL;
> >>      struct sock *sk;
> >>      unsigned int bucket_sks = 0;
> >> -    bool first;
> >>      bool resized = false;
> >> +    int offset = 0;
> >> +    int new_offset;
> >
> >>      /* The current batch is done, so advance the bucket. */
> >> -    if (iter->st_bucket_done)
> >> +    if (iter->st_bucket_done) {
> >>              state->bucket++;
> >> +            state->offset = 0;
> >> +    }
> >
> >>      udptable = udp_get_table_afinfo(afinfo, net);
> >
> >> +    if (state->bucket > udptable->mask) {
> >> +            state->bucket = 0;
> >> +            state->offset = 0;
> >> +    }
> >> +
> >>  again:
> >>      /* New batch for the next bucket.
> >>       * Iterate over the hash table to find a bucket with sockets matching
> >> @@ -3204,43 +3215,60 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>      iter->cur_sk = 0;
> >>      iter->end_sk = 0;
> >>      iter->st_bucket_done = false;
> >> -    first = true;
> >> +    first_sk = NULL;
> >> +    bucket_sks = 0;
> >> +    offset = state->offset;
> >> +    new_offset = offset;
> >
> >>      for (; state->bucket <= udptable->mask; state->bucket++) {
> >>              struct udp_hslot *hslot = &udptable->hash[state->bucket];
> >
> >> -            if (hlist_empty(&hslot->head))
> >> +            if (hlist_empty(&hslot->head)) {
> >> +                    offset = 0;
> >>                      continue;
> >> +            }
> >
> >>              spin_lock_bh(&hslot->lock);
> >> +            /* Resume from the last saved position in a bucket before
> >> +             * iterator was stopped.
> >> +             */
> >> +            while (offset-- > 0) {
> >> +                    sk_for_each(sk, &hslot->head)
> >> +                            continue;
> >> +            }
> >>              sk_for_each(sk, &hslot->head) {
> >>                      if (seq_sk_match(seq, sk)) {
> >> -                            if (first) {
> >> +                            if (!first_sk)
> >>                                      first_sk = sk;
> >> -                                    first = false;
> >> -                            }
> >>                              if (iter->end_sk < iter->max_sk) {
> >>                                      sock_hold(sk);
> >>                                      iter->batch[iter->end_sk++] = sk;
> >>                              }
> >>                              bucket_sks++;
> >>                      }
> >> +                    new_offset++;
> >>              }
> >>              spin_unlock_bh(&hslot->lock);
> >> +
> >>              if (first_sk)
> >>                      break;
> >> +
> >> +            /* Reset the current bucket's offset before moving to the next bucket. */
> >> +            offset = 0;
> >> +            new_offset = 0;
> >> +
> >>      }
> >
> >>      /* All done: no batch made. */
> >>      if (!first_sk)
> >> -            return NULL;
> >> +            goto ret;
> >
> >>      if (iter->end_sk == bucket_sks) {
> >>              /* Batching is done for the current bucket; return the first
> >>               * socket to be iterated from the batch.
> >>               */
> >>              iter->st_bucket_done = true;
> >> -            return first_sk;
> >> +            goto ret;
> >>      }
> >>      if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
> >>              resized = true;
> >> @@ -3248,19 +3276,24 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>              state->bucket--;
> >>              goto again;
> >>      }
> >> +ret:
> >> +    state->offset = new_offset;
> >>      return first_sk;
> >>  }
> >
> >>  static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> >>  {
> >>      struct bpf_udp_iter_state *iter = seq->private;
> >> +    struct udp_iter_state *state = &iter->state;
> >>      struct sock *sk;
> >
> >>      /* Whenever seq_next() is called, the iter->cur_sk is
> >>       * done with seq_show(), so unref the iter->cur_sk.
> >>       */
> >> -    if (iter->cur_sk < iter->end_sk)
> >> +    if (iter->cur_sk < iter->end_sk) {
> >>              sock_put(iter->batch[iter->cur_sk++]);
> >> +            ++state->offset;
> >> +    }
> >
> >>      /* After updating iter->cur_sk, check if there are more sockets
> >>       * available in the current bucket batch.
> >> @@ -3630,6 +3663,9 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
> >>      }
> >>      iter->cur_sk = 0;
> >>      iter->end_sk = 0;
> >> +    iter->st_bucket_done = false;
> >> +    st->bucket = 0;
> >> +    st->offset = 0;
> >
> >>      return ret;
> >>  }
> >> --
> >> 2.34.1
>
diff mbox series

Patch

diff --git a/include/net/udp.h b/include/net/udp.h
index de4b528522bb..d2999447d3f2 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -437,6 +437,7 @@  struct udp_seq_afinfo {
 struct udp_iter_state {
 	struct seq_net_private  p;
 	int			bucket;
+	int			offset;
 	struct udp_seq_afinfo	*bpf_seq_afinfo;
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1d6f165923bf..ba3e0dac119c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11621,3 +11621,57 @@  bpf_sk_base_func_proto(enum bpf_func_id func_id)
 
 	return func;
 }
+
+/* Disables missing prototype warnings */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
+ *
+ * The helper expects a non-NULL pointer to a socket. It invokes the
+ * protocol specific socket destroy handlers.
+ *
+ * The helper can only be called from BPF contexts that have acquired the socket
+ * locks.
+ *
+ * Parameters:
+ * @sock: Pointer to socket to be destroyed
+ *
+ * Return:
+ * On error, may return EPROTONOSUPPORT, EINVAL.
+ * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
+ * 0 otherwise
+ */
+__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
+{
+	struct sock *sk = (struct sock *)sock;
+
+	if (!sk)
+		return -EINVAL;
+
+	/* The locking semantics that allow for synchronous execution of the
+	 * destroy handlers are only supported for TCP and UDP.
+	 */
+	if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
+		return -EOPNOTSUPP;
+
+	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
+}
+
+__diag_pop()
+
+BTF_SET8_START(sock_destroy_kfunc_set)
+BTF_ID_FLAGS(func, bpf_sock_destroy)
+BTF_SET8_END(sock_destroy_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &sock_destroy_kfunc_set,
+};
+
+static int init_subsystem(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 33f559f491c8..59a833c0c872 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4678,8 +4678,10 @@  int tcp_abort(struct sock *sk, int err)
 		return 0;
 	}
 
-	/* Don't race with userspace socket closes such as tcp_close. */
-	lock_sock(sk);
+	/* BPF context ensures sock locking. */
+	if (!has_current_bpf_ctx())
+		/* Don't race with userspace socket closes such as tcp_close. */
+		lock_sock(sk);
 
 	if (sk->sk_state == TCP_LISTEN) {
 		tcp_set_state(sk, TCP_CLOSE);
@@ -4688,7 +4690,8 @@  int tcp_abort(struct sock *sk, int err)
 
 	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
 	local_bh_disable();
-	bh_lock_sock(sk);
+	if (!has_current_bpf_ctx())
+		bh_lock_sock(sk);
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		sk->sk_err = err;
@@ -4700,10 +4703,13 @@  int tcp_abort(struct sock *sk, int err)
 		tcp_done(sk);
 	}
 
-	bh_unlock_sock(sk);
+	if (!has_current_bpf_ctx())
+		bh_unlock_sock(sk);
+
 	local_bh_enable();
 	tcp_write_queue_purge(sk);
-	release_sock(sk);
+	if (!has_current_bpf_ctx())
+		release_sock(sk);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tcp_abort);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 545e56329355..02d357713838 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2925,7 +2925,9 @@  EXPORT_SYMBOL(udp_poll);
 
 int udp_abort(struct sock *sk, int err)
 {
-	lock_sock(sk);
+	/* BPF context ensures sock locking. */
+	if (!has_current_bpf_ctx())
+		lock_sock(sk);
 
 	/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
 	 * with close()
@@ -2938,7 +2940,8 @@  int udp_abort(struct sock *sk, int err)
 	__udp_disconnect(sk, 0);
 
 out:
-	release_sock(sk);
+	if (!has_current_bpf_ctx())
+		release_sock(sk);
 
 	return 0;
 }
@@ -3184,15 +3187,23 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 	struct sock *first_sk = NULL;
 	struct sock *sk;
 	unsigned int bucket_sks = 0;
-	bool first;
 	bool resized = false;
+	int offset = 0;
+	int new_offset;
 
 	/* The current batch is done, so advance the bucket. */
-	if (iter->st_bucket_done)
+	if (iter->st_bucket_done) {
 		state->bucket++;
+		state->offset = 0;
+	}
 
 	udptable = udp_get_table_afinfo(afinfo, net);
 
+	if (state->bucket > udptable->mask) {
+		state->bucket = 0;
+		state->offset = 0;
+	}
+
 again:
 	/* New batch for the next bucket.
 	 * Iterate over the hash table to find a bucket with sockets matching
@@ -3204,43 +3215,60 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 	iter->cur_sk = 0;
 	iter->end_sk = 0;
 	iter->st_bucket_done = false;
-	first = true;
+	first_sk = NULL;
+	bucket_sks = 0;
+	offset = state->offset;
+	new_offset = offset;
 
 	for (; state->bucket <= udptable->mask; state->bucket++) {
 		struct udp_hslot *hslot = &udptable->hash[state->bucket];
 
-		if (hlist_empty(&hslot->head))
+		if (hlist_empty(&hslot->head)) {
+			offset = 0;
 			continue;
+		}
 
 		spin_lock_bh(&hslot->lock);
+		/* Resume from the last saved position in a bucket before
+		 * iterator was stopped.
+		 */
+		while (offset-- > 0) {
+			sk_for_each(sk, &hslot->head)
+				continue;
+		}
 		sk_for_each(sk, &hslot->head) {
 			if (seq_sk_match(seq, sk)) {
-				if (first) {
+				if (!first_sk)
 					first_sk = sk;
-					first = false;
-				}
 				if (iter->end_sk < iter->max_sk) {
 					sock_hold(sk);
 					iter->batch[iter->end_sk++] = sk;
 				}
 				bucket_sks++;
 			}
+			new_offset++;
 		}
 		spin_unlock_bh(&hslot->lock);
+
 		if (first_sk)
 			break;
+
+		/* Reset the current bucket's offset before moving to the next bucket. */
+		offset = 0;
+		new_offset = 0;
+
 	}
 
 	/* All done: no batch made. */
 	if (!first_sk)
-		return NULL;
+		goto ret;
 
 	if (iter->end_sk == bucket_sks) {
 		/* Batching is done for the current bucket; return the first
 		 * socket to be iterated from the batch.
 		 */
 		iter->st_bucket_done = true;
-		return first_sk;
+		goto ret;
 	}
 	if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
 		resized = true;
@@ -3248,19 +3276,24 @@  static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 		state->bucket--;
 		goto again;
 	}
+ret:
+	state->offset = new_offset;
 	return first_sk;
 }
 
 static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct bpf_udp_iter_state *iter = seq->private;
+	struct udp_iter_state *state = &iter->state;
 	struct sock *sk;
 
 	/* Whenever seq_next() is called, the iter->cur_sk is
 	 * done with seq_show(), so unref the iter->cur_sk.
 	 */
-	if (iter->cur_sk < iter->end_sk)
+	if (iter->cur_sk < iter->end_sk) {
 		sock_put(iter->batch[iter->cur_sk++]);
+		++state->offset;
+	}
 
 	/* After updating iter->cur_sk, check if there are more sockets
 	 * available in the current bucket batch.
@@ -3630,6 +3663,9 @@  static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 	}
 	iter->cur_sk = 0;
 	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+	st->bucket = 0;
+	st->offset = 0;
 
 	return ret;
 }