diff mbox series

[bpf-next,v4,3/3] bpf: remove extra lock_sock for TCP_ZEROCOPY_RECEIVE

Message ID 20210107184305.444635-4-sdf@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: misc performance improvements for cgroup hooks | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 12 maintainers not CCed: kpsingh@kernel.org yhs@fb.com linux-kselftest@vger.kernel.org andrii@kernel.org kuba@kernel.org shuah@kernel.org john.fastabend@gmail.com iii@linux.ibm.com yoshfuji@linux-ipv6.org toke@redhat.com davem@davemloft.net kuznet@ms2.inr.ac.ru
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 12609 this patch: 12609
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 343 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 12948 this patch: 12948
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Stanislav Fomichev Jan. 7, 2021, 6:43 p.m. UTC
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
2% overhead for locking/unlocking the socket.

Also:
- Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
- Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
  (let's keep it to help with the other options)

(I can probably split this patch into two: add new features and rework
 bpf_sockopt_buf; can follow up if the approach in general sounds
 good).

Without this patch:
     1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt

With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
---
 include/linux/bpf-cgroup.h                    | 25 ++++-
 include/linux/filter.h                        |  6 +-
 include/net/sock.h                            |  2 +
 include/net/tcp.h                             |  1 +
 kernel/bpf/cgroup.c                           | 93 +++++++++++++------
 net/ipv4/tcp.c                                | 14 +++
 net/ipv4/tcp_ipv4.c                           |  1 +
 net/ipv6/tcp_ipv6.c                           |  1 +
 .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++
 .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++
 10 files changed, 147 insertions(+), 33 deletions(-)

Comments

Martin KaFai Lau Jan. 8, 2021, 1:08 a.m. UTC | #1
On Thu, Jan 07, 2021 at 10:43:05AM -0800, Stanislav Fomichev wrote:
> Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
> We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
> call in do_tcp_getsockopt using the on-stack data. This removes
> 2% overhead for locking/unlocking the socket.
> 
> Also:
> - Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
> - Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
>   (let's keep it to help with the other options)
> 
> (I can probably split this patch into two: add new features and rework
>  bpf_sockopt_buf; can follow up if the approach in general sounds
>  good).
> 
> Without this patch:
>      1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> 
> With the patch applied:
>      0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> Cc: Martin KaFai Lau <kafai@fb.com>
> Cc: Song Liu <songliubraving@fb.com>
> Cc: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/bpf-cgroup.h                    | 25 ++++-
>  include/linux/filter.h                        |  6 +-
>  include/net/sock.h                            |  2 +
>  include/net/tcp.h                             |  1 +
>  kernel/bpf/cgroup.c                           | 93 +++++++++++++------
>  net/ipv4/tcp.c                                | 14 +++
>  net/ipv4/tcp_ipv4.c                           |  1 +
>  net/ipv6/tcp_ipv6.c                           |  1 +
>  .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++
>  .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++
>  10 files changed, 147 insertions(+), 33 deletions(-)
>

[ ... ]

> @@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>  #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
>  				       optlen, max_optlen, retval) ({ retval; })
> +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
> +					    optlen, retval) ({ retval; })
>  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
>  				       kernel_optval) ({ 0; })
>  
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 54a4225f36d8..8739f1d4cac4 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {
>  	u64 tmp_reg;
>  };
>  
> -#define BPF_SOCKOPT_KERN_BUF_SIZE	64
> +#define BPF_SOCKOPT_KERN_BUF_SIZE	32
It is reduced from patch 1 because there is no
need to use the buf (and copy from/to buf) in TCP_ZEROCOPY_RECEIVE?

Patch 1 is still desired (and kept in this set) because it may still
benefit other optname?

> +struct bpf_sockopt_buf {
> +	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];
> +};
>  
>  struct bpf_sockopt_kern {
>  	struct sock	*sk;
> @@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {
>  	s32		optname;
>  	s32		optlen;
>  	s32		retval;
> -	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
It is better to pick one way to do thing to avoid code
churn like this within the same series.

>  };
>  
>  int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index bdc4323ce53c..ebf44d724845 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1174,6 +1174,8 @@ struct proto {
>  
>  	int			(*backlog_rcv) (struct sock *sk,
>  						struct sk_buff *skb);
> +	bool			(*bpf_bypass_getsockopt)(int level,
> +							 int optname);
>  
>  	void		(*release_cb)(struct sock *sk);
>  
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 78d13c88720f..4bb42fb19711 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
>  		      struct poll_table_struct *wait);
>  int tcp_getsockopt(struct sock *sk, int level, int optname,
>  		   char __user *optval, int __user *optlen);
> +bool tcp_bpf_bypass_getsockopt(int level, int optname);
>  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
>  		   unsigned int optlen);
>  void tcp_set_keepalive(struct sock *sk, int val);
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index adbecdcaa370..e82df63aedc7 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -16,7 +16,6 @@
>  #include <linux/bpf-cgroup.h>
>  #include <net/sock.h>
>  #include <net/bpf_sk_storage.h>
> -#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
Can the patches be re-ordered a little to avoid code churn like this
in the same series?

It feels like this patch 3 should be the first patch instead.
The current patch 1 should be the second patch
but it can still use the tcp_mmap to show potential
benefit for other optnames.

>  
>  #include "../cgroup/cgroup-internal.h"
>  
> @@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
>  	return empty;
>  }
>  
> -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> +			     struct bpf_sockopt_buf *buf)
>  {
>  	if (unlikely(max_optlen < 0))
>  		return -EINVAL;
> @@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  		max_optlen = PAGE_SIZE;
>  	}
>  
> -	if (max_optlen <= sizeof(ctx->buf)) {
> +	if (max_optlen <= sizeof(buf->data)) {
>  		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
>  		 * bytes avoid the cost of kzalloc.
> -		 *
> -		 * In order to remove extra allocations from the TCP
> -		 * fast zero-copy path ensure that buffer covers
> -		 * the size of struct tcp_zerocopy_receive.
>  		 */
> -		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> -			     BPF_SOCKOPT_KERN_BUF_SIZE);
> -
> -		ctx->optval = ctx->buf;
> +		ctx->optval = buf->data;
>  		ctx->optval_end = ctx->optval + max_optlen;
>  		return max_optlen;
>  	}
> @@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  	return max_optlen;
>  }
>  
> -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
> +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> +			     struct bpf_sockopt_buf *buf)
>  {
> -	if (ctx->optval == ctx->buf)
> +	if (ctx->optval == buf->data)
>  		return;
>  	kfree(ctx->optval);
>  }
>  
> -static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
> +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> +				  struct bpf_sockopt_buf *buf)
>  {
> -	return ctx->optval != ctx->buf;
> +	return ctx->optval != buf->data;
>  }
>  
>  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  				       int *optlen, char **kernel_optval)
>  {
>  	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> +	struct bpf_sockopt_buf buf = {};
>  	struct bpf_sockopt_kern ctx = {
>  		.sk = sk,
>  		.level = *level,
> @@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  	 */
>  	max_optlen = max_t(int, 16, *optlen);
>  
> -	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
> +	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>  	if (max_optlen < 0)
>  		return max_optlen;
>  
> @@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  			 * No way to export on-stack buf, have to allocate a
>  			 * new buffer.
>  			 */
> -			if (!sockopt_buf_allocated(&ctx)) {
> +			if (!sockopt_buf_allocated(&ctx, &buf)) {
>  				void *p = kzalloc(ctx.optlen, GFP_USER);
>  
>  				if (!p) {
> @@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  
>  out:
>  	if (ret)
> -		sockopt_free_buf(&ctx);
> +		sockopt_free_buf(&ctx, &buf);
>  	return ret;
>  }
>  
> @@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>  				       int __user *optlen, int max_optlen,
>  				       int retval)
>  {
> -	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> -	struct bpf_sockopt_kern ctx = {
> -		.sk = sk,
> -		.level = level,
> -		.optname = optname,
> -		.retval = retval,
> -	};
This change looks unnecessary?

> +	struct bpf_sockopt_kern ctx;
> +	struct bpf_sockopt_buf buf;
> +	struct cgroup *cgrp;
>  	int ret;
>  
> +	memset(&buf, 0, sizeof(buf));
> +	memset(&ctx, 0, sizeof(ctx));
> +
> +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> +	ctx.sk = sk;
> +	ctx.level = level;
> +	ctx.optname = optname;
> +	ctx.retval = retval;
> +
Stanislav Fomichev Jan. 8, 2021, 1:25 a.m. UTC | #2
On Thu, Jan 7, 2021 at 5:09 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Thu, Jan 07, 2021 at 10:43:05AM -0800, Stanislav Fomichev wrote:
> > Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
> > We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
> > call in do_tcp_getsockopt using the on-stack data. This removes
> > 2% overhead for locking/unlocking the socket.
> >
> > Also:
> > - Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
> > - Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
> >   (let's keep it to help with the other options)
> >
> > (I can probably split this patch into two: add new features and rework
> >  bpf_sockopt_buf; can follow up if the approach in general sounds
> >  good).
> >
> > Without this patch:
> >      1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> >
> > With the patch applied:
> >      0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Cc: Song Liu <songliubraving@fb.com>
> > Cc: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/linux/bpf-cgroup.h                    | 25 ++++-
> >  include/linux/filter.h                        |  6 +-
> >  include/net/sock.h                            |  2 +
> >  include/net/tcp.h                             |  1 +
> >  kernel/bpf/cgroup.c                           | 93 +++++++++++++------
> >  net/ipv4/tcp.c                                | 14 +++
> >  net/ipv4/tcp_ipv4.c                           |  1 +
> >  net/ipv6/tcp_ipv6.c                           |  1 +
> >  .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++
> >  .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++
> >  10 files changed, 147 insertions(+), 33 deletions(-)
> >
>
> [ ... ]
>
> > @@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> >  #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
> >  #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
> >                                      optlen, max_optlen, retval) ({ retval; })
> > +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
> > +                                         optlen, retval) ({ retval; })
> >  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
> >                                      kernel_optval) ({ 0; })
> >
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index 54a4225f36d8..8739f1d4cac4 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {
> >       u64 tmp_reg;
> >  };
> >
> > -#define BPF_SOCKOPT_KERN_BUF_SIZE    64
> > +#define BPF_SOCKOPT_KERN_BUF_SIZE    32
> It is reduced from patch 1 because there is no
> need to use the buf (and copy from/to buf) in TCP_ZEROCOPY_RECEIVE?
>
> Patch 1 is still desired (and kept in this set) because it may still
> benefit other optname?
Right, it seems like a good idea to keep it to help with the (majority?)
of small socket options.

> > +struct bpf_sockopt_buf {
> > +     u8              data[BPF_SOCKOPT_KERN_BUF_SIZE];
> > +};
> >
> >  struct bpf_sockopt_kern {
> >       struct sock     *sk;
> > @@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {
> >       s32             optname;
> >       s32             optlen;
> >       s32             retval;
> > -     u8              buf[BPF_SOCKOPT_KERN_BUF_SIZE];
> It is better to pick one way to do thing to avoid code
> churn like this within the same series.
Agreed. I pointed it out in the commit description that it might be a
good idea to separate those changes.
I wasn't sure about the fate of this patch when I first sent it out
and didn't spend too much time on this sort of stuff.
Let me simplify/reorder as you suggested below and resend.

> >  };
> >
> >  int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index bdc4323ce53c..ebf44d724845 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1174,6 +1174,8 @@ struct proto {
> >
> >       int                     (*backlog_rcv) (struct sock *sk,
> >                                               struct sk_buff *skb);
> > +     bool                    (*bpf_bypass_getsockopt)(int level,
> > +                                                      int optname);
> >
> >       void            (*release_cb)(struct sock *sk);
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 78d13c88720f..4bb42fb19711 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
> >                     struct poll_table_struct *wait);
> >  int tcp_getsockopt(struct sock *sk, int level, int optname,
> >                  char __user *optval, int __user *optlen);
> > +bool tcp_bpf_bypass_getsockopt(int level, int optname);
> >  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
> >                  unsigned int optlen);
> >  void tcp_set_keepalive(struct sock *sk, int val);
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index adbecdcaa370..e82df63aedc7 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -16,7 +16,6 @@
> >  #include <linux/bpf-cgroup.h>
> >  #include <net/sock.h>
> >  #include <net/bpf_sk_storage.h>
> > -#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
> Can the patches be re-ordered a little to avoid code churn like this
> in the same series?
>
> It feels like this patch 3 should be the first patch instead.
> The current patch 1 should be the second patch
> but it can still use the tcp_mmap to show potential
> benefit for other optnames.
>
> >
> >  #include "../cgroup/cgroup-internal.h"
> >
> > @@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
> >       return empty;
> >  }
> >
> > -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> > +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> > +                          struct bpf_sockopt_buf *buf)
> >  {
> >       if (unlikely(max_optlen < 0))
> >               return -EINVAL;
> > @@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >               max_optlen = PAGE_SIZE;
> >       }
> >
> > -     if (max_optlen <= sizeof(ctx->buf)) {
> > +     if (max_optlen <= sizeof(buf->data)) {
> >               /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> >                * bytes avoid the cost of kzalloc.
> > -              *
> > -              * In order to remove extra allocations from the TCP
> > -              * fast zero-copy path ensure that buffer covers
> > -              * the size of struct tcp_zerocopy_receive.
> >                */
> > -             BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> > -                          BPF_SOCKOPT_KERN_BUF_SIZE);
> > -
> > -             ctx->optval = ctx->buf;
> > +             ctx->optval = buf->data;
> >               ctx->optval_end = ctx->optval + max_optlen;
> >               return max_optlen;
> >       }
> > @@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> >       return max_optlen;
> >  }
> >
> > -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
> > +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> > +                          struct bpf_sockopt_buf *buf)
> >  {
> > -     if (ctx->optval == ctx->buf)
> > +     if (ctx->optval == buf->data)
> >               return;
> >       kfree(ctx->optval);
> >  }
> >
> > -static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
> > +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> > +                               struct bpf_sockopt_buf *buf)
> >  {
> > -     return ctx->optval != ctx->buf;
> > +     return ctx->optval != buf->data;
> >  }
> >
> >  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> > @@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >                                      int *optlen, char **kernel_optval)
> >  {
> >       struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> > +     struct bpf_sockopt_buf buf = {};
> >       struct bpf_sockopt_kern ctx = {
> >               .sk = sk,
> >               .level = *level,
> > @@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >        */
> >       max_optlen = max_t(int, 16, *optlen);
> >
> > -     max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
> > +     max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> >       if (max_optlen < 0)
> >               return max_optlen;
> >
> > @@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >                        * No way to export on-stack buf, have to allocate a
> >                        * new buffer.
> >                        */
> > -                     if (!sockopt_buf_allocated(&ctx)) {
> > +                     if (!sockopt_buf_allocated(&ctx, &buf)) {
> >                               void *p = kzalloc(ctx.optlen, GFP_USER);
> >
> >                               if (!p) {
> > @@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> >
> >  out:
> >       if (ret)
> > -             sockopt_free_buf(&ctx);
> > +             sockopt_free_buf(&ctx, &buf);
> >       return ret;
> >  }
> >
> > @@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
> >                                      int __user *optlen, int max_optlen,
> >                                      int retval)
> >  {
> > -     struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> > -     struct bpf_sockopt_kern ctx = {
> > -             .sk = sk,
> > -             .level = level,
> > -             .optname = optname,
> > -             .retval = retval,
> > -     };
> This change looks unnecessary?
>
> > +     struct bpf_sockopt_kern ctx;
> > +     struct bpf_sockopt_buf buf;
> > +     struct cgroup *cgrp;
> >       int ret;
> >
> > +     memset(&buf, 0, sizeof(buf));
> > +     memset(&ctx, 0, sizeof(ctx));
> > +
> > +     cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> > +     ctx.sk = sk;
> > +     ctx.level = level;
> > +     ctx.optname = optname;
> > +     ctx.retval = retval;
> > +
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index dd4b8e300746..cbba9c9ab073 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -147,6 +147,10 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int __user *optlen, int max_optlen,
 				       int retval);
 
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -366,10 +370,21 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 ({									       \
 	int __ret = retval;						       \
 	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
-		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
-							   optname, optval,    \
-							   optlen, max_optlen, \
-							   retval);	       \
+		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
+		    !(sock)->sk_prot->bpf_bypass_getsockopt(level, optname))   \
+			__ret = __cgroup_bpf_run_filter_getsockopt(	       \
+				sock, level, optname, optval, optlen,	       \
+				max_optlen, retval);			       \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \
+					    optlen, retval)		       \
+({									       \
+	int __ret = retval;						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
+		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
+			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
 })
 
@@ -454,6 +469,8 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 				       optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
+					    optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 54a4225f36d8..8739f1d4cac4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1281,7 +1281,10 @@  struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
-#define BPF_SOCKOPT_KERN_BUF_SIZE	64
+#define BPF_SOCKOPT_KERN_BUF_SIZE	32
+struct bpf_sockopt_buf {
+	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];
+};
 
 struct bpf_sockopt_kern {
 	struct sock	*sk;
@@ -1291,7 +1294,6 @@  struct bpf_sockopt_kern {
 	s32		optname;
 	s32		optlen;
 	s32		retval;
-	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index bdc4323ce53c..ebf44d724845 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1174,6 +1174,8 @@  struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
+	bool			(*bpf_bypass_getsockopt)(int level,
+							 int optname);
 
 	void		(*release_cb)(struct sock *sk);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 78d13c88720f..4bb42fb19711 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@  __poll_t tcp_poll(struct file *file, struct socket *sock,
 		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
+bool tcp_bpf_bypass_getsockopt(int level, int optname);
 int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen);
 void tcp_set_keepalive(struct sock *sk, int val);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index adbecdcaa370..e82df63aedc7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,7 +16,6 @@ 
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
-#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
 
 #include "../cgroup/cgroup-internal.h"
 
@@ -1299,7 +1298,8 @@  static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 	return empty;
 }
 
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+			     struct bpf_sockopt_buf *buf)
 {
 	if (unlikely(max_optlen < 0))
 		return -EINVAL;
@@ -1311,18 +1311,11 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 		max_optlen = PAGE_SIZE;
 	}
 
-	if (max_optlen <= sizeof(ctx->buf)) {
+	if (max_optlen <= sizeof(buf->data)) {
 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
 		 * bytes avoid the cost of kzalloc.
-		 *
-		 * In order to remove extra allocations from the TCP
-		 * fast zero-copy path ensure that buffer covers
-		 * the size of struct tcp_zerocopy_receive.
 		 */
-		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
-			     BPF_SOCKOPT_KERN_BUF_SIZE);
-
-		ctx->optval = ctx->buf;
+		ctx->optval = buf->data;
 		ctx->optval_end = ctx->optval + max_optlen;
 		return max_optlen;
 	}
@@ -1336,16 +1329,18 @@  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 	return max_optlen;
 }
 
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+			     struct bpf_sockopt_buf *buf)
 {
-	if (ctx->optval == ctx->buf)
+	if (ctx->optval == buf->data)
 		return;
 	kfree(ctx->optval);
 }
 
-static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+				  struct bpf_sockopt_buf *buf)
 {
-	return ctx->optval != ctx->buf;
+	return ctx->optval != buf->data;
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1353,6 +1348,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				       int *optlen, char **kernel_optval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_buf buf = {};
 	struct bpf_sockopt_kern ctx = {
 		.sk = sk,
 		.level = *level,
@@ -1373,7 +1369,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 */
 	max_optlen = max_t(int, 16, *optlen);
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1419,7 +1415,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 			 * No way to export on-stack buf, have to allocate a
 			 * new buffer.
 			 */
-			if (!sockopt_buf_allocated(&ctx)) {
+			if (!sockopt_buf_allocated(&ctx, &buf)) {
 				void *p = kzalloc(ctx.optlen, GFP_USER);
 
 				if (!p) {
@@ -1436,7 +1432,7 @@  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 out:
 	if (ret)
-		sockopt_free_buf(&ctx);
+		sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
 
@@ -1445,15 +1441,20 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int __user *optlen, int max_optlen,
 				       int retval)
 {
-	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_sockopt_kern ctx = {
-		.sk = sk,
-		.level = level,
-		.optname = optname,
-		.retval = retval,
-	};
+	struct bpf_sockopt_kern ctx;
+	struct bpf_sockopt_buf buf;
+	struct cgroup *cgrp;
 	int ret;
 
+	memset(&buf, 0, sizeof(buf));
+	memset(&ctx, 0, sizeof(ctx));
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	ctx.sk = sk;
+	ctx.level = level;
+	ctx.optname = optname;
+	ctx.retval = retval;
+
 	/* Opportunistic check to see whether we have any BPF program
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
@@ -1463,7 +1464,7 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	ctx.optlen = max_optlen;
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1521,9 +1522,47 @@  int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	ret = ctx.retval;
 
 out:
-	sockopt_free_buf(&ctx);
+	sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = level,
+		.optname = optname,
+		.retval = retval,
+		.optlen = *optlen,
+		.optval = optval,
+		.optval_end = optval + *optlen,
+	};
+	int ret;
+
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	if (!ret)
+		return -EPERM;
+
+	if (ctx.optlen > *optlen)
+		return -EFAULT;
+
+	/* BPF programs only allowed to set retval to 0, not some
+	 * arbitrary value.
+	 */
+	if (ctx.retval != 0 && ctx.retval != retval)
+		return -EFAULT;
+
+	/* BPF programs can shrink the buffer, export the modifications.
+	 */
+	if (ctx.optlen != 0)
+		*optlen = ctx.optlen;
+
+	return ctx.retval;
+}
 #endif
 
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ed42d2193c5c..ef3c895b66c1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4098,6 +4098,8 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		lock_sock(sk);
 		err = tcp_zerocopy_receive(sk, &zc);
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+							  &zc, &len, err);
 		release_sock(sk);
 		if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 			goto zerocopy_rcv_sk_err;
@@ -4132,6 +4134,18 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 	return 0;
 }
 
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
+	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+	 */
+	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int __user *optlen)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58207c7769d0..8b4906980fce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2792,6 +2792,7 @@  struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e254569a3005..6624eccff85b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2121,6 +2121,7 @@  struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index b25c9c45c148..6bb18b1d8578 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -11,6 +11,7 @@  static int getsetsockopt(void)
 		char u8[4];
 		__u32 u32;
 		char cc[16]; /* TCP_CA_NAME_MAX */
+		struct tcp_zerocopy_receive zc;
 	} buf = {};
 	socklen_t optlen;
 	char *big_buf = NULL;
@@ -154,6 +155,27 @@  static int getsetsockopt(void)
 		goto err;
 	}
 
+	/* TCP_ZEROCOPY_RECEIVE triggers */
+	memset(&buf, 0, sizeof(buf));
+	optlen = sizeof(buf.zc);
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (err) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
+	memset(&buf, 0, sizeof(buf));
+	buf.zc.address = 12345; /* rejected by BPF */
+	optlen = sizeof(buf.zc);
+	errno = 0;
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (errno != EPERM) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
 	free(big_buf);
 	close(fd);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index 712df7b49cb1..c726f0763a13 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -57,6 +57,21 @@  int _getsockopt(struct bpf_sockopt *ctx)
 		return 1;
 	}
 
+	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+		/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+		 * It has a custom implementation for performance
+		 * reasons.
+		 */
+
+		if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
+			return 0; /* EPERM, bounds check */
+
+		if (((struct tcp_zerocopy_receive *)optval)->address != 0)
+			return 0; /* EPERM, unexpected data */
+
+		return 1;
+	}
+
 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
 		if (optval + 1 > optval_end)
 			return 0; /* EPERM, bounds check */