Message ID | 20241107210331.3044434-1-almasrymina@google.com (mailing list archive) |
---|---|
State | Accepted |
Commit | f2685c00c3222305f5b6740a8b16ea044640283a |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net,v2,1/2] net: fix SO_DEVMEM_DONTNEED looping too long | expand |
On 11/07, Mina Almasry wrote: > Exit early if we're freeing more than 1024 frags, to prevent > looping too long. > > Also minor code cleanups: > - Flip checks to reduce indentation. > - Use sizeof(*tokens) everywhere for consistentcy. > > Cc: Yi Lai <yi1.lai@linux.intel.com> > Cc: Stanislav Fomichev <sdf@fomichev.me> > Signed-off-by: Mina Almasry <almasrymina@google.com> > > --- > > v2: > - Retain token check to prevent allocation of too much memory. > - Exit early instead of pre-checking in a loop so that we don't penalize > well behaved applications (sdf) > > --- > net/core/sock.c | 42 ++++++++++++++++++++++++------------------ > 1 file changed, 24 insertions(+), 18 deletions(-) > > diff --git a/net/core/sock.c b/net/core/sock.c > index 039be95c40cf..da50df485090 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) > > #ifdef CONFIG_PAGE_POOL > > -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in > - * 1 syscall. The limit exists to limit the amount of memory the kernel > - * allocates to copy these tokens. > +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED > + * in 1 syscall. The limit exists to limit the amount of memory the kernel > + * allocates to copy these tokens, and to prevent looping over the frags for > + * too long. > */ > #define MAX_DONTNEED_TOKENS 128 > +#define MAX_DONTNEED_FRAGS 1024 > > static noinline_for_stack int > sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > { > unsigned int num_tokens, i, j, k, netmem_num = 0; > struct dmabuf_token *tokens; > + int ret = 0, num_frags = 0; > netmem_ref netmems[16]; > - int ret = 0; > > if (!sk_is_tcp(sk)) > return -EBADF; > > - if (optlen % sizeof(struct dmabuf_token) || > + if (optlen % sizeof(*tokens) || > optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) > return -EINVAL; > [..] > - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); Oh, so we currently allocate optlen*8? This is a sneaky fix :-p > + num_tokens = optlen / sizeof(*tokens); > + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); > if (!tokens) > return -ENOMEM; > > - num_tokens = optlen / sizeof(struct dmabuf_token); > if (copy_from_sockptr(tokens, optval, optlen)) { > kvfree(tokens); > return -EFAULT; > @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > xa_lock_bh(&sk->sk_user_frags); > for (i = 0; i < num_tokens; i++) { > for (j = 0; j < tokens[i].token_count; j++) { [..] > + if (++num_frags > MAX_DONTNEED_FRAGS) > + goto frag_limit_reached; > + nit: maybe reuse existing ret (and rename it to num_frags) instead of introducing new num_frags? Both variables now seem to track the same number.
On Thu, Nov 7, 2024 at 5:28 PM Stanislav Fomichev <stfomichev@gmail.com> wrote: > > On 11/07, Mina Almasry wrote: > > Exit early if we're freeing more than 1024 frags, to prevent > > looping too long. > > > > Also minor code cleanups: > > - Flip checks to reduce indentation. > > - Use sizeof(*tokens) everywhere for consistentcy. > > > > Cc: Yi Lai <yi1.lai@linux.intel.com> > > Cc: Stanislav Fomichev <sdf@fomichev.me> > > Signed-off-by: Mina Almasry <almasrymina@google.com> > > > > --- > > > > v2: > > - Retain token check to prevent allocation of too much memory. > > - Exit early instead of pre-checking in a loop so that we don't penalize > > well behaved applications (sdf) > > > > --- > > net/core/sock.c | 42 ++++++++++++++++++++++++------------------ > > 1 file changed, 24 insertions(+), 18 deletions(-) > > > > diff --git a/net/core/sock.c b/net/core/sock.c > > index 039be95c40cf..da50df485090 100644 > > --- a/net/core/sock.c > > +++ b/net/core/sock.c > > @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) > > > > #ifdef CONFIG_PAGE_POOL > > > > -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in > > - * 1 syscall. The limit exists to limit the amount of memory the kernel > > - * allocates to copy these tokens. > > +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED > > + * in 1 syscall. The limit exists to limit the amount of memory the kernel > > + * allocates to copy these tokens, and to prevent looping over the frags for > > + * too long. > > */ > > #define MAX_DONTNEED_TOKENS 128 > > +#define MAX_DONTNEED_FRAGS 1024 > > > > static noinline_for_stack int > > sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > > { > > unsigned int num_tokens, i, j, k, netmem_num = 0; > > struct dmabuf_token *tokens; > > + int ret = 0, num_frags = 0; > > netmem_ref netmems[16]; > > - int ret = 0; > > > > if (!sk_is_tcp(sk)) > > return -EBADF; > > > > - if (optlen % sizeof(struct dmabuf_token) || > > + if (optlen % sizeof(*tokens) || > > optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) > > return -EINVAL; > > > > [..] > > > - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); > > Oh, so we currently allocate optlen*8? This is a sneaky fix :-p > > > + num_tokens = optlen / sizeof(*tokens); > > + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); > > if (!tokens) > > return -ENOMEM; > > > > - num_tokens = optlen / sizeof(struct dmabuf_token); > > if (copy_from_sockptr(tokens, optval, optlen)) { > > kvfree(tokens); > > return -EFAULT; > > @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > > xa_lock_bh(&sk->sk_user_frags); > > for (i = 0; i < num_tokens; i++) { > > for (j = 0; j < tokens[i].token_count; j++) { > > [..] > > > + if (++num_frags > MAX_DONTNEED_FRAGS) > > + goto frag_limit_reached; > > + > > nit: maybe reuse existing ret (and rename it to num_frags) instead of > introducing new num_frags? Both variables now seem to track the same > number. I almost sent it this way, but I think that would be wrong. num_frags is all the frags inspected. ret is all the frags freed. The difference is subtle but critical. We want to exit when we've inspected 1024 frags, not when we've freed 1024 frags, because the user may make us loop forever if they ask us to free 10000000 frags of which none exist.
On 11/07, Mina Almasry wrote: > On Thu, Nov 7, 2024 at 5:28 PM Stanislav Fomichev <stfomichev@gmail.com> wrote: > > > > On 11/07, Mina Almasry wrote: > > > Exit early if we're freeing more than 1024 frags, to prevent > > > looping too long. > > > > > > Also minor code cleanups: > > > - Flip checks to reduce indentation. > > > - Use sizeof(*tokens) everywhere for consistentcy. > > > > > > Cc: Yi Lai <yi1.lai@linux.intel.com> > > > Cc: Stanislav Fomichev <sdf@fomichev.me> > > > Signed-off-by: Mina Almasry <almasrymina@google.com> > > > > > > --- > > > > > > v2: > > > - Retain token check to prevent allocation of too much memory. > > > - Exit early instead of pre-checking in a loop so that we don't penalize > > > well behaved applications (sdf) > > > > > > --- > > > net/core/sock.c | 42 ++++++++++++++++++++++++------------------ > > > 1 file changed, 24 insertions(+), 18 deletions(-) > > > > > > diff --git a/net/core/sock.c b/net/core/sock.c > > > index 039be95c40cf..da50df485090 100644 > > > --- a/net/core/sock.c > > > +++ b/net/core/sock.c > > > @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) > > > > > > #ifdef CONFIG_PAGE_POOL > > > > > > -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in > > > - * 1 syscall. The limit exists to limit the amount of memory the kernel > > > - * allocates to copy these tokens. > > > +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED > > > + * in 1 syscall. The limit exists to limit the amount of memory the kernel > > > + * allocates to copy these tokens, and to prevent looping over the frags for > > > + * too long. > > > */ > > > #define MAX_DONTNEED_TOKENS 128 > > > +#define MAX_DONTNEED_FRAGS 1024 > > > > > > static noinline_for_stack int > > > sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > > > { > > > unsigned int num_tokens, i, j, k, netmem_num = 0; > > > struct dmabuf_token *tokens; > > > + int ret = 0, num_frags = 0; > > > netmem_ref netmems[16]; > > > - int ret = 0; > > > > > > if (!sk_is_tcp(sk)) > > > return -EBADF; > > > > > > - if (optlen % sizeof(struct dmabuf_token) || > > > + if (optlen % sizeof(*tokens) || > > > optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) > > > return -EINVAL; > > > > > > > [..] > > > > > - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); > > > > Oh, so we currently allocate optlen*8? This is a sneaky fix :-p > > > > > + num_tokens = optlen / sizeof(*tokens); > > > + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); > > > if (!tokens) > > > return -ENOMEM; > > > > > > - num_tokens = optlen / sizeof(struct dmabuf_token); > > > if (copy_from_sockptr(tokens, optval, optlen)) { > > > kvfree(tokens); > > > return -EFAULT; > > > @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) > > > xa_lock_bh(&sk->sk_user_frags); > > > for (i = 0; i < num_tokens; i++) { > > > for (j = 0; j < tokens[i].token_count; j++) { > > > > [..] > > > > > + if (++num_frags > MAX_DONTNEED_FRAGS) > > > + goto frag_limit_reached; > > > + > > > > nit: maybe reuse existing ret (and rename it to num_frags) instead of > > introducing new num_frags? Both variables now seem to track the same > > number. > > I almost sent it this way, but I think that would be wrong. > > num_frags is all the frags inspected. > ret is all the frags freed. > > The difference is subtle but critical. We want to exit when we've > inspected 1024 frags, not when we've freed 1024 frags, because the > user may make us loop forever if they ask us to free 10000000 frags of > which none exist. I see. Maybe can mitigate the damage with the following: for (i = 0; i < min(num_tokens, MAX_DONTNEED_FRAGS); i++) for (j = 0; j < min(tokens[i].token_count, MAX_DONTNEED_FRAGS); j++) In this case, worst case, we loop 1024*1024 on the invalid input :-D But up to you, separate num_frag works as well (but, as you've seen with my initial reply, it's not super straightforward). Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Hello: This series was applied to netdev/net.git (main) by Jakub Kicinski <kuba@kernel.org>: On Thu, 7 Nov 2024 21:03:30 +0000 you wrote: > Exit early if we're freeing more than 1024 frags, to prevent > looping too long. > > Also minor code cleanups: > - Flip checks to reduce indentation. > - Use sizeof(*tokens) everywhere for consistentcy. > > [...] Here is the summary with links: - [net,v2,1/2] net: fix SO_DEVMEM_DONTNEED looping too long https://git.kernel.org/netdev/net/c/f2685c00c322 - [net,v2,2/2] net: clarify SO_DEVMEM_DONTNEED behavior in documentation https://git.kernel.org/netdev/net/c/102d1404c385 You are awesome, thank you!
diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..da50df485090 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #ifdef CONFIG_PAGE_POOL -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in - * 1 syscall. The limit exists to limit the amount of memory the kernel - * allocates to copy these tokens. +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. */ #define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { unsigned int num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; netmem_ref netmems[16]; - int ret = 0; if (!sk_is_tcp(sk)) return -EBADF; - if (optlen % sizeof(struct dmabuf_token) || + if (optlen % sizeof(*tokens) || optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) return -EINVAL; - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); if (!tokens) return -ENOMEM; - num_tokens = optlen / sizeof(struct dmabuf_token); if (copy_from_sockptr(tokens, optval, optlen)) { kvfree(tokens); return -EFAULT; @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + netmem_ref netmem = (__force netmem_ref)__xa_erase( &sk->sk_user_frags, tokens[i].token_start + j); - if (netmem && - !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { - netmems[netmem_num++] = netmem; - if (netmem_num == ARRAY_SIZE(netmems)) { - xa_unlock_bh(&sk->sk_user_frags); - for (k = 0; k < netmem_num; k++) - WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); - netmem_num = 0; - xa_lock_bh(&sk->sk_user_frags); - } - ret++; + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); } + ret++; } } +frag_limit_reached: xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
Exit early if we're freeing more than 1024 frags, to prevent looping too long. Also minor code cleanups: - Flip checks to reduce indentation. - Use sizeof(*tokens) everywhere for consistentcy. Cc: Yi Lai <yi1.lai@linux.intel.com> Cc: Stanislav Fomichev <sdf@fomichev.me> Signed-off-by: Mina Almasry <almasrymina@google.com> --- v2: - Retain token check to prevent allocation of too much memory. - Exit early instead of pre-checking in a loop so that we don't penalize well behaved applications (sdf) --- net/core/sock.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-)