Message ID | 20220427175048.225235-1-maximmi@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] tls: Add opt-in zerocopy mode of sendfile() | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Clearly marked for net-next |
netdev/apply | fail | Patch does not apply to net-next |
> diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c > index b12f81a2b44c..715401b20c8b 100644 > --- a/net/tls/tls_device.c > +++ b/net/tls/tls_device.c > @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) > return 0; > } > > +union tls_iter_offset { > + struct iov_iter *msg_iter; > + int offset; > +}; Is this sort of atrocity used elsewhere in the kernel? If you can't refactor the code you can pack args into a structure but I've not seen people cast mutually exclusive arguments to a union type. Is this "inspired" by some higher level language? > static int tls_push_data(struct sock *sk, > - struct iov_iter *msg_iter, > + union tls_iter_offset iter_offset, > size_t size, int flags, > - unsigned char record_type) > + unsigned char record_type, > + struct page *zc_page) > { > struct tls_context *tls_ctx = tls_get_ctx(sk); > struct tls_prot_info *prot = &tls_ctx->prot_info; > @@ -480,15 +486,29 @@ static int tls_push_data(struct sock *sk, > } > > record = ctx->open_record; > - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); > - copy = min_t(size_t, copy, (max_open_record_len - record->len)); > - > - if (copy) { > - rc = tls_device_copy_data(page_address(pfrag->page) + > - pfrag->offset, copy, msg_iter); > - if (rc) > - goto handle_error; > - tls_append_frag(record, pfrag, copy); > + > + if (!zc_page) { > + copy = min_t(size_t, size, pfrag->size - pfrag->offset); > + copy = min_t(size_t, copy, max_open_record_len - record->len); Nope, refactor this please. 95% sure you don't need 4 indentation levels here. Space left in record can be calculated before the if, then you can do if (zc_page) { .. } else if (copy) { .. } > + if (copy) { > + rc = tls_device_copy_data(page_address(pfrag->page) + > + pfrag->offset, copy, > + iter_offset.msg_iter); > + if (rc) > + goto handle_error; > + tls_append_frag(record, pfrag, copy); > + } > + } else { > + copy = min_t(size_t, size, max_open_record_len - record->len); > + if (copy) { > + struct page_frag _pfrag; And name your variables right :/ > + _pfrag.page = zc_page; > + _pfrag.offset = iter_offset.offset; > + _pfrag.size = copy; > + tls_append_frag(record, &_pfrag, copy); > + } > } > size -= copy; > @@ -551,8 +571,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) > goto out; > } > > - rc = tls_push_data(sk, &msg->msg_iter, size, > - msg->msg_flags, record_type); > + rc = tls_push_data(sk, (union tls_iter_offset)&msg->msg_iter, size, > + msg->msg_flags, record_type, NULL); > > out: > release_sock(sk); > @@ -564,11 +584,14 @@ int tls_device_sendpage(struct sock *sk, struct page *page, > int offset, size_t size, int flags) > { > struct tls_context *tls_ctx = tls_get_ctx(sk); > + struct tls_offload_context_tx *ctx; > struct iov_iter msg_iter; > char *kaddr; > struct kvec iov; > int rc; > > + ctx = tls_offload_ctx_tx(tls_ctx); > + > if (flags & MSG_SENDPAGE_NOTLAST) > flags |= MSG_MORE; > > @@ -580,12 +603,18 @@ int tls_device_sendpage(struct sock *sk, struct page *page, > goto out; > } > > + if (ctx->zerocopy_sendfile) { > + rc = tls_push_data(sk, (union tls_iter_offset)offset, size, > + flags, TLS_RECORD_TYPE_DATA, page); > + goto out; > + } > + > kaddr = kmap(page); > iov.iov_base = kaddr + offset; > iov.iov_len = size; > iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); > - rc = tls_push_data(sk, &msg_iter, size, > - flags, TLS_RECORD_TYPE_DATA); > + rc = tls_push_data(sk, (union tls_iter_offset)&msg_iter, size, > + flags, TLS_RECORD_TYPE_DATA, NULL); > kunmap(page); > > out: > @@ -659,7 +688,8 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) > struct iov_iter msg_iter; > > iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); > - return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); > + return tls_push_data(sk, (union tls_iter_offset)&msg_iter, 0, flags, > + TLS_RECORD_TYPE_DATA, NULL); > } > > void tls_device_write_space(struct sock *sk, struct tls_context *ctx) > diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c > index 7b2b0e7ffee4..8ef86e04f571 100644 > --- a/net/tls/tls_main.c > +++ b/net/tls/tls_main.c > @@ -513,6 +513,31 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, > return rc; > } > > +static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, > + int __user *optlen) > +{ > + struct tls_context *tls_ctx = tls_get_ctx(sk); > + struct tls_offload_context_tx *ctx; > + int len, value; > + > + if (get_user(len, optlen)) > + return -EFAULT; > + > + if (len != sizeof(value)) the size check should match the one in setsockopt() > + return -EINVAL; > + > + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) > + return -EBUSY; see setsockopt() > + ctx = tls_offload_ctx_tx(tls_ctx); > + > + value = ctx->zerocopy_sendfile; > + if (copy_to_user(optval, &value, sizeof(value))) > + return -EFAULT; > + > + return 0; > +} > +static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, > + unsigned int optlen) > +{ > + struct tls_context *tls_ctx = tls_get_ctx(sk); Highly annoying but each file has its own scheme of naming variables, tls_main uses ctx for tls_context. > + struct tls_offload_context_tx *ctx; > + int val; unsigned > + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) How's tls_ctx ever gonna be NULL here? We should allow setting the option for non-HW. It's an opt-in, the app has opted in, the fact that the kernel will not make use of the liberty to apply the optimization is not important, no? > + return -EINVAL; > + > + if (sockptr_is_null(optval) || optlen < sizeof(val)) > + return -EINVAL; > + > + if (copy_from_sockptr(&val, optval, sizeof(val))) > + return -EFAULT; > + > + ctx = tls_offload_ctx_tx(tls_ctx); > + ctx->zerocopy_sendfile = val; if (val > 1) EINVAL. > + return 0; > +}
On 2022-04-29 01:11, Jakub Kicinski wrote: >> diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c >> index b12f81a2b44c..715401b20c8b 100644 >> --- a/net/tls/tls_device.c >> +++ b/net/tls/tls_device.c >> @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) >> return 0; >> } >> >> +union tls_iter_offset { >> + struct iov_iter *msg_iter; >> + int offset; >> +}; > > Is this sort of atrocity used elsewhere in the kernel? > If you can't refactor the code you can pack args into > a structure What's the point of packing arguments into a struct in this particular case? Depending on zc_page, I need either msg_iter or offset, and I'm reusing the same CPU register to pass either of them. The performance isn't affected, and the amount of memory used is the same. A struct won't allow to achieve this, it would force me to drag 8 extra bytes, but we already use all 6 registers used to pass parameters on x86_64. > but I've not seen people cast mutually exclusive > arguments to a union type. It's the purpose of a union, to hold one of mutually exclusive values, isn't it? > Is this "inspired" by some higher > level language? It's unfortunately inspired by C and its freedom to allow microoptimizations/hacks. The hack here is that I use a pointer being NULL or not-NULL as an indicator what type the other argument has. The closest alternative from high-level languages I can think of is enums with attached data from rust or swift. However, rust isn't smart enough to perform the optimization I described, so no, it's not inspired by it :) Options that I see here: 1. Union. 2. Just pass both parameters and use one of them. Drawbacks: now we have 7 parameters, one will be passed through the stack, and it's datapath code. 3. Pass `struct iov_iter *` and cast it to `int offset` when zc_page isn't NULL. As we are compiling with -fno-strict-aliasing, and int shouldn't be bigger than a pointer on all supported architectures, it's going to work, and we still have 6 parameters. 4. Combine `int offset` and `int flags` into a single 64-bit parameter. Which one do you prefer, or do you have anything better in mind? >> static int tls_push_data(struct sock *sk, >> - struct iov_iter *msg_iter, >> + union tls_iter_offset iter_offset, >> size_t size, int flags, >> - unsigned char record_type) >> + unsigned char record_type, >> + struct page *zc_page) >> { >> struct tls_context *tls_ctx = tls_get_ctx(sk); >> struct tls_prot_info *prot = &tls_ctx->prot_info; >> @@ -480,15 +486,29 @@ static int tls_push_data(struct sock *sk, >> } >> >> record = ctx->open_record; >> - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); >> - copy = min_t(size_t, copy, (max_open_record_len - record->len)); >> - >> - if (copy) { >> - rc = tls_device_copy_data(page_address(pfrag->page) + >> - pfrag->offset, copy, msg_iter); >> - if (rc) >> - goto handle_error; >> - tls_append_frag(record, pfrag, copy); >> + >> + if (!zc_page) { >> + copy = min_t(size_t, size, pfrag->size - pfrag->offset); >> + copy = min_t(size_t, copy, max_open_record_len - record->len); > > Nope, refactor this please. 95% sure you don't need 4 indentation > levels here. Space left in record can be calculated before the if, > then you can do > > if (zc_page) { > .. > } else if (copy) { > .. > } It'll save one indentation level for the zc_page case, but not for the other: copy = min_t(size_t, size, max_open_record_len - record->len); if (zc_page && copy) { ... } else { // I still have to do this in non-zc case: copy = min_t(size_t, copy, pfrag->size - pfrag->offset); if (copy) { // Same indentation level as in my patch. ... } } Is it good enough? >> + if (copy) { >> + rc = tls_device_copy_data(page_address(pfrag->page) + >> + pfrag->offset, copy, >> + iter_offset.msg_iter); >> + if (rc) >> + goto handle_error; >> + tls_append_frag(record, pfrag, copy); >> + } >> + } else { >> + copy = min_t(size_t, size, max_open_record_len - record->len); >> + if (copy) { >> + struct page_frag _pfrag; > > And name your variables right :/ > >> + _pfrag.page = zc_page; >> + _pfrag.offset = iter_offset.offset; >> + _pfrag.size = copy; >> + tls_append_frag(record, &_pfrag, copy); >> + } >> } > >> size -= copy; >> @@ -551,8 +571,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) >> goto out; >> } >> >> - rc = tls_push_data(sk, &msg->msg_iter, size, >> - msg->msg_flags, record_type); >> + rc = tls_push_data(sk, (union tls_iter_offset)&msg->msg_iter, size, >> + msg->msg_flags, record_type, NULL); >> >> out: >> release_sock(sk); >> @@ -564,11 +584,14 @@ int tls_device_sendpage(struct sock *sk, struct page *page, >> int offset, size_t size, int flags) >> { >> struct tls_context *tls_ctx = tls_get_ctx(sk); >> + struct tls_offload_context_tx *ctx; >> struct iov_iter msg_iter; >> char *kaddr; >> struct kvec iov; >> int rc; >> >> + ctx = tls_offload_ctx_tx(tls_ctx); >> + >> if (flags & MSG_SENDPAGE_NOTLAST) >> flags |= MSG_MORE; >> >> @@ -580,12 +603,18 @@ int tls_device_sendpage(struct sock *sk, struct page *page, >> goto out; >> } >> >> + if (ctx->zerocopy_sendfile) { >> + rc = tls_push_data(sk, (union tls_iter_offset)offset, size, >> + flags, TLS_RECORD_TYPE_DATA, page); >> + goto out; >> + } >> + >> kaddr = kmap(page); >> iov.iov_base = kaddr + offset; >> iov.iov_len = size; >> iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); >> - rc = tls_push_data(sk, &msg_iter, size, >> - flags, TLS_RECORD_TYPE_DATA); >> + rc = tls_push_data(sk, (union tls_iter_offset)&msg_iter, size, >> + flags, TLS_RECORD_TYPE_DATA, NULL); >> kunmap(page); >> >> out: >> @@ -659,7 +688,8 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) >> struct iov_iter msg_iter; >> >> iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); >> - return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); >> + return tls_push_data(sk, (union tls_iter_offset)&msg_iter, 0, flags, >> + TLS_RECORD_TYPE_DATA, NULL); >> } >> >> void tls_device_write_space(struct sock *sk, struct tls_context *ctx) >> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c >> index 7b2b0e7ffee4..8ef86e04f571 100644 >> --- a/net/tls/tls_main.c >> +++ b/net/tls/tls_main.c >> @@ -513,6 +513,31 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, >> return rc; >> } >> >> +static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, >> + int __user *optlen) >> +{ >> + struct tls_context *tls_ctx = tls_get_ctx(sk); >> + struct tls_offload_context_tx *ctx; >> + int len, value; >> + >> + if (get_user(len, optlen)) >> + return -EFAULT; >> + >> + if (len != sizeof(value)) > > the size check should match the one in setsockopt() > >> + return -EINVAL; >> + >> + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) >> + return -EBUSY; > > see setsockopt() > >> + ctx = tls_offload_ctx_tx(tls_ctx); >> + >> + value = ctx->zerocopy_sendfile; >> + if (copy_to_user(optval, &value, sizeof(value))) >> + return -EFAULT; >> + >> + return 0; >> +} > >> +static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, >> + unsigned int optlen) >> +{ >> + struct tls_context *tls_ctx = tls_get_ctx(sk); > > Highly annoying but each file has its own scheme of naming variables, > tls_main uses ctx for tls_context. > >> + struct tls_offload_context_tx *ctx; >> + int val; > > unsigned > >> + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) > > How's tls_ctx ever gonna be NULL here? Shouldn't be possible, I'll drop the check. > We should allow setting the option for non-HW. It's an opt-in, the app > has opted in, the fact that the kernel will not make use of the liberty > to apply the optimization is not important, no? Yes, I agree that if the application opted in, it should work properly regardless of whether the optimization actually did turn on. However, the indication could be useful, for example, for diagnostic purposes, to show the user whether zerocopy mode was enabled, if someone is trying to debug some performance issue. If you insist, though, I can make setsockopt succeed and getsockopt return 1. What do you think? >> + return -EINVAL; >> + >> + if (sockptr_is_null(optval) || optlen < sizeof(val)) >> + return -EINVAL; >> + >> + if (copy_from_sockptr(&val, optval, sizeof(val))) >> + return -EFAULT; >> + >> + ctx = tls_offload_ctx_tx(tls_ctx); >> + ctx->zerocopy_sendfile = val; > > if (val > 1) > EINVAL. > >> + return 0; >> +} Thanks for the comments!
On Fri, 29 Apr 2022 17:21:59 +0300 Maxim Mikityanskiy wrote: > On 2022-04-29 01:11, Jakub Kicinski wrote: > >> diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c > >> index b12f81a2b44c..715401b20c8b 100644 > >> --- a/net/tls/tls_device.c > >> +++ b/net/tls/tls_device.c > >> @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) > >> return 0; > >> } > >> > >> +union tls_iter_offset { > >> + struct iov_iter *msg_iter; > >> + int offset; > >> +}; > > > > Is this sort of atrocity used elsewhere in the kernel? > > If you can't refactor the code you can pack args into > > a structure > > What's the point of packing arguments into a struct in this particular > case? Depending on zc_page, I need either msg_iter or offset, and I'm > reusing the same CPU register to pass either of them. The performance > isn't affected, and the amount of memory used is the same. A struct > won't allow to achieve this, it would force me to drag 8 extra bytes, > but we already use all 6 registers used to pass parameters on x86_64. I know why you're doing this, but you're not writing assembly: + rc = tls_push_data(sk, (union tls_iter_offset)&msg_iter, size, + return tls_push_data(sk, (union tls_iter_offset)&msg_iter, 0, flags, even if it's legal C (i.e. not UB) it looks awkward. > > but I've not seen people cast mutually exclusive > > arguments to a union type. > > It's the purpose of a union, to hold one of mutually exclusive values, > isn't it? The union itself is not the problem. > > Is this "inspired" by some higher > > level language? > > It's unfortunately inspired by C and its freedom to allow > microoptimizations/hacks. The hack here is that I use a pointer being > NULL or not-NULL as an indicator what type the other argument has. > > The closest alternative from high-level languages I can think of is > enums with attached data from rust or swift. However, rust isn't smart > enough to perform the optimization I described, so no, it's not inspired > by it :) > > Options that I see here: > > 1. Union. > > 2. Just pass both parameters and use one of them. Drawbacks: now we have > 7 parameters, one will be passed through the stack, and it's datapath code. > > 3. Pass `struct iov_iter *` and cast it to `int offset` when zc_page > isn't NULL. As we are compiling with -fno-strict-aliasing, and int > shouldn't be bigger than a pointer on all supported architectures, it's > going to work, and we still have 6 parameters. > > 4. Combine `int offset` and `int flags` into a single 64-bit parameter. > > Which one do you prefer, or do you have anything better in mind? If you declare the union on the stack in the callers, and pass by value - is the compiler not going to be clever enough to still DDRT? > >> static int tls_push_data(struct sock *sk, > >> - struct iov_iter *msg_iter, > >> + union tls_iter_offset iter_offset, > >> size_t size, int flags, > >> - unsigned char record_type) > >> + unsigned char record_type, > >> + struct page *zc_page) > >> { > >> struct tls_context *tls_ctx = tls_get_ctx(sk); > >> struct tls_prot_info *prot = &tls_ctx->prot_info; > >> @@ -480,15 +486,29 @@ static int tls_push_data(struct sock *sk, > >> } > >> > >> record = ctx->open_record; > >> - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); > >> - copy = min_t(size_t, copy, (max_open_record_len - record->len)); > >> - > >> - if (copy) { > >> - rc = tls_device_copy_data(page_address(pfrag->page) + > >> - pfrag->offset, copy, msg_iter); > >> - if (rc) > >> - goto handle_error; > >> - tls_append_frag(record, pfrag, copy); > >> + > >> + if (!zc_page) { > >> + copy = min_t(size_t, size, pfrag->size - pfrag->offset); > >> + copy = min_t(size_t, copy, max_open_record_len - record->len); > > > > Nope, refactor this please. 95% sure you don't need 4 indentation > > levels here. Space left in record can be calculated before the if, > > then you can do > > > > if (zc_page) { > > .. > > } else if (copy) { > > .. > > } > > It'll save one indentation level for the zc_page case, but not for the > other: > > copy = min_t(size_t, size, max_open_record_len - record->len); > if (zc_page && copy) { > ... > } else { > // I still have to do this in non-zc case: > copy = min_t(size_t, copy, pfrag->size - pfrag->offset); Can pfrag->size - pfrag->offset really be 0 provided tls_do_allocation() did not return an error? > if (copy) { > // Same indentation level as in my patch. > ... > } > } > > Is it good enough? > > We should allow setting the option for non-HW. It's an opt-in, the app > > has opted in, the fact that the kernel will not make use of the liberty > > to apply the optimization is not important, no? > > Yes, I agree that if the application opted in, it should work properly > regardless of whether the optimization actually did turn on. However, > the indication could be useful, for example, for diagnostic purposes, to > show the user whether zerocopy mode was enabled, if someone is trying to > debug some performance issue. If you insist, though, I can make > setsockopt succeed and getsockopt return 1. What do you think? I'd say "whether the optimization is applicable" rather than "whether the optimization is turned on". User can check whether the connection is using SW or HW TLS if they want to make sure it's taken advantage of. Speaking of which, should we report the state of this knob via socket diag?
On 2022-04-29 22:11, Jakub Kicinski wrote: > On Fri, 29 Apr 2022 17:21:59 +0300 Maxim Mikityanskiy wrote: >> On 2022-04-29 01:11, Jakub Kicinski wrote: >>>> diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c >>>> index b12f81a2b44c..715401b20c8b 100644 >>>> --- a/net/tls/tls_device.c >>>> +++ b/net/tls/tls_device.c >>>> @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) >>>> return 0; >>>> } >>>> >>>> +union tls_iter_offset { >>>> + struct iov_iter *msg_iter; >>>> + int offset; >>>> +}; >>> >>> Is this sort of atrocity used elsewhere in the kernel? >>> If you can't refactor the code you can pack args into >>> a structure >> >> What's the point of packing arguments into a struct in this particular >> case? Depending on zc_page, I need either msg_iter or offset, and I'm >> reusing the same CPU register to pass either of them. The performance >> isn't affected, and the amount of memory used is the same. A struct >> won't allow to achieve this, it would force me to drag 8 extra bytes, >> but we already use all 6 registers used to pass parameters on x86_64. > > I know why you're doing this, but you're not writing assembly: > > + rc = tls_push_data(sk, (union tls_iter_offset)&msg_iter, size, > > + return tls_push_data(sk, (union tls_iter_offset)&msg_iter, 0, flags, > > even if it's legal C (i.e. not UB) it looks awkward. It's not UB, cast between a union and its element type is well-defined in C. >>> but I've not seen people cast mutually exclusive >>> arguments to a union type. >> >> It's the purpose of a union, to hold one of mutually exclusive values, >> isn't it? > > The union itself is not the problem. > >>> Is this "inspired" by some higher >>> level language? >> >> It's unfortunately inspired by C and its freedom to allow >> microoptimizations/hacks. The hack here is that I use a pointer being >> NULL or not-NULL as an indicator what type the other argument has. >> >> The closest alternative from high-level languages I can think of is >> enums with attached data from rust or swift. However, rust isn't smart >> enough to perform the optimization I described, so no, it's not inspired >> by it :) >> >> Options that I see here: >> >> 1. Union. >> >> 2. Just pass both parameters and use one of them. Drawbacks: now we have >> 7 parameters, one will be passed through the stack, and it's datapath code. >> >> 3. Pass `struct iov_iter *` and cast it to `int offset` when zc_page >> isn't NULL. As we are compiling with -fno-strict-aliasing, and int >> shouldn't be bigger than a pointer on all supported architectures, it's >> going to work, and we still have 6 parameters. >> >> 4. Combine `int offset` and `int flags` into a single 64-bit parameter. >> >> Which one do you prefer, or do you have anything better in mind? > > If you declare the union on the stack in the callers, and pass by value > - is the compiler not going to be clever enough to still DDRT? Ah, OK, it should do the thing. I thought you wanted me to ditch the union altogether. >>>> static int tls_push_data(struct sock *sk, >>>> - struct iov_iter *msg_iter, >>>> + union tls_iter_offset iter_offset, >>>> size_t size, int flags, >>>> - unsigned char record_type) >>>> + unsigned char record_type, >>>> + struct page *zc_page) >>>> { >>>> struct tls_context *tls_ctx = tls_get_ctx(sk); >>>> struct tls_prot_info *prot = &tls_ctx->prot_info; >>>> @@ -480,15 +486,29 @@ static int tls_push_data(struct sock *sk, >>>> } >>>> >>>> record = ctx->open_record; >>>> - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); >>>> - copy = min_t(size_t, copy, (max_open_record_len - record->len)); >>>> - >>>> - if (copy) { >>>> - rc = tls_device_copy_data(page_address(pfrag->page) + >>>> - pfrag->offset, copy, msg_iter); >>>> - if (rc) >>>> - goto handle_error; >>>> - tls_append_frag(record, pfrag, copy); >>>> + >>>> + if (!zc_page) { >>>> + copy = min_t(size_t, size, pfrag->size - pfrag->offset); >>>> + copy = min_t(size_t, copy, max_open_record_len - record->len); >>> >>> Nope, refactor this please. 95% sure you don't need 4 indentation >>> levels here. Space left in record can be calculated before the if, >>> then you can do >>> >>> if (zc_page) { >>> .. >>> } else if (copy) { >>> .. >>> } >> >> It'll save one indentation level for the zc_page case, but not for the >> other: >> >> copy = min_t(size_t, size, max_open_record_len - record->len); >> if (zc_page && copy) { >> ... >> } else { >> // I still have to do this in non-zc case: >> copy = min_t(size_t, copy, pfrag->size - pfrag->offset); > > Can pfrag->size - pfrag->offset really be 0 provided > tls_do_allocation() did not return an error? No, it can't. Now it makes more sense to me. Thanks for the clarification. >> if (copy) { >> // Same indentation level as in my patch. >> ... >> } >> } >> >> Is it good enough? > >>> We should allow setting the option for non-HW. It's an opt-in, the app >>> has opted in, the fact that the kernel will not make use of the liberty >>> to apply the optimization is not important, no? >> >> Yes, I agree that if the application opted in, it should work properly >> regardless of whether the optimization actually did turn on. However, >> the indication could be useful, for example, for diagnostic purposes, to >> show the user whether zerocopy mode was enabled, if someone is trying to >> debug some performance issue. If you insist, though, I can make >> setsockopt succeed and getsockopt return 1. What do you think? > > I'd say "whether the optimization is applicable" rather than "whether > the optimization is turned on". User can check whether the connection > is using SW or HW TLS if they want to make sure it's taken advantage of. > > Speaking of which, should we report the state of this knob via socket > diag? That sounds like an option, I'll take a look. TLS doesn't expose anything via diag yet, does it? The only option to distinguish SW/HW TLS is ethtool, and there is no per-socket check, right? Cause a HW TLS socket can downgrade to SW after tls_device_down, and ethtool won't show it.
On Tue, 3 May 2022 21:56:48 +0300 Maxim Mikityanskiy wrote: > >> Yes, I agree that if the application opted in, it should work properly > >> regardless of whether the optimization actually did turn on. However, > >> the indication could be useful, for example, for diagnostic purposes, to > >> show the user whether zerocopy mode was enabled, if someone is trying to > >> debug some performance issue. If you insist, though, I can make > >> setsockopt succeed and getsockopt return 1. What do you think? > > > > I'd say "whether the optimization is applicable" rather than "whether > > the optimization is turned on". User can check whether the connection > > is using SW or HW TLS if they want to make sure it's taken advantage of. > > > > Speaking of which, should we report the state of this knob via socket > > diag? > > That sounds like an option, I'll take a look. TLS doesn't expose > anything via diag yet, does it? The only option to distinguish SW/HW TLS > is ethtool, and there is no per-socket check, right? Cause a HW TLS > socket can downgrade to SW after tls_device_down, and ethtool won't show it. It does - look for tls_get_info()
> > If you declare the union on the stack in the callers, and pass by value > > - is the compiler not going to be clever enough to still DDRT? > > Ah, OK, it should do the thing. I thought you wanted me to ditch the > union altogether. Some architectures always pass struct/union by address. Which is probably not what you had in mind. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 2022-05-04 12:49, David Laight wrote: >>> If you declare the union on the stack in the callers, and pass by value >>> - is the compiler not going to be clever enough to still DDRT? >> >> Ah, OK, it should do the thing. I thought you wanted me to ditch the >> union altogether. > > Some architectures always pass struct/union by address. > Which is probably not what you had in mind. Do you have any specific architecture in mind? I couldn't find any information that it happens anywhere, x86_64 ABI [1] (pages 20-21) aligns with my expectations, and my common sense can't explain why would some architectures do what you say. In C, when the caller passes a struct as a parameter, the callee can freely modify it. If the compiler silently replaced it with a pointer, the callee would corrupt the caller's local variable, so such approach requires the caller to make an extra copy. Making an extra copy on the stack and passing a pointer doesn't make any sense to me if you can just make a copy on the stack (or to a register) and call it a parameter. If you know any specific architecture supported by Linux that passes all unions by a pointer, could you please point me to it? Maybe I'm missing something in my logic, and a real-world example will explain things, but at the moment it sounds unrealistic to me. Thanks, Max [1]: https://www.intel.com/content/dam/develop/external/us/en/documents/mpx-linux64-abi.pdf > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK > Registration No: 1397386 (Wales)
From: Maxim Mikityanskiy > Sent: 05 May 2022 13:40 > > On 2022-05-04 12:49, David Laight wrote: > >>> If you declare the union on the stack in the callers, and pass by value > >>> - is the compiler not going to be clever enough to still DDRT? > >> > >> Ah, OK, it should do the thing. I thought you wanted me to ditch the > >> union altogether. > > > > Some architectures always pass struct/union by address. > > Which is probably not what you had in mind. > > Do you have any specific architecture in mind? I couldn't find any > information that it happens anywhere, x86_64 ABI [1] (pages 20-21) > aligns with my expectations, and my common sense can't explain why would > some architectures do what you say. > > In C, when the caller passes a struct as a parameter, the callee can > freely modify it. If the compiler silently replaced it with a pointer, > the callee would corrupt the caller's local variable, so such approach > requires the caller to make an extra copy. Yes, that is what happens. > Making an extra copy on the > stack and passing a pointer doesn't make any sense to me if you can just > make a copy on the stack (or to a register) and call it a parameter. > > If you know any specific architecture supported by Linux that passes all > unions by a pointer, could you please point me to it? Maybe I'm missing > something in my logic, and a real-world example will explain things, but > at the moment it sounds unrealistic to me. Look at any old architecture, m68k almost certainly passes all structures by address. i386 would - but I think the 'regparm' option includes passing small structures by value. I think sparc32 used to, but that might have changed in the last 30 years. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 2022-05-05 16:48, David Laight wrote: > From: Maxim Mikityanskiy >> Sent: 05 May 2022 13:40 >> >> On 2022-05-04 12:49, David Laight wrote: >>>>> If you declare the union on the stack in the callers, and pass by value >>>>> - is the compiler not going to be clever enough to still DDRT? >>>> >>>> Ah, OK, it should do the thing. I thought you wanted me to ditch the >>>> union altogether. >>> >>> Some architectures always pass struct/union by address. >>> Which is probably not what you had in mind. >> >> Do you have any specific architecture in mind? I couldn't find any >> information that it happens anywhere, x86_64 ABI [1] (pages 20-21) >> aligns with my expectations, and my common sense can't explain why would >> some architectures do what you say. >> >> In C, when the caller passes a struct as a parameter, the callee can >> freely modify it. If the compiler silently replaced it with a pointer, >> the callee would corrupt the caller's local variable, so such approach >> requires the caller to make an extra copy. > > Yes, that is what happens. I did a quick experiment with gcc 9 on m68k and i386, and it doesn't confirm what you claim. #include <stdint.h> #include <stdio.h> union test { uint32_t x; uint32_t *y; }; void func1(void *ptr, union test t) { if (ptr) { printf("%p %u\n", ptr, t.x); } else { printf("%u\n", *t.y); } } void func2(void *ptr, uint32_t *y) { if (ptr) { printf("%p %u\n", ptr, (uint32_t)y); } else { printf("%u\n", *y); } } gcc -S test.c -fno-strict-aliasing -o - I believe this minimal example reflects well enough what happens in my code. The assembly generated for func1 and func2 are identical. In both cases the second parameter is passed on the stack by value, not by pointer. >> Making an extra copy on the >> stack and passing a pointer doesn't make any sense to me if you can just >> make a copy on the stack (or to a register) and call it a parameter. >> >> If you know any specific architecture supported by Linux that passes all >> unions by a pointer, could you please point me to it? Maybe I'm missing >> something in my logic, and a real-world example will explain things, but >> at the moment it sounds unrealistic to me. > > Look at any old architecture, m68k almost certainly passes all structures > by address. > i386 would - but I think the 'regparm' option includes passing small > structures by value. i386 passes all parameters on the stack by default, and regparm makes some parameters be passed in registers instead of the stack, but it doesn't have any other weird effects. The value passed is the same in both cases, the only difference is registers vs stack. > I think sparc32 used to, but that might have changed in the last 30 years. > > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK > Registration No: 1397386 (Wales)
From: Maxim Mikityanskiy > Sent: 05 May 2022 19:28 > > On 2022-05-05 16:48, David Laight wrote: > > From: Maxim Mikityanskiy > >> Sent: 05 May 2022 13:40 > >> > >> On 2022-05-04 12:49, David Laight wrote: > >>>>> If you declare the union on the stack in the callers, and pass by value > >>>>> - is the compiler not going to be clever enough to still DDRT? > >>>> > >>>> Ah, OK, it should do the thing. I thought you wanted me to ditch the > >>>> union altogether. > >>> > >>> Some architectures always pass struct/union by address. > >>> Which is probably not what you had in mind. > >> > >> Do you have any specific architecture in mind? I couldn't find any > >> information that it happens anywhere, x86_64 ABI [1] (pages 20-21) > >> aligns with my expectations, and my common sense can't explain why would > >> some architectures do what you say. > >> > >> In C, when the caller passes a struct as a parameter, the callee can > >> freely modify it. If the compiler silently replaced it with a pointer, > >> the callee would corrupt the caller's local variable, so such approach > >> requires the caller to make an extra copy. > > > > Yes, that is what happens. > > I did a quick experiment with gcc 9 on m68k and i386, and it doesn't > confirm what you claim. > > #include <stdint.h> > #include <stdio.h> > > union test { > uint32_t x; > uint32_t *y; > }; > > void func1(void *ptr, union test t) > { > if (ptr) { > printf("%p %u\n", ptr, t.x); > } else { > printf("%u\n", *t.y); > } > } > > void func2(void *ptr, uint32_t *y) > { > if (ptr) { > printf("%p %u\n", ptr, (uint32_t)y); > } else { > printf("%u\n", *y); > } > } > > gcc -S test.c -fno-strict-aliasing -o - > > I believe this minimal example reflects well enough what happens in my > code. The assembly generated for func1 and func2 are identical. In both > cases the second parameter is passed on the stack by value, not by pointer. Hmmm, perhaps it is/was only sparc32 that passed all structures by reference. godbolt doesn't seem to have a sparc compiler and I don't have a working sparc system any more. It is also possible that the calling conventions are slightly different than the ones I remember using years ago. Certainly on i386 even 4 byte structures are returned by reference. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 2022-05-06 11:09, David Laight wrote: > From: Maxim Mikityanskiy >> Sent: 05 May 2022 19:28 >> >> On 2022-05-05 16:48, David Laight wrote: >>> From: Maxim Mikityanskiy >>>> Sent: 05 May 2022 13:40 >>>> >>>> On 2022-05-04 12:49, David Laight wrote: >>>>>>> If you declare the union on the stack in the callers, and pass by value >>>>>>> - is the compiler not going to be clever enough to still DDRT? >>>>>> >>>>>> Ah, OK, it should do the thing. I thought you wanted me to ditch the >>>>>> union altogether. >>>>> >>>>> Some architectures always pass struct/union by address. >>>>> Which is probably not what you had in mind. >>>> >>>> Do you have any specific architecture in mind? I couldn't find any >>>> information that it happens anywhere, x86_64 ABI [1] (pages 20-21) >>>> aligns with my expectations, and my common sense can't explain why would >>>> some architectures do what you say. >>>> >>>> In C, when the caller passes a struct as a parameter, the callee can >>>> freely modify it. If the compiler silently replaced it with a pointer, >>>> the callee would corrupt the caller's local variable, so such approach >>>> requires the caller to make an extra copy. >>> >>> Yes, that is what happens. >> >> I did a quick experiment with gcc 9 on m68k and i386, and it doesn't >> confirm what you claim. >> >> #include <stdint.h> >> #include <stdio.h> >> >> union test { >> uint32_t x; >> uint32_t *y; >> }; >> >> void func1(void *ptr, union test t) >> { >> if (ptr) { >> printf("%p %u\n", ptr, t.x); >> } else { >> printf("%u\n", *t.y); >> } >> } >> >> void func2(void *ptr, uint32_t *y) >> { >> if (ptr) { >> printf("%p %u\n", ptr, (uint32_t)y); >> } else { >> printf("%u\n", *y); >> } >> } >> >> gcc -S test.c -fno-strict-aliasing -o - >> >> I believe this minimal example reflects well enough what happens in my >> code. The assembly generated for func1 and func2 are identical. In both >> cases the second parameter is passed on the stack by value, not by pointer. > > Hmmm, perhaps it is/was only sparc32 that passed all structures by reference. Looks like sparc32 really does this crazy thing (and if the callee wants to modify the union, it has to make a copy). Well, it's always great to learn something new - thanks for the information! I'll consider it, but I'll likely keep the union anyway, since other options are uglier or less performant on common 64-bit architectures, and sparc32 is a legacy architecture that is unlikely to be used with the new performance feature of TLS zerocopy sendfile. > godbolt doesn't seem to have a sparc compiler and I don't have a > working sparc system any more. I just installed a cross-compiler from the repository and inspected the assembly. > It is also possible that the calling conventions are slightly > different than the ones I remember using years ago. > > Certainly on i386 even 4 byte structures are returned by reference. Right, return values are a different thing, there is this pseudo parameter that points to the buffer for the struct being returned, but I'm not going to return my union, I only pass it as a parameter, so it doesn't concern me. > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK > Registration No: 1397386 (Wales)
diff --git a/include/net/tls.h b/include/net/tls.h index b59f0a63292b..fc291e2747b5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -164,6 +164,7 @@ struct tls_record_info { struct tls_offload_context_tx { struct crypto_aead *aead_send; + bool zerocopy_sendfile; spinlock_t lock; /* protects records list */ struct list_head records_list; struct tls_record_info *open_record; diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 5f38be0ec0f3..47989b77ebcf 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -39,6 +39,7 @@ /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ #define TLS_RX 2 /* Set receive parameters */ +#define TLS_TX_ZEROCOPY_SENDFILE 3 /* transmit zerocopy sendfile */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b12f81a2b44c..715401b20c8b 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) return 0; } +union tls_iter_offset { + struct iov_iter *msg_iter; + int offset; +}; + static int tls_push_data(struct sock *sk, - struct iov_iter *msg_iter, + union tls_iter_offset iter_offset, size_t size, int flags, - unsigned char record_type) + unsigned char record_type, + struct page *zc_page) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -480,15 +486,29 @@ static int tls_push_data(struct sock *sk, } record = ctx->open_record; - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); - copy = min_t(size_t, copy, (max_open_record_len - record->len)); - - if (copy) { - rc = tls_device_copy_data(page_address(pfrag->page) + - pfrag->offset, copy, msg_iter); - if (rc) - goto handle_error; - tls_append_frag(record, pfrag, copy); + + if (!zc_page) { + copy = min_t(size_t, size, pfrag->size - pfrag->offset); + copy = min_t(size_t, copy, max_open_record_len - record->len); + + if (copy) { + rc = tls_device_copy_data(page_address(pfrag->page) + + pfrag->offset, copy, + iter_offset.msg_iter); + if (rc) + goto handle_error; + tls_append_frag(record, pfrag, copy); + } + } else { + copy = min_t(size_t, size, max_open_record_len - record->len); + if (copy) { + struct page_frag _pfrag; + + _pfrag.page = zc_page; + _pfrag.offset = iter_offset.offset; + _pfrag.size = copy; + tls_append_frag(record, &_pfrag, copy); + } } size -= copy; @@ -551,8 +571,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - rc = tls_push_data(sk, &msg->msg_iter, size, - msg->msg_flags, record_type); + rc = tls_push_data(sk, (union tls_iter_offset)&msg->msg_iter, size, + msg->msg_flags, record_type, NULL); out: release_sock(sk); @@ -564,11 +584,14 @@ int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_tx *ctx; struct iov_iter msg_iter; char *kaddr; struct kvec iov; int rc; + ctx = tls_offload_ctx_tx(tls_ctx); + if (flags & MSG_SENDPAGE_NOTLAST) flags |= MSG_MORE; @@ -580,12 +603,18 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; } + if (ctx->zerocopy_sendfile) { + rc = tls_push_data(sk, (union tls_iter_offset)offset, size, + flags, TLS_RECORD_TYPE_DATA, page); + goto out; + } + kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); - rc = tls_push_data(sk, &msg_iter, size, - flags, TLS_RECORD_TYPE_DATA); + rc = tls_push_data(sk, (union tls_iter_offset)&msg_iter, size, + flags, TLS_RECORD_TYPE_DATA, NULL); kunmap(page); out: @@ -659,7 +688,8 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) struct iov_iter msg_iter; iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); - return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); + return tls_push_data(sk, (union tls_iter_offset)&msg_iter, 0, flags, + TLS_RECORD_TYPE_DATA, NULL); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7b2b0e7ffee4..8ef86e04f571 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -513,6 +513,31 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, return rc; } +static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_tx *ctx; + int len, value; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len != sizeof(value)) + return -EINVAL; + + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) + return -EBUSY; + + ctx = tls_offload_ctx_tx(tls_ctx); + + value = ctx->zerocopy_sendfile; + if (copy_to_user(optval, &value, sizeof(value))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -524,6 +549,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, rc = do_tls_getsockopt_conf(sk, optval, optlen, optname == TLS_TX); break; + case TLS_TX_ZEROCOPY_SENDFILE: + rc = do_tls_getsockopt_tx_zc(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -675,6 +703,28 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, return rc; } +static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_tx *ctx; + int val; + + if (!tls_ctx || tls_ctx->tx_conf != TLS_HW) + return -EINVAL; + + if (sockptr_is_null(optval) || optlen < sizeof(val)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + ctx = tls_offload_ctx_tx(tls_ctx); + ctx->zerocopy_sendfile = val; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -688,6 +738,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, optname == TLS_TX); release_sock(sk); break; + case TLS_TX_ZEROCOPY_SENDFILE: + lock_sock(sk); + rc = do_tls_setsockopt_tx_zc(sk, optval, optlen); + release_sock(sk); + break; default: rc = -ENOPROTOOPT; break;