diff mbox series

[bpf-next,v2,1/2] bpf: tcp: prevent bpf_reserve_hdr_opt() from growing skb larger than MTU

Message ID 20240827013736.2845596-2-zijianzhang@bytedance.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series prevent bpf_reserve_hdr_opt() from growing skb larger than MTU | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 23 this patch: 23
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: netdev@vger.kernel.org
netdev/build_clang success Errors and warnings before: 28 this patch: 28
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 927 this patch: 927
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 49 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 4 this patch: 4
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Zijian Zhang Aug. 27, 2024, 1:37 a.m. UTC
From: Amery Hung <amery.hung@bytedance.com>

This series prevents sockops users from accidentally causing packet
drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
reserves different option lengths in tcp_sendmsg().

Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
reserve a space in tcp_send_mss(), which will return the MSS for TSO.
Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
again to calculate the actual tcp_option_size and skb_push() the total
header size.

skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
reserved opt size is smaller than the actual header size, the len of the
skb can exceed the MTU. As a result, ip(6)_fragment will drop the
packet if skb->ignore_df is not set.

To prevent this accidental packet drop, we need to make sure the
second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
not more than the first time. Since this cannot be done during
verification time, we add a runtime sanity check to have
bpf_reserve_hdr_opt return an error instead of causing packet drops later.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Co-developed-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
---
 include/net/tcp.h     |  8 ++++++++
 net/ipv4/tcp_input.c  |  8 --------
 net/ipv4/tcp_output.c | 13 +++++++++++--
 3 files changed, 19 insertions(+), 10 deletions(-)

Comments

Martin KaFai Lau Aug. 28, 2024, 9:29 p.m. UTC | #1
On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
> From: Amery Hung <amery.hung@bytedance.com>
> 
> This series prevents sockops users from accidentally causing packet
> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
> reserves different option lengths in tcp_sendmsg().
> 
> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
> again to calculate the actual tcp_option_size and skb_push() the total
> header size.
> 
> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
> reserved opt size is smaller than the actual header size, the len of the
> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
> packet if skb->ignore_df is not set.
> 
> To prevent this accidental packet drop, we need to make sure the
> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
> not more than the first time. 

iiuc, it is a bug in the bpf prog itself that did not reserve the same header 
length and caused a drop. It is not the only drop case though for an incorrect 
bpf prog. There are other cases where a bpf prog can accidentally drop a packet.

Do you have an actual use case that the bpf prog cannot reserve the correct 
header length for the same sk ?
Zijian Zhang Aug. 28, 2024, 11:01 p.m. UTC | #2
On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>> From: Amery Hung <amery.hung@bytedance.com>
>>
>> This series prevents sockops users from accidentally causing packet
>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>> reserves different option lengths in tcp_sendmsg().
>>
>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
>> again to calculate the actual tcp_option_size and skb_push() the total
>> header size.
>>
>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>> reserved opt size is smaller than the actual header size, the len of the
>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>> packet if skb->ignore_df is not set.
>>
>> To prevent this accidental packet drop, we need to make sure the
>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>> not more than the first time. 
> 
> iiuc, it is a bug in the bpf prog itself that did not reserve the same 
> header length and caused a drop. It is not the only drop case though for 
> an incorrect bpf prog. There are other cases where a bpf prog can 
> accidentally drop a packet.
> 
> Do you have an actual use case that the bpf prog cannot reserve the 
> correct header length for the same sk ?

That's right, it's the bug of the bpf prog itself. We are trying to have
the error reported earlier in eBPF program, instead of successfully
returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
at the end because of it.

By adding this patch, the `remaining` variable passed to the
bpf_skops_hdr_opt_len will be more precise, it takes the previously
reserved size into account. As a result, if users accidentally set an
option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
will return -ENOSPC instead of 0.

We have a use case where we add options to some packets kind of randomly
for the purpose of sampling, and accidentally set a larger option size
than the reserved size. It is the problem of ourselves and takes us
some effort to troubleshoot the root cause.

If bpf_sock_ops_reserve_hdr_opt can return an error in this case, it
could be helpful for users to avoid this mistake.
Martin KaFai Lau Aug. 29, 2024, 1 a.m. UTC | #3
On 8/28/24 4:01 PM, Zijian Zhang wrote:
> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>> From: Amery Hung <amery.hung@bytedance.com>
>>>
>>> This series prevents sockops users from accidentally causing packet
>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>> reserves different option lengths in tcp_sendmsg().
>>>
>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
>>> again to calculate the actual tcp_option_size and skb_push() the total
>>> header size.
>>>
>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>> reserved opt size is smaller than the actual header size, the len of the
>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>> packet if skb->ignore_df is not set.
>>>
>>> To prevent this accidental packet drop, we need to make sure the
>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>> not more than the first time. 
>>
>> iiuc, it is a bug in the bpf prog itself that did not reserve the same header 
>> length and caused a drop. It is not the only drop case though for an incorrect 
>> bpf prog. There are other cases where a bpf prog can accidentally drop a packet.
>>
>> Do you have an actual use case that the bpf prog cannot reserve the correct 
>> header length for the same sk ?
> 
> That's right, it's the bug of the bpf prog itself. We are trying to have
> the error reported earlier in eBPF program, instead of successfully
> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
> at the end because of it.
> 
> By adding this patch, the `remaining` variable passed to the
> bpf_skops_hdr_opt_len will be more precise, it takes the previously
> reserved size into account. As a result, if users accidentally set an
> option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
> will return -ENOSPC instead of 0.

Putting aside it adds more checks, this adds something pretty unique to the bpf 
header option comparing with other dynamic options like sack. e.g. For 
tp->mss_cache, I assume it won't change since the earlier tcp_current_mss() was 
called?

> 
> We have a use case where we add options to some packets kind of randomly
> for the purpose of sampling, and accidentally set a larger option size
> than the reserved size. It is the problem of ourselves and takes us
> some effort to troubleshoot the root cause.
> 
> If bpf_sock_ops_reserve_hdr_opt can return an error in this case, it
> could be helpful for users to avoid this mistake.

The bpf_sk_storage can be used to decide if a sk has been sampled.

Also, with bpf_cast_to_kern_ctx and bpf_rdonly_cast, all the checks in this 
patch can be done in the bpf prog itself.
Cong Wang Aug. 29, 2024, 4:46 p.m. UTC | #4
On Wed, Aug 28, 2024 at 02:29:17PM -0700, Martin KaFai Lau wrote:
> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
> > From: Amery Hung <amery.hung@bytedance.com>
> > 
> > This series prevents sockops users from accidentally causing packet
> > drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
> > reserves different option lengths in tcp_sendmsg().
> > 
> > Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
> > reserve a space in tcp_send_mss(), which will return the MSS for TSO.
> > Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
> > again to calculate the actual tcp_option_size and skb_push() the total
> > header size.
> > 
> > skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
> > derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
> > reserved opt size is smaller than the actual header size, the len of the
> > skb can exceed the MTU. As a result, ip(6)_fragment will drop the
> > packet if skb->ignore_df is not set.
> > 
> > To prevent this accidental packet drop, we need to make sure the
> > second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
> > not more than the first time.
> 
> iiuc, it is a bug in the bpf prog itself that did not reserve the same
> header length and caused a drop. It is not the only drop case though for an
> incorrect bpf prog. There are other cases where a bpf prog can accidentally
> drop a packet.

But safety is the most important thing for eBPF programs, do we really
allow this kind of bug to happen in eBPF programs?

> 
> Do you have an actual use case that the bpf prog cannot reserve the correct
> header length for the same sk ?

You can think of it as a simple call of bpf_get_prandom_u32():

SEC("sockops")
int bpf_sock_ops_cb(struct bpf_sock_ops *skops)
{
    if (skops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) {
        return bpf_get_prandom_u32();
    }
    return 0;
}

And eBPF programs are stateful anyway, at least we should not assume
it is stateless since maps are commonly used. Therefore, different invocations
of a same eBPF program are expected to return different values. IMHO,
this is a situation we have to deal with in the kernel, hence stricter
checks are reasonable and necessary.

Thanks.
Martin KaFai Lau Aug. 30, 2024, 12:20 a.m. UTC | #5
On 8/29/24 9:46 AM, Cong Wang wrote:
> On Wed, Aug 28, 2024 at 02:29:17PM -0700, Martin KaFai Lau wrote:
>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>> From: Amery Hung <amery.hung@bytedance.com>
>>>
>>> This series prevents sockops users from accidentally causing packet
>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>> reserves different option lengths in tcp_sendmsg().
>>>
>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
>>> again to calculate the actual tcp_option_size and skb_push() the total
>>> header size.
>>>
>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>> reserved opt size is smaller than the actual header size, the len of the
>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>> packet if skb->ignore_df is not set.
>>>
>>> To prevent this accidental packet drop, we need to make sure the
>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>> not more than the first time.
>>
>> iiuc, it is a bug in the bpf prog itself that did not reserve the same
>> header length and caused a drop. It is not the only drop case though for an
>> incorrect bpf prog. There are other cases where a bpf prog can accidentally
>> drop a packet.
> 
> But safety is the most important thing for eBPF programs, do we really
> allow this kind of bug to happen in eBPF programs?
> 
>>
>> Do you have an actual use case that the bpf prog cannot reserve the correct
>> header length for the same sk ?
> 
> You can think of it as a simple call of bpf_get_prandom_u32():
> 
> SEC("sockops")
> int bpf_sock_ops_cb(struct bpf_sock_ops *skops)
> {
>      if (skops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) {
>          return bpf_get_prandom_u32();

It compiles but this won't even pass the verifier.

If you want to go to this extreme to consider any random bpf program, it is 
essentially deploying a fuzzer to the production traffic, right? Sure, syzbot 
has programs that are rejected by verifier or cause a packet drop. afaik, I 
don't recall syzbot reported them as a crash.

Is a drop a safety issue as you said? I don't think so. It is why this set is 
tagged as bpf-next also and I agree. What is the hurry here?

Is it something that can be improved? may be. Note that the patchwork status has 
not been changed yet. Instead of wasting time here, please allow Zijian (/Amery) 
to continue the discussion on another thread. I have asked question on the 
changes and also suggested other ways.


>      }
>      return 0;
> }
> 
> And eBPF programs are stateful anyway, at least we should not assume
> it is stateless since maps are commonly used. Therefore, different invocations
> of a same eBPF program are expected to return different values. IMHO,
> this is a situation we have to deal with in the kernel, hence stricter
> checks are reasonable and necessary.
Zijian Zhang Aug. 30, 2024, 9:02 p.m. UTC | #6
On 8/28/24 6:00 PM, Martin KaFai Lau wrote:
> On 8/28/24 4:01 PM, Zijian Zhang wrote:
>> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>>> From: Amery Hung <amery.hung@bytedance.com>
>>>>
>>>> This series prevents sockops users from accidentally causing packet
>>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>>> reserves different option lengths in tcp_sendmsg().
>>>>
>>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be 
>>>> called to
>>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in 
>>>> __tcp_transmit_skb()
>>>> again to calculate the actual tcp_option_size and skb_push() the total
>>>> header size.
>>>>
>>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>>> reserved opt size is smaller than the actual header size, the len of 
>>>> the
>>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>>> packet if skb->ignore_df is not set.
>>>>
>>>> To prevent this accidental packet drop, we need to make sure the
>>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>>> not more than the first time. 
>>>
>>> iiuc, it is a bug in the bpf prog itself that did not reserve the 
>>> same header length and caused a drop. It is not the only drop case 
>>> though for an incorrect bpf prog. There are other cases where a bpf 
>>> prog can accidentally drop a packet.
>>>
>>> Do you have an actual use case that the bpf prog cannot reserve the 
>>> correct header length for the same sk ?
>>
>> That's right, it's the bug of the bpf prog itself. We are trying to have
>> the error reported earlier in eBPF program, instead of successfully
>> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
>> at the end because of it.
>>
>> By adding this patch, the `remaining` variable passed to the
>> bpf_skops_hdr_opt_len will be more precise, it takes the previously
>> reserved size into account. As a result, if users accidentally set an
>> option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
>> will return -ENOSPC instead of 0.
> 
> Putting aside it adds more checks, this adds something pretty unique to 
> the bpf header option comparing with other dynamic options like sack. 
> e.g. For tp->mss_cache, I assume it won't change since the earlier 
> tcp_current_mss() was called?
> 

Agreed, this check is very unique comparing with other dynamic options.
How about moving the check into function bpf_skops_hdr_opt_len? It
already has some logic to check if the context is in tcp_current_mss.
Does it look more reasonable?

`reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb)`
For tp->mss_cache here, yes, I also think it won't change,

I am trying to get the reserved bpf hdr size. Considering other dynamic
options, this equation is not precise, and may change it to
`reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb) - size`?

Or, not sure if adding a field in tcp_sock to precisely cache the
reserved bpf hdr size is a good idea?

>>
>> We have a use case where we add options to some packets kind of randomly
>> for the purpose of sampling, and accidentally set a larger option size
>> than the reserved size. It is the problem of ourselves and takes us
>> some effort to troubleshoot the root cause.
>>
>> If bpf_sock_ops_reserve_hdr_opt can return an error in this case, it
>> could be helpful for users to avoid this mistake.
> 
> The bpf_sk_storage can be used to decide if a sk has been sampled.
> 
> Also, with bpf_cast_to_kern_ctx and bpf_rdonly_cast, all the checks in 
> this patch can be done in the bpf prog itself.
> 

Thanks for pointing this out! Agreed, the check can be implemented in
eBPF progs.
Martin KaFai Lau Sept. 3, 2024, 10:38 p.m. UTC | #7
On 8/30/24 2:02 PM, Zijian Zhang wrote:
> On 8/28/24 6:00 PM, Martin KaFai Lau wrote:
>> On 8/28/24 4:01 PM, Zijian Zhang wrote:
>>> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>>>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>>>> From: Amery Hung <amery.hung@bytedance.com>
>>>>>
>>>>> This series prevents sockops users from accidentally causing packet
>>>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>>>> reserves different option lengths in tcp_sendmsg().
>>>>>
>>>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
>>>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
>>>>> again to calculate the actual tcp_option_size and skb_push() the total
>>>>> header size.
>>>>>
>>>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>>>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>>>> reserved opt size is smaller than the actual header size, the len of the
>>>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>>>> packet if skb->ignore_df is not set.
>>>>>
>>>>> To prevent this accidental packet drop, we need to make sure the
>>>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>>>> not more than the first time. 
>>>>
>>>> iiuc, it is a bug in the bpf prog itself that did not reserve the same 
>>>> header length and caused a drop. It is not the only drop case though for an 
>>>> incorrect bpf prog. There are other cases where a bpf prog can accidentally 
>>>> drop a packet.
>>>>
>>>> Do you have an actual use case that the bpf prog cannot reserve the correct 
>>>> header length for the same sk ?
>>>
>>> That's right, it's the bug of the bpf prog itself. We are trying to have
>>> the error reported earlier in eBPF program, instead of successfully
>>> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
>>> at the end because of it.
>>>
>>> By adding this patch, the `remaining` variable passed to the
>>> bpf_skops_hdr_opt_len will be more precise, it takes the previously
>>> reserved size into account. As a result, if users accidentally set an
>>> option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
>>> will return -ENOSPC instead of 0.
>>
>> Putting aside it adds more checks, this adds something pretty unique to the 
>> bpf header option comparing with other dynamic options like sack. e.g. For tp- 
>> >mss_cache, I assume it won't change since the earlier tcp_current_mss() was 
>> called?
>>
> 
> Agreed, this check is very unique comparing with other dynamic options.
> How about moving the check into function bpf_skops_hdr_opt_len? It
> already has some logic to check if the context is in tcp_current_mss.
> Does it look more reasonable?

Yes, it probably may be better to put that into the bpf_skops_hdr_opt_len(). 
This is implementation details which is something for later after figuring out 
if the reserved_opt_spc change is correct and needed.

> 
> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb)`
> For tp->mss_cache here, yes, I also think it won't change,

This needs more details and clear explanation on why it is the case and why the 
existing regular bpf prog will continue to work.

afaik, mss_cache and tcp_skb_seglen() must be in-sync and the same between calls 
for this to work . From thinking about it, it should be but I haven't looked at 
all cases. Missing "- size" does not help the confidence here also.

Also, when "skb != NULL", I don't understand why the "MAX_TCP_OPTION_SPACE - 
size" is still being considered. I am likely missing something here. If the 
above formula is correct, why reserved_opt_spc is not always used for the "skb 
!= NULL" case and still need to be compared with the "MAX_TCP_OPTION_SPACE - size"?

> 
> I am trying to get the reserved bpf hdr size. Considering other dynamic
> options, this equation is not precise, and may change it to
> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb) - size`?

"- size" is needed. The test did not catch it?

> 
> Or, not sure if adding a field in tcp_sock to precisely cache the
> reserved bpf hdr size is a good idea?

imo, adding one field in tcp_sock for this is overkill.

It seems your bpf prog is randomly reserving space. If this is the case, giving 
a fail signal in bpf_reserve_hdr_opt() does not improve the success rate for 
your random bpf prog to reserve header. I don't think adding a field or the 
change in this patch really solves anything in your randomly reserving space use 
case ?

> 
>>>
>>> We have a use case where we add options to some packets kind of randomly
>>> for the purpose of sampling, and accidentally set a larger option size
>>> than the reserved size. It is the problem of ourselves and takes us
>>> some effort to troubleshoot the root cause.
>>>
>>> If bpf_sock_ops_reserve_hdr_opt can return an error in this case, it
>>> could be helpful for users to avoid this mistake.
>>
>> The bpf_sk_storage can be used to decide if a sk has been sampled.
>>
>> Also, with bpf_cast_to_kern_ctx and bpf_rdonly_cast, all the checks in this 
>> patch can be done in the bpf prog itself.
>>
> 
> Thanks for pointing this out! Agreed, the check can be implemented in
> eBPF progs.
Zijian Zhang Sept. 5, 2024, 6:19 p.m. UTC | #8
On 9/3/24 3:38 PM, Martin KaFai Lau wrote:
> On 8/30/24 2:02 PM, Zijian Zhang wrote:
>> On 8/28/24 6:00 PM, Martin KaFai Lau wrote:
>>> On 8/28/24 4:01 PM, Zijian Zhang wrote:
>>>> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>>>>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>>>>> From: Amery Hung <amery.hung@bytedance.com>
>>>>>>
>>>>>> This series prevents sockops users from accidentally causing packet
>>>>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>>>>> reserves different option lengths in tcp_sendmsg().
>>>>>>
>>>>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be 
>>>>>> called to
>>>>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>>>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in 
>>>>>> __tcp_transmit_skb()
>>>>>> again to calculate the actual tcp_option_size and skb_push() the 
>>>>>> total
>>>>>> header size.
>>>>>>
>>>>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, 
>>>>>> which is
>>>>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>>>>> reserved opt size is smaller than the actual header size, the len 
>>>>>> of the
>>>>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>>>>> packet if skb->ignore_df is not set.
>>>>>>
>>>>>> To prevent this accidental packet drop, we need to make sure the
>>>>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>>>>> not more than the first time. 
>>>>>
>>>>> iiuc, it is a bug in the bpf prog itself that did not reserve the 
>>>>> same header length and caused a drop. It is not the only drop case 
>>>>> though for an incorrect bpf prog. There are other cases where a bpf 
>>>>> prog can accidentally drop a packet.
>>>>>
>>>>> Do you have an actual use case that the bpf prog cannot reserve the 
>>>>> correct header length for the same sk ?
>>>>
>>>> That's right, it's the bug of the bpf prog itself. We are trying to 
>>>> have
>>>> the error reported earlier in eBPF program, instead of successfully
>>>> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
>>>> at the end because of it.
>>>>
>>>> By adding this patch, the `remaining` variable passed to the
>>>> bpf_skops_hdr_opt_len will be more precise, it takes the previously
>>>> reserved size into account. As a result, if users accidentally set an
>>>> option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
>>>> will return -ENOSPC instead of 0.
>>>
>>> Putting aside it adds more checks, this adds something pretty unique 
>>> to the bpf header option comparing with other dynamic options like 
>>> sack. e.g. For tp- >mss_cache, I assume it won't change since the 
>>> earlier tcp_current_mss() was called?
>>>
>>
>> Agreed, this check is very unique comparing with other dynamic options.
>> How about moving the check into function bpf_skops_hdr_opt_len? It
>> already has some logic to check if the context is in tcp_current_mss.
>> Does it look more reasonable?
> 
> Yes, it probably may be better to put that into the 
> bpf_skops_hdr_opt_len(). This is implementation details which is 
> something for later after figuring out if the reserved_opt_spc change is 
> correct and needed.
> 
>>
>> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb)`
>> For tp->mss_cache here, yes, I also think it won't change,
> 
> This needs more details and clear explanation on why it is the case and 
> why the existing regular bpf prog will continue to work.
> 
> afaik, mss_cache and tcp_skb_seglen() must be in-sync and the same 
> between calls for this to work . From thinking about it, it should be 
> but I haven't looked at all cases. Missing "- size" does not help the 
> confidence here also.
> 
> Also, when "skb != NULL", I don't understand why the 
> "MAX_TCP_OPTION_SPACE - size" is still being considered. I am likely 
> missing something here. If the above formula is correct, why 
> reserved_opt_spc is not always used for the "skb != NULL" case and still 
> need to be compared with the "MAX_TCP_OPTION_SPACE - size"?
> 

Cases I can think of are as follows,
- When it's not a GSO skb, tcp_skb_seglen will simply return skb->len,
it might make `tp->mss_cache - tcp_skb_seglen(skb)` a large number.

- When we are in tcp_mtu_probe, tp->mss_cache will be smaller than
tcp_skb_seglen(skb), which makes the equation again a large number.


"MAX_TCP_OPTION_SPACE - size" is considered here as the strict upper
bound, while reserved_opt_spc is expected to be a stricter upper bound.
In the above or other cases, where the equation malfunctioned, we can
always fall back to the original bound.


I am not sure which way to get the reserved size is better,
1. We could precisely cache the result of the reserved size, may
need to have a new field for it, I agree that a field in tcp_sock
is overkill.

2. In this patch, we use an equation to infer the value of it. There are
some concerns with tp->mss_cache and tcp_skb_seglen(skb) here, I agree.
If tp->mss_cache and tcp_skb_seglen(skb) might not be in-sync, we may
have an underestimated reserved_opt_spc, it may break existing bpf
progs. If this method is preferred I will do more investigations to
verify it or modify the equation :)

Do you have preference to either option?

>>
>> I am trying to get the reserved bpf hdr size. Considering other dynamic
>> options, this equation is not precise, and may change it to
>> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb) - size`?
> 
> "- size" is needed. The test did not catch it?
> 
>>
>> Or, not sure if adding a field in tcp_sock to precisely cache the
>> reserved bpf hdr size is a good idea?
> 
> imo, adding one field in tcp_sock for this is overkill.
> 
> It seems your bpf prog is randomly reserving space. If this is the case, 
> giving a fail signal in bpf_reserve_hdr_opt() does not improve the 
> success rate for your random bpf prog to reserve header. I don't think 
> adding a field or the change in this patch really solves anything in 
> your randomly reserving space use case ?
> 

It's true that it won't help with the success rate, but it could help
save packets from being dropped.

The goal is to let bpf_reserve_hdr_opt fail, when it is supposed to.
And, as a bonus effect, it could also save packets from being dropped.

When the accidental mistake happens, we want bpf_reserve_hdr_opt to
fail, so that `sock_ops.remaining_opt_len` won't be updated.
If it is still updated in this case, the packet will be dropped in
ip_fragment because `IPCB(skb)->frag_max_size > mtu`.

>>
>>>>
>>>> We have a use case where we add options to some packets kind of 
>>>> randomly
>>>> for the purpose of sampling, and accidentally set a larger option size
>>>> than the reserved size. It is the problem of ourselves and takes us
>>>> some effort to troubleshoot the root cause.
>>>>
>>>> If bpf_sock_ops_reserve_hdr_opt can return an error in this case, it
>>>> could be helpful for users to avoid this mistake.
>>>
>>> The bpf_sk_storage can be used to decide if a sk has been sampled.
>>>
>>> Also, with bpf_cast_to_kern_ctx and bpf_rdonly_cast, all the checks 
>>> in this patch can be done in the bpf prog itself.
>>>
>>
>> Thanks for pointing this out! Agreed, the check can be implemented in
>> eBPF progs.
>
Martin KaFai Lau Sept. 5, 2024, 7:38 p.m. UTC | #9
On 9/5/24 11:19 AM, Zijian Zhang wrote:
> On 9/3/24 3:38 PM, Martin KaFai Lau wrote:
>> On 8/30/24 2:02 PM, Zijian Zhang wrote:
>>> On 8/28/24 6:00 PM, Martin KaFai Lau wrote:
>>>> On 8/28/24 4:01 PM, Zijian Zhang wrote:
>>>>> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>>>>>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>>>>>> From: Amery Hung <amery.hung@bytedance.com>
>>>>>>>
>>>>>>> This series prevents sockops users from accidentally causing packet
>>>>>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>>>>>> reserves different option lengths in tcp_sendmsg().
>>>>>>>
>>>>>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to
>>>>>>> reserve a space in tcp_send_mss(), which will return the MSS for TSO.
>>>>>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb()
>>>>>>> again to calculate the actual tcp_option_size and skb_push() the total
>>>>>>> header size.
>>>>>>>
>>>>>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is
>>>>>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>>>>>> reserved opt size is smaller than the actual header size, the len of the
>>>>>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>>>>>> packet if skb->ignore_df is not set.
>>>>>>>
>>>>>>> To prevent this accidental packet drop, we need to make sure the
>>>>>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space
>>>>>>> not more than the first time. 
>>>>>>
>>>>>> iiuc, it is a bug in the bpf prog itself that did not reserve the same 
>>>>>> header length and caused a drop. It is not the only drop case though for 
>>>>>> an incorrect bpf prog. There are other cases where a bpf prog can 
>>>>>> accidentally drop a packet.
>>>>>>
>>>>>> Do you have an actual use case that the bpf prog cannot reserve the 
>>>>>> correct header length for the same sk ?
>>>>>
>>>>> That's right, it's the bug of the bpf prog itself. We are trying to have
>>>>> the error reported earlier in eBPF program, instead of successfully
>>>>> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet drop
>>>>> at the end because of it.
>>>>>
>>>>> By adding this patch, the `remaining` variable passed to the
>>>>> bpf_skops_hdr_opt_len will be more precise, it takes the previously
>>>>> reserved size into account. As a result, if users accidentally set an
>>>>> option size larger than the reserved size, bpf_sock_ops_reserve_hdr_opt
>>>>> will return -ENOSPC instead of 0.
>>>>
>>>> Putting aside it adds more checks, this adds something pretty unique to the 
>>>> bpf header option comparing with other dynamic options like sack. e.g. For 
>>>> tp- >mss_cache, I assume it won't change since the earlier tcp_current_mss() 
>>>> was called?
>>>>
>>>
>>> Agreed, this check is very unique comparing with other dynamic options.
>>> How about moving the check into function bpf_skops_hdr_opt_len? It
>>> already has some logic to check if the context is in tcp_current_mss.
>>> Does it look more reasonable?
>>
>> Yes, it probably may be better to put that into the bpf_skops_hdr_opt_len(). 
>> This is implementation details which is something for later after figuring out 
>> if the reserved_opt_spc change is correct and needed.
>>
>>>
>>> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb)`
>>> For tp->mss_cache here, yes, I also think it won't change,
>>
>> This needs more details and clear explanation on why it is the case and why 
>> the existing regular bpf prog will continue to work.
>>
>> afaik, mss_cache and tcp_skb_seglen() must be in-sync and the same between 
>> calls for this to work . From thinking about it, it should be but I haven't 
>> looked at all cases. Missing "- size" does not help the confidence here also.
>>
>> Also, when "skb != NULL", I don't understand why the "MAX_TCP_OPTION_SPACE - 
>> size" is still being considered. I am likely missing something here. If the 
>> above formula is correct, why reserved_opt_spc is not always used for the 
>> "skb != NULL" case and still need to be compared with the 
>> "MAX_TCP_OPTION_SPACE - size"?
>>
> 
> Cases I can think of are as follows,
> - When it's not a GSO skb, tcp_skb_seglen will simply return skb->len,
> it might make `tp->mss_cache - tcp_skb_seglen(skb)` a large number.
> 
> - When we are in tcp_mtu_probe, tp->mss_cache will be smaller than
> tcp_skb_seglen(skb), which makes the equation again a large number.

In tcp_mtu_probe, mss_cache is (smaller) than the tcp_skb_seglen(skb)?

> 
> 
> "MAX_TCP_OPTION_SPACE - size" is considered here as the strict upper
> bound, while reserved_opt_spc is expected to be a stricter upper bound.
> In the above or other cases, where the equation malfunctioned, we can
> always fall back to the original bound.

Make sense. Thanks for the explanation. The commit message could have used this 
details.

> 
> 
> I am not sure which way to get the reserved size is better,
> 1. We could precisely cache the result of the reserved size, may
> need to have a new field for it, I agree that a field in tcp_sock
> is overkill.
> 
> 2. In this patch, we use an equation to infer the value of it. There are
> some concerns with tp->mss_cache and tcp_skb_seglen(skb) here, I agree.
> If tp->mss_cache and tcp_skb_seglen(skb) might not be in-sync, we may
> have an underestimated reserved_opt_spc, it may break existing bpf
> progs. If this method is preferred I will do more investigations to
> verify it or modify the equation :)

imo, the bpf prog can always use bpf_sk_storage to add fields to a sock and 
store the previous reserved space there. I think this is the solution you should 
use in your use case which randomly reserves header space. Adding a field in the 
tcp_sock feels wrong especially the only use case I am hearing so far is a bpf 
prog randomly reserving header spaces.

If (2) can be convinced to be correct, improving bpf_reserve_hdr_opt() is fine 
as long as it does not break the existing program. I expect some tests to cover 
this.
Zijian Zhang Sept. 5, 2024, 8:20 p.m. UTC | #10
On 9/5/24 12:38 PM, Martin KaFai Lau wrote:
> On 9/5/24 11:19 AM, Zijian Zhang wrote:
>> On 9/3/24 3:38 PM, Martin KaFai Lau wrote:
>>> On 8/30/24 2:02 PM, Zijian Zhang wrote:
>>>> On 8/28/24 6:00 PM, Martin KaFai Lau wrote:
>>>>> On 8/28/24 4:01 PM, Zijian Zhang wrote:
>>>>>> On 8/28/24 2:29 PM, Martin KaFai Lau wrote:
>>>>>>> On 8/26/24 6:37 PM, zijianzhang@bytedance.com wrote:
>>>>>>>> From: Amery Hung <amery.hung@bytedance.com>
>>>>>>>>
>>>>>>>> This series prevents sockops users from accidentally causing packet
>>>>>>>> drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program
>>>>>>>> reserves different option lengths in tcp_sendmsg().
>>>>>>>>
>>>>>>>> Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be 
>>>>>>>> called to
>>>>>>>> reserve a space in tcp_send_mss(), which will return the MSS for 
>>>>>>>> TSO.
>>>>>>>> Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in 
>>>>>>>> __tcp_transmit_skb()
>>>>>>>> again to calculate the actual tcp_option_size and skb_push() the 
>>>>>>>> total
>>>>>>>> header size.
>>>>>>>>
>>>>>>>> skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, 
>>>>>>>> which is
>>>>>>>> derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the
>>>>>>>> reserved opt size is smaller than the actual header size, the 
>>>>>>>> len of the
>>>>>>>> skb can exceed the MTU. As a result, ip(6)_fragment will drop the
>>>>>>>> packet if skb->ignore_df is not set.
>>>>>>>>
>>>>>>>> To prevent this accidental packet drop, we need to make sure the
>>>>>>>> second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves 
>>>>>>>> space
>>>>>>>> not more than the first time. 
>>>>>>>
>>>>>>> iiuc, it is a bug in the bpf prog itself that did not reserve the 
>>>>>>> same header length and caused a drop. It is not the only drop 
>>>>>>> case though for an incorrect bpf prog. There are other cases 
>>>>>>> where a bpf prog can accidentally drop a packet.
>>>>>>>
>>>>>>> Do you have an actual use case that the bpf prog cannot reserve 
>>>>>>> the correct header length for the same sk ?
>>>>>>
>>>>>> That's right, it's the bug of the bpf prog itself. We are trying 
>>>>>> to have
>>>>>> the error reported earlier in eBPF program, instead of successfully
>>>>>> returning from bpf_sock_ops_reserve_hdr_opt but leading to packet 
>>>>>> drop
>>>>>> at the end because of it.
>>>>>>
>>>>>> By adding this patch, the `remaining` variable passed to the
>>>>>> bpf_skops_hdr_opt_len will be more precise, it takes the previously
>>>>>> reserved size into account. As a result, if users accidentally set an
>>>>>> option size larger than the reserved size, 
>>>>>> bpf_sock_ops_reserve_hdr_opt
>>>>>> will return -ENOSPC instead of 0.
>>>>>
>>>>> Putting aside it adds more checks, this adds something pretty 
>>>>> unique to the bpf header option comparing with other dynamic 
>>>>> options like sack. e.g. For tp- >mss_cache, I assume it won't 
>>>>> change since the earlier tcp_current_mss() was called?
>>>>>
>>>>
>>>> Agreed, this check is very unique comparing with other dynamic options.
>>>> How about moving the check into function bpf_skops_hdr_opt_len? It
>>>> already has some logic to check if the context is in tcp_current_mss.
>>>> Does it look more reasonable?
>>>
>>> Yes, it probably may be better to put that into the 
>>> bpf_skops_hdr_opt_len(). This is implementation details which is 
>>> something for later after figuring out if the reserved_opt_spc change 
>>> is correct and needed.
>>>
>>>>
>>>> `reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb)`
>>>> For tp->mss_cache here, yes, I also think it won't change,
>>>
>>> This needs more details and clear explanation on why it is the case 
>>> and why the existing regular bpf prog will continue to work.
>>>
>>> afaik, mss_cache and tcp_skb_seglen() must be in-sync and the same 
>>> between calls for this to work . From thinking about it, it should be 
>>> but I haven't looked at all cases. Missing "- size" does not help the 
>>> confidence here also.
>>>
>>> Also, when "skb != NULL", I don't understand why the 
>>> "MAX_TCP_OPTION_SPACE - size" is still being considered. I am likely 
>>> missing something here. If the above formula is correct, why 
>>> reserved_opt_spc is not always used for the "skb != NULL" case and 
>>> still need to be compared with the "MAX_TCP_OPTION_SPACE - size"?
>>>
>>
>> Cases I can think of are as follows,
>> - When it's not a GSO skb, tcp_skb_seglen will simply return skb->len,
>> it might make `tp->mss_cache - tcp_skb_seglen(skb)` a large number.
>>
>> - When we are in tcp_mtu_probe, tp->mss_cache will be smaller than
>> tcp_skb_seglen(skb), which makes the equation again a large number.
> 
> In tcp_mtu_probe, mss_cache is (smaller) than the tcp_skb_seglen(skb)?
> 

```
tcp_init_tso_segs(nskb, nskb->len);
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) ...
```

In the tcp_transmit_skb inside tcp_mtu_probe, it tries to send an skb
with larger mss, so I assume mss_cache will be smaller than
tcp_skb_seglen(skb). Sorry for the confusion here.

>>
>>
>> "MAX_TCP_OPTION_SPACE - size" is considered here as the strict upper
>> bound, while reserved_opt_spc is expected to be a stricter upper bound.
>> In the above or other cases, where the equation malfunctioned, we can
>> always fall back to the original bound.
> 
> Make sense. Thanks for the explanation. The commit message could have 
> used this details.
> 

No problem ;)

>>
>>
>> I am not sure which way to get the reserved size is better,
>> 1. We could precisely cache the result of the reserved size, may
>> need to have a new field for it, I agree that a field in tcp_sock
>> is overkill.
>>
>> 2. In this patch, we use an equation to infer the value of it. There are
>> some concerns with tp->mss_cache and tcp_skb_seglen(skb) here, I agree.
>> If tp->mss_cache and tcp_skb_seglen(skb) might not be in-sync, we may
>> have an underestimated reserved_opt_spc, it may break existing bpf
>> progs. If this method is preferred I will do more investigations to
>> verify it or modify the equation :)
> 
> imo, the bpf prog can always use bpf_sk_storage to add fields to a sock 
> and store the previous reserved space there. I think this is the 
> solution you should use in your use case which randomly reserves header 
> space. Adding a field in the tcp_sock feels wrong especially the only 
> use case I am hearing so far is a bpf prog randomly reserving header 
> spaces.

Agree, thanks for the suggestion.

> 
> If (2) can be convinced to be correct, improving bpf_reserve_hdr_opt() 
> is fine as long as it does not break the existing program. I expect some 
> tests to cover this.
> 

Got it, will take a further look and do more tests.
Martin KaFai Lau Sept. 5, 2024, 9:07 p.m. UTC | #11
On 9/5/24 1:20 PM, Zijian Zhang wrote:
>>>
>>> Cases I can think of are as follows,
>>> - When it's not a GSO skb, tcp_skb_seglen will simply return skb->len,
>>> it might make `tp->mss_cache - tcp_skb_seglen(skb)` a large number.
>>>
>>> - When we are in tcp_mtu_probe, tp->mss_cache will be smaller than
>>> tcp_skb_seglen(skb), which makes the equation again a large number.
>>
>> In tcp_mtu_probe, mss_cache is (smaller) than the tcp_skb_seglen(skb)?
>>
> 
> ```
> tcp_init_tso_segs(nskb, nskb->len);
> if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) ...
> ```
> 
> In the tcp_transmit_skb inside tcp_mtu_probe, it tries to send an skb
> with larger mss, so I assume mss_cache will be smaller than
> tcp_skb_seglen(skb). Sorry for the confusion here.

hmm... "mss_cache - tcp_skb_seglen(skb)" and mss_cache could be smaller...
This is another signal that this approach does not sound right. I am not 
positive tbh. Given that I have already suggested more than one other ways. If 
you really eager to pursue this route to improve bpf_reserve_hdr_opt(), the 
tests coverage has to be convincing enough to cover corner cases like this for 
example.

pw-bot: cr
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2aac11e7e1cc..e202eeb19be4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1058,6 +1058,14 @@  static inline int tcp_skb_mss(const struct sk_buff *skb)
 	return TCP_SKB_CB(skb)->tcp_gso_size;
 }
 
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static inline int tcp_skb_seglen(const struct sk_buff *skb)
+{
+	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
 static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
 {
 	return likely(!TCP_SKB_CB(skb)->eor);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e37488d3453f..c1ffe19b0717 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1550,14 +1550,6 @@  static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	return true;
 }
 
-/* I wish gso_size would have a bit more sane initialization than
- * something-or-zero which complicates things
- */
-static int tcp_skb_seglen(const struct sk_buff *skb)
-{
-	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
-}
-
 /* Shifting pages past head area doesn't work */
 static int skb_can_shift(const struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 16c48df8df4c..f5996cdbb2ba 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1033,10 +1033,19 @@  static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
 					    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
 		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+		unsigned int old_remaining;
 
-		bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+		if (skb) {
+			unsigned int reserved_opt_spc;
+
+			reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb);
+			if (reserved_opt_spc < remaining)
+				remaining = reserved_opt_spc;
+		}
 
-		size = MAX_TCP_OPTION_SPACE - remaining;
+		old_remaining = remaining;
+		bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+		size += old_remaining - remaining;
 	}
 
 	return size;