diff mbox series

[bpf-next,09/10] bpf: Add a helper to issue timestamp cookies in XDP

Message ID 20211019144655.3483197-10-maximmi@nvidia.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series New BPF helpers to accelerate synproxy | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/cover_letter success Series has a cover letter
netdev/fixes_present success Fixes tag not required for -next series
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 3 maintainers not CCed: llvm@lists.linux.dev brouer@redhat.com liuhangbin@gmail.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 11790 this patch: 11790
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success No Fixes tag
netdev/checkpatch warning WARNING: line length of 92 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 11421 this patch: 11421
netdev/header_inline success No static functions without inline keyword in header files
bpf/vmtest-bpf-next success VM_Test

Commit Message

Maxim Mikityanskiy Oct. 19, 2021, 2:46 p.m. UTC
The new helper bpf_tcp_raw_gen_tscookie allows an XDP program to
generate timestamp cookies (to be used together with SYN cookies) which
encode different options set by the client in the SYN packet: SACK
support, ECN support, window scale. These options are encoded in lower
bits of the timestamp, which will be returned by the client in a
subsequent ACK packet. The format is the same used by synproxy.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
---
 include/net/tcp.h              |  1 +
 include/uapi/linux/bpf.h       | 27 +++++++++++++++
 net/core/filter.c              | 38 +++++++++++++++++++++
 net/ipv4/syncookies.c          | 60 ++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 27 +++++++++++++++
 5 files changed, 153 insertions(+)

Comments

Eric Dumazet Oct. 19, 2021, 4:45 p.m. UTC | #1
On 10/19/21 7:46 AM, Maxim Mikityanskiy wrote:
> The new helper bpf_tcp_raw_gen_tscookie allows an XDP program to
> generate timestamp cookies (to be used together with SYN cookies) which
> encode different options set by the client in the SYN packet: SACK
> support, ECN support, window scale. These options are encoded in lower
> bits of the timestamp, which will be returned by the client in a
> subsequent ACK packet. The format is the same used by synproxy.
> 
> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
> Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
> ---
>  include/net/tcp.h              |  1 +
>  include/uapi/linux/bpf.h       | 27 +++++++++++++++
>  net/core/filter.c              | 38 +++++++++++++++++++++
>  net/ipv4/syncookies.c          | 60 ++++++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h | 27 +++++++++++++++
>  5 files changed, 153 insertions(+)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 1cc96a225848..651820bef6a2 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -564,6 +564,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
>  			      u16 *mssp);
>  __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
>  u64 cookie_init_timestamp(struct request_sock *req, u64 now);
> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr);
>  bool cookie_timestamp_decode(const struct net *net,
>  			     struct tcp_options_received *opt);
>  bool cookie_ecn_ok(const struct tcp_options_received *opt,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e32f72077250..791790b41874 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5053,6 +5053,32 @@ union bpf_attr {
>   *
>   *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
>   *		CONFIG_IPV6 is disabled).
> + *
> + * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
> + *	Description
> + *		Try to generate a timestamp cookie which encodes some of the
> + *		flags sent by the client in the SYN packet: SACK support, ECN
> + *		support, window scale. To be used with SYN cookies.
> + *
> + *		*th* points to the start of the TCP header of the client's SYN
> + *		packet, while *th_len* contains the length of the TCP header (at
> + *		least **sizeof**\ (**struct tcphdr**)).
> + *
> + *		*tsopt* points to the output location where to put the resulting
> + *		timestamp values: tsval and tsecr, in the format of the TCP
> + *		timestamp option.
> + *
> + *	Return
> + *		On success, 0.
> + *
> + *		On failure, the returned value is one of the following:
> + *
> + *		**-EINVAL** if the input arguments are invalid.
> + *
> + *		**-ENOENT** if the TCP header doesn't have the timestamp option.
> + *
> + *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
> + *		cookies (CONFIG_SYN_COOKIES is off).
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -5238,6 +5264,7 @@ union bpf_attr {
>  	FN(ct_release),			\
>  	FN(tcp_raw_gen_syncookie),	\
>  	FN(tcp_raw_check_syncookie),	\
> +	FN(tcp_raw_gen_tscookie),	\
>  	/* */
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 5f03d4a282a0..73fe20ef7442 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -7403,6 +7403,42 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_proto = {
>  	.arg4_type	= ARG_CONST_SIZE,
>  };
>  
> +BPF_CALL_4(bpf_tcp_raw_gen_tscookie, struct tcphdr *, th, u32, th_len,
> +	   __be32 *, tsopt, u32, tsopt_len)
> +{
> +	int err;
> +
> +#ifdef CONFIG_SYN_COOKIES
> +	if (tsopt_len != sizeof(u64)) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!cookie_init_timestamp_raw(th, &tsopt[0], &tsopt[1])) {
> +		err = -ENOENT;
> +		goto err_out;
> +	}
> +
> +	return 0;
> +err_out:
> +#else
> +	err = -EOPNOTSUPP;
> +#endif
> +	memset(tsopt, 0, tsopt_len);
> +	return err;
> +}
> +
> +static const struct bpf_func_proto bpf_tcp_raw_gen_tscookie_proto = {
> +	.func		= bpf_tcp_raw_gen_tscookie,
> +	.gpl_only	= false,
> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_MEM,
> +	.arg2_type	= ARG_CONST_SIZE,
> +	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
> +	.arg4_type	= ARG_CONST_SIZE,
> +};
> +
>  #endif /* CONFIG_INET */
>  
>  bool bpf_helper_changes_pkt_data(void *func)
> @@ -7825,6 +7861,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_tcp_raw_gen_syncookie_proto;
>  	case BPF_FUNC_tcp_raw_check_syncookie:
>  		return &bpf_tcp_raw_check_syncookie_proto;
> +	case BPF_FUNC_tcp_raw_gen_tscookie:
> +		return &bpf_tcp_raw_gen_tscookie_proto;
>  #endif
>  	default:
>  		return bpf_sk_base_func_proto(func_id);
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index 8696dc343ad2..4dd2c7a096eb 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -85,6 +85,66 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
>  	return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
>  }
>  
> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
> +{
> +	int length = (th->doff * 4) - sizeof(*th);
> +	u8 wscale = TS_OPT_WSCALE_MASK;
> +	bool option_timestamp = false;
> +	bool option_sack = false;
> +	u32 cookie;
> +	u8 *ptr;
> +
> +	ptr = (u8 *)(th + 1);
> +
> +	while (length > 0) {
> +		u8 opcode = *ptr++;
> +		u8 opsize;
> +
> +		if (opcode == TCPOPT_EOL)
> +			break;
> +		if (opcode == TCPOPT_NOP) {
> +			length--;
> +			continue;
> +		}
> +
> +		if (length < 2)
> +			break;
> +		opsize = *ptr++;
> +		if (opsize < 2)
> +			break;
> +		if (opsize > length)
> +			break;
> +
> +		switch (opcode) {
> +		case TCPOPT_WINDOW:

You must check osize.

> +			wscale = min_t(u8, *ptr, TCP_MAX_WSCALE);
> +			break;
> +		case TCPOPT_TIMESTAMP:

You must check opsize.

> +			option_timestamp = true;
> +			/* Client's tsval becomes our tsecr. */
> +			*tsecr = cpu_to_be32(get_unaligned_be32(ptr));

Please avoid useless ntohl/htonl dance (even if compiler probably optimizes this)
No need to obfuscate :)

			*tsecr = get_unaligned((__be32 *)ptr);

> +			break;
> +		case TCPOPT_SACK_PERM:
> +			option_sack = true;
> +			break;
> +		}
> +
> +		ptr += opsize - 2;
> +		length -= opsize;
> +	}
> +
> +	if (!option_timestamp)
> +		return false;
> +
> +	cookie = tcp_time_stamp_raw() & ~TSMASK;
> +	cookie |= wscale & TS_OPT_WSCALE_MASK;
> +	if (option_sack)
> +		cookie |= TS_OPT_SACK;
> +	if (th->ece && th->cwr)
> +		cookie |= TS_OPT_ECN;
> +	*tsval = cpu_to_be32(cookie);
> +	return true;
> +}
>  
>  static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
>  				   __be16 dport, __u32 sseq, __u32 data)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index e32f72077250..791790b41874 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -5053,6 +5053,32 @@ union bpf_attr {
>   *
>   *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
>   *		CONFIG_IPV6 is disabled).
> + *
> + * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
> + *	Description
> + *		Try to generate a timestamp cookie which encodes some of the
> + *		flags sent by the client in the SYN packet: SACK support, ECN
> + *		support, window scale. To be used with SYN cookies.
> + *
> + *		*th* points to the start of the TCP header of the client's SYN
> + *		packet, while *th_len* contains the length of the TCP header (at
> + *		least **sizeof**\ (**struct tcphdr**)).
> + *
> + *		*tsopt* points to the output location where to put the resulting
> + *		timestamp values: tsval and tsecr, in the format of the TCP
> + *		timestamp option.
> + *
> + *	Return
> + *		On success, 0.
> + *
> + *		On failure, the returned value is one of the following:
> + *
> + *		**-EINVAL** if the input arguments are invalid.
> + *
> + *		**-ENOENT** if the TCP header doesn't have the timestamp option.
> + *
> + *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
> + *		cookies (CONFIG_SYN_COOKIES is off).
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -5238,6 +5264,7 @@ union bpf_attr {
>  	FN(ct_release),			\
>  	FN(tcp_raw_gen_syncookie),	\
>  	FN(tcp_raw_check_syncookie),	\
> +	FN(tcp_raw_gen_tscookie),	\
>  	/* */
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>
Maxim Mikityanskiy Oct. 20, 2021, 1:16 p.m. UTC | #2
On 2021-10-19 19:45, Eric Dumazet wrote:
> 
> 
> On 10/19/21 7:46 AM, Maxim Mikityanskiy wrote:
>> The new helper bpf_tcp_raw_gen_tscookie allows an XDP program to
>> generate timestamp cookies (to be used together with SYN cookies) which
>> encode different options set by the client in the SYN packet: SACK
>> support, ECN support, window scale. These options are encoded in lower
>> bits of the timestamp, which will be returned by the client in a
>> subsequent ACK packet. The format is the same used by synproxy.
>>
>> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
>> Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
>> ---
>>   include/net/tcp.h              |  1 +
>>   include/uapi/linux/bpf.h       | 27 +++++++++++++++
>>   net/core/filter.c              | 38 +++++++++++++++++++++
>>   net/ipv4/syncookies.c          | 60 ++++++++++++++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h | 27 +++++++++++++++
>>   5 files changed, 153 insertions(+)
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 1cc96a225848..651820bef6a2 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -564,6 +564,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
>>   			      u16 *mssp);
>>   __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
>>   u64 cookie_init_timestamp(struct request_sock *req, u64 now);
>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr);
>>   bool cookie_timestamp_decode(const struct net *net,
>>   			     struct tcp_options_received *opt);
>>   bool cookie_ecn_ok(const struct tcp_options_received *opt,
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index e32f72077250..791790b41874 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -5053,6 +5053,32 @@ union bpf_attr {
>>    *
>>    *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
>>    *		CONFIG_IPV6 is disabled).
>> + *
>> + * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
>> + *	Description
>> + *		Try to generate a timestamp cookie which encodes some of the
>> + *		flags sent by the client in the SYN packet: SACK support, ECN
>> + *		support, window scale. To be used with SYN cookies.
>> + *
>> + *		*th* points to the start of the TCP header of the client's SYN
>> + *		packet, while *th_len* contains the length of the TCP header (at
>> + *		least **sizeof**\ (**struct tcphdr**)).
>> + *
>> + *		*tsopt* points to the output location where to put the resulting
>> + *		timestamp values: tsval and tsecr, in the format of the TCP
>> + *		timestamp option.
>> + *
>> + *	Return
>> + *		On success, 0.
>> + *
>> + *		On failure, the returned value is one of the following:
>> + *
>> + *		**-EINVAL** if the input arguments are invalid.
>> + *
>> + *		**-ENOENT** if the TCP header doesn't have the timestamp option.
>> + *
>> + *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
>> + *		cookies (CONFIG_SYN_COOKIES is off).
>>    */
>>   #define __BPF_FUNC_MAPPER(FN)		\
>>   	FN(unspec),			\
>> @@ -5238,6 +5264,7 @@ union bpf_attr {
>>   	FN(ct_release),			\
>>   	FN(tcp_raw_gen_syncookie),	\
>>   	FN(tcp_raw_check_syncookie),	\
>> +	FN(tcp_raw_gen_tscookie),	\
>>   	/* */
>>   
>>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 5f03d4a282a0..73fe20ef7442 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -7403,6 +7403,42 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_proto = {
>>   	.arg4_type	= ARG_CONST_SIZE,
>>   };
>>   
>> +BPF_CALL_4(bpf_tcp_raw_gen_tscookie, struct tcphdr *, th, u32, th_len,
>> +	   __be32 *, tsopt, u32, tsopt_len)
>> +{
>> +	int err;
>> +
>> +#ifdef CONFIG_SYN_COOKIES
>> +	if (tsopt_len != sizeof(u64)) {
>> +		err = -EINVAL;
>> +		goto err_out;
>> +	}
>> +
>> +	if (!cookie_init_timestamp_raw(th, &tsopt[0], &tsopt[1])) {
>> +		err = -ENOENT;
>> +		goto err_out;
>> +	}
>> +
>> +	return 0;
>> +err_out:
>> +#else
>> +	err = -EOPNOTSUPP;
>> +#endif
>> +	memset(tsopt, 0, tsopt_len);
>> +	return err;
>> +}
>> +
>> +static const struct bpf_func_proto bpf_tcp_raw_gen_tscookie_proto = {
>> +	.func		= bpf_tcp_raw_gen_tscookie,
>> +	.gpl_only	= false,
>> +	.pkt_access	= true,
>> +	.ret_type	= RET_INTEGER,
>> +	.arg1_type	= ARG_PTR_TO_MEM,
>> +	.arg2_type	= ARG_CONST_SIZE,
>> +	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
>> +	.arg4_type	= ARG_CONST_SIZE,
>> +};
>> +
>>   #endif /* CONFIG_INET */
>>   
>>   bool bpf_helper_changes_pkt_data(void *func)
>> @@ -7825,6 +7861,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>>   		return &bpf_tcp_raw_gen_syncookie_proto;
>>   	case BPF_FUNC_tcp_raw_check_syncookie:
>>   		return &bpf_tcp_raw_check_syncookie_proto;
>> +	case BPF_FUNC_tcp_raw_gen_tscookie:
>> +		return &bpf_tcp_raw_gen_tscookie_proto;
>>   #endif
>>   	default:
>>   		return bpf_sk_base_func_proto(func_id);
>> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
>> index 8696dc343ad2..4dd2c7a096eb 100644
>> --- a/net/ipv4/syncookies.c
>> +++ b/net/ipv4/syncookies.c
>> @@ -85,6 +85,66 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
>>   	return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
>>   }
>>   
>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
>> +{
>> +	int length = (th->doff * 4) - sizeof(*th);
>> +	u8 wscale = TS_OPT_WSCALE_MASK;
>> +	bool option_timestamp = false;
>> +	bool option_sack = false;
>> +	u32 cookie;
>> +	u8 *ptr;
>> +
>> +	ptr = (u8 *)(th + 1);
>> +
>> +	while (length > 0) {
>> +		u8 opcode = *ptr++;
>> +		u8 opsize;
>> +
>> +		if (opcode == TCPOPT_EOL)
>> +			break;
>> +		if (opcode == TCPOPT_NOP) {
>> +			length--;
>> +			continue;
>> +		}
>> +
>> +		if (length < 2)
>> +			break;
>> +		opsize = *ptr++;
>> +		if (opsize < 2)
>> +			break;
>> +		if (opsize > length)
>> +			break;
>> +
>> +		switch (opcode) {
>> +		case TCPOPT_WINDOW:
> 
> You must check osize.
> 
>> +			wscale = min_t(u8, *ptr, TCP_MAX_WSCALE);
>> +			break;
>> +		case TCPOPT_TIMESTAMP:
> 
> You must check opsize.

Ack.

>> +			option_timestamp = true;
>> +			/* Client's tsval becomes our tsecr. */
>> +			*tsecr = cpu_to_be32(get_unaligned_be32(ptr));
> 
> Please avoid useless ntohl/htonl dance (even if compiler probably optimizes this)
> No need to obfuscate :)

No obfuscation intended - I thought I was clearer this way. I can change it.

Thanks for reviewing!

> 
> 			*tsecr = get_unaligned((__be32 *)ptr);
> 
>> +			break;
>> +		case TCPOPT_SACK_PERM:
>> +			option_sack = true;
>> +			break;
>> +		}
>> +
>> +		ptr += opsize - 2;
>> +		length -= opsize;
>> +	}
>> +
>> +	if (!option_timestamp)
>> +		return false;
>> +
>> +	cookie = tcp_time_stamp_raw() & ~TSMASK;
>> +	cookie |= wscale & TS_OPT_WSCALE_MASK;
>> +	if (option_sack)
>> +		cookie |= TS_OPT_SACK;
>> +	if (th->ece && th->cwr)
>> +		cookie |= TS_OPT_ECN;
>> +	*tsval = cpu_to_be32(cookie);
>> +	return true;
>> +}
>>   
>>   static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
>>   				   __be16 dport, __u32 sseq, __u32 data)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index e32f72077250..791790b41874 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -5053,6 +5053,32 @@ union bpf_attr {
>>    *
>>    *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
>>    *		CONFIG_IPV6 is disabled).
>> + *
>> + * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
>> + *	Description
>> + *		Try to generate a timestamp cookie which encodes some of the
>> + *		flags sent by the client in the SYN packet: SACK support, ECN
>> + *		support, window scale. To be used with SYN cookies.
>> + *
>> + *		*th* points to the start of the TCP header of the client's SYN
>> + *		packet, while *th_len* contains the length of the TCP header (at
>> + *		least **sizeof**\ (**struct tcphdr**)).
>> + *
>> + *		*tsopt* points to the output location where to put the resulting
>> + *		timestamp values: tsval and tsecr, in the format of the TCP
>> + *		timestamp option.
>> + *
>> + *	Return
>> + *		On success, 0.
>> + *
>> + *		On failure, the returned value is one of the following:
>> + *
>> + *		**-EINVAL** if the input arguments are invalid.
>> + *
>> + *		**-ENOENT** if the TCP header doesn't have the timestamp option.
>> + *
>> + *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
>> + *		cookies (CONFIG_SYN_COOKIES is off).
>>    */
>>   #define __BPF_FUNC_MAPPER(FN)		\
>>   	FN(unspec),			\
>> @@ -5238,6 +5264,7 @@ union bpf_attr {
>>   	FN(ct_release),			\
>>   	FN(tcp_raw_gen_syncookie),	\
>>   	FN(tcp_raw_check_syncookie),	\
>> +	FN(tcp_raw_gen_tscookie),	\
>>   	/* */
>>   
>>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>
Lorenz Bauer Oct. 20, 2021, 3:56 p.m. UTC | #3
On Tue, 19 Oct 2021 at 15:49, Maxim Mikityanskiy <maximmi@nvidia.com> wrote:
>
> The new helper bpf_tcp_raw_gen_tscookie allows an XDP program to
> generate timestamp cookies (to be used together with SYN cookies) which
> encode different options set by the client in the SYN packet: SACK
> support, ECN support, window scale. These options are encoded in lower
> bits of the timestamp, which will be returned by the client in a
> subsequent ACK packet. The format is the same used by synproxy.
>
> Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
> Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
> ---
>  include/net/tcp.h              |  1 +
>  include/uapi/linux/bpf.h       | 27 +++++++++++++++
>  net/core/filter.c              | 38 +++++++++++++++++++++
>  net/ipv4/syncookies.c          | 60 ++++++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h | 27 +++++++++++++++
>  5 files changed, 153 insertions(+)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 1cc96a225848..651820bef6a2 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -564,6 +564,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
>                               u16 *mssp);
>  __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
>  u64 cookie_init_timestamp(struct request_sock *req, u64 now);
> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr);
>  bool cookie_timestamp_decode(const struct net *net,
>                              struct tcp_options_received *opt);
>  bool cookie_ecn_ok(const struct tcp_options_received *opt,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e32f72077250..791790b41874 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5053,6 +5053,32 @@ union bpf_attr {
>   *
>   *             **-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
>   *             CONFIG_IPV6 is disabled).
> + *
> + * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)

flags which must be 0?

> + *     Description
> + *             Try to generate a timestamp cookie which encodes some of the
> + *             flags sent by the client in the SYN packet: SACK support, ECN
> + *             support, window scale. To be used with SYN cookies.
> + *
> + *             *th* points to the start of the TCP header of the client's SYN
> + *             packet, while *th_len* contains the length of the TCP header (at
> + *             least **sizeof**\ (**struct tcphdr**)).
> + *
> + *             *tsopt* points to the output location where to put the resulting
> + *             timestamp values: tsval and tsecr, in the format of the TCP
> + *             timestamp option.
> + *
> + *     Return
> + *             On success, 0.
> + *
> + *             On failure, the returned value is one of the following:
> + *
> + *             **-EINVAL** if the input arguments are invalid.
> + *
> + *             **-ENOENT** if the TCP header doesn't have the timestamp option.
> + *
> + *             **-EOPNOTSUPP** if the kernel configuration does not enable SYN
> + *             cookies (CONFIG_SYN_COOKIES is off).
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -5238,6 +5264,7 @@ union bpf_attr {
>         FN(ct_release),                 \
>         FN(tcp_raw_gen_syncookie),      \
>         FN(tcp_raw_check_syncookie),    \
> +       FN(tcp_raw_gen_tscookie),       \
>         /* */
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 5f03d4a282a0..73fe20ef7442 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -7403,6 +7403,42 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_proto = {
>         .arg4_type      = ARG_CONST_SIZE,
>  };
>
> +BPF_CALL_4(bpf_tcp_raw_gen_tscookie, struct tcphdr *, th, u32, th_len,
> +          __be32 *, tsopt, u32, tsopt_len)
> +{
> +       int err;

Missing check for th_len?

> +
> +#ifdef CONFIG_SYN_COOKIES
> +       if (tsopt_len != sizeof(u64)) {

sizeof(u32) * 2? That u64 isn't really relevant here.

> +               err = -EINVAL;
> +               goto err_out;
> +       }
> +
> +       if (!cookie_init_timestamp_raw(th, &tsopt[0], &tsopt[1])) {
> +               err = -ENOENT;
> +               goto err_out;
> +       }
> +
> +       return 0;
> +err_out:
> +#else
> +       err = -EOPNOTSUPP;
> +#endif
> +       memset(tsopt, 0, tsopt_len);
> +       return err;
> +}
> +
> +static const struct bpf_func_proto bpf_tcp_raw_gen_tscookie_proto = {
> +       .func           = bpf_tcp_raw_gen_tscookie,
> +       .gpl_only       = false,
> +       .pkt_access     = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_MEM,
> +       .arg2_type      = ARG_CONST_SIZE,
> +       .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
> +       .arg4_type      = ARG_CONST_SIZE,
> +};
> +
>  #endif /* CONFIG_INET */
>
>  bool bpf_helper_changes_pkt_data(void *func)
> @@ -7825,6 +7861,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>                 return &bpf_tcp_raw_gen_syncookie_proto;
>         case BPF_FUNC_tcp_raw_check_syncookie:
>                 return &bpf_tcp_raw_check_syncookie_proto;
> +       case BPF_FUNC_tcp_raw_gen_tscookie:
> +               return &bpf_tcp_raw_gen_tscookie_proto;
>  #endif
>         default:
>                 return bpf_sk_base_func_proto(func_id);
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index 8696dc343ad2..4dd2c7a096eb 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -85,6 +85,66 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
>         return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
>  }
>
> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)

I'm probably missing context, Is there something in this function that
means you can't implement it in BPF?

Lorenz

--
Lorenz Bauer  |  Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK

www.cloudflare.com
Toke Høiland-Jørgensen Oct. 20, 2021, 4:16 p.m. UTC | #4
Lorenz Bauer <lmb@cloudflare.com> writes:

>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
>
> I'm probably missing context, Is there something in this function that
> means you can't implement it in BPF?

I was about to reply with some other comments but upon closer inspection
I ended up at the same conclusion: this helper doesn't seem to be needed
at all?

-Toke
Maxim Mikityanskiy Oct. 22, 2021, 4:56 p.m. UTC | #5
On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
> Lorenz Bauer <lmb@cloudflare.com> writes:
> 
>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
>>
>> I'm probably missing context, Is there something in this function that
>> means you can't implement it in BPF?
> 
> I was about to reply with some other comments but upon closer inspection
> I ended up at the same conclusion: this helper doesn't seem to be needed
> at all?

tcp_time_stamp_raw() uses ktime_get_ns(), while bpf_ktime_get_ns() uses 
ktime_get_mono_fast_ns(). Is it fine to use ktime_get_mono_fast_ns() 
instead of ktime_get_ns()? I'm a bit worried about this note in 
Documentation/core-api/timekeeping.rst:

 > most drivers should never call them,
 > since the time is allowed to jump under certain conditions.
Lorenz Bauer Oct. 27, 2021, 8:34 a.m. UTC | #6
On Fri, 22 Oct 2021 at 17:56, Maxim Mikityanskiy <maximmi@nvidia.com> wrote:
>
> tcp_time_stamp_raw() uses ktime_get_ns(), while bpf_ktime_get_ns() uses
> ktime_get_mono_fast_ns(). Is it fine to use ktime_get_mono_fast_ns()
> instead of ktime_get_ns()? I'm a bit worried about this note in
> Documentation/core-api/timekeeping.rst:
>
>  > most drivers should never call them,
>  > since the time is allowed to jump under certain conditions.

That depends on what happens when the timestamp is "off". Since you're
sending this value over the network I doubt that the two methods will
show a difference.

Lorenz
Maxim Mikityanskiy Nov. 1, 2021, 11:14 a.m. UTC | #7
On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
> Lorenz Bauer <lmb@cloudflare.com> writes:
> 
>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
>>
>> I'm probably missing context, Is there something in this function that
>> means you can't implement it in BPF?
> 
> I was about to reply with some other comments but upon closer inspection
> I ended up at the same conclusion: this helper doesn't seem to be needed
> at all?

After trying to put this code into BPF (replacing the underlying 
ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues with 
passing the verifier.

In addition to comparing ptr to end, I had to add checks that compare 
ptr to data_end, because the verifier can't deduce that end <= data_end. 
More branches will add a certain slowdown (not measured).

A more serious issue is the overall program complexity. Even though the 
loop over the TCP options has an upper bound, and the pointer advances 
by at least one byte every iteration, I had to limit the total number of 
iterations artificially. The maximum number of iterations that makes the 
verifier happy is 10. With more iterations, I have the following error:

BPF program is too large. Processed 1000001 insn 
 
 
                        processed 1000001 insns (limit 1000000) 
max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45

I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the accumulated 
amount of instructions that the verifier can process in all branches, is 
that right? It doesn't look realistic that my program can run 1 million 
instructions in a single run, but it might be that if you take all 
possible flows and add up the instructions from these flows, it will 
exceed 1 million.

The limitation of maximum 10 TCP options might be not enough, given that 
valid packets are permitted to include more than 10 NOPs. An alternative 
of using bpf_load_hdr_opt and calling it three times doesn't look good 
either, because it will be about three times slower than going over the 
options once. So maybe having a helper for that is better than trying to 
fit it into BPF?

One more interesting fact is the time that it takes for the verifier to 
check my program. If it's limited to 10 iterations, it does it pretty 
fast, but if I try to increase the number to 11 iterations, it takes 
several minutes for the verifier to reach 1 million instructions and 
print the error then. I also tried grouping the NOPs in an inner loop to 
count only 10 real options, and the verifier has been running for a few 
hours without any response. Is it normal? Commit c04c0d2b968a ("bpf: 
increase complexity limit and maximum program size") says it shouldn't 
take more than one second in any case.

Thanks,
Max
Yonghong Song Nov. 3, 2021, 2:10 a.m. UTC | #8
On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>
>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, 
>>>> __be32 *tsecr)
>>>
>>> I'm probably missing context, Is there something in this function that
>>> means you can't implement it in BPF?
>>
>> I was about to reply with some other comments but upon closer inspection
>> I ended up at the same conclusion: this helper doesn't seem to be needed
>> at all?
> 
> After trying to put this code into BPF (replacing the underlying 
> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues with 
> passing the verifier.
> 
> In addition to comparing ptr to end, I had to add checks that compare 
> ptr to data_end, because the verifier can't deduce that end <= data_end. 
> More branches will add a certain slowdown (not measured).
> 
> A more serious issue is the overall program complexity. Even though the 
> loop over the TCP options has an upper bound, and the pointer advances 
> by at least one byte every iteration, I had to limit the total number of 
> iterations artificially. The maximum number of iterations that makes the 
> verifier happy is 10. With more iterations, I have the following error:
> 
> BPF program is too large. Processed 1000001 insn
> 
>                         processed 1000001 insns (limit 1000000) 
> max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45
> 
> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the accumulated 
> amount of instructions that the verifier can process in all branches, is 
> that right? It doesn't look realistic that my program can run 1 million 
> instructions in a single run, but it might be that if you take all 
> possible flows and add up the instructions from these flows, it will 
> exceed 1 million.
> 
> The limitation of maximum 10 TCP options might be not enough, given that 
> valid packets are permitted to include more than 10 NOPs. An alternative 
> of using bpf_load_hdr_opt and calling it three times doesn't look good 
> either, because it will be about three times slower than going over the 
> options once. So maybe having a helper for that is better than trying to 
> fit it into BPF?
> 
> One more interesting fact is the time that it takes for the verifier to 
> check my program. If it's limited to 10 iterations, it does it pretty 
> fast, but if I try to increase the number to 11 iterations, it takes 
> several minutes for the verifier to reach 1 million instructions and 
> print the error then. I also tried grouping the NOPs in an inner loop to 
> count only 10 real options, and the verifier has been running for a few 
> hours without any response. Is it normal? 

Maxim, this may expose a verifier bug. Do you have a reproducer I can 
access? I would like to debug this to see what is the root case. Thanks!

> Commit c04c0d2b968a ("bpf: 
> increase complexity limit and maximum program size") says it shouldn't 
> take more than one second in any case.
> 
> Thanks,
> Max
Maxim Mikityanskiy Nov. 3, 2021, 2:02 p.m. UTC | #9
On 2021-11-03 04:10, Yonghong Song wrote:
> 
> 
> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>
>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, 
>>>>> __be32 *tsecr)
>>>>
>>>> I'm probably missing context, Is there something in this function that
>>>> means you can't implement it in BPF?
>>>
>>> I was about to reply with some other comments but upon closer inspection
>>> I ended up at the same conclusion: this helper doesn't seem to be needed
>>> at all?
>>
>> After trying to put this code into BPF (replacing the underlying 
>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues with 
>> passing the verifier.
>>
>> In addition to comparing ptr to end, I had to add checks that compare 
>> ptr to data_end, because the verifier can't deduce that end <= 
>> data_end. More branches will add a certain slowdown (not measured).
>>
>> A more serious issue is the overall program complexity. Even though 
>> the loop over the TCP options has an upper bound, and the pointer 
>> advances by at least one byte every iteration, I had to limit the 
>> total number of iterations artificially. The maximum number of 
>> iterations that makes the verifier happy is 10. With more iterations, 
>> I have the following error:
>>
>> BPF program is too large. Processed 1000001 insn
>>
>>                         processed 1000001 insns (limit 1000000) 
>> max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45
>>
>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>> accumulated amount of instructions that the verifier can process in 
>> all branches, is that right? It doesn't look realistic that my program 
>> can run 1 million instructions in a single run, but it might be that 
>> if you take all possible flows and add up the instructions from these 
>> flows, it will exceed 1 million.
>>
>> The limitation of maximum 10 TCP options might be not enough, given 
>> that valid packets are permitted to include more than 10 NOPs. An 
>> alternative of using bpf_load_hdr_opt and calling it three times 
>> doesn't look good either, because it will be about three times slower 
>> than going over the options once. So maybe having a helper for that is 
>> better than trying to fit it into BPF?
>>
>> One more interesting fact is the time that it takes for the verifier 
>> to check my program. If it's limited to 10 iterations, it does it 
>> pretty fast, but if I try to increase the number to 11 iterations, it 
>> takes several minutes for the verifier to reach 1 million instructions 
>> and print the error then. I also tried grouping the NOPs in an inner 
>> loop to count only 10 real options, and the verifier has been running 
>> for a few hours without any response. Is it normal? 
> 
> Maxim, this may expose a verifier bug. Do you have a reproducer I can 
> access? I would like to debug this to see what is the root case. Thanks!

Thanks, I appreciate your help in debugging it. The reproducer is based 
on the modified XDP program from patch 10 in this series. You'll need to 
apply at least patches 6, 7, 8 from this series to get new BPF helpers 
needed for the XDP program (tell me if that's a problem, I can try to 
remove usage of new helpers, but it will affect the program length and 
may produce different results in the verifier).

See the C code of the program that passes the verifier (compiled with 
clang version 12.0.0-1ubuntu1) in the bottom of this email. If you 
increase the loop boundary from 10 to at least 11 in 
cookie_init_timestamp_raw(), it fails the verifier after a few minutes. 
If you apply this tiny change, it fails the verifier after about 3 hours:

--- a/samples/bpf/syncookie_kern.c
+++ b/samples/bpf/syncookie_kern.c
@@ -167,6 +167,7 @@ static __always_inline bool cookie_init_
  	for (i = 0; i < 10; i++) {
  		u8 opcode, opsize;

+skip_nop:
  		if (ptr >= end)
  			break;
  		if (ptr + 1 > data_end)
@@ -178,7 +179,7 @@ static __always_inline bool cookie_init_
  			break;
  		if (opcode == TCPOPT_NOP) {
  			++ptr;
-			continue;
+			goto skip_nop;
  		}

  		if (ptr + 1 >= end)

--cut--

// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights 
reserved. */

#include <stdbool.h>
#include <stddef.h>

#include <uapi/linux/errno.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/pkt_cls.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/in.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>
#include <linux/minmax.h>
#include <vdso/time64.h>
#include <asm/unaligned.h>

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>

#define DEFAULT_MSS4 1460
#define DEFAULT_MSS6 1440
#define DEFAULT_WSCALE 7
#define DEFAULT_TTL 64
#define MAX_ALLOWED_PORTS 8

struct bpf_map_def SEC("maps") values = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(__u32),
	.value_size = sizeof(__u64),
	.max_entries = 2,
};

struct bpf_map_def SEC("maps") allowed_ports = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(__u32),
	.value_size = sizeof(__u16),
	.max_entries = MAX_ALLOWED_PORTS,
};

#define IP_DF 0x4000
#define IP_MF 0x2000
#define IP_OFFSET 0x1fff

#define NEXTHDR_TCP 6

#define TCPOPT_NOP 1
#define TCPOPT_EOL 0
#define TCPOPT_MSS 2
#define TCPOPT_WINDOW 3
#define TCPOPT_SACK_PERM 4
#define TCPOPT_TIMESTAMP 8

#define TCPOLEN_MSS 4
#define TCPOLEN_WINDOW 3
#define TCPOLEN_SACK_PERM 2
#define TCPOLEN_TIMESTAMP 10

#define TCP_MAX_WSCALE 14U

#define TS_OPT_WSCALE_MASK 0xf
#define TS_OPT_SACK BIT(4)
#define TS_OPT_ECN BIT(5)
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)

#define TCP_TS_HZ 1000

#define IPV4_MAXLEN 60
#define TCP_MAXLEN 60

static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
{
	__u8 tmp[ETH_ALEN];

	__builtin_memcpy(tmp, a, ETH_ALEN);
	__builtin_memcpy(a, b, ETH_ALEN);
	__builtin_memcpy(b, tmp, ETH_ALEN);
}

static __always_inline __u16 csum_fold(__u32 csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	csum = (csum & 0xffff) + (csum >> 16);
	return (__u16)~csum;
}

static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
					       __u32 len, __u8 proto,
					       __u32 csum)
{
	__u64 s = csum;

	s += (__u32)saddr;
	s += (__u32)daddr;
#if defined(__BIG_ENDIAN__)
	s += proto + len;
#elif defined(__LITTLE_ENDIAN__)
	s += (proto + len) << 8;
#else
#error Unknown endian
#endif
	s = (s & 0xffffffff) + (s >> 32);
	s = (s & 0xffffffff) + (s >> 32);

	return csum_fold((__u32)s);
}

static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
					     const struct in6_addr *daddr,
					     __u32 len, __u8 proto, __u32 csum)
{
	__u64 sum = csum;
	int i;

#pragma unroll
	for (i = 0; i < 4; i++)
		sum += (__u32)saddr->s6_addr32[i];

#pragma unroll
	for (i = 0; i < 4; i++)
		sum += (__u32)daddr->s6_addr32[i];

	// Don't combine additions to avoid 32-bit overflow.
	sum += bpf_htonl(len);
	sum += bpf_htonl(proto);

	sum = (sum & 0xffffffff) + (sum >> 32);
	sum = (sum & 0xffffffff) + (sum >> 32);

	return csum_fold((__u32)sum);
}

static __always_inline u64 tcp_clock_ns(void)
{
	return bpf_ktime_get_ns();
}

static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
{
	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
}

static __always_inline __u32 tcp_time_stamp_raw(void)
{
	return tcp_ns_to_ts(tcp_clock_ns());
}

static __always_inline bool cookie_init_timestamp_raw(struct tcphdr 
*tcp_header,
						      __u16 tcp_len,
						      __be32 *tsval,
						      __be32 *tsecr,
						      void *data_end)
{
	u8 wscale = TS_OPT_WSCALE_MASK;
	bool option_timestamp = false;
	bool option_sack = false;
	u8 *ptr, *end;
	u32 cookie;
	int i;

	ptr = (u8 *)(tcp_header + 1);
	end = (u8 *)tcp_header + tcp_len;

	for (i = 0; i < 10; i++) {
		u8 opcode, opsize;

		if (ptr >= end)
			break;
		if (ptr + 1 > data_end)
			return false;

		opcode = ptr[0];

		if (opcode == TCPOPT_EOL)
			break;
		if (opcode == TCPOPT_NOP) {
			++ptr;
			continue;
		}

		if (ptr + 1 >= end)
			break;
		if (ptr + 2 > data_end)
			return false;
		opsize = ptr[1];
		if (opsize < 2)
			break;

		if (ptr + opsize > end)
			break;

		switch (opcode) {
		case TCPOPT_WINDOW:
			if (opsize == TCPOLEN_WINDOW) {
				if (ptr + TCPOLEN_WINDOW > data_end)
					return false;
				wscale = min_t(u8, ptr[2], TCP_MAX_WSCALE);
			}
			break;
		case TCPOPT_TIMESTAMP:
			if (opsize == TCPOLEN_TIMESTAMP) {
				if (ptr + TCPOLEN_TIMESTAMP > data_end)
					return false;
				option_timestamp = true;
				/* Client's tsval becomes our tsecr. */
				*tsecr = get_unaligned((__be32 *)(ptr + 2));
			}
			break;
		case TCPOPT_SACK_PERM:
			if (opsize == TCPOLEN_SACK_PERM)
				option_sack = true;
			break;
		}

		ptr += opsize;
	}

	if (!option_timestamp)
		return false;

	cookie = tcp_time_stamp_raw() & ~TSMASK;
	cookie |= wscale & TS_OPT_WSCALE_MASK;
	if (option_sack)
		cookie |= TS_OPT_SACK;
	if (tcp_header->ece && tcp_header->cwr)
		cookie |= TS_OPT_ECN;
	*tsval = cpu_to_be32(cookie);

	return true;
}

static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
						 __u8 *ttl, bool ipv6)
{
	__u32 key = 0;
	__u64 *value;

	value = bpf_map_lookup_elem(&values, &key);
	if (value && *value != 0) {
		if (ipv6)
			*mss = (*value >> 32) & 0xffff;
		else
			*mss = *value & 0xffff;
		*wscale = (*value >> 16) & 0xf;
		*ttl = (*value >> 24) & 0xff;
		return;
	}

	*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
	*wscale = DEFAULT_WSCALE;
	*ttl = DEFAULT_TTL;
}

static __always_inline void values_inc_synacks(void)
{
	__u32 key = 1;
	__u32 *value;

	value = bpf_map_lookup_elem(&values, &key);
	if (value)
		__sync_fetch_and_add(value, 1);
}

static __always_inline bool check_port_allowed(__u16 port)
{
	__u32 i;

	for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
		__u32 key = i;
		__u16 *value;

		value = bpf_map_lookup_elem(&allowed_ports, &key);

		if (!value)
			break;
		// 0 is a terminator value. Check it first to avoid matching on
		// a forbidden port == 0 and returning true.
		if (*value == 0)
			break;

		if (*value == port)
			return true;
	}

	return false;
}

struct header_pointers {
	struct ethhdr *eth;
	struct iphdr *ipv4;
	struct ipv6hdr *ipv6;
	struct tcphdr *tcp;
	__u16 tcp_len;
};

static __always_inline int tcp_dissect(void *data, void *data_end,
				       struct header_pointers *hdr)
{
	hdr->eth = data;
	if (hdr->eth + 1 > data_end)
		return XDP_DROP;

	switch (bpf_ntohs(hdr->eth->h_proto)) {
	case ETH_P_IP:
		hdr->ipv6 = NULL;

		hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
		if (hdr->ipv4 + 1 > data_end)
			return XDP_DROP;
		if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
			return XDP_DROP;
		if (hdr->ipv4->version != 4)
			return XDP_DROP;

		if (hdr->ipv4->protocol != IPPROTO_TCP)
			return XDP_PASS;

		hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
		break;
	case ETH_P_IPV6:
		hdr->ipv4 = NULL;

		hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
		if (hdr->ipv6 + 1 > data_end)
			return XDP_DROP;
		if (hdr->ipv6->version != 6)
			return XDP_DROP;

		// XXX: Extension headers are not supported and could circumvent
		// XDP SYN flood protection.
		if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
			return XDP_PASS;

		hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
		break;
	default:
		// XXX: VLANs will circumvent XDP SYN flood protection.
		return XDP_PASS;
	}

	if (hdr->tcp + 1 > data_end)
		return XDP_DROP;
	hdr->tcp_len = hdr->tcp->doff * 4;
	if (hdr->tcp_len < sizeof(*hdr->tcp))
		return XDP_DROP;

	return XDP_TX;
}

static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, 
__u16 mss,
					  __u8 wscale)
{
	__be32 *start = buf;

	*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);

	if (!tsopt)
		return buf - start;

	if (tsopt[0] & bpf_htonl(1 << 4))
		*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
				   (TCPOLEN_SACK_PERM << 16) |
				   (TCPOPT_TIMESTAMP << 8) |
				   TCPOLEN_TIMESTAMP);
	else
		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
				   (TCPOPT_NOP << 16) |
				   (TCPOPT_TIMESTAMP << 8) |
				   TCPOLEN_TIMESTAMP);
	*buf++ = tsopt[0];
	*buf++ = tsopt[1];

	if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
		*buf++ = bpf_htonl((TCPOPT_NOP << 24) |
				   (TCPOPT_WINDOW << 16) |
				   (TCPOLEN_WINDOW << 8) |
				   wscale);

	return buf - start;
}

static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
					   __u32 cookie, __be32 *tsopt,
					   __u16 mss, __u8 wscale)
{
	void *tcp_options;

	tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
	if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
		tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
	tcp_header->doff = 5; // doff is part of tcp_flag_word.
	swap(tcp_header->source, tcp_header->dest);
	tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
	tcp_header->seq = bpf_htonl(cookie);
	tcp_header->window = 0;
	tcp_header->urg_ptr = 0;
	tcp_header->check = 0; // Rely on hardware checksum offload.

	tcp_options = (void *)(tcp_header + 1);
	tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
}

static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
					     __u32 cookie, __be32 *tsopt)
{
	__u8 wscale;
	__u16 mss;
	__u8 ttl;

	values_get_tcpipopts(&mss, &wscale, &ttl, false);

	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);

	swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
	hdr->ipv4->check = 0; // Rely on hardware checksum offload.
	hdr->ipv4->tos = 0;
	hdr->ipv4->id = 0;
	hdr->ipv4->ttl = ttl;

	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);

	hdr->tcp_len = hdr->tcp->doff * 4;
	hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
}

static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
					     __u32 cookie, __be32 *tsopt)
{
	__u8 wscale;
	__u16 mss;
	__u8 ttl;

	values_get_tcpipopts(&mss, &wscale, &ttl, true);

	swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);

	swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
	*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
	hdr->ipv6->hop_limit = ttl;

	tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);

	hdr->tcp_len = hdr->tcp->doff * 4;
	hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
}

static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
						struct xdp_md *ctx,
						void *data, void *data_end)
{
	__u32 old_pkt_size, new_pkt_size;
	// Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
	// BPF verifier if tsopt is not volatile. Volatile forces it to store
	// the pointer value and use it directly, otherwise tcp_mkoptions is
	// (mis)compiled like this:
	//   if (!tsopt)
	//       return buf - start;
	//   reg = stored_return_value_of_bpf_tcp_raw_gen_tscookie;
	//   if (reg == 0)
	//       tsopt = tsopt_buf;
	//   else
	//       tsopt = NULL;
	//   ...
	//   *buf++ = tsopt[1];
	// It creates a dead branch where tsopt is assigned NULL, but the
	// verifier can't prove it's dead and blocks the program.
	__be32 * volatile tsopt = NULL;
	__be32 tsopt_buf[2];
	void *ip_header;
	__u16 ip_len;
	__u32 cookie;
	__s64 value;

	if (hdr->ipv4) {
		// Check the IPv4 and TCP checksums before creating a SYNACK.
		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
		if (value < 0)
			return XDP_ABORTED;
		if (csum_fold(value) != 0)
			return XDP_DROP; // Bad IPv4 checksum.

		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
		if (value < 0)
			return XDP_ABORTED;
		if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
				      hdr->tcp_len, IPPROTO_TCP, value) != 0)
			return XDP_DROP; // Bad TCP checksum.

		ip_header = hdr->ipv4;
		ip_len = sizeof(*hdr->ipv4);
	} else if (hdr->ipv6) {
		// Check the TCP checksum before creating a SYNACK.
		value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
		if (value < 0)
			return XDP_ABORTED;
		if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
				    hdr->tcp_len, IPPROTO_TCP, value) != 0)
			return XDP_DROP; // Bad TCP checksum.

		ip_header = hdr->ipv6;
		ip_len = sizeof(*hdr->ipv6);
	} else {
		return XDP_ABORTED;
	}

	// Issue SYN cookies on allowed ports, drop SYN packets on
	// blocked ports.
	if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
		return XDP_DROP;

	value = bpf_tcp_raw_gen_syncookie(ip_header, ip_len,
					  (void *)hdr->tcp, hdr->tcp_len);
	if (value < 0)
		return XDP_ABORTED;
	cookie = (__u32)value;

	if (cookie_init_timestamp_raw((void *)hdr->tcp, hdr->tcp_len,
				      &tsopt_buf[0], &tsopt_buf[1], data_end))
		tsopt = tsopt_buf;

	// Check that there is enough space for a SYNACK. It also covers
	// the check that the destination of the __builtin_memmove below
	// doesn't overflow.
	if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
		return XDP_ABORTED;

	if (hdr->ipv4) {
		if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
			struct tcphdr *new_tcp_header;

			new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
			__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
			hdr->tcp = new_tcp_header;

			hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
		}

		tcpv4_gen_synack(hdr, cookie, tsopt);
	} else if (hdr->ipv6) {
		tcpv6_gen_synack(hdr, cookie, tsopt);
	} else {
		return XDP_ABORTED;
	}

	// Recalculate checksums.
	hdr->tcp->check = 0;
	value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
	if (value < 0)
		return XDP_ABORTED;
	if (hdr->ipv4) {
		hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
						    hdr->ipv4->daddr,
						    hdr->tcp_len,
						    IPPROTO_TCP,
						    value);

		hdr->ipv4->check = 0;
		value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
		if (value < 0)
			return XDP_ABORTED;
		hdr->ipv4->check = csum_fold(value);
	} else if (hdr->ipv6) {
		hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
						  &hdr->ipv6->daddr,
						  hdr->tcp_len,
						  IPPROTO_TCP,
						  value);
	} else {
		return XDP_ABORTED;
	}

	// Set the new packet size.
	old_pkt_size = data_end - data;
	new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
	if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
		return XDP_ABORTED;

	values_inc_synacks();

	return XDP_TX;
}

static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
{
	int err;

	if (hdr->ipv4)
		err = bpf_tcp_raw_check_syncookie(hdr->ipv4, sizeof(*hdr->ipv4),
						  (void *)hdr->tcp, hdr->tcp_len);
	else if (hdr->ipv6)
		err = bpf_tcp_raw_check_syncookie(hdr->ipv6, sizeof(*hdr->ipv6),
						  (void *)hdr->tcp, hdr->tcp_len);
	else
		return XDP_ABORTED;
	if (err)
		return XDP_DROP;

	return XDP_PASS;
}

SEC("xdp/syncookie")
int syncookie_xdp(struct xdp_md *ctx)
{
	void *data_end = (void *)(long)ctx->data_end;
	void *data = (void *)(long)ctx->data;
	struct header_pointers hdr;
	struct bpf_sock_tuple tup;
	struct bpf_nf_conn *ct;
	__u32 tup_size;
	__s64 value;
	int ret;

	ret = tcp_dissect(data, data_end, &hdr);
	if (ret != XDP_TX)
		return ret;

	if (hdr.ipv4) {
		// TCP doesn't normally use fragments, and XDP can't reassemble them.
		if ((hdr.ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != 
bpf_htons(IP_DF))
			return XDP_DROP;

		tup.ipv4.saddr = hdr.ipv4->saddr;
		tup.ipv4.daddr = hdr.ipv4->daddr;
		tup.ipv4.sport = hdr.tcp->source;
		tup.ipv4.dport = hdr.tcp->dest;
		tup_size = sizeof(tup.ipv4);
	} else if (hdr.ipv6) {
		__builtin_memcpy(tup.ipv6.saddr, &hdr.ipv6->saddr, 
sizeof(tup.ipv6.saddr));
		__builtin_memcpy(tup.ipv6.daddr, &hdr.ipv6->daddr, 
sizeof(tup.ipv6.daddr));
		tup.ipv6.sport = hdr.tcp->source;
		tup.ipv6.dport = hdr.tcp->dest;
		tup_size = sizeof(tup.ipv6);
	} else {
		// The verifier can't track that either ipv4 or ipv6 is not NULL.
		return XDP_ABORTED;
	}

	value = 0; // Flags.
	ct = bpf_ct_lookup_tcp(ctx, &tup, tup_size, BPF_F_CURRENT_NETNS, &value);
	if (ct) {
		unsigned long status = ct->status;

		bpf_ct_release(ct);
		if (status & IPS_CONFIRMED_BIT)
			return XDP_PASS;
	} else if (value != -ENOENT) {
		return XDP_ABORTED;
	}

	// value == -ENOENT || !(status & IPS_CONFIRMED_BIT)

	if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1)
		return XDP_DROP;

	// Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len
	// to bpf_tcp_raw_gen_syncookie and pass the verifier.
	if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len))
		return XDP_ABORTED;

	data_end = (void *)(long)ctx->data_end;
	data = (void *)(long)ctx->data;

	if (hdr.ipv4) {
		hdr.eth = data;
		hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth);
		// IPV4_MAXLEN is needed when calculating checksum.
		// At least sizeof(struct iphdr) is needed here to access ihl.
		if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end)
			return XDP_ABORTED;
		hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4;
	} else if (hdr.ipv6) {
		hdr.eth = data;
		hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth);
		hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6);
	} else {
		return XDP_ABORTED;
	}

	if ((void *)hdr.tcp + TCP_MAXLEN > data_end)
		return XDP_ABORTED;

	// We run out of registers, tcp_len gets spilled to the stack, and the
	// verifier forgets its min and max values checked above in tcp_dissect.
	hdr.tcp_len = hdr.tcp->doff * 4;
	if (hdr.tcp_len < sizeof(*hdr.tcp))
		return XDP_ABORTED;

	return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) :
			      syncookie_handle_ack(&hdr);
}

SEC("xdp/dummy")
int dummy_xdp(struct xdp_md *ctx)
{
	// veth requires XDP programs to be set on both sides.
	return XDP_PASS;
}

char _license[] SEC("license") = "GPL";
Yonghong Song Nov. 9, 2021, 7:11 a.m. UTC | #10
On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
> On 2021-11-03 04:10, Yonghong Song wrote:
>>
>>
>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>
>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, 
>>>>>> __be32 *tsecr)
>>>>>
>>>>> I'm probably missing context, Is there something in this function that
>>>>> means you can't implement it in BPF?
>>>>
>>>> I was about to reply with some other comments but upon closer 
>>>> inspection
>>>> I ended up at the same conclusion: this helper doesn't seem to be 
>>>> needed
>>>> at all?
>>>
>>> After trying to put this code into BPF (replacing the underlying 
>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues with 
>>> passing the verifier.
>>>
>>> In addition to comparing ptr to end, I had to add checks that compare 
>>> ptr to data_end, because the verifier can't deduce that end <= 
>>> data_end. More branches will add a certain slowdown (not measured).
>>>
>>> A more serious issue is the overall program complexity. Even though 
>>> the loop over the TCP options has an upper bound, and the pointer 
>>> advances by at least one byte every iteration, I had to limit the 
>>> total number of iterations artificially. The maximum number of 
>>> iterations that makes the verifier happy is 10. With more iterations, 
>>> I have the following error:
>>>
>>> BPF program is too large. Processed 1000001 insn
>>>
>>>                         processed 1000001 insns (limit 1000000) 
>>> max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45
>>>
>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>> accumulated amount of instructions that the verifier can process in 
>>> all branches, is that right? It doesn't look realistic that my 
>>> program can run 1 million instructions in a single run, but it might 
>>> be that if you take all possible flows and add up the instructions 
>>> from these flows, it will exceed 1 million.
>>>
>>> The limitation of maximum 10 TCP options might be not enough, given 
>>> that valid packets are permitted to include more than 10 NOPs. An 
>>> alternative of using bpf_load_hdr_opt and calling it three times 
>>> doesn't look good either, because it will be about three times slower 
>>> than going over the options once. So maybe having a helper for that 
>>> is better than trying to fit it into BPF?
>>>
>>> One more interesting fact is the time that it takes for the verifier 
>>> to check my program. If it's limited to 10 iterations, it does it 
>>> pretty fast, but if I try to increase the number to 11 iterations, it 
>>> takes several minutes for the verifier to reach 1 million 
>>> instructions and print the error then. I also tried grouping the NOPs 
>>> in an inner loop to count only 10 real options, and the verifier has 
>>> been running for a few hours without any response. Is it normal? 
>>
>> Maxim, this may expose a verifier bug. Do you have a reproducer I can 
>> access? I would like to debug this to see what is the root case. Thanks!
> 
> Thanks, I appreciate your help in debugging it. The reproducer is based 
> on the modified XDP program from patch 10 in this series. You'll need to 
> apply at least patches 6, 7, 8 from this series to get new BPF helpers 
> needed for the XDP program (tell me if that's a problem, I can try to 
> remove usage of new helpers, but it will affect the program length and 
> may produce different results in the verifier).
> 
> See the C code of the program that passes the verifier (compiled with 
> clang version 12.0.0-1ubuntu1) in the bottom of this email. If you 
> increase the loop boundary from 10 to at least 11 in 
> cookie_init_timestamp_raw(), it fails the verifier after a few minutes. 

I tried to reproduce with latest llvm (llvm-project repo),
loop boundary 10 is okay and 11 exceeds the 1M complexity limit. For 10,
the number of verified instructions is 563626 (more than 0.5M) so it is
totally possible that one more iteration just blows past the limit.


> If you apply this tiny change, it fails the verifier after about 3 hours:
> 
> --- a/samples/bpf/syncookie_kern.c
> +++ b/samples/bpf/syncookie_kern.c
> @@ -167,6 +167,7 @@ static __always_inline bool cookie_init_
>       for (i = 0; i < 10; i++) {
>           u8 opcode, opsize;
> 
> +skip_nop:
>           if (ptr >= end)
>               break;
>           if (ptr + 1 > data_end)
> @@ -178,7 +179,7 @@ static __always_inline bool cookie_init_
>               break;
>           if (opcode == TCPOPT_NOP) {
>               ++ptr;
> -            continue;
> +            goto skip_nop;
>           }
> 
>           if (ptr + 1 >= end)

I tried this as well, with latest llvm, and got the following errors
in ~30 seconds:

......
536: (79) r2 = *(u64 *)(r10 -96)
537: R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) 
R1=pkt(id=9,off=499,r=518,umax_value=60,var_off=(0x0; 0x3c)) 
R2=pkt_end(id=0,off=0,imm=0) 
R3=pkt(id=27,off=14,r=0,umin_value=20,umax_value=120,var_off=(0x0; 
0x7c),s32_min_value=0,s32_max_value=124,u32_max_value=124) R4=invP3 
R5=inv1 R6=ctx(id=0,off=0,imm=0) R7=pkt(id=9,off=519,r=518,umax_va^C
[yhs@devbig309.ftw3 ~/work/bpf-next/samples/bpf] tail -f log
550: (55) if r0 != 0x4 goto pc+4
The sequence of 8193 jumps is too complex.
verification time 30631375 usec
stack depth 160
processed 330595 insns (limit 1000000) max_states_per_insn 4 
total_states 20377 peak_states 100 mark_read 37

With llvm12, I got the similar verification error:

The sequence of 8193 jumps is too complex.
processed 330592 insns (limit 1000000) max_states_per_insn 4 
total_states 20378 peak_states 101 mark_read 37

Could you check again with your experiment which takes 3 hours to
fail? What is the verification failure log?

> 
> --cut--
> 
> // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> /* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights 
> reserved. */
> 
[...]
Maxim Mikityanskiy Nov. 25, 2021, 2:34 p.m. UTC | #11
On 2021-11-09 09:11, Yonghong Song wrote:
> 
> 
> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>
>>>
>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>
>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, 
>>>>>>> __be32 *tsecr)
>>>>>>
>>>>>> I'm probably missing context, Is there something in this function 
>>>>>> that
>>>>>> means you can't implement it in BPF?
>>>>>
>>>>> I was about to reply with some other comments but upon closer 
>>>>> inspection
>>>>> I ended up at the same conclusion: this helper doesn't seem to be 
>>>>> needed
>>>>> at all?
>>>>
>>>> After trying to put this code into BPF (replacing the underlying 
>>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues with 
>>>> passing the verifier.
>>>>
>>>> In addition to comparing ptr to end, I had to add checks that 
>>>> compare ptr to data_end, because the verifier can't deduce that end 
>>>> <= data_end. More branches will add a certain slowdown (not measured).
>>>>
>>>> A more serious issue is the overall program complexity. Even though 
>>>> the loop over the TCP options has an upper bound, and the pointer 
>>>> advances by at least one byte every iteration, I had to limit the 
>>>> total number of iterations artificially. The maximum number of 
>>>> iterations that makes the verifier happy is 10. With more 
>>>> iterations, I have the following error:
>>>>
>>>> BPF program is too large. Processed 1000001 insn
>>>>
>>>>                         processed 1000001 insns (limit 1000000) 
>>>> max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45
>>>>
>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>> accumulated amount of instructions that the verifier can process in 
>>>> all branches, is that right? It doesn't look realistic that my 
>>>> program can run 1 million instructions in a single run, but it might 
>>>> be that if you take all possible flows and add up the instructions 
>>>> from these flows, it will exceed 1 million.
>>>>
>>>> The limitation of maximum 10 TCP options might be not enough, given 
>>>> that valid packets are permitted to include more than 10 NOPs. An 
>>>> alternative of using bpf_load_hdr_opt and calling it three times 
>>>> doesn't look good either, because it will be about three times 
>>>> slower than going over the options once. So maybe having a helper 
>>>> for that is better than trying to fit it into BPF?
>>>>
>>>> One more interesting fact is the time that it takes for the verifier 
>>>> to check my program. If it's limited to 10 iterations, it does it 
>>>> pretty fast, but if I try to increase the number to 11 iterations, 
>>>> it takes several minutes for the verifier to reach 1 million 
>>>> instructions and print the error then. I also tried grouping the 
>>>> NOPs in an inner loop to count only 10 real options, and the 
>>>> verifier has been running for a few hours without any response. Is 
>>>> it normal? 
>>>
>>> Maxim, this may expose a verifier bug. Do you have a reproducer I can 
>>> access? I would like to debug this to see what is the root case. Thanks!
>>
>> Thanks, I appreciate your help in debugging it. The reproducer is 
>> based on the modified XDP program from patch 10 in this series. You'll 
>> need to apply at least patches 6, 7, 8 from this series to get new BPF 
>> helpers needed for the XDP program (tell me if that's a problem, I can 
>> try to remove usage of new helpers, but it will affect the program 
>> length and may produce different results in the verifier).
>>
>> See the C code of the program that passes the verifier (compiled with 
>> clang version 12.0.0-1ubuntu1) in the bottom of this email. If you 
>> increase the loop boundary from 10 to at least 11 in 
>> cookie_init_timestamp_raw(), it fails the verifier after a few minutes. 
> 
> I tried to reproduce with latest llvm (llvm-project repo),
> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. For 10,
> the number of verified instructions is 563626 (more than 0.5M) so it is
> totally possible that one more iteration just blows past the limit.

So, does it mean that the verifying complexity grows exponentially with 
increasing the number of loop iterations (options parsed)?

Is it a good enough reason to keep this code as a BPF helper, rather 
than trying to fit it into the BPF program?

> 
>> If you apply this tiny change, it fails the verifier after about 3 hours:
>>
>> --- a/samples/bpf/syncookie_kern.c
>> +++ b/samples/bpf/syncookie_kern.c
>> @@ -167,6 +167,7 @@ static __always_inline bool cookie_init_
>>       for (i = 0; i < 10; i++) {
>>           u8 opcode, opsize;
>>
>> +skip_nop:
>>           if (ptr >= end)
>>               break;
>>           if (ptr + 1 > data_end)
>> @@ -178,7 +179,7 @@ static __always_inline bool cookie_init_
>>               break;
>>           if (opcode == TCPOPT_NOP) {
>>               ++ptr;
>> -            continue;
>> +            goto skip_nop;
>>           }
>>
>>           if (ptr + 1 >= end)
> 
> I tried this as well, with latest llvm, and got the following errors
> in ~30 seconds:
> 
> ......
> 536: (79) r2 = *(u64 *)(r10 -96)
> 537: R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) 
> R1=pkt(id=9,off=499,r=518,umax_value=60,var_off=(0x0; 0x3c)) 
> R2=pkt_end(id=0,off=0,imm=0) 
> R3=pkt(id=27,off=14,r=0,umin_value=20,umax_value=120,var_off=(0x0; 
> 0x7c),s32_min_value=0,s32_max_value=124,u32_max_value=124) R4=invP3 
> R5=inv1 R6=ctx(id=0,off=0,imm=0) R7=pkt(id=9,off=519,r=518,umax_va^C
> [yhs@devbig309.ftw3 ~/work/bpf-next/samples/bpf] tail -f log
> 550: (55) if r0 != 0x4 goto pc+4
> The sequence of 8193 jumps is too complex.
> verification time 30631375 usec
> stack depth 160
> processed 330595 insns (limit 1000000) max_states_per_insn 4 
> total_states 20377 peak_states 100 mark_read 37
> 
> With llvm12, I got the similar verification error:
> 
> The sequence of 8193 jumps is too complex.
> processed 330592 insns (limit 1000000) max_states_per_insn 4 
> total_states 20378 peak_states 101 mark_read 37
> 
> Could you check again with your experiment which takes 3 hours to
> fail? What is the verification failure log?

The log is similar:

...
; if (opsize == TCPOLEN_WINDOW) {
460: (55) if r8 != 0x3 goto pc+31
 
R0_w=pkt(id=28132,off=4037,r=0,umin_value=20,umax_value=2610,var_off=(0x0; 
0x3ffff),s32_min_value=0,s32_max_value=262143,u32_max_value=262143) 
R1=inv0 
R2=pkt(id=27,off=14,r=0,umin_value=20,umax_value=120,var_off=(0x0; 
0x7c),s32_min_value=0,s32_max_value=124,u32_max_value=124) R3_w=inv3 
R4_w=inv9 R5=inv0 R6=ctx(id=0,off=0,imm=0) 
R7_w=pkt(id=44,off=4047,r=4039,umin_value=18,umax_value=2355,var_off=(0x0; 
0x1ffff),s32_min_value=0,s32_max_value=131071,u32_max_value=131071) 
R8_w=invP3 R9=inv0 R10=fp0 fp-16=????mmmm fp-24=00000000 fp-64=????mmmm 
fp-72=mmmmmmmm fp-80=mmmmmmmm fp-88=pkt fp-96=pkt_end fp-104=pkt 
fp-112=pkt fp-120=inv20 fp-128=mmmmmmmm fp-136_w=inv14 fp-144=pkt
; if (ptr + TCPOLEN_WINDOW > data_end)
461: (bf) r3 = r7
462: (07) r3 += -7
; if (ptr + TCPOLEN_WINDOW > data_end)
463: (79) r8 = *(u64 *)(r10 -96)
464: (2d) if r3 > r8 goto pc+56
The sequence of 8193 jumps is too complex.
processed 414429 insns (limit 1000000) max_states_per_insn 4 
total_states 8097 peak_states 97 mark_read 49

libbpf: -- END LOG --
libbpf: failed to load program 'syncookie_xdp'
libbpf: failed to load object '.../samples/bpf/syncookie_kern.o'
Error: bpf_prog_load: Unknown error 4007

real    189m49.659s
user    0m0.012s
sys     189m26.322s

Ubuntu clang version 12.0.0-1ubuntu1

I wonder why it takes only 30 seconds for you. As I understand, the 
expectation is less than 1 second anyway, but the difference between 30 
seconds and 3 hours is huge. Maybe some kernel config options matter 
(KASAN?)

Is it expected that increasing the loop length linearly increases the 
verifying complexity exponentially? Is there any mitigation?

Thanks,
Max

>>
>> --cut--
>>
>> // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> /* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights 
>> reserved. */
>>
> [...]
Yonghong Song Nov. 26, 2021, 5:43 a.m. UTC | #12
On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
> On 2021-11-09 09:11, Yonghong Song wrote:
>>
>>
>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>>
>>>>
>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>>
>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 
>>>>>>>> *tsval, __be32 *tsecr)
>>>>>>>
>>>>>>> I'm probably missing context, Is there something in this function 
>>>>>>> that
>>>>>>> means you can't implement it in BPF?
>>>>>>
>>>>>> I was about to reply with some other comments but upon closer 
>>>>>> inspection
>>>>>> I ended up at the same conclusion: this helper doesn't seem to be 
>>>>>> needed
>>>>>> at all?
>>>>>
>>>>> After trying to put this code into BPF (replacing the underlying 
>>>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues 
>>>>> with passing the verifier.
>>>>>
>>>>> In addition to comparing ptr to end, I had to add checks that 
>>>>> compare ptr to data_end, because the verifier can't deduce that end 
>>>>> <= data_end. More branches will add a certain slowdown (not measured).
>>>>>
>>>>> A more serious issue is the overall program complexity. Even though 
>>>>> the loop over the TCP options has an upper bound, and the pointer 
>>>>> advances by at least one byte every iteration, I had to limit the 
>>>>> total number of iterations artificially. The maximum number of 
>>>>> iterations that makes the verifier happy is 10. With more 
>>>>> iterations, I have the following error:
>>>>>
>>>>> BPF program is too large. Processed 1000001 insn
>>>>>
>>>>>                         processed 1000001 insns (limit 1000000) 
>>>>> max_states_per_insn 29 total_states 35489 peak_states 596 mark_read 45
>>>>>
>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>>> accumulated amount of instructions that the verifier can process in 
>>>>> all branches, is that right? It doesn't look realistic that my 
>>>>> program can run 1 million instructions in a single run, but it 
>>>>> might be that if you take all possible flows and add up the 
>>>>> instructions from these flows, it will exceed 1 million.
>>>>>
>>>>> The limitation of maximum 10 TCP options might be not enough, given 
>>>>> that valid packets are permitted to include more than 10 NOPs. An 
>>>>> alternative of using bpf_load_hdr_opt and calling it three times 
>>>>> doesn't look good either, because it will be about three times 
>>>>> slower than going over the options once. So maybe having a helper 
>>>>> for that is better than trying to fit it into BPF?
>>>>>
>>>>> One more interesting fact is the time that it takes for the 
>>>>> verifier to check my program. If it's limited to 10 iterations, it 
>>>>> does it pretty fast, but if I try to increase the number to 11 
>>>>> iterations, it takes several minutes for the verifier to reach 1 
>>>>> million instructions and print the error then. I also tried 
>>>>> grouping the NOPs in an inner loop to count only 10 real options, 
>>>>> and the verifier has been running for a few hours without any 
>>>>> response. Is it normal? 
>>>>
>>>> Maxim, this may expose a verifier bug. Do you have a reproducer I 
>>>> can access? I would like to debug this to see what is the root case. 
>>>> Thanks!
>>>
>>> Thanks, I appreciate your help in debugging it. The reproducer is 
>>> based on the modified XDP program from patch 10 in this series. 
>>> You'll need to apply at least patches 6, 7, 8 from this series to get 
>>> new BPF helpers needed for the XDP program (tell me if that's a 
>>> problem, I can try to remove usage of new helpers, but it will affect 
>>> the program length and may produce different results in the verifier).
>>>
>>> See the C code of the program that passes the verifier (compiled with 
>>> clang version 12.0.0-1ubuntu1) in the bottom of this email. If you 
>>> increase the loop boundary from 10 to at least 11 in 
>>> cookie_init_timestamp_raw(), it fails the verifier after a few minutes. 
>>
>> I tried to reproduce with latest llvm (llvm-project repo),
>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. For 10,
>> the number of verified instructions is 563626 (more than 0.5M) so it is
>> totally possible that one more iteration just blows past the limit.
> 
> So, does it mean that the verifying complexity grows exponentially with 
> increasing the number of loop iterations (options parsed)?

Depending on verification time pruning results, it is possible slightly 
increase number of branches could result quite some (2x, 4x, etc.) of
to-be-verified dynamic instructions.

> 
> Is it a good enough reason to keep this code as a BPF helper, rather 
> than trying to fit it into the BPF program?

Another option is to use global function, which is verified separately
from the main bpf program.

> 
>>
>>> If you apply this tiny change, it fails the verifier after about 3 
>>> hours:
>>>
[...]
Maxim Mikityanskiy Nov. 26, 2021, 4:50 p.m. UTC | #13
On 2021-11-26 07:43, Yonghong Song wrote:
> 
> 
> On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
>> On 2021-11-09 09:11, Yonghong Song wrote:
>>>
>>>
>>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>>>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>>>
>>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 
>>>>>>>>> *tsval, __be32 *tsecr)
>>>>>>>>
>>>>>>>> I'm probably missing context, Is there something in this 
>>>>>>>> function that
>>>>>>>> means you can't implement it in BPF?
>>>>>>>
>>>>>>> I was about to reply with some other comments but upon closer 
>>>>>>> inspection
>>>>>>> I ended up at the same conclusion: this helper doesn't seem to be 
>>>>>>> needed
>>>>>>> at all?
>>>>>>
>>>>>> After trying to put this code into BPF (replacing the underlying 
>>>>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues 
>>>>>> with passing the verifier.
>>>>>>
>>>>>> In addition to comparing ptr to end, I had to add checks that 
>>>>>> compare ptr to data_end, because the verifier can't deduce that 
>>>>>> end <= data_end. More branches will add a certain slowdown (not 
>>>>>> measured).
>>>>>>
>>>>>> A more serious issue is the overall program complexity. Even 
>>>>>> though the loop over the TCP options has an upper bound, and the 
>>>>>> pointer advances by at least one byte every iteration, I had to 
>>>>>> limit the total number of iterations artificially. The maximum 
>>>>>> number of iterations that makes the verifier happy is 10. With 
>>>>>> more iterations, I have the following error:
>>>>>>
>>>>>> BPF program is too large. Processed 1000001 insn
>>>>>>
>>>>>>                         processed 1000001 insns (limit 1000000) 
>>>>>> max_states_per_insn 29 total_states 35489 peak_states 596 
>>>>>> mark_read 45
>>>>>>
>>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>>>> accumulated amount of instructions that the verifier can process 
>>>>>> in all branches, is that right? It doesn't look realistic that my 
>>>>>> program can run 1 million instructions in a single run, but it 
>>>>>> might be that if you take all possible flows and add up the 
>>>>>> instructions from these flows, it will exceed 1 million.
>>>>>>
>>>>>> The limitation of maximum 10 TCP options might be not enough, 
>>>>>> given that valid packets are permitted to include more than 10 
>>>>>> NOPs. An alternative of using bpf_load_hdr_opt and calling it 
>>>>>> three times doesn't look good either, because it will be about 
>>>>>> three times slower than going over the options once. So maybe 
>>>>>> having a helper for that is better than trying to fit it into BPF?
>>>>>>
>>>>>> One more interesting fact is the time that it takes for the 
>>>>>> verifier to check my program. If it's limited to 10 iterations, it 
>>>>>> does it pretty fast, but if I try to increase the number to 11 
>>>>>> iterations, it takes several minutes for the verifier to reach 1 
>>>>>> million instructions and print the error then. I also tried 
>>>>>> grouping the NOPs in an inner loop to count only 10 real options, 
>>>>>> and the verifier has been running for a few hours without any 
>>>>>> response. Is it normal? 
>>>>>
>>>>> Maxim, this may expose a verifier bug. Do you have a reproducer I 
>>>>> can access? I would like to debug this to see what is the root 
>>>>> case. Thanks!
>>>>
>>>> Thanks, I appreciate your help in debugging it. The reproducer is 
>>>> based on the modified XDP program from patch 10 in this series. 
>>>> You'll need to apply at least patches 6, 7, 8 from this series to 
>>>> get new BPF helpers needed for the XDP program (tell me if that's a 
>>>> problem, I can try to remove usage of new helpers, but it will 
>>>> affect the program length and may produce different results in the 
>>>> verifier).
>>>>
>>>> See the C code of the program that passes the verifier (compiled 
>>>> with clang version 12.0.0-1ubuntu1) in the bottom of this email. If 
>>>> you increase the loop boundary from 10 to at least 11 in 
>>>> cookie_init_timestamp_raw(), it fails the verifier after a few minutes. 
>>>
>>> I tried to reproduce with latest llvm (llvm-project repo),
>>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. For 10,
>>> the number of verified instructions is 563626 (more than 0.5M) so it is
>>> totally possible that one more iteration just blows past the limit.
>>
>> So, does it mean that the verifying complexity grows exponentially 
>> with increasing the number of loop iterations (options parsed)?
> 
> Depending on verification time pruning results, it is possible slightly 
> increase number of branches could result quite some (2x, 4x, etc.) of
> to-be-verified dynamic instructions.

Is it at least theoretically possible to make this coefficient below 2x? 
I.e. write a loop, so that adding another iteration will not double the 
number of verified instructions, but will have a smaller increase?

If that's not possible, then it looks like BPF can't have loops bigger 
than ~19 iterations (2^20 > 1M), and this function is not implementable 
in BPF.

>>
>> Is it a good enough reason to keep this code as a BPF helper, rather 
>> than trying to fit it into the BPF program?
> 
> Another option is to use global function, which is verified separately
> from the main bpf program.

Simply removing __always_inline didn't change anything. Do I need to 
make any other changes? Will it make sense to call a global function in 
a loop, i.e. will it increase chances to pass the verifier?

>>
>>>
>>>> If you apply this tiny change, it fails the verifier after about 3 
>>>> hours:
>>>>
> [...]
Yonghong Song Nov. 26, 2021, 5:07 p.m. UTC | #14
On 11/26/21 8:50 AM, Maxim Mikityanskiy wrote:
> On 2021-11-26 07:43, Yonghong Song wrote:
>>
>>
>> On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
>>> On 2021-11-09 09:11, Yonghong Song wrote:
>>>>
>>>>
>>>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>>>>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>>>>
>>>>>>
>>>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>>>>
>>>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 
>>>>>>>>>> *tsval, __be32 *tsecr)
>>>>>>>>>
>>>>>>>>> I'm probably missing context, Is there something in this 
>>>>>>>>> function that
>>>>>>>>> means you can't implement it in BPF?
>>>>>>>>
>>>>>>>> I was about to reply with some other comments but upon closer 
>>>>>>>> inspection
>>>>>>>> I ended up at the same conclusion: this helper doesn't seem to 
>>>>>>>> be needed
>>>>>>>> at all?
>>>>>>>
>>>>>>> After trying to put this code into BPF (replacing the underlying 
>>>>>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues 
>>>>>>> with passing the verifier.
>>>>>>>
>>>>>>> In addition to comparing ptr to end, I had to add checks that 
>>>>>>> compare ptr to data_end, because the verifier can't deduce that 
>>>>>>> end <= data_end. More branches will add a certain slowdown (not 
>>>>>>> measured).
>>>>>>>
>>>>>>> A more serious issue is the overall program complexity. Even 
>>>>>>> though the loop over the TCP options has an upper bound, and the 
>>>>>>> pointer advances by at least one byte every iteration, I had to 
>>>>>>> limit the total number of iterations artificially. The maximum 
>>>>>>> number of iterations that makes the verifier happy is 10. With 
>>>>>>> more iterations, I have the following error:
>>>>>>>
>>>>>>> BPF program is too large. Processed 1000001 insn
>>>>>>>
>>>>>>>                         processed 1000001 insns (limit 1000000) 
>>>>>>> max_states_per_insn 29 total_states 35489 peak_states 596 
>>>>>>> mark_read 45
>>>>>>>
>>>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>>>>> accumulated amount of instructions that the verifier can process 
>>>>>>> in all branches, is that right? It doesn't look realistic that my 
>>>>>>> program can run 1 million instructions in a single run, but it 
>>>>>>> might be that if you take all possible flows and add up the 
>>>>>>> instructions from these flows, it will exceed 1 million.
>>>>>>>
>>>>>>> The limitation of maximum 10 TCP options might be not enough, 
>>>>>>> given that valid packets are permitted to include more than 10 
>>>>>>> NOPs. An alternative of using bpf_load_hdr_opt and calling it 
>>>>>>> three times doesn't look good either, because it will be about 
>>>>>>> three times slower than going over the options once. So maybe 
>>>>>>> having a helper for that is better than trying to fit it into BPF?
>>>>>>>
>>>>>>> One more interesting fact is the time that it takes for the 
>>>>>>> verifier to check my program. If it's limited to 10 iterations, 
>>>>>>> it does it pretty fast, but if I try to increase the number to 11 
>>>>>>> iterations, it takes several minutes for the verifier to reach 1 
>>>>>>> million instructions and print the error then. I also tried 
>>>>>>> grouping the NOPs in an inner loop to count only 10 real options, 
>>>>>>> and the verifier has been running for a few hours without any 
>>>>>>> response. Is it normal? 
>>>>>>
>>>>>> Maxim, this may expose a verifier bug. Do you have a reproducer I 
>>>>>> can access? I would like to debug this to see what is the root 
>>>>>> case. Thanks!
>>>>>
>>>>> Thanks, I appreciate your help in debugging it. The reproducer is 
>>>>> based on the modified XDP program from patch 10 in this series. 
>>>>> You'll need to apply at least patches 6, 7, 8 from this series to 
>>>>> get new BPF helpers needed for the XDP program (tell me if that's a 
>>>>> problem, I can try to remove usage of new helpers, but it will 
>>>>> affect the program length and may produce different results in the 
>>>>> verifier).
>>>>>
>>>>> See the C code of the program that passes the verifier (compiled 
>>>>> with clang version 12.0.0-1ubuntu1) in the bottom of this email. If 
>>>>> you increase the loop boundary from 10 to at least 11 in 
>>>>> cookie_init_timestamp_raw(), it fails the verifier after a few 
>>>>> minutes. 
>>>>
>>>> I tried to reproduce with latest llvm (llvm-project repo),
>>>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. For 
>>>> 10,
>>>> the number of verified instructions is 563626 (more than 0.5M) so it is
>>>> totally possible that one more iteration just blows past the limit.
>>>
>>> So, does it mean that the verifying complexity grows exponentially 
>>> with increasing the number of loop iterations (options parsed)?
>>
>> Depending on verification time pruning results, it is possible 
>> slightly increase number of branches could result quite some (2x, 4x, 
>> etc.) of
>> to-be-verified dynamic instructions.
> 
> Is it at least theoretically possible to make this coefficient below 2x? 
> I.e. write a loop, so that adding another iteration will not double the 
> number of verified instructions, but will have a smaller increase?
> 
> If that's not possible, then it looks like BPF can't have loops bigger 
> than ~19 iterations (2^20 > 1M), and this function is not implementable 
> in BPF.

This is the worst case. As I mentioned pruning plays a huge role in 
verification. Effective pruning can add little increase of dynamic 
instructions say from 19 iterations to 20 iterations. But we have
to look at verifier log to find out whether pruning is less effective or
something else... Based on my experience, in most cases, pruning is
quite effective. But occasionally it is not... You can look at
verifier.c file to roughly understand how pruning work.

Not sure whether in this case it is due to less effective pruning or 
inherently we just have to go through all these dynamic instructions for 
verification.

> 
>>>
>>> Is it a good enough reason to keep this code as a BPF helper, rather 
>>> than trying to fit it into the BPF program?
>>
>> Another option is to use global function, which is verified separately
>> from the main bpf program.
> 
> Simply removing __always_inline didn't change anything. Do I need to 
> make any other changes? Will it make sense to call a global function in 
> a loop, i.e. will it increase chances to pass the verifier?

global function cannot be static function. You can try
either global function inside the loop or global function
containing the loop. It probably more effective to put loops
inside the global function. You have to do some experiments
to see which one is better.

> 
>>>
>>>>
>>>>> If you apply this tiny change, it fails the verifier after about 3 
>>>>> hours:
>>>>>
>> [...]
>
Maxim Mikityanskiy Nov. 29, 2021, 5:51 p.m. UTC | #15
On 2021-11-26 19:07, Yonghong Song wrote:
> 
> 
> On 11/26/21 8:50 AM, Maxim Mikityanskiy wrote:
>> On 2021-11-26 07:43, Yonghong Song wrote:
>>>
>>>
>>> On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
>>>> On 2021-11-09 09:11, Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>>>>>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>>>>>
>>>>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 
>>>>>>>>>>> *tsval, __be32 *tsecr)
>>>>>>>>>>
>>>>>>>>>> I'm probably missing context, Is there something in this 
>>>>>>>>>> function that
>>>>>>>>>> means you can't implement it in BPF?
>>>>>>>>>
>>>>>>>>> I was about to reply with some other comments but upon closer 
>>>>>>>>> inspection
>>>>>>>>> I ended up at the same conclusion: this helper doesn't seem to 
>>>>>>>>> be needed
>>>>>>>>> at all?
>>>>>>>>
>>>>>>>> After trying to put this code into BPF (replacing the underlying 
>>>>>>>> ktime_get_ns with ktime_get_mono_fast_ns), I experienced issues 
>>>>>>>> with passing the verifier.
>>>>>>>>
>>>>>>>> In addition to comparing ptr to end, I had to add checks that 
>>>>>>>> compare ptr to data_end, because the verifier can't deduce that 
>>>>>>>> end <= data_end. More branches will add a certain slowdown (not 
>>>>>>>> measured).
>>>>>>>>
>>>>>>>> A more serious issue is the overall program complexity. Even 
>>>>>>>> though the loop over the TCP options has an upper bound, and the 
>>>>>>>> pointer advances by at least one byte every iteration, I had to 
>>>>>>>> limit the total number of iterations artificially. The maximum 
>>>>>>>> number of iterations that makes the verifier happy is 10. With 
>>>>>>>> more iterations, I have the following error:
>>>>>>>>
>>>>>>>> BPF program is too large. Processed 1000001 insn
>>>>>>>>
>>>>>>>>                         processed 1000001 insns (limit 1000000) 
>>>>>>>> max_states_per_insn 29 total_states 35489 peak_states 596 
>>>>>>>> mark_read 45
>>>>>>>>
>>>>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>>>>>> accumulated amount of instructions that the verifier can process 
>>>>>>>> in all branches, is that right? It doesn't look realistic that 
>>>>>>>> my program can run 1 million instructions in a single run, but 
>>>>>>>> it might be that if you take all possible flows and add up the 
>>>>>>>> instructions from these flows, it will exceed 1 million.
>>>>>>>>
>>>>>>>> The limitation of maximum 10 TCP options might be not enough, 
>>>>>>>> given that valid packets are permitted to include more than 10 
>>>>>>>> NOPs. An alternative of using bpf_load_hdr_opt and calling it 
>>>>>>>> three times doesn't look good either, because it will be about 
>>>>>>>> three times slower than going over the options once. So maybe 
>>>>>>>> having a helper for that is better than trying to fit it into BPF?
>>>>>>>>
>>>>>>>> One more interesting fact is the time that it takes for the 
>>>>>>>> verifier to check my program. If it's limited to 10 iterations, 
>>>>>>>> it does it pretty fast, but if I try to increase the number to 
>>>>>>>> 11 iterations, it takes several minutes for the verifier to 
>>>>>>>> reach 1 million instructions and print the error then. I also 
>>>>>>>> tried grouping the NOPs in an inner loop to count only 10 real 
>>>>>>>> options, and the verifier has been running for a few hours 
>>>>>>>> without any response. Is it normal? 
>>>>>>>
>>>>>>> Maxim, this may expose a verifier bug. Do you have a reproducer I 
>>>>>>> can access? I would like to debug this to see what is the root 
>>>>>>> case. Thanks!
>>>>>>
>>>>>> Thanks, I appreciate your help in debugging it. The reproducer is 
>>>>>> based on the modified XDP program from patch 10 in this series. 
>>>>>> You'll need to apply at least patches 6, 7, 8 from this series to 
>>>>>> get new BPF helpers needed for the XDP program (tell me if that's 
>>>>>> a problem, I can try to remove usage of new helpers, but it will 
>>>>>> affect the program length and may produce different results in the 
>>>>>> verifier).
>>>>>>
>>>>>> See the C code of the program that passes the verifier (compiled 
>>>>>> with clang version 12.0.0-1ubuntu1) in the bottom of this email. 
>>>>>> If you increase the loop boundary from 10 to at least 11 in 
>>>>>> cookie_init_timestamp_raw(), it fails the verifier after a few 
>>>>>> minutes. 
>>>>>
>>>>> I tried to reproduce with latest llvm (llvm-project repo),
>>>>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. 
>>>>> For 10,
>>>>> the number of verified instructions is 563626 (more than 0.5M) so 
>>>>> it is
>>>>> totally possible that one more iteration just blows past the limit.
>>>>
>>>> So, does it mean that the verifying complexity grows exponentially 
>>>> with increasing the number of loop iterations (options parsed)?
>>>
>>> Depending on verification time pruning results, it is possible 
>>> slightly increase number of branches could result quite some (2x, 4x, 
>>> etc.) of
>>> to-be-verified dynamic instructions.
>>
>> Is it at least theoretically possible to make this coefficient below 
>> 2x? I.e. write a loop, so that adding another iteration will not 
>> double the number of verified instructions, but will have a smaller 
>> increase?
>>
>> If that's not possible, then it looks like BPF can't have loops bigger 
>> than ~19 iterations (2^20 > 1M), and this function is not 
>> implementable in BPF.
> 
> This is the worst case. As I mentioned pruning plays a huge role in 
> verification. Effective pruning can add little increase of dynamic 
> instructions say from 19 iterations to 20 iterations. But we have
> to look at verifier log to find out whether pruning is less effective or
> something else... Based on my experience, in most cases, pruning is
> quite effective. But occasionally it is not... You can look at
> verifier.c file to roughly understand how pruning work.
> 
> Not sure whether in this case it is due to less effective pruning or 
> inherently we just have to go through all these dynamic instructions for 
> verification.
> 
>>
>>>>
>>>> Is it a good enough reason to keep this code as a BPF helper, rather 
>>>> than trying to fit it into the BPF program?
>>>
>>> Another option is to use global function, which is verified separately
>>> from the main bpf program.
>>
>> Simply removing __always_inline didn't change anything. Do I need to 
>> make any other changes? Will it make sense to call a global function 
>> in a loop, i.e. will it increase chances to pass the verifier?
> 
> global function cannot be static function. You can try
> either global function inside the loop or global function
> containing the loop. It probably more effective to put loops
> inside the global function. You have to do some experiments
> to see which one is better.

Sorry for a probably noob question, but how can I pass data_end to a 
global function? I'm getting this error:

Validating cookie_init_timestamp_raw() func#1...
arg#4 reference type('UNKNOWN ') size cannot be determined: -22
processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 
peak_states 0 mark_read 0

When I removed data_end, I got another one:

; opcode = ptr[0];
969: (71) r8 = *(u8 *)(r0 +0)
  R0=mem(id=0,ref_obj_id=0,off=20,imm=0) 
R1=mem(id=0,ref_obj_id=0,off=0,umin_value=4,umax_value=60,var_off=(0x0; 
0x3f),s32_min_value=0,s32_max_value=63,u32_max_value=63)
  R2=invP0 R3=invP0 R4=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) 
R5=invP0 R6=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) 
R7=mem(id=0,ref_obj_id=0,off=0,imm=0) R10=fp0 fp
-8=00000000 fp-16=invP15
invalid access to memory, mem_size=20 off=20 size=1
R0 min value is outside of the allowed memory range
processed 20 insns (limit 1000000) max_states_per_insn 0 total_states 2 
peak_states 2 mark_read 1

It looks like pointers to the context aren't supported:

https://www.spinics.net/lists/bpf/msg34907.html

 > test_global_func11 - check that CTX pointer cannot be passed

What is the standard way to pass packet data to a global function?

Thanks,
Max

>>
>>>>
>>>>>
>>>>>> If you apply this tiny change, it fails the verifier after about 3 
>>>>>> hours:
>>>>>>
>>> [...]
>>
Yonghong Song Dec. 1, 2021, 6:39 a.m. UTC | #16
On 11/29/21 9:51 AM, Maxim Mikityanskiy wrote:
> On 2021-11-26 19:07, Yonghong Song wrote:
>>
>>
>> On 11/26/21 8:50 AM, Maxim Mikityanskiy wrote:
>>> On 2021-11-26 07:43, Yonghong Song wrote:
>>>>
>>>>
>>>> On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
>>>>> On 2021-11-09 09:11, Yonghong Song wrote:
>>>>>>
>>>>>>
>>>>>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
>>>>>>> On 2021-11-03 04:10, Yonghong Song wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
>>>>>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
>>>>>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
>>>>>>>>>>
>>>>>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 
>>>>>>>>>>>> *tsval, __be32 *tsecr)
>>>>>>>>>>>
>>>>>>>>>>> I'm probably missing context, Is there something in this 
>>>>>>>>>>> function that
>>>>>>>>>>> means you can't implement it in BPF?
>>>>>>>>>>
>>>>>>>>>> I was about to reply with some other comments but upon closer 
>>>>>>>>>> inspection
>>>>>>>>>> I ended up at the same conclusion: this helper doesn't seem to 
>>>>>>>>>> be needed
>>>>>>>>>> at all?
>>>>>>>>>
>>>>>>>>> After trying to put this code into BPF (replacing the 
>>>>>>>>> underlying ktime_get_ns with ktime_get_mono_fast_ns), I 
>>>>>>>>> experienced issues with passing the verifier.
>>>>>>>>>
>>>>>>>>> In addition to comparing ptr to end, I had to add checks that 
>>>>>>>>> compare ptr to data_end, because the verifier can't deduce that 
>>>>>>>>> end <= data_end. More branches will add a certain slowdown (not 
>>>>>>>>> measured).
>>>>>>>>>
>>>>>>>>> A more serious issue is the overall program complexity. Even 
>>>>>>>>> though the loop over the TCP options has an upper bound, and 
>>>>>>>>> the pointer advances by at least one byte every iteration, I 
>>>>>>>>> had to limit the total number of iterations artificially. The 
>>>>>>>>> maximum number of iterations that makes the verifier happy is 
>>>>>>>>> 10. With more iterations, I have the following error:
>>>>>>>>>
>>>>>>>>> BPF program is too large. Processed 1000001 insn
>>>>>>>>>
>>>>>>>>>                         processed 1000001 insns (limit 1000000) 
>>>>>>>>> max_states_per_insn 29 total_states 35489 peak_states 596 
>>>>>>>>> mark_read 45
>>>>>>>>>
>>>>>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the 
>>>>>>>>> accumulated amount of instructions that the verifier can 
>>>>>>>>> process in all branches, is that right? It doesn't look 
>>>>>>>>> realistic that my program can run 1 million instructions in a 
>>>>>>>>> single run, but it might be that if you take all possible flows 
>>>>>>>>> and add up the instructions from these flows, it will exceed 1 
>>>>>>>>> million.
>>>>>>>>>
>>>>>>>>> The limitation of maximum 10 TCP options might be not enough, 
>>>>>>>>> given that valid packets are permitted to include more than 10 
>>>>>>>>> NOPs. An alternative of using bpf_load_hdr_opt and calling it 
>>>>>>>>> three times doesn't look good either, because it will be about 
>>>>>>>>> three times slower than going over the options once. So maybe 
>>>>>>>>> having a helper for that is better than trying to fit it into BPF?
>>>>>>>>>
>>>>>>>>> One more interesting fact is the time that it takes for the 
>>>>>>>>> verifier to check my program. If it's limited to 10 iterations, 
>>>>>>>>> it does it pretty fast, but if I try to increase the number to 
>>>>>>>>> 11 iterations, it takes several minutes for the verifier to 
>>>>>>>>> reach 1 million instructions and print the error then. I also 
>>>>>>>>> tried grouping the NOPs in an inner loop to count only 10 real 
>>>>>>>>> options, and the verifier has been running for a few hours 
>>>>>>>>> without any response. Is it normal? 
>>>>>>>>
>>>>>>>> Maxim, this may expose a verifier bug. Do you have a reproducer 
>>>>>>>> I can access? I would like to debug this to see what is the root 
>>>>>>>> case. Thanks!
>>>>>>>
>>>>>>> Thanks, I appreciate your help in debugging it. The reproducer is 
>>>>>>> based on the modified XDP program from patch 10 in this series. 
>>>>>>> You'll need to apply at least patches 6, 7, 8 from this series to 
>>>>>>> get new BPF helpers needed for the XDP program (tell me if that's 
>>>>>>> a problem, I can try to remove usage of new helpers, but it will 
>>>>>>> affect the program length and may produce different results in 
>>>>>>> the verifier).
>>>>>>>
>>>>>>> See the C code of the program that passes the verifier (compiled 
>>>>>>> with clang version 12.0.0-1ubuntu1) in the bottom of this email. 
>>>>>>> If you increase the loop boundary from 10 to at least 11 in 
>>>>>>> cookie_init_timestamp_raw(), it fails the verifier after a few 
>>>>>>> minutes. 
>>>>>>
>>>>>> I tried to reproduce with latest llvm (llvm-project repo),
>>>>>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit. 
>>>>>> For 10,
>>>>>> the number of verified instructions is 563626 (more than 0.5M) so 
>>>>>> it is
>>>>>> totally possible that one more iteration just blows past the limit.
>>>>>
>>>>> So, does it mean that the verifying complexity grows exponentially 
>>>>> with increasing the number of loop iterations (options parsed)?
>>>>
>>>> Depending on verification time pruning results, it is possible 
>>>> slightly increase number of branches could result quite some (2x, 
>>>> 4x, etc.) of
>>>> to-be-verified dynamic instructions.
>>>
>>> Is it at least theoretically possible to make this coefficient below 
>>> 2x? I.e. write a loop, so that adding another iteration will not 
>>> double the number of verified instructions, but will have a smaller 
>>> increase?
>>>
>>> If that's not possible, then it looks like BPF can't have loops 
>>> bigger than ~19 iterations (2^20 > 1M), and this function is not 
>>> implementable in BPF.
>>
>> This is the worst case. As I mentioned pruning plays a huge role in 
>> verification. Effective pruning can add little increase of dynamic 
>> instructions say from 19 iterations to 20 iterations. But we have
>> to look at verifier log to find out whether pruning is less effective or
>> something else... Based on my experience, in most cases, pruning is
>> quite effective. But occasionally it is not... You can look at
>> verifier.c file to roughly understand how pruning work.
>>
>> Not sure whether in this case it is due to less effective pruning or 
>> inherently we just have to go through all these dynamic instructions 
>> for verification.
>>
>>>
>>>>>
>>>>> Is it a good enough reason to keep this code as a BPF helper, 
>>>>> rather than trying to fit it into the BPF program?
>>>>
>>>> Another option is to use global function, which is verified separately
>>>> from the main bpf program.
>>>
>>> Simply removing __always_inline didn't change anything. Do I need to 
>>> make any other changes? Will it make sense to call a global function 
>>> in a loop, i.e. will it increase chances to pass the verifier?
>>
>> global function cannot be static function. You can try
>> either global function inside the loop or global function
>> containing the loop. It probably more effective to put loops
>> inside the global function. You have to do some experiments
>> to see which one is better.
> 
> Sorry for a probably noob question, but how can I pass data_end to a 
> global function? I'm getting this error:
> 
> Validating cookie_init_timestamp_raw() func#1...
> arg#4 reference type('UNKNOWN ') size cannot be determined: -22
> processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 
> peak_states 0 mark_read 0
> 
> When I removed data_end, I got another one:
> 
> ; opcode = ptr[0];
> 969: (71) r8 = *(u8 *)(r0 +0)
>   R0=mem(id=0,ref_obj_id=0,off=20,imm=0) 
> R1=mem(id=0,ref_obj_id=0,off=0,umin_value=4,umax_value=60,var_off=(0x0; 
> 0x3f),s32_min_value=0,s32_max_value=63,u32_max_value=63)
>   R2=invP0 R3=invP0 R4=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) 
> R5=invP0 R6=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) 
> R7=mem(id=0,ref_obj_id=0,off=0,imm=0) R10=fp0 fp
> -8=00000000 fp-16=invP15
> invalid access to memory, mem_size=20 off=20 size=1
> R0 min value is outside of the allowed memory range
> processed 20 insns (limit 1000000) max_states_per_insn 0 total_states 2 
> peak_states 2 mark_read 1
> 
> It looks like pointers to the context aren't supported:
> 
> https://www.spinics.net/lists/bpf/msg34907.html 
> 
>  > test_global_func11 - check that CTX pointer cannot be passed
> 
> What is the standard way to pass packet data to a global function?

Since global function is separately verified, you need to pass the 'ctx' 
to the global function and do the 'data_end' check again in the global
function. This will incur some packet re-parsing overhead similar to
tail calls.

> 
> Thanks,
> Max
> 
>>>
>>>>>
>>>>>>
>>>>>>> If you apply this tiny change, it fails the verifier after about 
>>>>>>> 3 hours:
>>>>>>>
>>>> [...]
>>>
>
Andrii Nakryiko Dec. 1, 2021, 6:06 p.m. UTC | #17
On Tue, Nov 30, 2021 at 10:40 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 11/29/21 9:51 AM, Maxim Mikityanskiy wrote:
> > On 2021-11-26 19:07, Yonghong Song wrote:
> >>
> >>
> >> On 11/26/21 8:50 AM, Maxim Mikityanskiy wrote:
> >>> On 2021-11-26 07:43, Yonghong Song wrote:
> >>>>
> >>>>
> >>>> On 11/25/21 6:34 AM, Maxim Mikityanskiy wrote:
> >>>>> On 2021-11-09 09:11, Yonghong Song wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 11/3/21 7:02 AM, Maxim Mikityanskiy wrote:
> >>>>>>> On 2021-11-03 04:10, Yonghong Song wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 11/1/21 4:14 AM, Maxim Mikityanskiy wrote:
> >>>>>>>>> On 2021-10-20 19:16, Toke Høiland-Jørgensen wrote:
> >>>>>>>>>> Lorenz Bauer <lmb@cloudflare.com> writes:
> >>>>>>>>>>
> >>>>>>>>>>>> +bool cookie_init_timestamp_raw(struct tcphdr *th, __be32
> >>>>>>>>>>>> *tsval, __be32 *tsecr)
> >>>>>>>>>>>
> >>>>>>>>>>> I'm probably missing context, Is there something in this
> >>>>>>>>>>> function that
> >>>>>>>>>>> means you can't implement it in BPF?
> >>>>>>>>>>
> >>>>>>>>>> I was about to reply with some other comments but upon closer
> >>>>>>>>>> inspection
> >>>>>>>>>> I ended up at the same conclusion: this helper doesn't seem to
> >>>>>>>>>> be needed
> >>>>>>>>>> at all?
> >>>>>>>>>
> >>>>>>>>> After trying to put this code into BPF (replacing the
> >>>>>>>>> underlying ktime_get_ns with ktime_get_mono_fast_ns), I
> >>>>>>>>> experienced issues with passing the verifier.
> >>>>>>>>>
> >>>>>>>>> In addition to comparing ptr to end, I had to add checks that
> >>>>>>>>> compare ptr to data_end, because the verifier can't deduce that
> >>>>>>>>> end <= data_end. More branches will add a certain slowdown (not
> >>>>>>>>> measured).
> >>>>>>>>>
> >>>>>>>>> A more serious issue is the overall program complexity. Even
> >>>>>>>>> though the loop over the TCP options has an upper bound, and
> >>>>>>>>> the pointer advances by at least one byte every iteration, I
> >>>>>>>>> had to limit the total number of iterations artificially. The
> >>>>>>>>> maximum number of iterations that makes the verifier happy is
> >>>>>>>>> 10. With more iterations, I have the following error:
> >>>>>>>>>
> >>>>>>>>> BPF program is too large. Processed 1000001 insn
> >>>>>>>>>
> >>>>>>>>>                         processed 1000001 insns (limit 1000000)
> >>>>>>>>> max_states_per_insn 29 total_states 35489 peak_states 596
> >>>>>>>>> mark_read 45
> >>>>>>>>>
> >>>>>>>>> I assume that BPF_COMPLEXITY_LIMIT_INSNS (1 million) is the
> >>>>>>>>> accumulated amount of instructions that the verifier can
> >>>>>>>>> process in all branches, is that right? It doesn't look
> >>>>>>>>> realistic that my program can run 1 million instructions in a
> >>>>>>>>> single run, but it might be that if you take all possible flows
> >>>>>>>>> and add up the instructions from these flows, it will exceed 1
> >>>>>>>>> million.
> >>>>>>>>>
> >>>>>>>>> The limitation of maximum 10 TCP options might be not enough,
> >>>>>>>>> given that valid packets are permitted to include more than 10
> >>>>>>>>> NOPs. An alternative of using bpf_load_hdr_opt and calling it
> >>>>>>>>> three times doesn't look good either, because it will be about
> >>>>>>>>> three times slower than going over the options once. So maybe
> >>>>>>>>> having a helper for that is better than trying to fit it into BPF?
> >>>>>>>>>
> >>>>>>>>> One more interesting fact is the time that it takes for the
> >>>>>>>>> verifier to check my program. If it's limited to 10 iterations,
> >>>>>>>>> it does it pretty fast, but if I try to increase the number to
> >>>>>>>>> 11 iterations, it takes several minutes for the verifier to
> >>>>>>>>> reach 1 million instructions and print the error then. I also
> >>>>>>>>> tried grouping the NOPs in an inner loop to count only 10 real
> >>>>>>>>> options, and the verifier has been running for a few hours
> >>>>>>>>> without any response. Is it normal?
> >>>>>>>>
> >>>>>>>> Maxim, this may expose a verifier bug. Do you have a reproducer
> >>>>>>>> I can access? I would like to debug this to see what is the root
> >>>>>>>> case. Thanks!
> >>>>>>>
> >>>>>>> Thanks, I appreciate your help in debugging it. The reproducer is
> >>>>>>> based on the modified XDP program from patch 10 in this series.
> >>>>>>> You'll need to apply at least patches 6, 7, 8 from this series to
> >>>>>>> get new BPF helpers needed for the XDP program (tell me if that's
> >>>>>>> a problem, I can try to remove usage of new helpers, but it will
> >>>>>>> affect the program length and may produce different results in
> >>>>>>> the verifier).
> >>>>>>>
> >>>>>>> See the C code of the program that passes the verifier (compiled
> >>>>>>> with clang version 12.0.0-1ubuntu1) in the bottom of this email.
> >>>>>>> If you increase the loop boundary from 10 to at least 11 in
> >>>>>>> cookie_init_timestamp_raw(), it fails the verifier after a few
> >>>>>>> minutes.
> >>>>>>
> >>>>>> I tried to reproduce with latest llvm (llvm-project repo),
> >>>>>> loop boundary 10 is okay and 11 exceeds the 1M complexity limit.
> >>>>>> For 10,
> >>>>>> the number of verified instructions is 563626 (more than 0.5M) so
> >>>>>> it is
> >>>>>> totally possible that one more iteration just blows past the limit.
> >>>>>
> >>>>> So, does it mean that the verifying complexity grows exponentially
> >>>>> with increasing the number of loop iterations (options parsed)?
> >>>>
> >>>> Depending on verification time pruning results, it is possible
> >>>> slightly increase number of branches could result quite some (2x,
> >>>> 4x, etc.) of
> >>>> to-be-verified dynamic instructions.
> >>>
> >>> Is it at least theoretically possible to make this coefficient below
> >>> 2x? I.e. write a loop, so that adding another iteration will not
> >>> double the number of verified instructions, but will have a smaller
> >>> increase?
> >>>
> >>> If that's not possible, then it looks like BPF can't have loops
> >>> bigger than ~19 iterations (2^20 > 1M), and this function is not
> >>> implementable in BPF.
> >>
> >> This is the worst case. As I mentioned pruning plays a huge role in
> >> verification. Effective pruning can add little increase of dynamic
> >> instructions say from 19 iterations to 20 iterations. But we have
> >> to look at verifier log to find out whether pruning is less effective or
> >> something else... Based on my experience, in most cases, pruning is
> >> quite effective. But occasionally it is not... You can look at
> >> verifier.c file to roughly understand how pruning work.
> >>
> >> Not sure whether in this case it is due to less effective pruning or
> >> inherently we just have to go through all these dynamic instructions
> >> for verification.
> >>
> >>>
> >>>>>
> >>>>> Is it a good enough reason to keep this code as a BPF helper,
> >>>>> rather than trying to fit it into the BPF program?
> >>>>
> >>>> Another option is to use global function, which is verified separately
> >>>> from the main bpf program.
> >>>
> >>> Simply removing __always_inline didn't change anything. Do I need to
> >>> make any other changes? Will it make sense to call a global function
> >>> in a loop, i.e. will it increase chances to pass the verifier?
> >>
> >> global function cannot be static function. You can try
> >> either global function inside the loop or global function
> >> containing the loop. It probably more effective to put loops
> >> inside the global function. You have to do some experiments
> >> to see which one is better.
> >
> > Sorry for a probably noob question, but how can I pass data_end to a
> > global function? I'm getting this error:
> >
> > Validating cookie_init_timestamp_raw() func#1...
> > arg#4 reference type('UNKNOWN ') size cannot be determined: -22
> > processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0
> > peak_states 0 mark_read 0
> >
> > When I removed data_end, I got another one:
> >
> > ; opcode = ptr[0];
> > 969: (71) r8 = *(u8 *)(r0 +0)
> >   R0=mem(id=0,ref_obj_id=0,off=20,imm=0)
> > R1=mem(id=0,ref_obj_id=0,off=0,umin_value=4,umax_value=60,var_off=(0x0;
> > 0x3f),s32_min_value=0,s32_max_value=63,u32_max_value=63)
> >   R2=invP0 R3=invP0 R4=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0)
> > R5=invP0 R6=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0)
> > R7=mem(id=0,ref_obj_id=0,off=0,imm=0) R10=fp0 fp
> > -8=00000000 fp-16=invP15
> > invalid access to memory, mem_size=20 off=20 size=1
> > R0 min value is outside of the allowed memory range
> > processed 20 insns (limit 1000000) max_states_per_insn 0 total_states 2
> > peak_states 2 mark_read 1
> >
> > It looks like pointers to the context aren't supported:
> >
> > https://www.spinics.net/lists/bpf/msg34907.html
> >
> >  > test_global_func11 - check that CTX pointer cannot be passed
> >
> > What is the standard way to pass packet data to a global function?
>
> Since global function is separately verified, you need to pass the 'ctx'
> to the global function and do the 'data_end' check again in the global
> function. This will incur some packet re-parsing overhead similar to
> tail calls.

Now that the bpf_loop() helper landed, it's another option for doing
repeated work. Please see [0].

  [0] https://patchwork.kernel.org/project/netdevbpf/list/?series=587497&state=*

>
> >
> > Thanks,
> > Max
> >
> >>>
> >>>>>
> >>>>>>
> >>>>>>> If you apply this tiny change, it fails the verifier after about
> >>>>>>> 3 hours:
> >>>>>>>
> >>>> [...]
> >>>
> >
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1cc96a225848..651820bef6a2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -564,6 +564,7 @@  u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
 __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
 u64 cookie_init_timestamp(struct request_sock *req, u64 now);
+bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr);
 bool cookie_timestamp_decode(const struct net *net,
 			     struct tcp_options_received *opt);
 bool cookie_ecn_ok(const struct tcp_options_received *opt,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e32f72077250..791790b41874 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5053,6 +5053,32 @@  union bpf_attr {
  *
  *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
  *		CONFIG_IPV6 is disabled).
+ *
+ * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
+ *	Description
+ *		Try to generate a timestamp cookie which encodes some of the
+ *		flags sent by the client in the SYN packet: SACK support, ECN
+ *		support, window scale. To be used with SYN cookies.
+ *
+ *		*th* points to the start of the TCP header of the client's SYN
+ *		packet, while *th_len* contains the length of the TCP header (at
+ *		least **sizeof**\ (**struct tcphdr**)).
+ *
+ *		*tsopt* points to the output location where to put the resulting
+ *		timestamp values: tsval and tsecr, in the format of the TCP
+ *		timestamp option.
+ *
+ *	Return
+ *		On success, 0.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if the input arguments are invalid.
+ *
+ *		**-ENOENT** if the TCP header doesn't have the timestamp option.
+ *
+ *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
+ *		cookies (CONFIG_SYN_COOKIES is off).
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5238,6 +5264,7 @@  union bpf_attr {
 	FN(ct_release),			\
 	FN(tcp_raw_gen_syncookie),	\
 	FN(tcp_raw_check_syncookie),	\
+	FN(tcp_raw_gen_tscookie),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 5f03d4a282a0..73fe20ef7442 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7403,6 +7403,42 @@  static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_4(bpf_tcp_raw_gen_tscookie, struct tcphdr *, th, u32, th_len,
+	   __be32 *, tsopt, u32, tsopt_len)
+{
+	int err;
+
+#ifdef CONFIG_SYN_COOKIES
+	if (tsopt_len != sizeof(u64)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!cookie_init_timestamp_raw(th, &tsopt[0], &tsopt[1])) {
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	return 0;
+err_out:
+#else
+	err = -EOPNOTSUPP;
+#endif
+	memset(tsopt, 0, tsopt_len);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_tscookie_proto = {
+	.func		= bpf_tcp_raw_gen_tscookie,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -7825,6 +7861,8 @@  xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_raw_gen_syncookie_proto;
 	case BPF_FUNC_tcp_raw_check_syncookie:
 		return &bpf_tcp_raw_check_syncookie_proto;
+	case BPF_FUNC_tcp_raw_gen_tscookie:
+		return &bpf_tcp_raw_gen_tscookie_proto;
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 8696dc343ad2..4dd2c7a096eb 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -85,6 +85,66 @@  u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 	return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
 }
 
+bool cookie_init_timestamp_raw(struct tcphdr *th, __be32 *tsval, __be32 *tsecr)
+{
+	int length = (th->doff * 4) - sizeof(*th);
+	u8 wscale = TS_OPT_WSCALE_MASK;
+	bool option_timestamp = false;
+	bool option_sack = false;
+	u32 cookie;
+	u8 *ptr;
+
+	ptr = (u8 *)(th + 1);
+
+	while (length > 0) {
+		u8 opcode = *ptr++;
+		u8 opsize;
+
+		if (opcode == TCPOPT_EOL)
+			break;
+		if (opcode == TCPOPT_NOP) {
+			length--;
+			continue;
+		}
+
+		if (length < 2)
+			break;
+		opsize = *ptr++;
+		if (opsize < 2)
+			break;
+		if (opsize > length)
+			break;
+
+		switch (opcode) {
+		case TCPOPT_WINDOW:
+			wscale = min_t(u8, *ptr, TCP_MAX_WSCALE);
+			break;
+		case TCPOPT_TIMESTAMP:
+			option_timestamp = true;
+			/* Client's tsval becomes our tsecr. */
+			*tsecr = cpu_to_be32(get_unaligned_be32(ptr));
+			break;
+		case TCPOPT_SACK_PERM:
+			option_sack = true;
+			break;
+		}
+
+		ptr += opsize - 2;
+		length -= opsize;
+	}
+
+	if (!option_timestamp)
+		return false;
+
+	cookie = tcp_time_stamp_raw() & ~TSMASK;
+	cookie |= wscale & TS_OPT_WSCALE_MASK;
+	if (option_sack)
+		cookie |= TS_OPT_SACK;
+	if (th->ece && th->cwr)
+		cookie |= TS_OPT_ECN;
+	*tsval = cpu_to_be32(cookie);
+	return true;
+}
 
 static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
 				   __be16 dport, __u32 sseq, __u32 data)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e32f72077250..791790b41874 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5053,6 +5053,32 @@  union bpf_attr {
  *
  *		**-EPROTONOSUPPORT** if the IP version is not 4 or 6 (or 6, but
  *		CONFIG_IPV6 is disabled).
+ *
+ * int bpf_tcp_raw_gen_tscookie(struct tcphdr *th, u32 th_len, __be32 *tsopt, u32 tsopt_len)
+ *	Description
+ *		Try to generate a timestamp cookie which encodes some of the
+ *		flags sent by the client in the SYN packet: SACK support, ECN
+ *		support, window scale. To be used with SYN cookies.
+ *
+ *		*th* points to the start of the TCP header of the client's SYN
+ *		packet, while *th_len* contains the length of the TCP header (at
+ *		least **sizeof**\ (**struct tcphdr**)).
+ *
+ *		*tsopt* points to the output location where to put the resulting
+ *		timestamp values: tsval and tsecr, in the format of the TCP
+ *		timestamp option.
+ *
+ *	Return
+ *		On success, 0.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if the input arguments are invalid.
+ *
+ *		**-ENOENT** if the TCP header doesn't have the timestamp option.
+ *
+ *		**-EOPNOTSUPP** if the kernel configuration does not enable SYN
+ *		cookies (CONFIG_SYN_COOKIES is off).
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5238,6 +5264,7 @@  union bpf_attr {
 	FN(ct_release),			\
 	FN(tcp_raw_gen_syncookie),	\
 	FN(tcp_raw_check_syncookie),	\
+	FN(tcp_raw_gen_tscookie),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper