diff mbox series

[bpf-next,v1,1/3] bpf: Add skb dynptrs

Message ID 20220726184706.954822-2-joannelkoong@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Add skb + xdp dynptrs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next

Commit Message

Joanne Koong July 26, 2022, 6:47 p.m. UTC
Add skb dynptrs, which are dynptrs whose underlying pointer points
to a skb. The dynptr acts on skb data. skb dynptrs have two main
benefits. One is that they allow operations on sizes that are not
statically known at compile-time (eg variable-sized accesses).
Another is that parsing the packet data through dynptrs (instead of
through direct access of skb->data and skb->data_end) can be more
ergonomic and less brittle (eg does not need manual if checking for
being within bounds of data_end).

For bpf prog types that don't support writes on skb data, the dynptr is
read-only (writes and data slices are not permitted). For reads on the
dynptr, this includes reading into data in the non-linear paged buffers
but for writes and data slices, if the data is in a paged buffer, the
user must first call bpf_skb_pull_data to pull the data into the linear
portion.

Additionally, any helper calls that change the underlying packet buffer
(eg bpf_skb_pull_data) invalidates any data slices of the associated
dynptr.

Right now, skb dynptrs can only be constructed from skbs that are
the bpf program context - as such, there does not need to be any
reference tracking or release on skb dynptrs.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 include/linux/bpf.h            |  8 ++++-
 include/linux/filter.h         |  4 +++
 include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
 kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
 net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
 tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
 7 files changed, 229 insertions(+), 17 deletions(-)

Comments

Stanislav Fomichev July 27, 2022, 5:13 p.m. UTC | #1
On 07/26, Joanne Koong wrote:
> Add skb dynptrs, which are dynptrs whose underlying pointer points
> to a skb. The dynptr acts on skb data. skb dynptrs have two main
> benefits. One is that they allow operations on sizes that are not
> statically known at compile-time (eg variable-sized accesses).
> Another is that parsing the packet data through dynptrs (instead of
> through direct access of skb->data and skb->data_end) can be more
> ergonomic and less brittle (eg does not need manual if checking for
> being within bounds of data_end).

> For bpf prog types that don't support writes on skb data, the dynptr is
> read-only (writes and data slices are not permitted). For reads on the
> dynptr, this includes reading into data in the non-linear paged buffers
> but for writes and data slices, if the data is in a paged buffer, the
> user must first call bpf_skb_pull_data to pull the data into the linear
> portion.

> Additionally, any helper calls that change the underlying packet buffer
> (eg bpf_skb_pull_data) invalidates any data slices of the associated
> dynptr.

> Right now, skb dynptrs can only be constructed from skbs that are
> the bpf program context - as such, there does not need to be any
> reference tracking or release on skb dynptrs.

> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>   include/linux/bpf.h            |  8 ++++-
>   include/linux/filter.h         |  4 +++
>   include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
>   kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
>   kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
>   net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
>   tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
>   7 files changed, 229 insertions(+), 17 deletions(-)

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 20c26aed7896..7fbd4324c848 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -407,11 +407,14 @@ enum bpf_type_flag {
>   	/* Size is known at compile time. */
>   	MEM_FIXED_SIZE		= BIT(10 + BPF_BASE_TYPE_BITS),

> +	/* DYNPTR points to sk_buff */
> +	DYNPTR_TYPE_SKB		= BIT(11 + BPF_BASE_TYPE_BITS),
> +
>   	__BPF_TYPE_FLAG_MAX,
>   	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
>   };

> -#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
> +#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF |  
> DYNPTR_TYPE_SKB)

>   /* Max number of base types. */
>   #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
> @@ -2556,12 +2559,15 @@ enum bpf_dynptr_type {
>   	BPF_DYNPTR_TYPE_LOCAL,
>   	/* Underlying data is a ringbuf record */
>   	BPF_DYNPTR_TYPE_RINGBUF,
> +	/* Underlying data is a sk_buff */
> +	BPF_DYNPTR_TYPE_SKB,
>   };

>   void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
>   		     enum bpf_dynptr_type type, u32 offset, u32 size);
>   void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
>   int bpf_dynptr_check_size(u32 size);
> +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);

>   #ifdef CONFIG_BPF_LSM
>   void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index a5f21dc3c432..649063d9cbfd 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1532,4 +1532,8 @@ static __always_inline int  
> __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
>   	return XDP_REDIRECT;
>   }

> +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void  
> *to, u32 len);
> +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void  
> *from,
> +			  u32 len, u64 flags);
> +
>   #endif /* __LINUX_FILTER_H__ */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59a217ca2dfd..0730cd198a7f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5241,11 +5241,22 @@ union bpf_attr {
>    *	Description
>    *		Write *len* bytes from *src* into *dst*, starting from *offset*
>    *		into *dst*.
> - *		*flags* is currently unused.
> + *
> + *		*flags* must be 0 except for skb-type dynptrs.
> + *
> + *		For skb-type dynptrs:
> + *		    *  if *offset* + *len* extends into the skb's paged buffers, the  
> user
> + *		       should manually pull the skb with bpf_skb_pull and then try  
> again.
> + *
> + *		    *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM**  
> (automatically
> + *			recompute the checksum for the packet after storing the bytes) and
> + *			**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> + *			**->swhash** and *skb*\ **->l4hash** to 0).
>    *	Return
>    *		0 on success, -E2BIG if *offset* + *len* exceeds the length
>    *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> - *		is a read-only dynptr or if *flags* is not 0.
> + *		is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> + *		skb-type dynptrs the write extends into the skb's paged buffers.
>    *
>    * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
>    *	Description
> @@ -5253,10 +5264,19 @@ union bpf_attr {
>    *
>    *		*len* must be a statically known value. The returned data slice
>    *		is invalidated whenever the dynptr is invalidated.
> + *
> + *		For skb-type dynptrs:
> + *		    * if *offset* + *len* extends into the skb's paged buffers,
> + *		      the user should manually pull the skb with bpf_skb_pull and  
> then
> + *		      try again.
> + *
> + *		    * the data slice is automatically invalidated anytime a
> + *		      helper call that changes the underlying packet buffer
> + *		      (eg bpf_skb_pull) is called.
>    *	Return
>    *		Pointer to the underlying dynptr data, NULL if the dynptr is
>    *		read-only, if the dynptr is invalid, or if the offset and length
> - *		is out of bounds.
> + *		is out of bounds or in a paged buffer for skb-type dynptrs.
>    *
>    * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr  
> *th, u32 th_len)
>    *	Description
> @@ -5331,6 +5351,21 @@ union bpf_attr {
>    *		**-EACCES** if the SYN cookie is not valid.
>    *
>    *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> + *
> + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct  
> bpf_dynptr *ptr)
> + *	Description
> + *		Get a dynptr to the data in *skb*. *skb* must be the BPF program
> + *		context. Depending on program type, the dynptr may be read-only,
> + *		in which case trying to obtain a direct data slice to it through
> + *		bpf_dynptr_data will return an error.
> + *
> + *		Calls that change the *skb*'s underlying packet buffer
> + *		(eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> + *		invalidate any data slices associated with the dynptr.
> + *
> + *		*flags* is currently unused, it must be 0 for now.
> + *	Return
> + *		0 on success or -EINVAL if flags is not 0.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -5541,6 +5576,7 @@ union bpf_attr {
>   	FN(tcp_raw_gen_syncookie_ipv6),	\
>   	FN(tcp_raw_check_syncookie_ipv4),	\
>   	FN(tcp_raw_check_syncookie_ipv6),	\
> +	FN(dynptr_from_skb),		\
>   	/* */

>   /* integer value in 'imm' field of BPF_CALL instruction selects which  
> helper
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 1f961f9982d2..21a806057e9e 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1425,11 +1425,21 @@ static bool bpf_dynptr_is_rdonly(struct  
> bpf_dynptr_kern *ptr)
>   	return ptr->size & DYNPTR_RDONLY_BIT;
>   }

> +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
> +{
> +	ptr->size |= DYNPTR_RDONLY_BIT;
> +}
> +
>   static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum  
> bpf_dynptr_type type)
>   {
>   	ptr->size |= type << DYNPTR_TYPE_SHIFT;
>   }

> +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct  
> bpf_dynptr_kern *ptr)
> +{
> +	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
> +}
> +
>   static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
>   {
>   	return ptr->size & DYNPTR_SIZE_MASK;
> @@ -1500,6 +1510,7 @@ static const struct bpf_func_proto  
> bpf_dynptr_from_mem_proto = {
>   BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct  
> bpf_dynptr_kern *, src,
>   	   u32, offset, u64, flags)
>   {
> +	enum bpf_dynptr_type type;
>   	int err;

>   	if (!src->data || flags)
> @@ -1509,6 +1520,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len,  
> struct bpf_dynptr_kern *, src
>   	if (err)
>   		return err;

> +	type = bpf_dynptr_get_type(src);
> +
> +	if (type == BPF_DYNPTR_TYPE_SKB)
> +		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
> +
>   	memcpy(dst, src->data + src->offset + offset, len);

>   	return 0;
> @@ -1528,15 +1544,38 @@ static const struct bpf_func_proto  
> bpf_dynptr_read_proto = {
>   BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset,  
> void *, src,
>   	   u32, len, u64, flags)
>   {
> +	enum bpf_dynptr_type type;
>   	int err;

> -	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
> +	if (!dst->data || bpf_dynptr_is_rdonly(dst))
>   		return -EINVAL;

>   	err = bpf_dynptr_check_off_len(dst, offset, len);
>   	if (err)
>   		return err;

> +	type = bpf_dynptr_get_type(dst);
> +
> +	if (flags) {
> +		if (type == BPF_DYNPTR_TYPE_SKB) {
> +			if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
> +				return -EINVAL;
> +		} else {
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (type == BPF_DYNPTR_TYPE_SKB) {
> +		struct sk_buff *skb = dst->data;
> +
> +		/* if the data is paged, the caller needs to pull it first */
> +		if (dst->offset + offset + len > skb->len - skb->data_len)

Use skb_headlen instead of 'skb->len - skb->data_len' ?

> +			return -EAGAIN;
> +
> +		return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> +					     flags);
> +	}
> +
>   	memcpy(dst->data + dst->offset + offset, src, len);

>   	return 0;
> @@ -1555,6 +1594,7 @@ static const struct bpf_func_proto  
> bpf_dynptr_write_proto = {

>   BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset,  
> u32, len)
>   {
> +	enum bpf_dynptr_type type;
>   	int err;

>   	if (!ptr->data)
> @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern  
> *, ptr, u32, offset, u32, len
>   	if (bpf_dynptr_is_rdonly(ptr))
>   		return 0;

> +	type = bpf_dynptr_get_type(ptr);
> +
> +	if (type == BPF_DYNPTR_TYPE_SKB) {
> +		struct sk_buff *skb = ptr->data;
> +
> +		/* if the data is paged, the caller needs to pull it first */
> +		if (ptr->offset + offset + len > skb->len - skb->data_len)
> +			return 0;

Same here?

> +
> +		return (unsigned long)(skb->data + ptr->offset + offset);
> +	}
> +
>   	return (unsigned long)(ptr->data + ptr->offset + offset);
>   }

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 0d523741a543..0838653eeb4e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -263,6 +263,7 @@ struct bpf_call_arg_meta {
>   	u32 subprogno;
>   	struct bpf_map_value_off_desc *kptr_off_desc;
>   	u8 uninit_dynptr_regno;
> +	enum bpf_dynptr_type type;
>   };

>   struct btf *btf_vmlinux;
> @@ -678,6 +679,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum  
> bpf_arg_type arg_type)
>   		return BPF_DYNPTR_TYPE_LOCAL;
>   	case DYNPTR_TYPE_RINGBUF:
>   		return BPF_DYNPTR_TYPE_RINGBUF;
> +	case DYNPTR_TYPE_SKB:
> +		return BPF_DYNPTR_TYPE_SKB;
>   	default:
>   		return BPF_DYNPTR_TYPE_INVALID;
>   	}
> @@ -5820,12 +5823,14 @@ int check_func_arg_reg_off(struct  
> bpf_verifier_env *env,
>   	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
>   }

> -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct  
> bpf_reg_state *reg)
> +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env,  
> struct bpf_reg_state *reg,
> +				       struct bpf_call_arg_meta *meta)
>   {
>   	struct bpf_func_state *state = func(env, reg);
>   	int spi = get_spi(reg->off);

> -	return state->stack[spi].spilled_ptr.id;
> +	meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> +	meta->type = state->stack[spi].spilled_ptr.dynptr.type;
>   }

>   static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env  
> *env, u32 arg,
>   				case DYNPTR_TYPE_RINGBUF:
>   					err_extra = "ringbuf ";
>   					break;
> +				case DYNPTR_TYPE_SKB:
> +					err_extra = "skb ";
> +					break;
>   				default:
>   					break;
>   				}
> @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env  
> *env, u32 arg,
>   					verbose(env, "verifier internal error: multiple refcounted args in  
> BPF_FUNC_dynptr_data");
>   					return -EFAULT;
>   				}
> -				/* Find the id of the dynptr we're tracking the reference of */
> -				meta->ref_obj_id = stack_slot_get_id(env, reg);
> +				/* Find the id and the type of the dynptr we're tracking
> +				 * the reference of.
> +				 */
> +				stack_slot_get_dynptr_info(env, reg, meta);
>   			}
>   		}
>   		break;
> @@ -7406,7 +7416,11 @@ static int check_helper_call(struct  
> bpf_verifier_env *env, struct bpf_insn *insn
>   		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
>   	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
>   		mark_reg_known_zero(env, regs, BPF_REG_0);
> -		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> +		if (func_id == BPF_FUNC_dynptr_data &&
> +		    meta.type == BPF_DYNPTR_TYPE_SKB)
> +			regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> +		else
> +			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
>   		regs[BPF_REG_0].mem_size = meta.mem_size;
>   	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
>   		const struct btf_type *t;
> @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct  
> bpf_verifier_env *env)
>   			goto patch_call_imm;
>   		}


[..]

> +		if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> +			if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> +				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> +			else
> +				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> +			insn_buf[1] = *insn;
> +			cnt = 2;
> +
> +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +			if (!new_prog)
> +				return -ENOMEM;
> +
> +			delta += cnt - 1;
> +			env->prog = new_prog;
> +			prog = new_prog;
> +			insn = new_prog->insnsi + i + delta;
> +			goto patch_call_imm;
> +		}

Would it be easier to have two separate helpers:
- BPF_FUNC_dynptr_from_skb
- BPF_FUNC_dynptr_from_skb_readonly

And make the verifier rewrite insn->imm to
BPF_FUNC_dynptr_from_skb_readonly when needed?

if (insn->imm == BPF_FUNC_dynptr_from_skb) {
	if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
		insn->imm = BPF_FUNC_dynptr_from_skb_readonly;
}

Or it's also ugly because we'd have to leak that new helper into UAPI?
(I wonder whether that hidden 4th argument is too magical, but probably
fine?)

> +
>   		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
>   		 * and other inlining handlers are currently limited to 64 bit
>   		 * only.
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 5669248aff25..312f99deb759 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1681,8 +1681,8 @@ static inline void bpf_pull_mac_rcsum(struct  
> sk_buff *skb)
>   		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
>   }

> -BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
> -	   const void *, from, u32, len, u64, flags)
> +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void  
> *from,
> +			  u32 len, u64 flags)
>   {
>   	void *ptr;

> @@ -1707,6 +1707,12 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *,  
> skb, u32, offset,
>   	return 0;
>   }

> +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
> +	   const void *, from, u32, len, u64, flags)
> +{
> +	return __bpf_skb_store_bytes(skb, offset, from, len, flags);
> +}
> +
>   static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
>   	.func		= bpf_skb_store_bytes,
>   	.gpl_only	= false,
> @@ -1718,8 +1724,7 @@ static const struct bpf_func_proto  
> bpf_skb_store_bytes_proto = {
>   	.arg5_type	= ARG_ANYTHING,
>   };

> -BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
> -	   void *, to, u32, len)
> +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void  
> *to, u32 len)
>   {
>   	void *ptr;

> @@ -1738,6 +1743,12 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct  
> sk_buff *, skb, u32, offset,
>   	return -EFAULT;
>   }

> +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
> +	   void *, to, u32, len)
> +{
> +	return __bpf_skb_load_bytes(skb, offset, to, len);
> +}
> +
>   static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
>   	.func		= bpf_skb_load_bytes,
>   	.gpl_only	= false,
> @@ -1849,6 +1860,32 @@ static const struct bpf_func_proto  
> bpf_skb_pull_data_proto = {
>   	.arg2_type	= ARG_ANYTHING,
>   };

> +/* is_rdonly is set by the verifier */
> +BPF_CALL_4(bpf_dynptr_from_skb, struct sk_buff *, skb, u64, flags,
> +	   struct bpf_dynptr_kern *, ptr, u32, is_rdonly)
> +{
> +	if (flags) {
> +		bpf_dynptr_set_null(ptr);
> +		return -EINVAL;
> +	}
> +
> +	bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
> +
> +	if (is_rdonly)
> +		bpf_dynptr_set_rdonly(ptr);
> +
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_dynptr_from_skb_proto = {
> +	.func		= bpf_dynptr_from_skb,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_SKB | MEM_UNINIT,
> +};
> +
>   BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
>   {
>   	return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
> @@ -7808,6 +7845,8 @@ sk_filter_func_proto(enum bpf_func_id func_id,  
> const struct bpf_prog *prog)
>   		return &bpf_get_socket_uid_proto;
>   	case BPF_FUNC_perf_event_output:
>   		return &bpf_skb_event_output_proto;
> +	case BPF_FUNC_dynptr_from_skb:
> +		return &bpf_dynptr_from_skb_proto;
>   	default:
>   		return bpf_sk_base_func_proto(func_id);
>   	}
> @@ -7991,6 +8030,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id,  
> const struct bpf_prog *prog)
>   		return &bpf_tcp_raw_check_syncookie_ipv6_proto;
>   #endif
>   #endif
> +	case BPF_FUNC_dynptr_from_skb:
> +		return &bpf_dynptr_from_skb_proto;
>   	default:
>   		return bpf_sk_base_func_proto(func_id);
>   	}
> @@ -8186,6 +8227,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const  
> struct bpf_prog *prog)
>   	case BPF_FUNC_skc_lookup_tcp:
>   		return &bpf_skc_lookup_tcp_proto;
>   #endif
> +	case BPF_FUNC_dynptr_from_skb:
> +		return &bpf_dynptr_from_skb_proto;
>   	default:
>   		return bpf_sk_base_func_proto(func_id);
>   	}
> @@ -8224,6 +8267,8 @@ lwt_out_func_proto(enum bpf_func_id func_id, const  
> struct bpf_prog *prog)
>   		return &bpf_get_smp_processor_id_proto;
>   	case BPF_FUNC_skb_under_cgroup:
>   		return &bpf_skb_under_cgroup_proto;
> +	case BPF_FUNC_dynptr_from_skb:
> +		return &bpf_dynptr_from_skb_proto;
>   	default:
>   		return bpf_sk_base_func_proto(func_id);
>   	}
> diff --git a/tools/include/uapi/linux/bpf.h  
> b/tools/include/uapi/linux/bpf.h
> index 59a217ca2dfd..0730cd198a7f 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -5241,11 +5241,22 @@ union bpf_attr {
>    *	Description
>    *		Write *len* bytes from *src* into *dst*, starting from *offset*
>    *		into *dst*.
> - *		*flags* is currently unused.
> + *
> + *		*flags* must be 0 except for skb-type dynptrs.
> + *
> + *		For skb-type dynptrs:
> + *		    *  if *offset* + *len* extends into the skb's paged buffers, the  
> user
> + *		       should manually pull the skb with bpf_skb_pull and then try  
> again.
> + *
> + *		    *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM**  
> (automatically
> + *			recompute the checksum for the packet after storing the bytes) and
> + *			**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> + *			**->swhash** and *skb*\ **->l4hash** to 0).
>    *	Return
>    *		0 on success, -E2BIG if *offset* + *len* exceeds the length
>    *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> - *		is a read-only dynptr or if *flags* is not 0.
> + *		is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> + *		skb-type dynptrs the write extends into the skb's paged buffers.
>    *
>    * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
>    *	Description
> @@ -5253,10 +5264,19 @@ union bpf_attr {
>    *
>    *		*len* must be a statically known value. The returned data slice
>    *		is invalidated whenever the dynptr is invalidated.
> + *
> + *		For skb-type dynptrs:
> + *		    * if *offset* + *len* extends into the skb's paged buffers,
> + *		      the user should manually pull the skb with bpf_skb_pull and  
> then
> + *		      try again.
> + *
> + *		    * the data slice is automatically invalidated anytime a
> + *		      helper call that changes the underlying packet buffer
> + *		      (eg bpf_skb_pull) is called.
>    *	Return
>    *		Pointer to the underlying dynptr data, NULL if the dynptr is
>    *		read-only, if the dynptr is invalid, or if the offset and length
> - *		is out of bounds.
> + *		is out of bounds or in a paged buffer for skb-type dynptrs.
>    *
>    * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr  
> *th, u32 th_len)
>    *	Description
> @@ -5331,6 +5351,21 @@ union bpf_attr {
>    *		**-EACCES** if the SYN cookie is not valid.
>    *
>    *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> + *
> + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct  
> bpf_dynptr *ptr)
> + *	Description
> + *		Get a dynptr to the data in *skb*. *skb* must be the BPF program
> + *		context. Depending on program type, the dynptr may be read-only,
> + *		in which case trying to obtain a direct data slice to it through
> + *		bpf_dynptr_data will return an error.
> + *
> + *		Calls that change the *skb*'s underlying packet buffer
> + *		(eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> + *		invalidate any data slices associated with the dynptr.
> + *
> + *		*flags* is currently unused, it must be 0 for now.
> + *	Return
> + *		0 on success or -EINVAL if flags is not 0.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -5541,6 +5576,7 @@ union bpf_attr {
>   	FN(tcp_raw_gen_syncookie_ipv6),	\
>   	FN(tcp_raw_check_syncookie_ipv4),	\
>   	FN(tcp_raw_check_syncookie_ipv6),	\
> +	FN(dynptr_from_skb),		\
>   	/* */

>   /* integer value in 'imm' field of BPF_CALL instruction selects which  
> helper
> --
> 2.30.2
Joanne Koong July 28, 2022, 4:49 p.m. UTC | #2
On Wed, Jul 27, 2022 at 10:14 AM <sdf@google.com> wrote:
>
> On 07/26, Joanne Koong wrote:
> > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > benefits. One is that they allow operations on sizes that are not
> > statically known at compile-time (eg variable-sized accesses).
> > Another is that parsing the packet data through dynptrs (instead of
> > through direct access of skb->data and skb->data_end) can be more
> > ergonomic and less brittle (eg does not need manual if checking for
> > being within bounds of data_end).
>
> > For bpf prog types that don't support writes on skb data, the dynptr is
> > read-only (writes and data slices are not permitted). For reads on the
> > dynptr, this includes reading into data in the non-linear paged buffers
> > but for writes and data slices, if the data is in a paged buffer, the
> > user must first call bpf_skb_pull_data to pull the data into the linear
> > portion.
>
> > Additionally, any helper calls that change the underlying packet buffer
> > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > dynptr.
>
> > Right now, skb dynptrs can only be constructed from skbs that are
> > the bpf program context - as such, there does not need to be any
> > reference tracking or release on skb dynptrs.
>
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >   include/linux/bpf.h            |  8 ++++-
> >   include/linux/filter.h         |  4 +++
> >   include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
> >   kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
> >   kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
> >   net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
> >   tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
> >   7 files changed, 229 insertions(+), 17 deletions(-)
>
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 20c26aed7896..7fbd4324c848 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -407,11 +407,14 @@ enum bpf_type_flag {
> >       /* Size is known at compile time. */
> >       MEM_FIXED_SIZE          = BIT(10 + BPF_BASE_TYPE_BITS),
>
> > +     /* DYNPTR points to sk_buff */
> > +     DYNPTR_TYPE_SKB         = BIT(11 + BPF_BASE_TYPE_BITS),
> > +
> >       __BPF_TYPE_FLAG_MAX,
> >       __BPF_TYPE_LAST_FLAG    = __BPF_TYPE_FLAG_MAX - 1,
> >   };
>
> > -#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
> > +#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF |
> > DYNPTR_TYPE_SKB)
>
> >   /* Max number of base types. */
> >   #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
> > @@ -2556,12 +2559,15 @@ enum bpf_dynptr_type {
> >       BPF_DYNPTR_TYPE_LOCAL,
> >       /* Underlying data is a ringbuf record */
> >       BPF_DYNPTR_TYPE_RINGBUF,
> > +     /* Underlying data is a sk_buff */
> > +     BPF_DYNPTR_TYPE_SKB,
> >   };
>
> >   void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
> >                    enum bpf_dynptr_type type, u32 offset, u32 size);
> >   void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
> >   int bpf_dynptr_check_size(u32 size);
> > +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
>
> >   #ifdef CONFIG_BPF_LSM
> >   void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index a5f21dc3c432..649063d9cbfd 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -1532,4 +1532,8 @@ static __always_inline int
> > __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
> >       return XDP_REDIRECT;
> >   }
>
> > +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void
> > *to, u32 len);
> > +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void
> > *from,
> > +                       u32 len, u64 flags);
> > +
> >   #endif /* __LINUX_FILTER_H__ */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 59a217ca2dfd..0730cd198a7f 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -5241,11 +5241,22 @@ union bpf_attr {
> >    *  Description
> >    *          Write *len* bytes from *src* into *dst*, starting from *offset*
> >    *          into *dst*.
> > - *           *flags* is currently unused.
> > + *
> > + *           *flags* must be 0 except for skb-type dynptrs.
> > + *
> > + *           For skb-type dynptrs:
> > + *               *  if *offset* + *len* extends into the skb's paged buffers, the
> > user
> > + *                  should manually pull the skb with bpf_skb_pull and then try
> > again.
> > + *
> > + *               *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM**
> > (automatically
> > + *                   recompute the checksum for the packet after storing the bytes) and
> > + *                   **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> > + *                   **->swhash** and *skb*\ **->l4hash** to 0).
> >    *  Return
> >    *          0 on success, -E2BIG if *offset* + *len* exceeds the length
> >    *          of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> > - *           is a read-only dynptr or if *flags* is not 0.
> > + *           is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> > + *           skb-type dynptrs the write extends into the skb's paged buffers.
> >    *
> >    * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
> >    *  Description
> > @@ -5253,10 +5264,19 @@ union bpf_attr {
> >    *
> >    *          *len* must be a statically known value. The returned data slice
> >    *          is invalidated whenever the dynptr is invalidated.
> > + *
> > + *           For skb-type dynptrs:
> > + *               * if *offset* + *len* extends into the skb's paged buffers,
> > + *                 the user should manually pull the skb with bpf_skb_pull and
> > then
> > + *                 try again.
> > + *
> > + *               * the data slice is automatically invalidated anytime a
> > + *                 helper call that changes the underlying packet buffer
> > + *                 (eg bpf_skb_pull) is called.
> >    *  Return
> >    *          Pointer to the underlying dynptr data, NULL if the dynptr is
> >    *          read-only, if the dynptr is invalid, or if the offset and length
> > - *           is out of bounds.
> > + *           is out of bounds or in a paged buffer for skb-type dynptrs.
> >    *
> >    * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr
> > *th, u32 th_len)
> >    *  Description
> > @@ -5331,6 +5351,21 @@ union bpf_attr {
> >    *          **-EACCES** if the SYN cookie is not valid.
> >    *
> >    *          **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> > + *
> > + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct
> > bpf_dynptr *ptr)
> > + *   Description
> > + *           Get a dynptr to the data in *skb*. *skb* must be the BPF program
> > + *           context. Depending on program type, the dynptr may be read-only,
> > + *           in which case trying to obtain a direct data slice to it through
> > + *           bpf_dynptr_data will return an error.
> > + *
> > + *           Calls that change the *skb*'s underlying packet buffer
> > + *           (eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> > + *           invalidate any data slices associated with the dynptr.
> > + *
> > + *           *flags* is currently unused, it must be 0 for now.
> > + *   Return
> > + *           0 on success or -EINVAL if flags is not 0.
> >    */
> >   #define __BPF_FUNC_MAPPER(FN)               \
> >       FN(unspec),                     \
> > @@ -5541,6 +5576,7 @@ union bpf_attr {
> >       FN(tcp_raw_gen_syncookie_ipv6), \
> >       FN(tcp_raw_check_syncookie_ipv4),       \
> >       FN(tcp_raw_check_syncookie_ipv6),       \
> > +     FN(dynptr_from_skb),            \
> >       /* */
>
> >   /* integer value in 'imm' field of BPF_CALL instruction selects which
> > helper
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 1f961f9982d2..21a806057e9e 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1425,11 +1425,21 @@ static bool bpf_dynptr_is_rdonly(struct
> > bpf_dynptr_kern *ptr)
> >       return ptr->size & DYNPTR_RDONLY_BIT;
> >   }
>
> > +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
> > +{
> > +     ptr->size |= DYNPTR_RDONLY_BIT;
> > +}
> > +
> >   static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum
> > bpf_dynptr_type type)
> >   {
> >       ptr->size |= type << DYNPTR_TYPE_SHIFT;
> >   }
>
> > +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct
> > bpf_dynptr_kern *ptr)
> > +{
> > +     return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
> > +}
> > +
> >   static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
> >   {
> >       return ptr->size & DYNPTR_SIZE_MASK;
> > @@ -1500,6 +1510,7 @@ static const struct bpf_func_proto
> > bpf_dynptr_from_mem_proto = {
> >   BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct
> > bpf_dynptr_kern *, src,
> >          u32, offset, u64, flags)
> >   {
> > +     enum bpf_dynptr_type type;
> >       int err;
>
> >       if (!src->data || flags)
> > @@ -1509,6 +1520,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len,
> > struct bpf_dynptr_kern *, src
> >       if (err)
> >               return err;
>
> > +     type = bpf_dynptr_get_type(src);
> > +
> > +     if (type == BPF_DYNPTR_TYPE_SKB)
> > +             return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
> > +
> >       memcpy(dst, src->data + src->offset + offset, len);
>
> >       return 0;
> > @@ -1528,15 +1544,38 @@ static const struct bpf_func_proto
> > bpf_dynptr_read_proto = {
> >   BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset,
> > void *, src,
> >          u32, len, u64, flags)
> >   {
> > +     enum bpf_dynptr_type type;
> >       int err;
>
> > -     if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
> > +     if (!dst->data || bpf_dynptr_is_rdonly(dst))
> >               return -EINVAL;
>
> >       err = bpf_dynptr_check_off_len(dst, offset, len);
> >       if (err)
> >               return err;
>
> > +     type = bpf_dynptr_get_type(dst);
> > +
> > +     if (flags) {
> > +             if (type == BPF_DYNPTR_TYPE_SKB) {
> > +                     if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
> > +                             return -EINVAL;
> > +             } else {
> > +                     return -EINVAL;
> > +             }
> > +     }
> > +
> > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > +             struct sk_buff *skb = dst->data;
> > +
> > +             /* if the data is paged, the caller needs to pull it first */
> > +             if (dst->offset + offset + len > skb->len - skb->data_len)
>
> Use skb_headlen instead of 'skb->len - skb->data_len' ?
Awesome, will replace this (and the one in bpf_dynptr_data) with
skb_headlen() for v2. thanks!
>
> > +                     return -EAGAIN;
> > +
> > +             return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> > +                                          flags);
> > +     }
> > +
> >       memcpy(dst->data + dst->offset + offset, src, len);
>
> >       return 0;
> > @@ -1555,6 +1594,7 @@ static const struct bpf_func_proto
> > bpf_dynptr_write_proto = {
>
> >   BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset,
> > u32, len)
> >   {
> > +     enum bpf_dynptr_type type;
> >       int err;
>
> >       if (!ptr->data)
> > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern
> > *, ptr, u32, offset, u32, len
> >       if (bpf_dynptr_is_rdonly(ptr))
> >               return 0;
>
> > +     type = bpf_dynptr_get_type(ptr);
> > +
> > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > +             struct sk_buff *skb = ptr->data;
> > +
> > +             /* if the data is paged, the caller needs to pull it first */
> > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > +                     return 0;
>
> Same here?
>
> > +
> > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > +     }
> > +
> >       return (unsigned long)(ptr->data + ptr->offset + offset);
> >   }
>
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 0d523741a543..0838653eeb4e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -263,6 +263,7 @@ struct bpf_call_arg_meta {
> >       u32 subprogno;
> >       struct bpf_map_value_off_desc *kptr_off_desc;
> >       u8 uninit_dynptr_regno;
> > +     enum bpf_dynptr_type type;
> >   };
>
> >   struct btf *btf_vmlinux;
> > @@ -678,6 +679,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum
> > bpf_arg_type arg_type)
> >               return BPF_DYNPTR_TYPE_LOCAL;
> >       case DYNPTR_TYPE_RINGBUF:
> >               return BPF_DYNPTR_TYPE_RINGBUF;
> > +     case DYNPTR_TYPE_SKB:
> > +             return BPF_DYNPTR_TYPE_SKB;
> >       default:
> >               return BPF_DYNPTR_TYPE_INVALID;
> >       }
> > @@ -5820,12 +5823,14 @@ int check_func_arg_reg_off(struct
> > bpf_verifier_env *env,
> >       return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
> >   }
>
> > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct
> > bpf_reg_state *reg)
> > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env,
> > struct bpf_reg_state *reg,
> > +                                    struct bpf_call_arg_meta *meta)
> >   {
> >       struct bpf_func_state *state = func(env, reg);
> >       int spi = get_spi(reg->off);
>
> > -     return state->stack[spi].spilled_ptr.id;
> > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> >   }
>
> >   static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env
> > *env, u32 arg,
> >                               case DYNPTR_TYPE_RINGBUF:
> >                                       err_extra = "ringbuf ";
> >                                       break;
> > +                             case DYNPTR_TYPE_SKB:
> > +                                     err_extra = "skb ";
> > +                                     break;
> >                               default:
> >                                       break;
> >                               }
> > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env
> > *env, u32 arg,
> >                                       verbose(env, "verifier internal error: multiple refcounted args in
> > BPF_FUNC_dynptr_data");
> >                                       return -EFAULT;
> >                               }
> > -                             /* Find the id of the dynptr we're tracking the reference of */
> > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > +                             /* Find the id and the type of the dynptr we're tracking
> > +                              * the reference of.
> > +                              */
> > +                             stack_slot_get_dynptr_info(env, reg, meta);
> >                       }
> >               }
> >               break;
> > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct
> > bpf_verifier_env *env, struct bpf_insn *insn
> >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > +             if (func_id == BPF_FUNC_dynptr_data &&
> > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > +             else
> > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> >               regs[BPF_REG_0].mem_size = meta.mem_size;
> >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> >               const struct btf_type *t;
> > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct
> > bpf_verifier_env *env)
> >                       goto patch_call_imm;
> >               }
>
>
> [..]
>
> > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > +                     else
> > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > +                     insn_buf[1] = *insn;
> > +                     cnt = 2;
> > +
> > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > +                     if (!new_prog)
> > +                             return -ENOMEM;
> > +
> > +                     delta += cnt - 1;
> > +                     env->prog = new_prog;
> > +                     prog = new_prog;
> > +                     insn = new_prog->insnsi + i + delta;
> > +                     goto patch_call_imm;
> > +             }
>
> Would it be easier to have two separate helpers:
> - BPF_FUNC_dynptr_from_skb
> - BPF_FUNC_dynptr_from_skb_readonly
>
> And make the verifier rewrite insn->imm to
> BPF_FUNC_dynptr_from_skb_readonly when needed?
>
> if (insn->imm == BPF_FUNC_dynptr_from_skb) {
>         if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
>                 insn->imm = BPF_FUNC_dynptr_from_skb_readonly;
> }
>
> Or it's also ugly because we'd have to leak that new helper into UAPI?
> (I wonder whether that hidden 4th argument is too magical, but probably
> fine?)
To me, having 2 separate helpers feels more cluttered and having to
expose it in the uapi (though I think there is probably some way to
avoid this by doing some sort of ad hoc processing) doesn't seem
ideal. If you feel strongly about this though, I am happy to change
this to use two separate helpers. We do this sort of manual
instruction patching for the sleepable flags in
bpf_task/sk/inode_storage_get and for the callback args in
bpf_timer_set_callback as well - if we use separate helpers here, we
should do that for the other cases as well to maintain consistency.
>
> > +
> >               /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
> >                * and other inlining handlers are currently limited to 64 bit
> >                * only.
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 5669248aff25..312f99deb759 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -1681,8 +1681,8 @@ static inline void bpf_pull_mac_rcsum(struct
> > sk_buff *skb)
> >               skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
> >   }
>
> > -BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
> > -        const void *, from, u32, len, u64, flags)
> > +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void
> > *from,
> > +                       u32 len, u64 flags)
> >   {
> >       void *ptr;
>
> > @@ -1707,6 +1707,12 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *,
> > skb, u32, offset,
> >       return 0;
> >   }
>
> > +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
> > +        const void *, from, u32, len, u64, flags)
> > +{
> > +     return __bpf_skb_store_bytes(skb, offset, from, len, flags);
> > +}
> > +
> >   static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
> >       .func           = bpf_skb_store_bytes,
> >       .gpl_only       = false,
> > @@ -1718,8 +1724,7 @@ static const struct bpf_func_proto
> > bpf_skb_store_bytes_proto = {
> >       .arg5_type      = ARG_ANYTHING,
> >   };
>
> > -BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
> > -        void *, to, u32, len)
> > +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void
> > *to, u32 len)
> >   {
> >       void *ptr;
>
> > @@ -1738,6 +1743,12 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct
> > sk_buff *, skb, u32, offset,
> >       return -EFAULT;
> >   }
>
> > +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
> > +        void *, to, u32, len)
> > +{
> > +     return __bpf_skb_load_bytes(skb, offset, to, len);
> > +}
> > +
> >   static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
> >       .func           = bpf_skb_load_bytes,
> >       .gpl_only       = false,
> > @@ -1849,6 +1860,32 @@ static const struct bpf_func_proto
> > bpf_skb_pull_data_proto = {
> >       .arg2_type      = ARG_ANYTHING,
> >   };
>
> > +/* is_rdonly is set by the verifier */
> > +BPF_CALL_4(bpf_dynptr_from_skb, struct sk_buff *, skb, u64, flags,
> > +        struct bpf_dynptr_kern *, ptr, u32, is_rdonly)
> > +{
> > +     if (flags) {
> > +             bpf_dynptr_set_null(ptr);
> > +             return -EINVAL;
> > +     }
> > +
> > +     bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
> > +
> > +     if (is_rdonly)
> > +             bpf_dynptr_set_rdonly(ptr);
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_func_proto bpf_dynptr_from_skb_proto = {
> > +     .func           = bpf_dynptr_from_skb,
> > +     .gpl_only       = false,
> > +     .ret_type       = RET_INTEGER,
> > +     .arg1_type      = ARG_PTR_TO_CTX,
> > +     .arg2_type      = ARG_ANYTHING,
> > +     .arg3_type      = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_SKB | MEM_UNINIT,
> > +};
> > +
> >   BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
> >   {
> >       return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
> > @@ -7808,6 +7845,8 @@ sk_filter_func_proto(enum bpf_func_id func_id,
> > const struct bpf_prog *prog)
> >               return &bpf_get_socket_uid_proto;
> >       case BPF_FUNC_perf_event_output:
> >               return &bpf_skb_event_output_proto;
> > +     case BPF_FUNC_dynptr_from_skb:
> > +             return &bpf_dynptr_from_skb_proto;
> >       default:
> >               return bpf_sk_base_func_proto(func_id);
> >       }
> > @@ -7991,6 +8030,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id,
> > const struct bpf_prog *prog)
> >               return &bpf_tcp_raw_check_syncookie_ipv6_proto;
> >   #endif
> >   #endif
> > +     case BPF_FUNC_dynptr_from_skb:
> > +             return &bpf_dynptr_from_skb_proto;
> >       default:
> >               return bpf_sk_base_func_proto(func_id);
> >       }
> > @@ -8186,6 +8227,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const
> > struct bpf_prog *prog)
> >       case BPF_FUNC_skc_lookup_tcp:
> >               return &bpf_skc_lookup_tcp_proto;
> >   #endif
> > +     case BPF_FUNC_dynptr_from_skb:
> > +             return &bpf_dynptr_from_skb_proto;
> >       default:
> >               return bpf_sk_base_func_proto(func_id);
> >       }
> > @@ -8224,6 +8267,8 @@ lwt_out_func_proto(enum bpf_func_id func_id, const
> > struct bpf_prog *prog)
> >               return &bpf_get_smp_processor_id_proto;
> >       case BPF_FUNC_skb_under_cgroup:
> >               return &bpf_skb_under_cgroup_proto;
> > +     case BPF_FUNC_dynptr_from_skb:
> > +             return &bpf_dynptr_from_skb_proto;
> >       default:
> >               return bpf_sk_base_func_proto(func_id);
> >       }
> > diff --git a/tools/include/uapi/linux/bpf.h
> > b/tools/include/uapi/linux/bpf.h
> > index 59a217ca2dfd..0730cd198a7f 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -5241,11 +5241,22 @@ union bpf_attr {
> >    *  Description
> >    *          Write *len* bytes from *src* into *dst*, starting from *offset*
> >    *          into *dst*.
> > - *           *flags* is currently unused.
> > + *
> > + *           *flags* must be 0 except for skb-type dynptrs.
> > + *
> > + *           For skb-type dynptrs:
> > + *               *  if *offset* + *len* extends into the skb's paged buffers, the
> > user
> > + *                  should manually pull the skb with bpf_skb_pull and then try
> > again.
> > + *
> > + *               *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM**
> > (automatically
> > + *                   recompute the checksum for the packet after storing the bytes) and
> > + *                   **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> > + *                   **->swhash** and *skb*\ **->l4hash** to 0).
> >    *  Return
> >    *          0 on success, -E2BIG if *offset* + *len* exceeds the length
> >    *          of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> > - *           is a read-only dynptr or if *flags* is not 0.
> > + *           is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> > + *           skb-type dynptrs the write extends into the skb's paged buffers.
> >    *
> >    * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
> >    *  Description
> > @@ -5253,10 +5264,19 @@ union bpf_attr {
> >    *
> >    *          *len* must be a statically known value. The returned data slice
> >    *          is invalidated whenever the dynptr is invalidated.
> > + *
> > + *           For skb-type dynptrs:
> > + *               * if *offset* + *len* extends into the skb's paged buffers,
> > + *                 the user should manually pull the skb with bpf_skb_pull and
> > then
> > + *                 try again.
> > + *
> > + *               * the data slice is automatically invalidated anytime a
> > + *                 helper call that changes the underlying packet buffer
> > + *                 (eg bpf_skb_pull) is called.
> >    *  Return
> >    *          Pointer to the underlying dynptr data, NULL if the dynptr is
> >    *          read-only, if the dynptr is invalid, or if the offset and length
> > - *           is out of bounds.
> > + *           is out of bounds or in a paged buffer for skb-type dynptrs.
> >    *
> >    * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr
> > *th, u32 th_len)
> >    *  Description
> > @@ -5331,6 +5351,21 @@ union bpf_attr {
> >    *          **-EACCES** if the SYN cookie is not valid.
> >    *
> >    *          **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> > + *
> > + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct
> > bpf_dynptr *ptr)
> > + *   Description
> > + *           Get a dynptr to the data in *skb*. *skb* must be the BPF program
> > + *           context. Depending on program type, the dynptr may be read-only,
> > + *           in which case trying to obtain a direct data slice to it through
> > + *           bpf_dynptr_data will return an error.
> > + *
> > + *           Calls that change the *skb*'s underlying packet buffer
> > + *           (eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> > + *           invalidate any data slices associated with the dynptr.
> > + *
> > + *           *flags* is currently unused, it must be 0 for now.
> > + *   Return
> > + *           0 on success or -EINVAL if flags is not 0.
> >    */
> >   #define __BPF_FUNC_MAPPER(FN)               \
> >       FN(unspec),                     \
> > @@ -5541,6 +5576,7 @@ union bpf_attr {
> >       FN(tcp_raw_gen_syncookie_ipv6), \
> >       FN(tcp_raw_check_syncookie_ipv4),       \
> >       FN(tcp_raw_check_syncookie_ipv6),       \
> > +     FN(dynptr_from_skb),            \
> >       /* */
>
> >   /* integer value in 'imm' field of BPF_CALL instruction selects which
> > helper
> > --
> > 2.30.2
>
Stanislav Fomichev July 28, 2022, 5:28 p.m. UTC | #3
On Thu, Jul 28, 2022 at 9:49 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Wed, Jul 27, 2022 at 10:14 AM <sdf@google.com> wrote:
> >
> > On 07/26, Joanne Koong wrote:
> > > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > > benefits. One is that they allow operations on sizes that are not
> > > statically known at compile-time (eg variable-sized accesses).
> > > Another is that parsing the packet data through dynptrs (instead of
> > > through direct access of skb->data and skb->data_end) can be more
> > > ergonomic and less brittle (eg does not need manual if checking for
> > > being within bounds of data_end).
> >
> > > For bpf prog types that don't support writes on skb data, the dynptr is
> > > read-only (writes and data slices are not permitted). For reads on the
> > > dynptr, this includes reading into data in the non-linear paged buffers
> > > but for writes and data slices, if the data is in a paged buffer, the
> > > user must first call bpf_skb_pull_data to pull the data into the linear
> > > portion.
> >
> > > Additionally, any helper calls that change the underlying packet buffer
> > > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > > dynptr.
> >
> > > Right now, skb dynptrs can only be constructed from skbs that are
> > > the bpf program context - as such, there does not need to be any
> > > reference tracking or release on skb dynptrs.
> >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > ---
> > >   include/linux/bpf.h            |  8 ++++-
> > >   include/linux/filter.h         |  4 +++
> > >   include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
> > >   kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
> > >   kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
> > >   net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
> > >   tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
> > >   7 files changed, 229 insertions(+), 17 deletions(-)
> >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 20c26aed7896..7fbd4324c848 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -407,11 +407,14 @@ enum bpf_type_flag {
> > >       /* Size is known at compile time. */
> > >       MEM_FIXED_SIZE          = BIT(10 + BPF_BASE_TYPE_BITS),
> >
> > > +     /* DYNPTR points to sk_buff */
> > > +     DYNPTR_TYPE_SKB         = BIT(11 + BPF_BASE_TYPE_BITS),
> > > +
> > >       __BPF_TYPE_FLAG_MAX,
> > >       __BPF_TYPE_LAST_FLAG    = __BPF_TYPE_FLAG_MAX - 1,
> > >   };
> >
> > > -#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
> > > +#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF |
> > > DYNPTR_TYPE_SKB)
> >
> > >   /* Max number of base types. */
> > >   #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
> > > @@ -2556,12 +2559,15 @@ enum bpf_dynptr_type {
> > >       BPF_DYNPTR_TYPE_LOCAL,
> > >       /* Underlying data is a ringbuf record */
> > >       BPF_DYNPTR_TYPE_RINGBUF,
> > > +     /* Underlying data is a sk_buff */
> > > +     BPF_DYNPTR_TYPE_SKB,
> > >   };
> >
> > >   void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
> > >                    enum bpf_dynptr_type type, u32 offset, u32 size);
> > >   void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
> > >   int bpf_dynptr_check_size(u32 size);
> > > +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
> >
> > >   #ifdef CONFIG_BPF_LSM
> > >   void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
> > > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > > index a5f21dc3c432..649063d9cbfd 100644
> > > --- a/include/linux/filter.h
> > > +++ b/include/linux/filter.h
> > > @@ -1532,4 +1532,8 @@ static __always_inline int
> > > __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
> > >       return XDP_REDIRECT;
> > >   }
> >
> > > +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void
> > > *to, u32 len);
> > > +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void
> > > *from,
> > > +                       u32 len, u64 flags);
> > > +
> > >   #endif /* __LINUX_FILTER_H__ */
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 59a217ca2dfd..0730cd198a7f 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -5241,11 +5241,22 @@ union bpf_attr {
> > >    *  Description
> > >    *          Write *len* bytes from *src* into *dst*, starting from *offset*
> > >    *          into *dst*.
> > > - *           *flags* is currently unused.
> > > + *
> > > + *           *flags* must be 0 except for skb-type dynptrs.
> > > + *
> > > + *           For skb-type dynptrs:
> > > + *               *  if *offset* + *len* extends into the skb's paged buffers, the
> > > user
> > > + *                  should manually pull the skb with bpf_skb_pull and then try
> > > again.
> > > + *
> > > + *               *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM**
> > > (automatically
> > > + *                   recompute the checksum for the packet after storing the bytes) and
> > > + *                   **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> > > + *                   **->swhash** and *skb*\ **->l4hash** to 0).
> > >    *  Return
> > >    *          0 on success, -E2BIG if *offset* + *len* exceeds the length
> > >    *          of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> > > - *           is a read-only dynptr or if *flags* is not 0.
> > > + *           is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> > > + *           skb-type dynptrs the write extends into the skb's paged buffers.
> > >    *
> > >    * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
> > >    *  Description
> > > @@ -5253,10 +5264,19 @@ union bpf_attr {
> > >    *
> > >    *          *len* must be a statically known value. The returned data slice
> > >    *          is invalidated whenever the dynptr is invalidated.
> > > + *
> > > + *           For skb-type dynptrs:
> > > + *               * if *offset* + *len* extends into the skb's paged buffers,
> > > + *                 the user should manually pull the skb with bpf_skb_pull and
> > > then
> > > + *                 try again.
> > > + *
> > > + *               * the data slice is automatically invalidated anytime a
> > > + *                 helper call that changes the underlying packet buffer
> > > + *                 (eg bpf_skb_pull) is called.
> > >    *  Return
> > >    *          Pointer to the underlying dynptr data, NULL if the dynptr is
> > >    *          read-only, if the dynptr is invalid, or if the offset and length
> > > - *           is out of bounds.
> > > + *           is out of bounds or in a paged buffer for skb-type dynptrs.
> > >    *
> > >    * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr
> > > *th, u32 th_len)
> > >    *  Description
> > > @@ -5331,6 +5351,21 @@ union bpf_attr {
> > >    *          **-EACCES** if the SYN cookie is not valid.
> > >    *
> > >    *          **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> > > + *
> > > + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct
> > > bpf_dynptr *ptr)
> > > + *   Description
> > > + *           Get a dynptr to the data in *skb*. *skb* must be the BPF program
> > > + *           context. Depending on program type, the dynptr may be read-only,
> > > + *           in which case trying to obtain a direct data slice to it through
> > > + *           bpf_dynptr_data will return an error.
> > > + *
> > > + *           Calls that change the *skb*'s underlying packet buffer
> > > + *           (eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> > > + *           invalidate any data slices associated with the dynptr.
> > > + *
> > > + *           *flags* is currently unused, it must be 0 for now.
> > > + *   Return
> > > + *           0 on success or -EINVAL if flags is not 0.
> > >    */
> > >   #define __BPF_FUNC_MAPPER(FN)               \
> > >       FN(unspec),                     \
> > > @@ -5541,6 +5576,7 @@ union bpf_attr {
> > >       FN(tcp_raw_gen_syncookie_ipv6), \
> > >       FN(tcp_raw_check_syncookie_ipv4),       \
> > >       FN(tcp_raw_check_syncookie_ipv6),       \
> > > +     FN(dynptr_from_skb),            \
> > >       /* */
> >
> > >   /* integer value in 'imm' field of BPF_CALL instruction selects which
> > > helper
> > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > index 1f961f9982d2..21a806057e9e 100644
> > > --- a/kernel/bpf/helpers.c
> > > +++ b/kernel/bpf/helpers.c
> > > @@ -1425,11 +1425,21 @@ static bool bpf_dynptr_is_rdonly(struct
> > > bpf_dynptr_kern *ptr)
> > >       return ptr->size & DYNPTR_RDONLY_BIT;
> > >   }
> >
> > > +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
> > > +{
> > > +     ptr->size |= DYNPTR_RDONLY_BIT;
> > > +}
> > > +
> > >   static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum
> > > bpf_dynptr_type type)
> > >   {
> > >       ptr->size |= type << DYNPTR_TYPE_SHIFT;
> > >   }
> >
> > > +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct
> > > bpf_dynptr_kern *ptr)
> > > +{
> > > +     return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
> > > +}
> > > +
> > >   static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
> > >   {
> > >       return ptr->size & DYNPTR_SIZE_MASK;
> > > @@ -1500,6 +1510,7 @@ static const struct bpf_func_proto
> > > bpf_dynptr_from_mem_proto = {
> > >   BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct
> > > bpf_dynptr_kern *, src,
> > >          u32, offset, u64, flags)
> > >   {
> > > +     enum bpf_dynptr_type type;
> > >       int err;
> >
> > >       if (!src->data || flags)
> > > @@ -1509,6 +1520,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len,
> > > struct bpf_dynptr_kern *, src
> > >       if (err)
> > >               return err;
> >
> > > +     type = bpf_dynptr_get_type(src);
> > > +
> > > +     if (type == BPF_DYNPTR_TYPE_SKB)
> > > +             return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
> > > +
> > >       memcpy(dst, src->data + src->offset + offset, len);
> >
> > >       return 0;
> > > @@ -1528,15 +1544,38 @@ static const struct bpf_func_proto
> > > bpf_dynptr_read_proto = {
> > >   BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset,
> > > void *, src,
> > >          u32, len, u64, flags)
> > >   {
> > > +     enum bpf_dynptr_type type;
> > >       int err;
> >
> > > -     if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
> > > +     if (!dst->data || bpf_dynptr_is_rdonly(dst))
> > >               return -EINVAL;
> >
> > >       err = bpf_dynptr_check_off_len(dst, offset, len);
> > >       if (err)
> > >               return err;
> >
> > > +     type = bpf_dynptr_get_type(dst);
> > > +
> > > +     if (flags) {
> > > +             if (type == BPF_DYNPTR_TYPE_SKB) {
> > > +                     if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
> > > +                             return -EINVAL;
> > > +             } else {
> > > +                     return -EINVAL;
> > > +             }
> > > +     }
> > > +
> > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > +             struct sk_buff *skb = dst->data;
> > > +
> > > +             /* if the data is paged, the caller needs to pull it first */
> > > +             if (dst->offset + offset + len > skb->len - skb->data_len)
> >
> > Use skb_headlen instead of 'skb->len - skb->data_len' ?
> Awesome, will replace this (and the one in bpf_dynptr_data) with
> skb_headlen() for v2. thanks!
> >
> > > +                     return -EAGAIN;
> > > +
> > > +             return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> > > +                                          flags);
> > > +     }
> > > +
> > >       memcpy(dst->data + dst->offset + offset, src, len);
> >
> > >       return 0;
> > > @@ -1555,6 +1594,7 @@ static const struct bpf_func_proto
> > > bpf_dynptr_write_proto = {
> >
> > >   BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset,
> > > u32, len)
> > >   {
> > > +     enum bpf_dynptr_type type;
> > >       int err;
> >
> > >       if (!ptr->data)
> > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern
> > > *, ptr, u32, offset, u32, len
> > >       if (bpf_dynptr_is_rdonly(ptr))
> > >               return 0;
> >
> > > +     type = bpf_dynptr_get_type(ptr);
> > > +
> > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > +             struct sk_buff *skb = ptr->data;
> > > +
> > > +             /* if the data is paged, the caller needs to pull it first */
> > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > +                     return 0;
> >
> > Same here?
> >
> > > +
> > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > +     }
> > > +
> > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > >   }
> >
> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index 0d523741a543..0838653eeb4e 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -263,6 +263,7 @@ struct bpf_call_arg_meta {
> > >       u32 subprogno;
> > >       struct bpf_map_value_off_desc *kptr_off_desc;
> > >       u8 uninit_dynptr_regno;
> > > +     enum bpf_dynptr_type type;
> > >   };
> >
> > >   struct btf *btf_vmlinux;
> > > @@ -678,6 +679,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum
> > > bpf_arg_type arg_type)
> > >               return BPF_DYNPTR_TYPE_LOCAL;
> > >       case DYNPTR_TYPE_RINGBUF:
> > >               return BPF_DYNPTR_TYPE_RINGBUF;
> > > +     case DYNPTR_TYPE_SKB:
> > > +             return BPF_DYNPTR_TYPE_SKB;
> > >       default:
> > >               return BPF_DYNPTR_TYPE_INVALID;
> > >       }
> > > @@ -5820,12 +5823,14 @@ int check_func_arg_reg_off(struct
> > > bpf_verifier_env *env,
> > >       return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
> > >   }
> >
> > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct
> > > bpf_reg_state *reg)
> > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env,
> > > struct bpf_reg_state *reg,
> > > +                                    struct bpf_call_arg_meta *meta)
> > >   {
> > >       struct bpf_func_state *state = func(env, reg);
> > >       int spi = get_spi(reg->off);
> >
> > > -     return state->stack[spi].spilled_ptr.id;
> > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > >   }
> >
> > >   static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env
> > > *env, u32 arg,
> > >                               case DYNPTR_TYPE_RINGBUF:
> > >                                       err_extra = "ringbuf ";
> > >                                       break;
> > > +                             case DYNPTR_TYPE_SKB:
> > > +                                     err_extra = "skb ";
> > > +                                     break;
> > >                               default:
> > >                                       break;
> > >                               }
> > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env
> > > *env, u32 arg,
> > >                                       verbose(env, "verifier internal error: multiple refcounted args in
> > > BPF_FUNC_dynptr_data");
> > >                                       return -EFAULT;
> > >                               }
> > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > +                             /* Find the id and the type of the dynptr we're tracking
> > > +                              * the reference of.
> > > +                              */
> > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > >                       }
> > >               }
> > >               break;
> > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct
> > > bpf_verifier_env *env, struct bpf_insn *insn
> > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > +             else
> > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > >               const struct btf_type *t;
> > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct
> > > bpf_verifier_env *env)
> > >                       goto patch_call_imm;
> > >               }
> >
> >
> > [..]
> >
> > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > +                     else
> > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > +                     insn_buf[1] = *insn;
> > > +                     cnt = 2;
> > > +
> > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > +                     if (!new_prog)
> > > +                             return -ENOMEM;
> > > +
> > > +                     delta += cnt - 1;
> > > +                     env->prog = new_prog;
> > > +                     prog = new_prog;
> > > +                     insn = new_prog->insnsi + i + delta;
> > > +                     goto patch_call_imm;
> > > +             }
> >
> > Would it be easier to have two separate helpers:
> > - BPF_FUNC_dynptr_from_skb
> > - BPF_FUNC_dynptr_from_skb_readonly
> >
> > And make the verifier rewrite insn->imm to
> > BPF_FUNC_dynptr_from_skb_readonly when needed?
> >
> > if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> >         if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> >                 insn->imm = BPF_FUNC_dynptr_from_skb_readonly;
> > }
> >
> > Or it's also ugly because we'd have to leak that new helper into UAPI?
> > (I wonder whether that hidden 4th argument is too magical, but probably
> > fine?)
> To me, having 2 separate helpers feels more cluttered and having to
> expose it in the uapi (though I think there is probably some way to
> avoid this by doing some sort of ad hoc processing) doesn't seem
> ideal. If you feel strongly about this though, I am happy to change
> this to use two separate helpers. We do this sort of manual
> instruction patching for the sleepable flags in
> bpf_task/sk/inode_storage_get and for the callback args in
> bpf_timer_set_callback as well - if we use separate helpers here, we
> should do that for the other cases as well to maintain consistency.

No, I don't feel strongly, let's keep it, especially if there is prior art :-)
I am just wondering whether there is some better alternative, but
extra helpers also have their downsides.
Hao Luo July 28, 2022, 5:45 p.m. UTC | #4
Hi, Joanne,

On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> Add skb dynptrs, which are dynptrs whose underlying pointer points
> to a skb. The dynptr acts on skb data. skb dynptrs have two main
> benefits. One is that they allow operations on sizes that are not
> statically known at compile-time (eg variable-sized accesses).
> Another is that parsing the packet data through dynptrs (instead of
> through direct access of skb->data and skb->data_end) can be more
> ergonomic and less brittle (eg does not need manual if checking for
> being within bounds of data_end).
>
> For bpf prog types that don't support writes on skb data, the dynptr is
> read-only (writes and data slices are not permitted). For reads on the
> dynptr, this includes reading into data in the non-linear paged buffers
> but for writes and data slices, if the data is in a paged buffer, the
> user must first call bpf_skb_pull_data to pull the data into the linear
> portion.
>
> Additionally, any helper calls that change the underlying packet buffer
> (eg bpf_skb_pull_data) invalidates any data slices of the associated
> dynptr.
>
> Right now, skb dynptrs can only be constructed from skbs that are
> the bpf program context - as such, there does not need to be any
> reference tracking or release on skb dynptrs.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  include/linux/bpf.h            |  8 ++++-
>  include/linux/filter.h         |  4 +++
>  include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
>  kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
>  kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
>  net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
>  tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
>  7 files changed, 229 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 20c26aed7896..7fbd4324c848 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -407,11 +407,14 @@ enum bpf_type_flag {
>         /* Size is known at compile time. */
>         MEM_FIXED_SIZE          = BIT(10 + BPF_BASE_TYPE_BITS),
>
> +       /* DYNPTR points to sk_buff */
> +       DYNPTR_TYPE_SKB         = BIT(11 + BPF_BASE_TYPE_BITS),
> +
>         __BPF_TYPE_FLAG_MAX,
>         __BPF_TYPE_LAST_FLAG    = __BPF_TYPE_FLAG_MAX - 1,
>  };
>
> -#define DYNPTR_TYPE_FLAG_MASK  (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
> +#define DYNPTR_TYPE_FLAG_MASK  (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
>

I wonder if we could maximize the use of these flags by combining them
with other base types, not just DYNPTR. For example, does TYPE_LOCAL
indicate memory is on stack? If so, can we apply LOCAL on PTR_TO_MEM?
If we have PTR_TO_MEM + LOCAL, can it be used to replace PTR_TO_STACK
in some scenarios?

WDYT?

>  /* Max number of base types. */
>  #define BPF_BASE_TYPE_LIMIT    (1UL << BPF_BASE_TYPE_BITS)
> @@ -2556,12 +2559,15 @@ enum bpf_dynptr_type {
>         BPF_DYNPTR_TYPE_LOCAL,
>         /* Underlying data is a ringbuf record */
>         BPF_DYNPTR_TYPE_RINGBUF,
> +       /* Underlying data is a sk_buff */
> +       BPF_DYNPTR_TYPE_SKB,
>  };
>
<...>
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> --
> 2.30.2
>
Joanne Koong July 28, 2022, 6:36 p.m. UTC | #5
On Thu, Jul 28, 2022 at 10:45 AM Hao Luo <haoluo@google.com> wrote:
>
> Hi, Joanne,
>
> On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > benefits. One is that they allow operations on sizes that are not
> > statically known at compile-time (eg variable-sized accesses).
> > Another is that parsing the packet data through dynptrs (instead of
> > through direct access of skb->data and skb->data_end) can be more
> > ergonomic and less brittle (eg does not need manual if checking for
> > being within bounds of data_end).
> >
> > For bpf prog types that don't support writes on skb data, the dynptr is
> > read-only (writes and data slices are not permitted). For reads on the
> > dynptr, this includes reading into data in the non-linear paged buffers
> > but for writes and data slices, if the data is in a paged buffer, the
> > user must first call bpf_skb_pull_data to pull the data into the linear
> > portion.
> >
> > Additionally, any helper calls that change the underlying packet buffer
> > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > dynptr.
> >
> > Right now, skb dynptrs can only be constructed from skbs that are
> > the bpf program context - as such, there does not need to be any
> > reference tracking or release on skb dynptrs.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  include/linux/bpf.h            |  8 ++++-
> >  include/linux/filter.h         |  4 +++
> >  include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
> >  kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
> >  kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
> >  net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
> >  tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
> >  7 files changed, 229 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 20c26aed7896..7fbd4324c848 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -407,11 +407,14 @@ enum bpf_type_flag {
> >         /* Size is known at compile time. */
> >         MEM_FIXED_SIZE          = BIT(10 + BPF_BASE_TYPE_BITS),
> >
> > +       /* DYNPTR points to sk_buff */
> > +       DYNPTR_TYPE_SKB         = BIT(11 + BPF_BASE_TYPE_BITS),
> > +
> >         __BPF_TYPE_FLAG_MAX,
> >         __BPF_TYPE_LAST_FLAG    = __BPF_TYPE_FLAG_MAX - 1,
> >  };
> >
> > -#define DYNPTR_TYPE_FLAG_MASK  (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
> > +#define DYNPTR_TYPE_FLAG_MASK  (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
> >
>
> I wonder if we could maximize the use of these flags by combining them
> with other base types, not just DYNPTR. For example, does TYPE_LOCAL
> indicate memory is on stack? If so, can we apply LOCAL on PTR_TO_MEM?
> If we have PTR_TO_MEM + LOCAL, can it be used to replace PTR_TO_STACK
> in some scenarios?
>
> WDYT?

Hi Hao. I love the idea but unfortunately I don't think it applies
neatly in this case. "local" in the context of dynptrs means memory
that is local to the bpf program (eg includes not just memory on the
stack).

>
> >  /* Max number of base types. */
> >  #define BPF_BASE_TYPE_LIMIT    (1UL << BPF_BASE_TYPE_BITS)
> > @@ -2556,12 +2559,15 @@ enum bpf_dynptr_type {
> >         BPF_DYNPTR_TYPE_LOCAL,
> >         /* Underlying data is a ringbuf record */
> >         BPF_DYNPTR_TYPE_RINGBUF,
> > +       /* Underlying data is a sk_buff */
> > +       BPF_DYNPTR_TYPE_SKB,
> >  };
> >
> <...>
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > --
> > 2.30.2
> >
Martin KaFai Lau July 28, 2022, 11:39 p.m. UTC | #6
On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
>  	if (bpf_dynptr_is_rdonly(ptr))
Is it possible to allow data slice for rdonly dynptr-skb?
and depends on the may_access_direct_pkt_data() check in the verifier.

>  		return 0;
>  
> +	type = bpf_dynptr_get_type(ptr);
> +
> +	if (type == BPF_DYNPTR_TYPE_SKB) {
> +		struct sk_buff *skb = ptr->data;
> +
> +		/* if the data is paged, the caller needs to pull it first */
> +		if (ptr->offset + offset + len > skb->len - skb->data_len)
> +			return 0;
> +
> +		return (unsigned long)(skb->data + ptr->offset + offset);
> +	}
> +
>  	return (unsigned long)(ptr->data + ptr->offset + offset);
>  }

[ ... ]

> -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> +				       struct bpf_call_arg_meta *meta)
>  {
>  	struct bpf_func_state *state = func(env, reg);
>  	int spi = get_spi(reg->off);
>  
> -	return state->stack[spi].spilled_ptr.id;
> +	meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> +	meta->type = state->stack[spi].spilled_ptr.dynptr.type;
>  }
>  
>  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>  				case DYNPTR_TYPE_RINGBUF:
>  					err_extra = "ringbuf ";
>  					break;
> +				case DYNPTR_TYPE_SKB:
> +					err_extra = "skb ";
> +					break;
>  				default:
>  					break;
>  				}
> @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>  					verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
>  					return -EFAULT;
>  				}
> -				/* Find the id of the dynptr we're tracking the reference of */
> -				meta->ref_obj_id = stack_slot_get_id(env, reg);
> +				/* Find the id and the type of the dynptr we're tracking
> +				 * the reference of.
> +				 */
> +				stack_slot_get_dynptr_info(env, reg, meta);
>  			}
>  		}
>  		break;
> @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>  		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
>  	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
>  		mark_reg_known_zero(env, regs, BPF_REG_0);
> -		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> +		if (func_id == BPF_FUNC_dynptr_data &&
> +		    meta.type == BPF_DYNPTR_TYPE_SKB)
> +			regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> +		else
> +			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
>  		regs[BPF_REG_0].mem_size = meta.mem_size;
check_packet_access() uses range.
It took me a while to figure range and mem_size is in union.
Mentioning here in case someone has similar question.

>  	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
>  		const struct btf_type *t;
> @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>  			goto patch_call_imm;
>  		}
>  
> +		if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> +			if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> +				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> +			else
> +				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> +			insn_buf[1] = *insn;
> +			cnt = 2;
> +
> +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +			if (!new_prog)
> +				return -ENOMEM;
> +
> +			delta += cnt - 1;
> +			env->prog = new_prog;
> +			prog = new_prog;
> +			insn = new_prog->insnsi + i + delta;
> +			goto patch_call_imm;
> +		}
Have you considered to reject bpf_dynptr_write()
at prog load time?
Joanne Koong July 29, 2022, 8:26 p.m. UTC | #7
On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> >       if (bpf_dynptr_is_rdonly(ptr))
> Is it possible to allow data slice for rdonly dynptr-skb?
> and depends on the may_access_direct_pkt_data() check in the verifier.

Ooh great idea. This should be very simple to do, since the data slice
that gets returned is assigned as PTR_TO_PACKET. So any stx operations
on it will by default go through the may_access_direct_pkt_data()
check. I'll add this for v2.

>
> >               return 0;
> >
> > +     type = bpf_dynptr_get_type(ptr);
> > +
> > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > +             struct sk_buff *skb = ptr->data;
> > +
> > +             /* if the data is paged, the caller needs to pull it first */
> > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > +                     return 0;
> > +
> > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > +     }
> > +
> >       return (unsigned long)(ptr->data + ptr->offset + offset);
> >  }
>
> [ ... ]
>
> > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > +                                    struct bpf_call_arg_meta *meta)
> >  {
> >       struct bpf_func_state *state = func(env, reg);
> >       int spi = get_spi(reg->off);
> >
> > -     return state->stack[spi].spilled_ptr.id;
> > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> >  }
> >
> >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> >                               case DYNPTR_TYPE_RINGBUF:
> >                                       err_extra = "ringbuf ";
> >                                       break;
> > +                             case DYNPTR_TYPE_SKB:
> > +                                     err_extra = "skb ";
> > +                                     break;
> >                               default:
> >                                       break;
> >                               }
> > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> >                                       return -EFAULT;
> >                               }
> > -                             /* Find the id of the dynptr we're tracking the reference of */
> > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > +                             /* Find the id and the type of the dynptr we're tracking
> > +                              * the reference of.
> > +                              */
> > +                             stack_slot_get_dynptr_info(env, reg, meta);
> >                       }
> >               }
> >               break;
> > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > +             if (func_id == BPF_FUNC_dynptr_data &&
> > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > +             else
> > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> >               regs[BPF_REG_0].mem_size = meta.mem_size;
> check_packet_access() uses range.
> It took me a while to figure range and mem_size is in union.
> Mentioning here in case someone has similar question.
For v2, I'll add this as a comment in the code or I'll include
"regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
obvious :)
>
> >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> >               const struct btf_type *t;
> > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> >                       goto patch_call_imm;
> >               }
> >
> > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > +                     else
> > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > +                     insn_buf[1] = *insn;
> > +                     cnt = 2;
> > +
> > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > +                     if (!new_prog)
> > +                             return -ENOMEM;
> > +
> > +                     delta += cnt - 1;
> > +                     env->prog = new_prog;
> > +                     prog = new_prog;
> > +                     insn = new_prog->insnsi + i + delta;
> > +                     goto patch_call_imm;
> > +             }
> Have you considered to reject bpf_dynptr_write()
> at prog load time?
It's possible to reject bpf_dynptr_write() at prog load time but would
require adding tracking in the verifier for whether a dynptr is
read-only or not. Do you think it's better to reject it at load time
instead of returning NULL at runtime?
Martin KaFai Lau July 29, 2022, 9:39 p.m. UTC | #8
On Fri, Jul 29, 2022 at 01:26:31PM -0700, Joanne Koong wrote:
> On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> > >       if (bpf_dynptr_is_rdonly(ptr))
> > Is it possible to allow data slice for rdonly dynptr-skb?
> > and depends on the may_access_direct_pkt_data() check in the verifier.
> 
> Ooh great idea. This should be very simple to do, since the data slice
> that gets returned is assigned as PTR_TO_PACKET. So any stx operations
> on it will by default go through the may_access_direct_pkt_data()
> check. I'll add this for v2.
It will be great.  Out of all three helpers (bpf_dynptr_read/write/data),
bpf_dynptr_data will be the useful one to parse the header data (e.g. tcp-hdr-opt)
that has runtime variable length because bpf_dynptr_data() can take a non-cost
'offset' argument.  It is useful to get a consistent usage across all bpf
prog types that are either read-only or read-write of the skb.

> 
> >
> > >               return 0;
> > >
> > > +     type = bpf_dynptr_get_type(ptr);
> > > +
> > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > +             struct sk_buff *skb = ptr->data;
> > > +
> > > +             /* if the data is paged, the caller needs to pull it first */
> > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > +                     return 0;
> > > +
> > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > +     }
> > > +
> > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > >  }
> >
> > [ ... ]
> >
> > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > > +                                    struct bpf_call_arg_meta *meta)
> > >  {
> > >       struct bpf_func_state *state = func(env, reg);
> > >       int spi = get_spi(reg->off);
> > >
> > > -     return state->stack[spi].spilled_ptr.id;
> > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > >  }
> > >
> > >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > >                               case DYNPTR_TYPE_RINGBUF:
> > >                                       err_extra = "ringbuf ";
> > >                                       break;
> > > +                             case DYNPTR_TYPE_SKB:
> > > +                                     err_extra = "skb ";
> > > +                                     break;
> > >                               default:
> > >                                       break;
> > >                               }
> > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> > >                                       return -EFAULT;
> > >                               }
> > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > +                             /* Find the id and the type of the dynptr we're tracking
> > > +                              * the reference of.
> > > +                              */
> > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > >                       }
> > >               }
> > >               break;
> > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > +             else
> > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > check_packet_access() uses range.
> > It took me a while to figure range and mem_size is in union.
> > Mentioning here in case someone has similar question.
> For v2, I'll add this as a comment in the code or I'll include
> "regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
> obvious :)
'regs[BPF_REG_0].range = meta.mem_size' would be great.  No strong
opinion here.

> >
> > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > >               const struct btf_type *t;
> > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> > >                       goto patch_call_imm;
> > >               }
> > >
> > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > +                     else
> > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > +                     insn_buf[1] = *insn;
> > > +                     cnt = 2;
> > > +
> > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > +                     if (!new_prog)
> > > +                             return -ENOMEM;
> > > +
> > > +                     delta += cnt - 1;
> > > +                     env->prog = new_prog;
> > > +                     prog = new_prog;
> > > +                     insn = new_prog->insnsi + i + delta;
> > > +                     goto patch_call_imm;
> > > +             }
> > Have you considered to reject bpf_dynptr_write()
> > at prog load time?
> It's possible to reject bpf_dynptr_write() at prog load time but would
> require adding tracking in the verifier for whether a dynptr is
> read-only or not. Do you think it's better to reject it at load time
> instead of returning NULL at runtime?
The check_helper_call above seems to know 'meta.type == BPF_DYNPTR_TYPE_SKB'.
Together with may_access_direct_pkt_data(), would it be enough ?
Then no need to do patching for BPF_FUNC_dynptr_from_skb here.

Since we are on bpf_dynptr_write, what is the reason
on limiting it to the skb_headlen() ?  Not implying one
way is better than another.  would like to undertand the reason
behind it since it is not clear in the commit message.
Joanne Koong Aug. 1, 2022, 5:52 p.m. UTC | #9
On Fri, Jul 29, 2022 at 2:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Fri, Jul 29, 2022 at 01:26:31PM -0700, Joanne Koong wrote:
> > On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> > > >       if (bpf_dynptr_is_rdonly(ptr))
> > > Is it possible to allow data slice for rdonly dynptr-skb?
> > > and depends on the may_access_direct_pkt_data() check in the verifier.
> >
> > Ooh great idea. This should be very simple to do, since the data slice
> > that gets returned is assigned as PTR_TO_PACKET. So any stx operations
> > on it will by default go through the may_access_direct_pkt_data()
> > check. I'll add this for v2.
> It will be great.  Out of all three helpers (bpf_dynptr_read/write/data),
> bpf_dynptr_data will be the useful one to parse the header data (e.g. tcp-hdr-opt)
> that has runtime variable length because bpf_dynptr_data() can take a non-cost
> 'offset' argument.  It is useful to get a consistent usage across all bpf
> prog types that are either read-only or read-write of the skb.
>
> >
> > >
> > > >               return 0;
> > > >
> > > > +     type = bpf_dynptr_get_type(ptr);
> > > > +
> > > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > > +             struct sk_buff *skb = ptr->data;
> > > > +
> > > > +             /* if the data is paged, the caller needs to pull it first */
> > > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > > +                     return 0;
> > > > +
> > > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > > +     }
> > > > +
> > > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > > >  }
> > >
> > > [ ... ]
> > >
> > > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > > > +                                    struct bpf_call_arg_meta *meta)
> > > >  {
> > > >       struct bpf_func_state *state = func(env, reg);
> > > >       int spi = get_spi(reg->off);
> > > >
> > > > -     return state->stack[spi].spilled_ptr.id;
> > > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > > >  }
> > > >
> > > >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > >                               case DYNPTR_TYPE_RINGBUF:
> > > >                                       err_extra = "ringbuf ";
> > > >                                       break;
> > > > +                             case DYNPTR_TYPE_SKB:
> > > > +                                     err_extra = "skb ";
> > > > +                                     break;
> > > >                               default:
> > > >                                       break;
> > > >                               }
> > > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> > > >                                       return -EFAULT;
> > > >                               }
> > > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > > +                             /* Find the id and the type of the dynptr we're tracking
> > > > +                              * the reference of.
> > > > +                              */
> > > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > > >                       }
> > > >               }
> > > >               break;
> > > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> > > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > > +             else
> > > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > > check_packet_access() uses range.
> > > It took me a while to figure range and mem_size is in union.
> > > Mentioning here in case someone has similar question.
> > For v2, I'll add this as a comment in the code or I'll include
> > "regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
> > obvious :)
> 'regs[BPF_REG_0].range = meta.mem_size' would be great.  No strong
> opinion here.
>
> > >
> > > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > > >               const struct btf_type *t;
> > > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> > > >                       goto patch_call_imm;
> > > >               }
> > > >
> > > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > > +                     else
> > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > > +                     insn_buf[1] = *insn;
> > > > +                     cnt = 2;
> > > > +
> > > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > > +                     if (!new_prog)
> > > > +                             return -ENOMEM;
> > > > +
> > > > +                     delta += cnt - 1;
> > > > +                     env->prog = new_prog;
> > > > +                     prog = new_prog;
> > > > +                     insn = new_prog->insnsi + i + delta;
> > > > +                     goto patch_call_imm;
> > > > +             }
> > > Have you considered to reject bpf_dynptr_write()
> > > at prog load time?
> > It's possible to reject bpf_dynptr_write() at prog load time but would
> > require adding tracking in the verifier for whether a dynptr is
> > read-only or not. Do you think it's better to reject it at load time
> > instead of returning NULL at runtime?
> The check_helper_call above seems to know 'meta.type == BPF_DYNPTR_TYPE_SKB'.
> Together with may_access_direct_pkt_data(), would it be enough ?
> Then no need to do patching for BPF_FUNC_dynptr_from_skb here.
Yeah! That would detect it just as well. I'll add this to v2 :)
>
> Since we are on bpf_dynptr_write, what is the reason
> on limiting it to the skb_headlen() ?  Not implying one
> way is better than another.  would like to undertand the reason
> behind it since it is not clear in the commit message.
For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
may be writes that pull the skb, so any existing data slices to the
skb must be invalidated. However, in the verifier we can't detect when
the data slice should be invalidated vs. when it shouldn't (eg
detecting when a write goes into the paged area vs when the write is
only in the head). If the prog wants to write into the paged area, I
think the only way it can work is if it pulls the data first with
bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
the commit message in v2
Martin KaFai Lau Aug. 1, 2022, 7:38 p.m. UTC | #10
On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > Since we are on bpf_dynptr_write, what is the reason
> > on limiting it to the skb_headlen() ?  Not implying one
> > way is better than another.  would like to undertand the reason
> > behind it since it is not clear in the commit message.
> For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> may be writes that pull the skb, so any existing data slices to the
> skb must be invalidated. However, in the verifier we can't detect when
> the data slice should be invalidated vs. when it shouldn't (eg
> detecting when a write goes into the paged area vs when the write is
> only in the head). If the prog wants to write into the paged area, I
> think the only way it can work is if it pulls the data first with
> bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> the commit message in v2
Note that current verifier unconditionally invalidates PTR_TO_PACKET
after bpf_skb_store_bytes().  Potentially the same could be done for
other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
behavior cannot be changed later, so want to raise this possibility here
just in case it wasn't considered before.

Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
If the user changes the skb by directly using skb->data to avoid calling
load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
bpf_skb_store_bytes() to store a modified byte at the same offset.
Joanne Koong Aug. 1, 2022, 9:16 p.m. UTC | #11
On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > Since we are on bpf_dynptr_write, what is the reason
> > > on limiting it to the skb_headlen() ?  Not implying one
> > > way is better than another.  would like to undertand the reason
> > > behind it since it is not clear in the commit message.
> > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > may be writes that pull the skb, so any existing data slices to the
> > skb must be invalidated. However, in the verifier we can't detect when
> > the data slice should be invalidated vs. when it shouldn't (eg
> > detecting when a write goes into the paged area vs when the write is
> > only in the head). If the prog wants to write into the paged area, I
> > think the only way it can work is if it pulls the data first with
> > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > the commit message in v2
> Note that current verifier unconditionally invalidates PTR_TO_PACKET
> after bpf_skb_store_bytes().  Potentially the same could be done for
> other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> behavior cannot be changed later, so want to raise this possibility here
> just in case it wasn't considered before.

Thanks for raising this possibility. To me, it seems more intuitive
from the user standpoint to have bpf_dynptr_write() on a paged area
fail (even if bpf_dynptr_read() on that same offset succeeds) than to
have bpf_dynptr_write() always invalidate all dynptr slices related to
that skb. I think most writes will be to the data in the head area,
which seems unfortunate that bpf_dynptr_writes to the head area would
invalidate the dynptr slices regardless.

What are your thoughts? Do you think you prefer having
bpf_dynptr_write() always work regardless of where the data is? If so,
I'm happy to make that change for v2 :)

>
> Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
> If the user changes the skb by directly using skb->data to avoid calling
> load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
> before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
> it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
> byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
> bpf_skb_store_bytes() to store a modified byte at the same offset.
Andrii Nakryiko Aug. 1, 2022, 10:11 p.m. UTC | #12
On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> Add skb dynptrs, which are dynptrs whose underlying pointer points
> to a skb. The dynptr acts on skb data. skb dynptrs have two main
> benefits. One is that they allow operations on sizes that are not
> statically known at compile-time (eg variable-sized accesses).
> Another is that parsing the packet data through dynptrs (instead of
> through direct access of skb->data and skb->data_end) can be more
> ergonomic and less brittle (eg does not need manual if checking for
> being within bounds of data_end).
>
> For bpf prog types that don't support writes on skb data, the dynptr is
> read-only (writes and data slices are not permitted). For reads on the
> dynptr, this includes reading into data in the non-linear paged buffers
> but for writes and data slices, if the data is in a paged buffer, the
> user must first call bpf_skb_pull_data to pull the data into the linear
> portion.
>
> Additionally, any helper calls that change the underlying packet buffer
> (eg bpf_skb_pull_data) invalidates any data slices of the associated
> dynptr.
>
> Right now, skb dynptrs can only be constructed from skbs that are
> the bpf program context - as such, there does not need to be any
> reference tracking or release on skb dynptrs.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  include/linux/bpf.h            |  8 ++++-
>  include/linux/filter.h         |  4 +++
>  include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
>  kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
>  kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
>  net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
>  tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
>  7 files changed, 229 insertions(+), 17 deletions(-)
>

[...]

> +       type = bpf_dynptr_get_type(dst);
> +
> +       if (flags) {
> +               if (type == BPF_DYNPTR_TYPE_SKB) {
> +                       if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
> +                               return -EINVAL;
> +               } else {
> +                       return -EINVAL;
> +               }
> +       }
> +
> +       if (type == BPF_DYNPTR_TYPE_SKB) {
> +               struct sk_buff *skb = dst->data;
> +
> +               /* if the data is paged, the caller needs to pull it first */
> +               if (dst->offset + offset + len > skb->len - skb->data_len)
> +                       return -EAGAIN;
> +
> +               return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> +                                            flags);
> +       }

It seems like it would be cleaner to have a switch per dynptr type and
each case doing its extra error checking (like CSUM and HASH flags for
TYPE_SKB) and then performing write operation.


memcpy can be either a catch-all default case, or perhaps it's safer
to explicitly list TYPE_LOCAL and TYPE_RINGBUF to do memcpy, and then
default should WARN() and return error?

> +
>         memcpy(dst->data + dst->offset + offset, src, len);
>
>         return 0;
> @@ -1555,6 +1594,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
>
>  BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
>  {
> +       enum bpf_dynptr_type type;
>         int err;
>
>         if (!ptr->data)
> @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
>         if (bpf_dynptr_is_rdonly(ptr))
>                 return 0;
>
> +       type = bpf_dynptr_get_type(ptr);
> +
> +       if (type == BPF_DYNPTR_TYPE_SKB) {
> +               struct sk_buff *skb = ptr->data;
> +
> +               /* if the data is paged, the caller needs to pull it first */
> +               if (ptr->offset + offset + len > skb->len - skb->data_len)
> +                       return 0;
> +
> +               return (unsigned long)(skb->data + ptr->offset + offset);
> +       }
> +
>         return (unsigned long)(ptr->data + ptr->offset + offset);

Similarly, all these dynptr helpers effectively dispatch different
implementations based on dynptr type. I think switch is most
appropriate for this.

>  }
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 0d523741a543..0838653eeb4e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -263,6 +263,7 @@ struct bpf_call_arg_meta {
>         u32 subprogno;
>         struct bpf_map_value_off_desc *kptr_off_desc;
>         u8 uninit_dynptr_regno;
> +       enum bpf_dynptr_type type;
>  };
>

[...]
Andrii Nakryiko Aug. 1, 2022, 10:14 p.m. UTC | #13
On Mon, Aug 1, 2022 at 2:16 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > Since we are on bpf_dynptr_write, what is the reason
> > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > way is better than another.  would like to undertand the reason
> > > > behind it since it is not clear in the commit message.
> > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > may be writes that pull the skb, so any existing data slices to the
> > > skb must be invalidated. However, in the verifier we can't detect when
> > > the data slice should be invalidated vs. when it shouldn't (eg
> > > detecting when a write goes into the paged area vs when the write is
> > > only in the head). If the prog wants to write into the paged area, I
> > > think the only way it can work is if it pulls the data first with
> > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > the commit message in v2
> > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > after bpf_skb_store_bytes().  Potentially the same could be done for
> > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > behavior cannot be changed later, so want to raise this possibility here
> > just in case it wasn't considered before.
>
> Thanks for raising this possibility. To me, it seems more intuitive
> from the user standpoint to have bpf_dynptr_write() on a paged area
> fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> have bpf_dynptr_write() always invalidate all dynptr slices related to
> that skb. I think most writes will be to the data in the head area,
> which seems unfortunate that bpf_dynptr_writes to the head area would
> invalidate the dynptr slices regardless.

+1. Given bpf_skb_store_bytes() is a more powerful superset of
bpf_dynptr_write(), I'd keep bpf_dynptr_write() in such a form as to
play nicely with bpf_dynptr_data() pointers.

>
> What are your thoughts? Do you think you prefer having
> bpf_dynptr_write() always work regardless of where the data is? If so,
> I'm happy to make that change for v2 :)
>
> >
> > Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
> > If the user changes the skb by directly using skb->data to avoid calling
> > load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
> > before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
> > it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
> > byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
> > bpf_skb_store_bytes() to store a modified byte at the same offset.
Martin KaFai Lau Aug. 1, 2022, 10:32 p.m. UTC | #14
On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > Since we are on bpf_dynptr_write, what is the reason
> > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > way is better than another.  would like to undertand the reason
> > > > behind it since it is not clear in the commit message.
> > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > may be writes that pull the skb, so any existing data slices to the
> > > skb must be invalidated. However, in the verifier we can't detect when
> > > the data slice should be invalidated vs. when it shouldn't (eg
> > > detecting when a write goes into the paged area vs when the write is
> > > only in the head). If the prog wants to write into the paged area, I
> > > think the only way it can work is if it pulls the data first with
> > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > the commit message in v2
> > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > after bpf_skb_store_bytes().  Potentially the same could be done for
> > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > behavior cannot be changed later, so want to raise this possibility here
> > just in case it wasn't considered before.
> 
> Thanks for raising this possibility. To me, it seems more intuitive
> from the user standpoint to have bpf_dynptr_write() on a paged area
> fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> have bpf_dynptr_write() always invalidate all dynptr slices related to
> that skb. I think most writes will be to the data in the head area,
> which seems unfortunate that bpf_dynptr_writes to the head area would
> invalidate the dynptr slices regardless.
> 
> What are your thoughts? Do you think you prefer having
> bpf_dynptr_write() always work regardless of where the data is? If so,
> I'm happy to make that change for v2 :)
Yeah, it sounds like an optimization to avoid unnecessarily
invalidating the sliced data.

To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
be used considering there is usually a pkt read before a pkt write in
the pkt modification use case.  If I got that far to have a sliced data pointer
to satisfy what I need for reading,  I would try to avoid making extra call
to dyptr_write() to modify it.

I would prefer user can have similar expectation (no need to worry pkt layout)
between dynptr_read() and dynptr_write(), and also has similar experience to
the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
unnecessary rules for user to remember while there is no clear benefit on
the chance of this optimization.

I won't insist though.  User can always stay with the bpf_skb_load_bytes()
and bpf_skb_store_bytes() to avoid worrying about the skb layout.

> >
> > Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
> > If the user changes the skb by directly using skb->data to avoid calling
> > load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
> > before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
> > it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
> > byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
> > bpf_skb_store_bytes() to store a modified byte at the same offset.
Andrii Nakryiko Aug. 1, 2022, 10:58 p.m. UTC | #15
On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > way is better than another.  would like to undertand the reason
> > > > > behind it since it is not clear in the commit message.
> > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > may be writes that pull the skb, so any existing data slices to the
> > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > detecting when a write goes into the paged area vs when the write is
> > > > only in the head). If the prog wants to write into the paged area, I
> > > > think the only way it can work is if it pulls the data first with
> > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > the commit message in v2
> > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > behavior cannot be changed later, so want to raise this possibility here
> > > just in case it wasn't considered before.
> >
> > Thanks for raising this possibility. To me, it seems more intuitive
> > from the user standpoint to have bpf_dynptr_write() on a paged area
> > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > that skb. I think most writes will be to the data in the head area,
> > which seems unfortunate that bpf_dynptr_writes to the head area would
> > invalidate the dynptr slices regardless.
> >
> > What are your thoughts? Do you think you prefer having
> > bpf_dynptr_write() always work regardless of where the data is? If so,
> > I'm happy to make that change for v2 :)
> Yeah, it sounds like an optimization to avoid unnecessarily
> invalidating the sliced data.
>
> To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> be used considering there is usually a pkt read before a pkt write in
> the pkt modification use case.  If I got that far to have a sliced data pointer
> to satisfy what I need for reading,  I would try to avoid making extra call
> to dyptr_write() to modify it.
>
> I would prefer user can have similar expectation (no need to worry pkt layout)
> between dynptr_read() and dynptr_write(), and also has similar experience to
> the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> unnecessary rules for user to remember while there is no clear benefit on
> the chance of this optimization.
>

Are you saying that bpf_dynptr_read() shouldn't read from non-linear
part of skb (and thus match more restrictive bpf_dynptr_write), or are
you saying you'd rather have bpf_dynptr_write() write into non-linear
part but invalidate bpf_dynptr_data() pointers?

I guess I agree about consistency and that it seems like in practice
you'd use bpf_dynptr_data() to work with headers and stuff like that
at known locations, and then if you need to modify the rest of payload
you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
bpf_dynptr_data() pointers (but that would be ok by that time).


> I won't insist though.  User can always stay with the bpf_skb_load_bytes()
> and bpf_skb_store_bytes() to avoid worrying about the skb layout.
>
> > >
> > > Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
> > > If the user changes the skb by directly using skb->data to avoid calling
> > > load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
> > > before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
> > > it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
> > > byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
> > > bpf_skb_store_bytes() to store a modified byte at the same offset.
Martin KaFai Lau Aug. 1, 2022, 11:23 p.m. UTC | #16
On Mon, Aug 01, 2022 at 03:58:41PM -0700, Andrii Nakryiko wrote:
> On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > > way is better than another.  would like to undertand the reason
> > > > > > behind it since it is not clear in the commit message.
> > > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > > may be writes that pull the skb, so any existing data slices to the
> > > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > > detecting when a write goes into the paged area vs when the write is
> > > > > only in the head). If the prog wants to write into the paged area, I
> > > > > think the only way it can work is if it pulls the data first with
> > > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > > the commit message in v2
> > > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > > behavior cannot be changed later, so want to raise this possibility here
> > > > just in case it wasn't considered before.
> > >
> > > Thanks for raising this possibility. To me, it seems more intuitive
> > > from the user standpoint to have bpf_dynptr_write() on a paged area
> > > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > > that skb. I think most writes will be to the data in the head area,
> > > which seems unfortunate that bpf_dynptr_writes to the head area would
> > > invalidate the dynptr slices regardless.
> > >
> > > What are your thoughts? Do you think you prefer having
> > > bpf_dynptr_write() always work regardless of where the data is? If so,
> > > I'm happy to make that change for v2 :)
> > Yeah, it sounds like an optimization to avoid unnecessarily
> > invalidating the sliced data.
> >
> > To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> > be used considering there is usually a pkt read before a pkt write in
> > the pkt modification use case.  If I got that far to have a sliced data pointer
> > to satisfy what I need for reading,  I would try to avoid making extra call
> > to dyptr_write() to modify it.
> >
> > I would prefer user can have similar expectation (no need to worry pkt layout)
> > between dynptr_read() and dynptr_write(), and also has similar experience to
> > the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> > unnecessary rules for user to remember while there is no clear benefit on
> > the chance of this optimization.
> >
> 
> Are you saying that bpf_dynptr_read() shouldn't read from non-linear
> part of skb (and thus match more restrictive bpf_dynptr_write), or are
> you saying you'd rather have bpf_dynptr_write() write into non-linear
> part but invalidate bpf_dynptr_data() pointers?
The latter.  Read and write without worrying about the skb layout.

Also, if the prog needs to call a helper to write, it knows the bytes are
not in the data pointer.  Then it needs to bpf_skb_pull_data() before
it can call write.  However, after bpf_skb_pull_data(), why the prog
needs to call the write helper instead of directly getting a new
data pointer and write to it?  If the prog needs to write many many
bytes, a write helper may then help.

> 
> I guess I agree about consistency and that it seems like in practice
> you'd use bpf_dynptr_data() to work with headers and stuff like that
> at known locations, and then if you need to modify the rest of payload
> you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
> bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
> bpf_dynptr_data() pointers (but that would be ok by that time).
imo, read, write and then go back to read is less common.
writing bytes without first reading them is also less common.

> 
> 
> > I won't insist though.  User can always stay with the bpf_skb_load_bytes()
> > and bpf_skb_store_bytes() to avoid worrying about the skb layout.
> >
> > > >
> > > > Thinking from the existing bpf_skb_{load,store}_bytes() and skb->data perspective.
> > > > If the user changes the skb by directly using skb->data to avoid calling
> > > > load_bytes()/store_bytes(), the user will do the necessary bpf_skb_pull_data()
> > > > before reading/writing the skb->data.  If load_bytes()+store_bytes() is used instead,
> > > > it would be hard to reason why the earlier bpf_skb_load_bytes() can load a particular
> > > > byte but [may] need to make an extra bpf_skb_pull_data() call before it can use
> > > > bpf_skb_store_bytes() to store a modified byte at the same offset.
Jakub Kicinski Aug. 1, 2022, 11:33 p.m. UTC | #17
(consider cross-posting network-related stuff to netdev@)

On Tue, 26 Jul 2022 11:47:04 -0700 Joanne Koong wrote:
> Add skb dynptrs, which are dynptrs whose underlying pointer points
> to a skb. The dynptr acts on skb data. skb dynptrs have two main
> benefits. One is that they allow operations on sizes that are not
> statically known at compile-time (eg variable-sized accesses).
> Another is that parsing the packet data through dynptrs (instead of
> through direct access of skb->data and skb->data_end) can be more
> ergonomic and less brittle (eg does not need manual if checking for
> being within bounds of data_end).

Is there really a need for dynptr_from_{skb,xdp} to be different
function IDs? I was hoping this work would improve portability of
networking BPF programs across the hooks.

> For bpf prog types that don't support writes on skb data, the dynptr is
> read-only (writes and data slices are not permitted). For reads on the
> dynptr, this includes reading into data in the non-linear paged buffers
> but for writes and data slices, if the data is in a paged buffer, the
> user must first call bpf_skb_pull_data to pull the data into the linear
> portion.
> 
> Additionally, any helper calls that change the underlying packet buffer
> (eg bpf_skb_pull_data) invalidates any data slices of the associated
> dynptr.

Grepping the verifier did not help me find that, would you mind
pointing me to the code?

> Right now, skb dynptrs can only be constructed from skbs that are
> the bpf program context - as such, there does not need to be any
> reference tracking or release on skb dynptrs.
Joanne Koong Aug. 2, 2022, 12:15 a.m. UTC | #18
On Mon, Aug 1, 2022 at 3:11 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > benefits. One is that they allow operations on sizes that are not
> > statically known at compile-time (eg variable-sized accesses).
> > Another is that parsing the packet data through dynptrs (instead of
> > through direct access of skb->data and skb->data_end) can be more
> > ergonomic and less brittle (eg does not need manual if checking for
> > being within bounds of data_end).
> >
> > For bpf prog types that don't support writes on skb data, the dynptr is
> > read-only (writes and data slices are not permitted). For reads on the
> > dynptr, this includes reading into data in the non-linear paged buffers
> > but for writes and data slices, if the data is in a paged buffer, the
> > user must first call bpf_skb_pull_data to pull the data into the linear
> > portion.
> >
> > Additionally, any helper calls that change the underlying packet buffer
> > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > dynptr.
> >
> > Right now, skb dynptrs can only be constructed from skbs that are
> > the bpf program context - as such, there does not need to be any
> > reference tracking or release on skb dynptrs.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  include/linux/bpf.h            |  8 ++++-
> >  include/linux/filter.h         |  4 +++
> >  include/uapi/linux/bpf.h       | 42 ++++++++++++++++++++++++--
> >  kernel/bpf/helpers.c           | 54 +++++++++++++++++++++++++++++++++-
> >  kernel/bpf/verifier.c          | 43 +++++++++++++++++++++++----
> >  net/core/filter.c              | 53 ++++++++++++++++++++++++++++++---
> >  tools/include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++--
> >  7 files changed, 229 insertions(+), 17 deletions(-)
> >
>
> [...]
>
> > +       type = bpf_dynptr_get_type(dst);
> > +
> > +       if (flags) {
> > +               if (type == BPF_DYNPTR_TYPE_SKB) {
> > +                       if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
> > +                               return -EINVAL;
> > +               } else {
> > +                       return -EINVAL;
> > +               }
> > +       }
> > +
> > +       if (type == BPF_DYNPTR_TYPE_SKB) {
> > +               struct sk_buff *skb = dst->data;
> > +
> > +               /* if the data is paged, the caller needs to pull it first */
> > +               if (dst->offset + offset + len > skb->len - skb->data_len)
> > +                       return -EAGAIN;
> > +
> > +               return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> > +                                            flags);
> > +       }
>
> It seems like it would be cleaner to have a switch per dynptr type and
> each case doing its extra error checking (like CSUM and HASH flags for
> TYPE_SKB) and then performing write operation.
>
>
> memcpy can be either a catch-all default case, or perhaps it's safer
> to explicitly list TYPE_LOCAL and TYPE_RINGBUF to do memcpy, and then
> default should WARN() and return error?

Sounds great, I will make these changes (and the one below) for v2

>
> > +
> >         memcpy(dst->data + dst->offset + offset, src, len);
> >
> >         return 0;
> > @@ -1555,6 +1594,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
> >
> >  BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
> >  {
> > +       enum bpf_dynptr_type type;
> >         int err;
> >
> >         if (!ptr->data)
> > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> >         if (bpf_dynptr_is_rdonly(ptr))
> >                 return 0;
> >
> > +       type = bpf_dynptr_get_type(ptr);
> > +
> > +       if (type == BPF_DYNPTR_TYPE_SKB) {
> > +               struct sk_buff *skb = ptr->data;
> > +
> > +               /* if the data is paged, the caller needs to pull it first */
> > +               if (ptr->offset + offset + len > skb->len - skb->data_len)
> > +                       return 0;
> > +
> > +               return (unsigned long)(skb->data + ptr->offset + offset);
> > +       }
> > +
> >         return (unsigned long)(ptr->data + ptr->offset + offset);
>
> Similarly, all these dynptr helpers effectively dispatch different
> implementations based on dynptr type. I think switch is most
> appropriate for this.
>
> >  }
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 0d523741a543..0838653eeb4e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -263,6 +263,7 @@ struct bpf_call_arg_meta {
> >         u32 subprogno;
> >         struct bpf_map_value_off_desc *kptr_off_desc;
> >         u8 uninit_dynptr_regno;
> > +       enum bpf_dynptr_type type;
> >  };
> >
>
> [...]
Martin KaFai Lau Aug. 2, 2022, 12:56 a.m. UTC | #19
On Mon, Aug 01, 2022 at 04:23:16PM -0700, Martin KaFai Lau wrote:
> On Mon, Aug 01, 2022 at 03:58:41PM -0700, Andrii Nakryiko wrote:
> > On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > > > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > > > way is better than another.  would like to undertand the reason
> > > > > > > behind it since it is not clear in the commit message.
> > > > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > > > may be writes that pull the skb, so any existing data slices to the
> > > > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > > > detecting when a write goes into the paged area vs when the write is
> > > > > > only in the head). If the prog wants to write into the paged area, I
> > > > > > think the only way it can work is if it pulls the data first with
> > > > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > > > the commit message in v2
> > > > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > > > behavior cannot be changed later, so want to raise this possibility here
> > > > > just in case it wasn't considered before.
> > > >
> > > > Thanks for raising this possibility. To me, it seems more intuitive
> > > > from the user standpoint to have bpf_dynptr_write() on a paged area
> > > > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > > > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > > > that skb. I think most writes will be to the data in the head area,
> > > > which seems unfortunate that bpf_dynptr_writes to the head area would
> > > > invalidate the dynptr slices regardless.
> > > >
> > > > What are your thoughts? Do you think you prefer having
> > > > bpf_dynptr_write() always work regardless of where the data is? If so,
> > > > I'm happy to make that change for v2 :)
> > > Yeah, it sounds like an optimization to avoid unnecessarily
> > > invalidating the sliced data.
> > >
> > > To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> > > be used considering there is usually a pkt read before a pkt write in
> > > the pkt modification use case.  If I got that far to have a sliced data pointer
> > > to satisfy what I need for reading,  I would try to avoid making extra call
> > > to dyptr_write() to modify it.
> > >
> > > I would prefer user can have similar expectation (no need to worry pkt layout)
> > > between dynptr_read() and dynptr_write(), and also has similar experience to
> > > the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> > > unnecessary rules for user to remember while there is no clear benefit on
> > > the chance of this optimization.
> > >
> > 
> > Are you saying that bpf_dynptr_read() shouldn't read from non-linear
> > part of skb (and thus match more restrictive bpf_dynptr_write), or are
> > you saying you'd rather have bpf_dynptr_write() write into non-linear
> > part but invalidate bpf_dynptr_data() pointers?
> The latter.  Read and write without worrying about the skb layout.
> 
> Also, if the prog needs to call a helper to write, it knows the bytes are
> not in the data pointer.  Then it needs to bpf_skb_pull_data() before
> it can call write.  However, after bpf_skb_pull_data(), why the prog
> needs to call the write helper instead of directly getting a new
> data pointer and write to it?  If the prog needs to write many many
> bytes, a write helper may then help.
After another thought, other than the non-linear handling,
bpf_skb_store_bytes() / dynptr_write() is more useful in
the 'BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH' flags.

That said,  my preference is still to have the same expectation on
non-linear data for both dynptr_read() and dynptr_write().  Considering
the user can fall back to use bpf_skb_load_bytes() and
bpf_skb_store_bytes(), I am fine with the current patch also.

> 
> > 
> > I guess I agree about consistency and that it seems like in practice
> > you'd use bpf_dynptr_data() to work with headers and stuff like that
> > at known locations, and then if you need to modify the rest of payload
> > you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
> > bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
> > bpf_dynptr_data() pointers (but that would be ok by that time).
> imo, read, write and then go back to read is less common.
> writing bytes without first reading them is also less common.
Joanne Koong Aug. 2, 2022, 2:12 a.m. UTC | #20
On Mon, Aug 1, 2022 at 4:33 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> (consider cross-posting network-related stuff to netdev@)

Great, I will start cc-ing netdev@

>
> On Tue, 26 Jul 2022 11:47:04 -0700 Joanne Koong wrote:
> > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > benefits. One is that they allow operations on sizes that are not
> > statically known at compile-time (eg variable-sized accesses).
> > Another is that parsing the packet data through dynptrs (instead of
> > through direct access of skb->data and skb->data_end) can be more
> > ergonomic and less brittle (eg does not need manual if checking for
> > being within bounds of data_end).
>
> Is there really a need for dynptr_from_{skb,xdp} to be different
> function IDs? I was hoping this work would improve portability of
> networking BPF programs across the hooks.

Awesome, I like this idea of having just one generic API named
something like bpf_dynptr_from_packet. I'll add this for v2!

>
> > For bpf prog types that don't support writes on skb data, the dynptr is
> > read-only (writes and data slices are not permitted). For reads on the
> > dynptr, this includes reading into data in the non-linear paged buffers
> > but for writes and data slices, if the data is in a paged buffer, the
> > user must first call bpf_skb_pull_data to pull the data into the linear
> > portion.
> >
> > Additionally, any helper calls that change the underlying packet buffer
> > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > dynptr.
>
> Grepping the verifier did not help me find that, would you mind
> pointing me to the code?

The base reg type of a skb data slice will be PTR_TO_PACKET - this
gets set in this patch in check_helper_call() in verifier.c:

+ if (func_id == BPF_FUNC_dynptr_data &&
+    meta.type == BPF_DYNPTR_TYPE_SKB)
+ regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;

Anytime there is a helper call that changes the underlying packet
buffer [0], the verifier iterates through the registers and marks all
PTR_TO_PACKET reg types as unknown, which invalidates them. The dynptr
data slice will be invalidated since its base reg type is
PTR_TO_PACKET

The stack trace is:
   check_helper_call() -> clear_all_pkt_pointers() ->
__clear_all_pkt_pointers() -> mark_reg_unknown()


I will add this explanation to the commit message for v2 since it is non-obvious


[0] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/verifier.c#L7143

[1] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/verifier.c#L6489


>
> > Right now, skb dynptrs can only be constructed from skbs that are
> > the bpf program context - as such, there does not need to be any
> > reference tracking or release on skb dynptrs.
Andrii Nakryiko Aug. 2, 2022, 3:51 a.m. UTC | #21
On Mon, Aug 1, 2022 at 5:56 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Mon, Aug 01, 2022 at 04:23:16PM -0700, Martin KaFai Lau wrote:
> > On Mon, Aug 01, 2022 at 03:58:41PM -0700, Andrii Nakryiko wrote:
> > > On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > > > > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > >
> > > > > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > > > > way is better than another.  would like to undertand the reason
> > > > > > > > behind it since it is not clear in the commit message.
> > > > > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > > > > may be writes that pull the skb, so any existing data slices to the
> > > > > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > > > > detecting when a write goes into the paged area vs when the write is
> > > > > > > only in the head). If the prog wants to write into the paged area, I
> > > > > > > think the only way it can work is if it pulls the data first with
> > > > > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > > > > the commit message in v2
> > > > > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > > > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > > > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > > > > behavior cannot be changed later, so want to raise this possibility here
> > > > > > just in case it wasn't considered before.
> > > > >
> > > > > Thanks for raising this possibility. To me, it seems more intuitive
> > > > > from the user standpoint to have bpf_dynptr_write() on a paged area
> > > > > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > > > > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > > > > that skb. I think most writes will be to the data in the head area,
> > > > > which seems unfortunate that bpf_dynptr_writes to the head area would
> > > > > invalidate the dynptr slices regardless.
> > > > >
> > > > > What are your thoughts? Do you think you prefer having
> > > > > bpf_dynptr_write() always work regardless of where the data is? If so,
> > > > > I'm happy to make that change for v2 :)
> > > > Yeah, it sounds like an optimization to avoid unnecessarily
> > > > invalidating the sliced data.
> > > >
> > > > To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> > > > be used considering there is usually a pkt read before a pkt write in
> > > > the pkt modification use case.  If I got that far to have a sliced data pointer
> > > > to satisfy what I need for reading,  I would try to avoid making extra call
> > > > to dyptr_write() to modify it.
> > > >
> > > > I would prefer user can have similar expectation (no need to worry pkt layout)
> > > > between dynptr_read() and dynptr_write(), and also has similar experience to
> > > > the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> > > > unnecessary rules for user to remember while there is no clear benefit on
> > > > the chance of this optimization.
> > > >
> > >
> > > Are you saying that bpf_dynptr_read() shouldn't read from non-linear
> > > part of skb (and thus match more restrictive bpf_dynptr_write), or are
> > > you saying you'd rather have bpf_dynptr_write() write into non-linear
> > > part but invalidate bpf_dynptr_data() pointers?
> > The latter.  Read and write without worrying about the skb layout.
> >
> > Also, if the prog needs to call a helper to write, it knows the bytes are
> > not in the data pointer.  Then it needs to bpf_skb_pull_data() before
> > it can call write.  However, after bpf_skb_pull_data(), why the prog
> > needs to call the write helper instead of directly getting a new
> > data pointer and write to it?  If the prog needs to write many many
> > bytes, a write helper may then help.
> After another thought, other than the non-linear handling,
> bpf_skb_store_bytes() / dynptr_write() is more useful in
> the 'BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH' flags.
>
> That said,  my preference is still to have the same expectation on
> non-linear data for both dynptr_read() and dynptr_write().  Considering
> the user can fall back to use bpf_skb_load_bytes() and
> bpf_skb_store_bytes(), I am fine with the current patch also.
>

Honestly, I don't have any specific preference, because I don't have
much specific experience writing networking BPF :)

But considering Jakub's point about trying to unify skb/xdp dynptr,
while I can see how we might have symmetrical dynptr_{read,write}()
for skb case (because you can pull skb), I believe this is not
possible with XDP (e.g., multi-buffer one), so bpf_dynptr_write()
would always be more limited for XDP case.

Or maybe it is possible for XDP and I'm totally wrong here? I'm happy
to be educated about this!

> >
> > >
> > > I guess I agree about consistency and that it seems like in practice
> > > you'd use bpf_dynptr_data() to work with headers and stuff like that
> > > at known locations, and then if you need to modify the rest of payload
> > > you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
> > > bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
> > > bpf_dynptr_data() pointers (but that would be ok by that time).
> > imo, read, write and then go back to read is less common.
> > writing bytes without first reading them is also less common.
Joanne Koong Aug. 2, 2022, 4:53 a.m. UTC | #22
On Mon, Aug 1, 2022 at 8:51 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 5:56 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Mon, Aug 01, 2022 at 04:23:16PM -0700, Martin KaFai Lau wrote:
> > > On Mon, Aug 01, 2022 at 03:58:41PM -0700, Andrii Nakryiko wrote:
> > > > On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > > > > > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > >
> > > > > > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > > > > > way is better than another.  would like to undertand the reason
> > > > > > > > > behind it since it is not clear in the commit message.
> > > > > > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > > > > > may be writes that pull the skb, so any existing data slices to the
> > > > > > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > > > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > > > > > detecting when a write goes into the paged area vs when the write is
> > > > > > > > only in the head). If the prog wants to write into the paged area, I
> > > > > > > > think the only way it can work is if it pulls the data first with
> > > > > > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > > > > > the commit message in v2
> > > > > > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > > > > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > > > > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > > > > > behavior cannot be changed later, so want to raise this possibility here
> > > > > > > just in case it wasn't considered before.
> > > > > >
> > > > > > Thanks for raising this possibility. To me, it seems more intuitive
> > > > > > from the user standpoint to have bpf_dynptr_write() on a paged area
> > > > > > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > > > > > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > > > > > that skb. I think most writes will be to the data in the head area,
> > > > > > which seems unfortunate that bpf_dynptr_writes to the head area would
> > > > > > invalidate the dynptr slices regardless.
> > > > > >
> > > > > > What are your thoughts? Do you think you prefer having
> > > > > > bpf_dynptr_write() always work regardless of where the data is? If so,
> > > > > > I'm happy to make that change for v2 :)
> > > > > Yeah, it sounds like an optimization to avoid unnecessarily
> > > > > invalidating the sliced data.
> > > > >
> > > > > To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> > > > > be used considering there is usually a pkt read before a pkt write in
> > > > > the pkt modification use case.  If I got that far to have a sliced data pointer
> > > > > to satisfy what I need for reading,  I would try to avoid making extra call
> > > > > to dyptr_write() to modify it.
> > > > >
> > > > > I would prefer user can have similar expectation (no need to worry pkt layout)
> > > > > between dynptr_read() and dynptr_write(), and also has similar experience to
> > > > > the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> > > > > unnecessary rules for user to remember while there is no clear benefit on
> > > > > the chance of this optimization.
> > > > >
> > > >
> > > > Are you saying that bpf_dynptr_read() shouldn't read from non-linear
> > > > part of skb (and thus match more restrictive bpf_dynptr_write), or are
> > > > you saying you'd rather have bpf_dynptr_write() write into non-linear
> > > > part but invalidate bpf_dynptr_data() pointers?
> > > The latter.  Read and write without worrying about the skb layout.
> > >
> > > Also, if the prog needs to call a helper to write, it knows the bytes are
> > > not in the data pointer.  Then it needs to bpf_skb_pull_data() before
> > > it can call write.  However, after bpf_skb_pull_data(), why the prog
> > > needs to call the write helper instead of directly getting a new
> > > data pointer and write to it?  If the prog needs to write many many
> > > bytes, a write helper may then help.
> > After another thought, other than the non-linear handling,
> > bpf_skb_store_bytes() / dynptr_write() is more useful in
> > the 'BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH' flags.
> >
> > That said,  my preference is still to have the same expectation on
> > non-linear data for both dynptr_read() and dynptr_write().  Considering
> > the user can fall back to use bpf_skb_load_bytes() and
> > bpf_skb_store_bytes(), I am fine with the current patch also.
> >
>
> Honestly, I don't have any specific preference, because I don't have
> much specific experience writing networking BPF :)
>
> But considering Jakub's point about trying to unify skb/xdp dynptr,
> while I can see how we might have symmetrical dynptr_{read,write}()
> for skb case (because you can pull skb), I believe this is not
> possible with XDP (e.g., multi-buffer one), so bpf_dynptr_write()
> would always be more limited for XDP case.
>
> Or maybe it is possible for XDP and I'm totally wrong here? I'm happy
> to be educated about this!

My understanding is that it's possible for XDP because the data in the
frags are mapped [eg we can use skb_frag_address() to get the address
and then copy into it with direct memcpys [0]] whereas skb frags are
unmapped (eg access into the frag requires kmapping [1]).

Maybe one solution is to add a function that does the mapping + write
to a skb frag without pulling it to the head. This would allow
bpf_dynptr_write to all data without needing to invalidate any dynptr
slices. But I don't know whether this is compatible with recomputing
the checksum or not, maybe the written data needs to be mapped (and
hence part of head) so that it can be used to compute the checksum [2]
- I'll read up some more on the checksumming code.

I like your point Martin that if people are using bpf_dynptr_write,
then they probably aren't using data slices much anyways so it
wouldn't be too inconvenient that their slices are invalidated (eg if
they are using bpf_dynptr_write it's to write into the skb frag, at
which point they would need to call pull before bpf_dynptr_write,
which would lead to same scenario where the data slices are
invalidated). My main concern was that slices would be invalidated for
bpf_dynptr_writes on data in the head area, but you're right that that
shouldn't be too likely since they'd just be using a direct data slice
access instead to read/write. I'll change it so that bpf_dynptr_write
always succeeds and it'll always invalidate the data slices for v2.

[0] https://elixir.bootlin.com/linux/v5.19/source/net/core/filter.c#L3846
[1] https://elixir.bootlin.com/linux/v5.19/source/net/core/skbuff.c#L2367
[2] https://elixir.bootlin.com/linux/v5.19/source/include/linux/skbuff.h#L3839

>
> > >
> > > >
> > > > I guess I agree about consistency and that it seems like in practice
> > > > you'd use bpf_dynptr_data() to work with headers and stuff like that
> > > > at known locations, and then if you need to modify the rest of payload
> > > > you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
> > > > bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
> > > > bpf_dynptr_data() pointers (but that would be ok by that time).
> > > imo, read, write and then go back to read is less common.
> > > writing bytes without first reading them is also less common.
Joanne Koong Aug. 2, 2022, 5:14 a.m. UTC | #23
On Mon, Aug 1, 2022 at 9:53 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 8:51 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Aug 1, 2022 at 5:56 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Mon, Aug 01, 2022 at 04:23:16PM -0700, Martin KaFai Lau wrote:
> > > > On Mon, Aug 01, 2022 at 03:58:41PM -0700, Andrii Nakryiko wrote:
> > > > > On Mon, Aug 1, 2022 at 3:33 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > >
> > > > > > On Mon, Aug 01, 2022 at 02:16:23PM -0700, Joanne Koong wrote:
> > > > > > > On Mon, Aug 1, 2022 at 12:38 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Aug 01, 2022 at 10:52:14AM -0700, Joanne Koong wrote:
> > > > > > > > > > Since we are on bpf_dynptr_write, what is the reason
> > > > > > > > > > on limiting it to the skb_headlen() ?  Not implying one
> > > > > > > > > > way is better than another.  would like to undertand the reason
> > > > > > > > > > behind it since it is not clear in the commit message.
> > > > > > > > > For bpf_dynptr_write, if we don't limit it to skb_headlen() then there
> > > > > > > > > may be writes that pull the skb, so any existing data slices to the
> > > > > > > > > skb must be invalidated. However, in the verifier we can't detect when
> > > > > > > > > the data slice should be invalidated vs. when it shouldn't (eg
> > > > > > > > > detecting when a write goes into the paged area vs when the write is
> > > > > > > > > only in the head). If the prog wants to write into the paged area, I
> > > > > > > > > think the only way it can work is if it pulls the data first with
> > > > > > > > > bpf_skb_pull_data before calling bpf_dynptr_write. I will add this to
> > > > > > > > > the commit message in v2
> > > > > > > > Note that current verifier unconditionally invalidates PTR_TO_PACKET
> > > > > > > > after bpf_skb_store_bytes().  Potentially the same could be done for
> > > > > > > > other new helper like bpf_dynptr_write().  I think this bpf_dynptr_write()
> > > > > > > > behavior cannot be changed later, so want to raise this possibility here
> > > > > > > > just in case it wasn't considered before.
> > > > > > >
> > > > > > > Thanks for raising this possibility. To me, it seems more intuitive
> > > > > > > from the user standpoint to have bpf_dynptr_write() on a paged area
> > > > > > > fail (even if bpf_dynptr_read() on that same offset succeeds) than to
> > > > > > > have bpf_dynptr_write() always invalidate all dynptr slices related to
> > > > > > > that skb. I think most writes will be to the data in the head area,
> > > > > > > which seems unfortunate that bpf_dynptr_writes to the head area would
> > > > > > > invalidate the dynptr slices regardless.
> > > > > > >
> > > > > > > What are your thoughts? Do you think you prefer having
> > > > > > > bpf_dynptr_write() always work regardless of where the data is? If so,
> > > > > > > I'm happy to make that change for v2 :)
> > > > > > Yeah, it sounds like an optimization to avoid unnecessarily
> > > > > > invalidating the sliced data.
> > > > > >
> > > > > > To be honest, I am not sure how often the dynptr_data()+dynptr_write() combo will
> > > > > > be used considering there is usually a pkt read before a pkt write in
> > > > > > the pkt modification use case.  If I got that far to have a sliced data pointer
> > > > > > to satisfy what I need for reading,  I would try to avoid making extra call
> > > > > > to dyptr_write() to modify it.
> > > > > >
> > > > > > I would prefer user can have similar expectation (no need to worry pkt layout)
> > > > > > between dynptr_read() and dynptr_write(), and also has similar experience to
> > > > > > the bpf_skb_load_bytes() and bpf_skb_store_bytes().  Otherwise, it is just
> > > > > > unnecessary rules for user to remember while there is no clear benefit on
> > > > > > the chance of this optimization.
> > > > > >
> > > > >
> > > > > Are you saying that bpf_dynptr_read() shouldn't read from non-linear
> > > > > part of skb (and thus match more restrictive bpf_dynptr_write), or are
> > > > > you saying you'd rather have bpf_dynptr_write() write into non-linear
> > > > > part but invalidate bpf_dynptr_data() pointers?
> > > > The latter.  Read and write without worrying about the skb layout.
> > > >
> > > > Also, if the prog needs to call a helper to write, it knows the bytes are
> > > > not in the data pointer.  Then it needs to bpf_skb_pull_data() before
> > > > it can call write.  However, after bpf_skb_pull_data(), why the prog
> > > > needs to call the write helper instead of directly getting a new
> > > > data pointer and write to it?  If the prog needs to write many many
> > > > bytes, a write helper may then help.
> > > After another thought, other than the non-linear handling,
> > > bpf_skb_store_bytes() / dynptr_write() is more useful in
> > > the 'BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH' flags.
> > >
> > > That said,  my preference is still to have the same expectation on
> > > non-linear data for both dynptr_read() and dynptr_write().  Considering
> > > the user can fall back to use bpf_skb_load_bytes() and
> > > bpf_skb_store_bytes(), I am fine with the current patch also.
> > >
> >
> > Honestly, I don't have any specific preference, because I don't have
> > much specific experience writing networking BPF :)
> >
> > But considering Jakub's point about trying to unify skb/xdp dynptr,
> > while I can see how we might have symmetrical dynptr_{read,write}()
> > for skb case (because you can pull skb), I believe this is not
> > possible with XDP (e.g., multi-buffer one), so bpf_dynptr_write()
> > would always be more limited for XDP case.
> >
> > Or maybe it is possible for XDP and I'm totally wrong here? I'm happy
> > to be educated about this!
>
> My understanding is that it's possible for XDP because the data in the
> frags are mapped [eg we can use skb_frag_address() to get the address
> and then copy into it with direct memcpys [0]] whereas skb frags are
> unmapped (eg access into the frag requires kmapping [1]).
>
> Maybe one solution is to add a function that does the mapping + write
> to a skb frag without pulling it to the head. This would allow
> bpf_dynptr_write to all data without needing to invalidate any dynptr
> slices. But I don't know whether this is compatible with recomputing
> the checksum or not, maybe the written data needs to be mapped (and
> hence part of head) so that it can be used to compute the checksum [2]
> - I'll read up some more on the checksumming code.
>
> I like your point Martin that if people are using bpf_dynptr_write,
> then they probably aren't using data slices much anyways so it
> wouldn't be too inconvenient that their slices are invalidated (eg if
> they are using bpf_dynptr_write it's to write into the skb frag, at
> which point they would need to call pull before bpf_dynptr_write,
> which would lead to same scenario where the data slices are
> invalidated). My main concern was that slices would be invalidated for
> bpf_dynptr_writes on data in the head area, but you're right that that
> shouldn't be too likely since they'd just be using a direct data slice
> access instead to read/write. I'll change it so that bpf_dynptr_write
> always succeeds and it'll always invalidate the data slices for v2.

On second thought, for v2 I plan to combine xdp and skb to 1 generic
function ("bpf_dynptr_from_packet") per Jakub's suggestion. I think in
that case then, for consistency, it'd be lbetter if we don't
invalidate the slices since it would be confusing if bpf_dynptr_write
invalidates slices for skb-type progs but not xdp-type ones. I think
bpf_dynptr_write returning an error for writes into the frag area for
skb type progs but not xdp type progs is less jarring than dynptr
slices being invalidated for skb type progs but not xdp type progs.

>
> [0] https://elixir.bootlin.com/linux/v5.19/source/net/core/filter.c#L3846
> [1] https://elixir.bootlin.com/linux/v5.19/source/net/core/skbuff.c#L2367
> [2] https://elixir.bootlin.com/linux/v5.19/source/include/linux/skbuff.h#L3839
>
> >
> > > >
> > > > >
> > > > > I guess I agree about consistency and that it seems like in practice
> > > > > you'd use bpf_dynptr_data() to work with headers and stuff like that
> > > > > at known locations, and then if you need to modify the rest of payload
> > > > > you'd do either bpf_skb_load_bytes()/bpf_skb_store_bytes() or
> > > > > bpf_dynptr_read()/bpf_dynptr_write() which would invalidate
> > > > > bpf_dynptr_data() pointers (but that would be ok by that time).
> > > > imo, read, write and then go back to read is less common.
> > > > writing bytes without first reading them is also less common.
Martin KaFai Lau Aug. 3, 2022, 6:37 a.m. UTC | #24
On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59a217ca2dfd..0730cd198a7f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5241,11 +5241,22 @@ union bpf_attr {
>   *	Description
>   *		Write *len* bytes from *src* into *dst*, starting from *offset*
>   *		into *dst*.
> - *		*flags* is currently unused.
> + *
> + *		*flags* must be 0 except for skb-type dynptrs.
> + *
> + *		For skb-type dynptrs:
> + *		    *  if *offset* + *len* extends into the skb's paged buffers, the user
> + *		       should manually pull the skb with bpf_skb_pull and then try again.
bpf_skb_pull_data().

Probably need formatting like, **bpf_skb_pull_data**\ ()

> + *
> + *		    *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM** (automatically
> + *			recompute the checksum for the packet after storing the bytes) and
> + *			**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
> + *			**->swhash** and *skb*\ **->l4hash** to 0).
>   *	Return
>   *		0 on success, -E2BIG if *offset* + *len* exceeds the length
>   *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
> - *		is a read-only dynptr or if *flags* is not 0.
> + *		is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
> + *		skb-type dynptrs the write extends into the skb's paged buffers.
May also mention other negative errors is similar to the bpf_skb_store_bytes()
instead of mentioning them one-by-one here.

>   *
>   * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
>   *	Description
> @@ -5253,10 +5264,19 @@ union bpf_attr {
>   *
>   *		*len* must be a statically known value. The returned data slice
>   *		is invalidated whenever the dynptr is invalidated.
> + *
> + *		For skb-type dynptrs:
> + *		    * if *offset* + *len* extends into the skb's paged buffers,
> + *		      the user should manually pull the skb with bpf_skb_pull and then
same here. bpf_skb_pull_data().

> + *		      try again.
> + *
> + *		    * the data slice is automatically invalidated anytime a
> + *		      helper call that changes the underlying packet buffer
> + *		      (eg bpf_skb_pull) is called.
>   *	Return
>   *		Pointer to the underlying dynptr data, NULL if the dynptr is
>   *		read-only, if the dynptr is invalid, or if the offset and length
> - *		is out of bounds.
> + *		is out of bounds or in a paged buffer for skb-type dynptrs.
>   *
>   * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
>   *	Description
> @@ -5331,6 +5351,21 @@ union bpf_attr {
>   *		**-EACCES** if the SYN cookie is not valid.
>   *
>   *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
> + *
> + * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct bpf_dynptr *ptr)
> + *	Description
> + *		Get a dynptr to the data in *skb*. *skb* must be the BPF program
> + *		context. Depending on program type, the dynptr may be read-only,
> + *		in which case trying to obtain a direct data slice to it through
> + *		bpf_dynptr_data will return an error.
> + *
> + *		Calls that change the *skb*'s underlying packet buffer
> + *		(eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
> + *		invalidate any data slices associated with the dynptr.
> + *
> + *		*flags* is currently unused, it must be 0 for now.
> + *	Return
> + *		0 on success or -EINVAL if flags is not 0.
>   */

[ ... ]

> @@ -1528,15 +1544,38 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
>  BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
>  	   u32, len, u64, flags)
>  {
> +	enum bpf_dynptr_type type;
>  	int err;
>  
> -	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
> +	if (!dst->data || bpf_dynptr_is_rdonly(dst))
>  		return -EINVAL;
>  
>  	err = bpf_dynptr_check_off_len(dst, offset, len);
>  	if (err)
>  		return err;
>  
> +	type = bpf_dynptr_get_type(dst);
> +
> +	if (flags) {
> +		if (type == BPF_DYNPTR_TYPE_SKB) {
> +			if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
nit.  The flags is the same as __bpf_skb_store_bytes().  __bpf_skb_store_bytes()
can reject as well instead of duplicating the test here.

> +				return -EINVAL;
> +		} else {
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (type == BPF_DYNPTR_TYPE_SKB) {
> +		struct sk_buff *skb = dst->data;
> +
> +		/* if the data is paged, the caller needs to pull it first */
> +		if (dst->offset + offset + len > skb->len - skb->data_len)
> +			return -EAGAIN;
> +
> +		return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
> +					     flags);
> +	}
> +
Joanne Koong Aug. 3, 2022, 8:29 p.m. UTC | #25
On Fri, Jul 29, 2022 at 2:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Fri, Jul 29, 2022 at 01:26:31PM -0700, Joanne Koong wrote:
> > On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> > > >       if (bpf_dynptr_is_rdonly(ptr))
> > > Is it possible to allow data slice for rdonly dynptr-skb?
> > > and depends on the may_access_direct_pkt_data() check in the verifier.
> >
> > Ooh great idea. This should be very simple to do, since the data slice
> > that gets returned is assigned as PTR_TO_PACKET. So any stx operations
> > on it will by default go through the may_access_direct_pkt_data()
> > check. I'll add this for v2.
> It will be great.  Out of all three helpers (bpf_dynptr_read/write/data),
> bpf_dynptr_data will be the useful one to parse the header data (e.g. tcp-hdr-opt)
> that has runtime variable length because bpf_dynptr_data() can take a non-cost
> 'offset' argument.  It is useful to get a consistent usage across all bpf
> prog types that are either read-only or read-write of the skb.
>
> >
> > >
> > > >               return 0;
> > > >
> > > > +     type = bpf_dynptr_get_type(ptr);
> > > > +
> > > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > > +             struct sk_buff *skb = ptr->data;
> > > > +
> > > > +             /* if the data is paged, the caller needs to pull it first */
> > > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > > +                     return 0;
> > > > +
> > > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > > +     }
> > > > +
> > > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > > >  }
> > >
> > > [ ... ]
> > >
> > > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > > > +                                    struct bpf_call_arg_meta *meta)
> > > >  {
> > > >       struct bpf_func_state *state = func(env, reg);
> > > >       int spi = get_spi(reg->off);
> > > >
> > > > -     return state->stack[spi].spilled_ptr.id;
> > > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > > >  }
> > > >
> > > >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > >                               case DYNPTR_TYPE_RINGBUF:
> > > >                                       err_extra = "ringbuf ";
> > > >                                       break;
> > > > +                             case DYNPTR_TYPE_SKB:
> > > > +                                     err_extra = "skb ";
> > > > +                                     break;
> > > >                               default:
> > > >                                       break;
> > > >                               }
> > > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> > > >                                       return -EFAULT;
> > > >                               }
> > > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > > +                             /* Find the id and the type of the dynptr we're tracking
> > > > +                              * the reference of.
> > > > +                              */
> > > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > > >                       }
> > > >               }
> > > >               break;
> > > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> > > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > > +             else
> > > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > > check_packet_access() uses range.
> > > It took me a while to figure range and mem_size is in union.
> > > Mentioning here in case someone has similar question.
> > For v2, I'll add this as a comment in the code or I'll include
> > "regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
> > obvious :)
> 'regs[BPF_REG_0].range = meta.mem_size' would be great.  No strong
> opinion here.
>
> > >
> > > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > > >               const struct btf_type *t;
> > > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> > > >                       goto patch_call_imm;
> > > >               }
> > > >
> > > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > > +                     else
> > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > > +                     insn_buf[1] = *insn;
> > > > +                     cnt = 2;
> > > > +
> > > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > > +                     if (!new_prog)
> > > > +                             return -ENOMEM;
> > > > +
> > > > +                     delta += cnt - 1;
> > > > +                     env->prog = new_prog;
> > > > +                     prog = new_prog;
> > > > +                     insn = new_prog->insnsi + i + delta;
> > > > +                     goto patch_call_imm;
> > > > +             }
> > > Have you considered to reject bpf_dynptr_write()
> > > at prog load time?
> > It's possible to reject bpf_dynptr_write() at prog load time but would
> > require adding tracking in the verifier for whether a dynptr is
> > read-only or not. Do you think it's better to reject it at load time
> > instead of returning NULL at runtime?
> The check_helper_call above seems to know 'meta.type == BPF_DYNPTR_TYPE_SKB'.
> Together with may_access_direct_pkt_data(), would it be enough ?
> Then no need to do patching for BPF_FUNC_dynptr_from_skb here.

Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
to be patched regardless in order to set the rd-only flag in the
metadata for the dynptr. There will be other helper functions that
write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
probe read user with dynptrs, ...) so I think it's more scalable if we
reject these writes at runtime through the rd-only flag in the
metadata, than for the verifier to custom-case that any helper funcs
that write into dynptrs will need to get dynptr type + do
may_access_direct_pkt_data() if it's type skb or xdp. The
inconsistency between not rd-only in metadata vs. rd-only in verifier
might be a little confusing as well.

For these reasons, I'm leaning more towards having bpf_dynptr_write()
and other dynptr write helper funcs be rejected at runtime instead of
prog load time, but I'm eager to hear what you prefer.

What are your thoughts?

>
> Since we are on bpf_dynptr_write, what is the reason
> on limiting it to the skb_headlen() ?  Not implying one
> way is better than another.  would like to undertand the reason
> behind it since it is not clear in the commit message.
Andrii Nakryiko Aug. 3, 2022, 8:36 p.m. UTC | #26
On Wed, Aug 3, 2022 at 1:29 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Fri, Jul 29, 2022 at 2:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Fri, Jul 29, 2022 at 01:26:31PM -0700, Joanne Koong wrote:
> > > On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > > > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> > > > >       if (bpf_dynptr_is_rdonly(ptr))
> > > > Is it possible to allow data slice for rdonly dynptr-skb?
> > > > and depends on the may_access_direct_pkt_data() check in the verifier.
> > >
> > > Ooh great idea. This should be very simple to do, since the data slice
> > > that gets returned is assigned as PTR_TO_PACKET. So any stx operations
> > > on it will by default go through the may_access_direct_pkt_data()
> > > check. I'll add this for v2.
> > It will be great.  Out of all three helpers (bpf_dynptr_read/write/data),
> > bpf_dynptr_data will be the useful one to parse the header data (e.g. tcp-hdr-opt)
> > that has runtime variable length because bpf_dynptr_data() can take a non-cost
> > 'offset' argument.  It is useful to get a consistent usage across all bpf
> > prog types that are either read-only or read-write of the skb.
> >
> > >
> > > >
> > > > >               return 0;
> > > > >
> > > > > +     type = bpf_dynptr_get_type(ptr);
> > > > > +
> > > > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > > > +             struct sk_buff *skb = ptr->data;
> > > > > +
> > > > > +             /* if the data is paged, the caller needs to pull it first */
> > > > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > > > +                     return 0;
> > > > > +
> > > > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > > > +     }
> > > > > +
> > > > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > > > >  }
> > > >
> > > > [ ... ]
> > > >
> > > > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > > > > +                                    struct bpf_call_arg_meta *meta)
> > > > >  {
> > > > >       struct bpf_func_state *state = func(env, reg);
> > > > >       int spi = get_spi(reg->off);
> > > > >
> > > > > -     return state->stack[spi].spilled_ptr.id;
> > > > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > > > >  }
> > > > >
> > > > >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > >                               case DYNPTR_TYPE_RINGBUF:
> > > > >                                       err_extra = "ringbuf ";
> > > > >                                       break;
> > > > > +                             case DYNPTR_TYPE_SKB:
> > > > > +                                     err_extra = "skb ";
> > > > > +                                     break;
> > > > >                               default:
> > > > >                                       break;
> > > > >                               }
> > > > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> > > > >                                       return -EFAULT;
> > > > >                               }
> > > > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > > > +                             /* Find the id and the type of the dynptr we're tracking
> > > > > +                              * the reference of.
> > > > > +                              */
> > > > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > > > >                       }
> > > > >               }
> > > > >               break;
> > > > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> > > > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > > > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > > > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > > > +             else
> > > > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > > > check_packet_access() uses range.
> > > > It took me a while to figure range and mem_size is in union.
> > > > Mentioning here in case someone has similar question.
> > > For v2, I'll add this as a comment in the code or I'll include
> > > "regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
> > > obvious :)
> > 'regs[BPF_REG_0].range = meta.mem_size' would be great.  No strong
> > opinion here.
> >
> > > >
> > > > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > > > >               const struct btf_type *t;
> > > > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> > > > >                       goto patch_call_imm;
> > > > >               }
> > > > >
> > > > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > > > +                     else
> > > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > > > +                     insn_buf[1] = *insn;
> > > > > +                     cnt = 2;
> > > > > +
> > > > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > > > +                     if (!new_prog)
> > > > > +                             return -ENOMEM;
> > > > > +
> > > > > +                     delta += cnt - 1;
> > > > > +                     env->prog = new_prog;
> > > > > +                     prog = new_prog;
> > > > > +                     insn = new_prog->insnsi + i + delta;
> > > > > +                     goto patch_call_imm;
> > > > > +             }
> > > > Have you considered to reject bpf_dynptr_write()
> > > > at prog load time?
> > > It's possible to reject bpf_dynptr_write() at prog load time but would
> > > require adding tracking in the verifier for whether a dynptr is
> > > read-only or not. Do you think it's better to reject it at load time
> > > instead of returning NULL at runtime?
> > The check_helper_call above seems to know 'meta.type == BPF_DYNPTR_TYPE_SKB'.
> > Together with may_access_direct_pkt_data(), would it be enough ?
> > Then no need to do patching for BPF_FUNC_dynptr_from_skb here.
>
> Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
> to be patched regardless in order to set the rd-only flag in the
> metadata for the dynptr. There will be other helper functions that
> write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
> probe read user with dynptrs, ...) so I think it's more scalable if we
> reject these writes at runtime through the rd-only flag in the
> metadata, than for the verifier to custom-case that any helper funcs
> that write into dynptrs will need to get dynptr type + do
> may_access_direct_pkt_data() if it's type skb or xdp. The
> inconsistency between not rd-only in metadata vs. rd-only in verifier
> might be a little confusing as well.
>
> For these reasons, I'm leaning more towards having bpf_dynptr_write()
> and other dynptr write helper funcs be rejected at runtime instead of
> prog load time, but I'm eager to hear what you prefer.
>

+1, that's sort of the point of dynptr to move move checks into
runtime and leave BPF verifier simpler

> What are your thoughts?
>
> >
> > Since we are on bpf_dynptr_write, what is the reason
> > on limiting it to the skb_headlen() ?  Not implying one
> > way is better than another.  would like to undertand the reason
> > behind it since it is not clear in the commit message.
Martin KaFai Lau Aug. 3, 2022, 8:56 p.m. UTC | #27
On Wed, Aug 03, 2022 at 01:29:37PM -0700, Joanne Koong wrote:
> On Fri, Jul 29, 2022 at 2:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Fri, Jul 29, 2022 at 01:26:31PM -0700, Joanne Koong wrote:
> > > On Thu, Jul 28, 2022 at 4:39 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > On Tue, Jul 26, 2022 at 11:47:04AM -0700, Joanne Koong wrote:
> > > > > @@ -1567,6 +1607,18 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
> > > > >       if (bpf_dynptr_is_rdonly(ptr))
> > > > Is it possible to allow data slice for rdonly dynptr-skb?
> > > > and depends on the may_access_direct_pkt_data() check in the verifier.
> > >
> > > Ooh great idea. This should be very simple to do, since the data slice
> > > that gets returned is assigned as PTR_TO_PACKET. So any stx operations
> > > on it will by default go through the may_access_direct_pkt_data()
> > > check. I'll add this for v2.
> > It will be great.  Out of all three helpers (bpf_dynptr_read/write/data),
> > bpf_dynptr_data will be the useful one to parse the header data (e.g. tcp-hdr-opt)
> > that has runtime variable length because bpf_dynptr_data() can take a non-cost
> > 'offset' argument.  It is useful to get a consistent usage across all bpf
> > prog types that are either read-only or read-write of the skb.
> >
> > >
> > > >
> > > > >               return 0;
> > > > >
> > > > > +     type = bpf_dynptr_get_type(ptr);
> > > > > +
> > > > > +     if (type == BPF_DYNPTR_TYPE_SKB) {
> > > > > +             struct sk_buff *skb = ptr->data;
> > > > > +
> > > > > +             /* if the data is paged, the caller needs to pull it first */
> > > > > +             if (ptr->offset + offset + len > skb->len - skb->data_len)
> > > > > +                     return 0;
> > > > > +
> > > > > +             return (unsigned long)(skb->data + ptr->offset + offset);
> > > > > +     }
> > > > > +
> > > > >       return (unsigned long)(ptr->data + ptr->offset + offset);
> > > > >  }
> > > >
> > > > [ ... ]
> > > >
> > > > > -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
> > > > > +static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
> > > > > +                                    struct bpf_call_arg_meta *meta)
> > > > >  {
> > > > >       struct bpf_func_state *state = func(env, reg);
> > > > >       int spi = get_spi(reg->off);
> > > > >
> > > > > -     return state->stack[spi].spilled_ptr.id;
> > > > > +     meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
> > > > > +     meta->type = state->stack[spi].spilled_ptr.dynptr.type;
> > > > >  }
> > > > >
> > > > >  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > > @@ -6052,6 +6057,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > >                               case DYNPTR_TYPE_RINGBUF:
> > > > >                                       err_extra = "ringbuf ";
> > > > >                                       break;
> > > > > +                             case DYNPTR_TYPE_SKB:
> > > > > +                                     err_extra = "skb ";
> > > > > +                                     break;
> > > > >                               default:
> > > > >                                       break;
> > > > >                               }
> > > > > @@ -6065,8 +6073,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
> > > > >                                       verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
> > > > >                                       return -EFAULT;
> > > > >                               }
> > > > > -                             /* Find the id of the dynptr we're tracking the reference of */
> > > > > -                             meta->ref_obj_id = stack_slot_get_id(env, reg);
> > > > > +                             /* Find the id and the type of the dynptr we're tracking
> > > > > +                              * the reference of.
> > > > > +                              */
> > > > > +                             stack_slot_get_dynptr_info(env, reg, meta);
> > > > >                       }
> > > > >               }
> > > > >               break;
> > > > > @@ -7406,7 +7416,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
> > > > >               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
> > > > >       } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
> > > > >               mark_reg_known_zero(env, regs, BPF_REG_0);
> > > > > -             regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > > +             if (func_id == BPF_FUNC_dynptr_data &&
> > > > > +                 meta.type == BPF_DYNPTR_TYPE_SKB)
> > > > > +                     regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
> > > > > +             else
> > > > > +                     regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
> > > > >               regs[BPF_REG_0].mem_size = meta.mem_size;
> > > > check_packet_access() uses range.
> > > > It took me a while to figure range and mem_size is in union.
> > > > Mentioning here in case someone has similar question.
> > > For v2, I'll add this as a comment in the code or I'll include
> > > "regs[BPF_REG_0].range = meta.mem_size" explicitly to make it more
> > > obvious :)
> > 'regs[BPF_REG_0].range = meta.mem_size' would be great.  No strong
> > opinion here.
> >
> > > >
> > > > >       } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
> > > > >               const struct btf_type *t;
> > > > > @@ -14132,6 +14146,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> > > > >                       goto patch_call_imm;
> > > > >               }
> > > > >
> > > > > +             if (insn->imm == BPF_FUNC_dynptr_from_skb) {
> > > > > +                     if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
> > > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
> > > > > +                     else
> > > > > +                             insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
> > > > > +                     insn_buf[1] = *insn;
> > > > > +                     cnt = 2;
> > > > > +
> > > > > +                     new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > > > > +                     if (!new_prog)
> > > > > +                             return -ENOMEM;
> > > > > +
> > > > > +                     delta += cnt - 1;
> > > > > +                     env->prog = new_prog;
> > > > > +                     prog = new_prog;
> > > > > +                     insn = new_prog->insnsi + i + delta;
> > > > > +                     goto patch_call_imm;
> > > > > +             }
> > > > Have you considered to reject bpf_dynptr_write()
> > > > at prog load time?
> > > It's possible to reject bpf_dynptr_write() at prog load time but would
> > > require adding tracking in the verifier for whether a dynptr is
> > > read-only or not. Do you think it's better to reject it at load time
> > > instead of returning NULL at runtime?
> > The check_helper_call above seems to know 'meta.type == BPF_DYNPTR_TYPE_SKB'.
> > Together with may_access_direct_pkt_data(), would it be enough ?
> > Then no need to do patching for BPF_FUNC_dynptr_from_skb here.
> 
> Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
> to be patched regardless in order to set the rd-only flag in the
> metadata for the dynptr. There will be other helper functions that
> write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
> probe read user with dynptrs, ...) so I think it's more scalable if we
> reject these writes at runtime through the rd-only flag in the
> metadata, than for the verifier to custom-case that any helper funcs
> that write into dynptrs will need to get dynptr type + do
> may_access_direct_pkt_data() if it's type skb or xdp. The
> inconsistency between not rd-only in metadata vs. rd-only in verifier
> might be a little confusing as well.
> 
> For these reasons, I'm leaning more towards having bpf_dynptr_write()
> and other dynptr write helper funcs be rejected at runtime instead of
> prog load time, but I'm eager to hear what you prefer.
> 
> What are your thoughts?
Sure, as long as bpf_dynptr_data() is not restricted by the rdonly dynptr.
Jakub Kicinski Aug. 3, 2022, 11:25 p.m. UTC | #28
On Wed, 3 Aug 2022 13:29:37 -0700 Joanne Koong wrote:
> Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
> to be patched regardless in order to set the rd-only flag in the
> metadata for the dynptr. There will be other helper functions that
> write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
> probe read user with dynptrs, ...) so I think it's more scalable if we
> reject these writes at runtime through the rd-only flag in the
> metadata, than for the verifier to custom-case that any helper funcs
> that write into dynptrs will need to get dynptr type + do
> may_access_direct_pkt_data() if it's type skb or xdp. The
> inconsistency between not rd-only in metadata vs. rd-only in verifier
> might be a little confusing as well.
> 
> For these reasons, I'm leaning more towards having bpf_dynptr_write()
> and other dynptr write helper funcs be rejected at runtime instead of
> prog load time, but I'm eager to hear what you prefer.
> 
> What are your thoughts?

Oh. I thought dynptrs are an extension of the discussion we had about
creating a skb_header_pointer()-like abstraction but it sounds like 
we veered quite far off that track at some point :(

The point of skb_header_pointer() is to expose the chunk of the packet
pointed to by [skb, offset, len] as a linear buffer. Potentially coping
it out to a stack buffer *IIF* the header is not contiguous inside the
skb head, which should very rarely happen.

Here it seems we return an error so that user must pull if the data is
not linear, which is defeating the purpose. The user of
skb_header_pointer() wants to avoid the copy while _reliably_ getting 
a contiguous pointer. Plus pulling in the header may be far more
expensive than a small copy to the stack.

The pointer returned by skb_header_pointer is writable, but it's not
guaranteed that the writes go to the packet, they may go to the
on-stack buffer, so the caller must do some sort of:

	if (data_ptr == stack_buf)
		skb_store_bits(...);

Which we were thinking of wrapping in some sort of flush operation.

If I'm reading this right dynptr as implemented here do not provide
such semantics, am I confused in thinking that this is a continuation
of the XDP multi-buff discussion? Is it a completely separate thing
and we'll still need a header_pointer like helper?
Joanne Koong Aug. 4, 2022, 1:05 a.m. UTC | #29
On Wed, Aug 3, 2022 at 4:25 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 3 Aug 2022 13:29:37 -0700 Joanne Koong wrote:
> > Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
> > to be patched regardless in order to set the rd-only flag in the
> > metadata for the dynptr. There will be other helper functions that
> > write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
> > probe read user with dynptrs, ...) so I think it's more scalable if we
> > reject these writes at runtime through the rd-only flag in the
> > metadata, than for the verifier to custom-case that any helper funcs
> > that write into dynptrs will need to get dynptr type + do
> > may_access_direct_pkt_data() if it's type skb or xdp. The
> > inconsistency between not rd-only in metadata vs. rd-only in verifier
> > might be a little confusing as well.
> >
> > For these reasons, I'm leaning more towards having bpf_dynptr_write()
> > and other dynptr write helper funcs be rejected at runtime instead of
> > prog load time, but I'm eager to hear what you prefer.
> >
> > What are your thoughts?
>
> Oh. I thought dynptrs are an extension of the discussion we had about
> creating a skb_header_pointer()-like abstraction but it sounds like
> we veered quite far off that track at some point :(

I think the problem is that the skb may be cloned, so a write into any
portion of the paged data requires a pull. If it weren't for this,
then we could do the write and checksumming without pulling (eg kmap
the page, get the csum_partial of the bytes you'll write over, do the
write, get the csum_partial of the bytes you just wrote, then unkmap,
then update skb->csum to be skb->csum - csum of the bytes you wrote
over + csum of the bytes you wrote). I think we would even be able to
provide a direct data slice to non-contiguous pages without needing
the additional copy to a stack buffer (eg kmap the non-contiguous
pages to a contiguous virtual address that we pass back to the bpf
program, and then when the bpf program is finished do the cleanup for
the mappings).

Three ideas I'm thinking through as a possible solution:
1) Enforce that the skb is always uncloned for skb-type bpf progs (we
currently do this just for the skb head, see bpf_unclone_prologue()),
but I'm not sure if the trade-off (pulling all the packet data, even
if it won't be used) is acceptable.

2) Don't support cloned skbs for bpf_dynptr_write/data and don't do
any pulling. If the prog wants to use bpf_dynptr_write/data, then they
have to pull it first

2) (uglier than #1 and #2) For bpf_dynptr_write()s, pull if the write
is to a paged area and the skb is cloned, otherwise write to the paged
area without pulling; if we do this, then we always have to invalidate
all data slices associated with the skb (even for writes to the head)
since at prog load time, the verifier doesn't know if the pull happens
or not. For bpf_dynptr_data()s, follow the same policy.

I'm leaning towards 2. What are your thoughts?

>
> The point of skb_header_pointer() is to expose the chunk of the packet
> pointed to by [skb, offset, len] as a linear buffer. Potentially coping
> it out to a stack buffer *IIF* the header is not contiguous inside the
> skb head, which should very rarely happen.
>
> Here it seems we return an error so that user must pull if the data is
> not linear, which is defeating the purpose. The user of
> skb_header_pointer() wants to avoid the copy while _reliably_ getting
> a contiguous pointer. Plus pulling in the header may be far more
> expensive than a small copy to the stack.
>
> The pointer returned by skb_header_pointer is writable, but it's not
> guaranteed that the writes go to the packet, they may go to the
> on-stack buffer, so the caller must do some sort of:
>
>         if (data_ptr == stack_buf)
>                 skb_store_bits(...);
>
> Which we were thinking of wrapping in some sort of flush operation.
>
> If I'm reading this right dynptr as implemented here do not provide
> such semantics, am I confused in thinking that this is a continuation
> of the XDP multi-buff discussion? Is it a completely separate thing
> and we'll still need a header_pointer like helper?
Martin KaFai Lau Aug. 4, 2022, 1:27 a.m. UTC | #30
On Wed, Aug 03, 2022 at 04:25:40PM -0700, Jakub Kicinski wrote:
> The point of skb_header_pointer() is to expose the chunk of the packet
> pointed to by [skb, offset, len] as a linear buffer. Potentially coping
> it out to a stack buffer *IIF* the header is not contiguous inside the
> skb head, which should very rarely happen.
> 
> Here it seems we return an error so that user must pull if the data is
> not linear, which is defeating the purpose. The user of
> skb_header_pointer() wants to avoid the copy while _reliably_ getting 
> a contiguous pointer. Plus pulling in the header may be far more
> expensive than a small copy to the stack.
> 
> The pointer returned by skb_header_pointer is writable, but it's not
> guaranteed that the writes go to the packet, they may go to the
> on-stack buffer, so the caller must do some sort of:
> 
> 	if (data_ptr == stack_buf)
> 		skb_store_bits(...);
> 
> Which we were thinking of wrapping in some sort of flush operation.
Curious on the idea.  don't know whether this is a dynptr helper or
should be a specific pkt helper though.

The idea is to have the prog keeps writing to a ptr (skb->data or stack_buf).
When the prog is done, call a bpf helper to flush.  The helper
decides if it needs to flush from stack_buf to skb and
will take care of the cloned skb ?

> 
> If I'm reading this right dynptr as implemented here do not provide
> such semantics, am I confused in thinking that this is a continuation
> of the XDP multi-buff discussion? Is it a completely separate thing
> and we'll still need a header_pointer like helper?
Can you share a pointer to the XDP multi-buff discussion?
Jakub Kicinski Aug. 4, 2022, 1:34 a.m. UTC | #31
On Wed, 3 Aug 2022 18:05:44 -0700 Joanne Koong wrote:
> I think the problem is that the skb may be cloned, so a write into any
> portion of the paged data requires a pull. If it weren't for this,
> then we could do the write and checksumming without pulling (eg kmap
> the page, get the csum_partial of the bytes you'll write over, do the
> write, get the csum_partial of the bytes you just wrote, then unkmap,
> then update skb->csum to be skb->csum - csum of the bytes you wrote
> over + csum of the bytes you wrote). I think we would even be able to
> provide a direct data slice to non-contiguous pages without needing
> the additional copy to a stack buffer (eg kmap the non-contiguous
> pages to a contiguous virtual address that we pass back to the bpf
> program, and then when the bpf program is finished do the cleanup for
> the mappings).

The whole read/write/data concept is not a great match for packet
parsing. Primary use for packet parsing is that you want to read
a header and not have to deal with frags or pulling. In that case
you should get a direct pointer or a copy on the stack, transparently.

Maybe before I go on talking nonsense I should read up on what dynptr
is and what it's trying to achieve. Stan says its like unique_ptr in
C++ which tells me.. nothing :)

$ git grep dynptr -- Documentation/
$

Any pointers?

> Three ideas I'm thinking through as a possible solution:
> 1) Enforce that the skb is always uncloned for skb-type bpf progs (we
> currently do this just for the skb head, see bpf_unclone_prologue()),
> but I'm not sure if the trade-off (pulling all the packet data, even
> if it won't be used) is acceptable.
> 
> 2) Don't support cloned skbs for bpf_dynptr_write/data and don't do
> any pulling. If the prog wants to use bpf_dynptr_write/data, then they
> have to pull it first

I think all output skbs from TCP are cloned, so that's not gonna work.

> 2) (uglier than #1 and #2) For bpf_dynptr_write()s, pull if the write
> is to a paged area and the skb is cloned, otherwise write to the paged
> area without pulling; if we do this, then we always have to invalidate
> all data slices associated with the skb (even for writes to the head)
> since at prog load time, the verifier doesn't know if the pull happens
> or not. For bpf_dynptr_data()s, follow the same policy.
> 
> I'm leaning towards 2. What are your thoughts?
Jakub Kicinski Aug. 4, 2022, 1:44 a.m. UTC | #32
On Wed, 3 Aug 2022 18:27:56 -0700 Martin KaFai Lau wrote:
> On Wed, Aug 03, 2022 at 04:25:40PM -0700, Jakub Kicinski wrote:
> > The point of skb_header_pointer() is to expose the chunk of the packet
> > pointed to by [skb, offset, len] as a linear buffer. Potentially coping
> > it out to a stack buffer *IIF* the header is not contiguous inside the
> > skb head, which should very rarely happen.
> > 
> > Here it seems we return an error so that user must pull if the data is
> > not linear, which is defeating the purpose. The user of
> > skb_header_pointer() wants to avoid the copy while _reliably_ getting 
> > a contiguous pointer. Plus pulling in the header may be far more
> > expensive than a small copy to the stack.
> > 
> > The pointer returned by skb_header_pointer is writable, but it's not
> > guaranteed that the writes go to the packet, they may go to the
> > on-stack buffer, so the caller must do some sort of:
> > 
> > 	if (data_ptr == stack_buf)
> > 		skb_store_bits(...);
> > 
> > Which we were thinking of wrapping in some sort of flush operation.  
> Curious on the idea.  don't know whether this is a dynptr helper or
> should be a specific pkt helper though.

Yeah, I could well pattern matched the dynptr because it sounded
similar but it's a completely different beast.

> The idea is to have the prog keeps writing to a ptr (skb->data or stack_buf).

To be clear writing is a lot more rare than reading in this case.

> When the prog is done, call a bpf helper to flush.  The helper
> decides if it needs to flush from stack_buf to skb and
> will take care of the cloned skb ?

Yeah, I'd think for skb it'd just pull. Normally dealing with skbs
you'd indeed probably just pull upfront if you knew you're gonna write.
Hence saving yourself from the unnecessary trip thru the stack. But XDP
does not have strong pulling support, so if the interface must support
both then it's the lower common denominator.

> > If I'm reading this right dynptr as implemented here do not provide
> > such semantics, am I confused in thinking that this is a continuation
> > of the XDP multi-buff discussion? Is it a completely separate thing
> > and we'll still need a header_pointer like helper?  
> Can you share a pointer to the XDP multi-buff discussion?

https://lore.kernel.org/all/20210916095539.4696ae27@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com/
Joanne Koong Aug. 4, 2022, 3:44 a.m. UTC | #33
On Wed, Aug 3, 2022 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 3 Aug 2022 18:05:44 -0700 Joanne Koong wrote:
> > I think the problem is that the skb may be cloned, so a write into any
> > portion of the paged data requires a pull. If it weren't for this,
> > then we could do the write and checksumming without pulling (eg kmap
> > the page, get the csum_partial of the bytes you'll write over, do the
> > write, get the csum_partial of the bytes you just wrote, then unkmap,
> > then update skb->csum to be skb->csum - csum of the bytes you wrote
> > over + csum of the bytes you wrote). I think we would even be able to
> > provide a direct data slice to non-contiguous pages without needing
> > the additional copy to a stack buffer (eg kmap the non-contiguous
> > pages to a contiguous virtual address that we pass back to the bpf
> > program, and then when the bpf program is finished do the cleanup for
> > the mappings).
>
> The whole read/write/data concept is not a great match for packet
> parsing. Primary use for packet parsing is that you want to read
> a header and not have to deal with frags or pulling. In that case
> you should get a direct pointer or a copy on the stack, transparently.

The selftests [0] includes some examples of packet parsing using
dynptrs. You're able to get a pointer to the headers (if it's in the
head) directly, or you can use bpf_dynptr_read() to read the data from
the frag into a buffer (without needing to pull; bpf_dynptr_read()
essentially just calls bpf_skb_load_bytes()).

Does this address the use cases you have in mind?

I think the pull and unclone stuff only pertains to the cases for
writes into the frags.

[0] https://lore.kernel.org/bpf/20220726184706.954822-4-joannelkoong@gmail.com/

>
> Maybe before I go on talking nonsense I should read up on what dynptr
> is and what it's trying to achieve. Stan says its like unique_ptr in
> C++ which tells me.. nothing :)
>
> $ git grep dynptr -- Documentation/
> $
>
> Any pointers?

Ooh thanks for the reminder, adding a page for the dynptr
documentation is on my to-do list

A dynptr (also known as fat pointers in other languages) is a pointer
that stores extra metadata alongside the address it points to. In
particular, the metadata the bpf dynptr stores includes the length of
data it points to (so in the case of skb/xdp, the size of the packet),
the type of dynptr, and properties like whether it's read-only.
Dynptrs are an interface for bpf programs to dynamically access
memory, because the helper calls enforce that the accesses are safe.
For example, bpf_dynptr_read() allows reads at offsets and lengths not
statically known at compile time (in bpf_dynptr_read, the kernel uses
the metadata to check that the offset and length doesn't violate
memory bounds); without dynptrs, the verifier can't guarantee that the
offset or length of the read is safe since those values aren't
statically known at compile time, so for example you can't directly
read a dynamic number of bytes from a packet.

With regards to skb + xdp, the main use case of dynptrs are to 1)
allow dynamic-size accesses of packet data and 2) allow easier and
simpler packet parsing (for example, accessing skb->data directly
requires multiple if checks for ensuring it's within bounds of
skb->data_end in order to satisfy the verifier; with the dynptr
interface, you are able to get a direct data slice and access it
without needing the checks. The selftests 3rd patch has some
demonstrations of this).

>
> > Three ideas I'm thinking through as a possible solution:
> > 1) Enforce that the skb is always uncloned for skb-type bpf progs (we
> > currently do this just for the skb head, see bpf_unclone_prologue()),
> > but I'm not sure if the trade-off (pulling all the packet data, even
> > if it won't be used) is acceptable.
> >
> > 2) Don't support cloned skbs for bpf_dynptr_write/data and don't do
> > any pulling. If the prog wants to use bpf_dynptr_write/data, then they
> > have to pull it first
>
> I think all output skbs from TCP are cloned, so that's not gonna work.
>
> > 2) (uglier than #1 and #2) For bpf_dynptr_write()s, pull if the write
> > is to a paged area and the skb is cloned, otherwise write to the paged
> > area without pulling; if we do this, then we always have to invalidate
> > all data slices associated with the skb (even for writes to the head)
> > since at prog load time, the verifier doesn't know if the pull happens
> > or not. For bpf_dynptr_data()s, follow the same policy.
> >
> > I'm leaning towards 2. What are your thoughts?
Joanne Koong Aug. 4, 2022, 9:55 p.m. UTC | #34
On Mon, Aug 1, 2022 at 7:12 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 4:33 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > (consider cross-posting network-related stuff to netdev@)
>
> Great, I will start cc-ing netdev@
>
> >
> > On Tue, 26 Jul 2022 11:47:04 -0700 Joanne Koong wrote:
> > > Add skb dynptrs, which are dynptrs whose underlying pointer points
> > > to a skb. The dynptr acts on skb data. skb dynptrs have two main
> > > benefits. One is that they allow operations on sizes that are not
> > > statically known at compile-time (eg variable-sized accesses).
> > > Another is that parsing the packet data through dynptrs (instead of
> > > through direct access of skb->data and skb->data_end) can be more
> > > ergonomic and less brittle (eg does not need manual if checking for
> > > being within bounds of data_end).
> >
> > Is there really a need for dynptr_from_{skb,xdp} to be different
> > function IDs? I was hoping this work would improve portability of
> > networking BPF programs across the hooks.
>
> Awesome, I like this idea of having just one generic API named
> something like bpf_dynptr_from_packet. I'll add this for v2!
>

Thinking about this some more, I don't think we get a lot of benefits
from combining it into one API (bpf_dynptr_from_packet) instead of 2
separate APIs (bpf_dynptr_from_skb / bpf_dynptr_from_xdp). The
bpf_dynptr_write behavior will be inconsistent (eg bpf_dynptr_write
into xdp frags will work whereas bpf_dynptr_write into skb frags will
fail). Martin also pointed out that he'd prefer bpf_dynptr_write() to
succeed for writing into frags and invalidate data slices (instead of
failing the write and always keeping data slices valid), which we
can't do if we combine xdp + skb, without always (needlessly)
invalidating xdp data slices whenever there's a write. Additionally,
in the verifier, there's no organic mapping between prog type -> prog
ctx, so we'll have to hardcode some mapping between prog type -> skb
vs. xdp ctx. I think for these reasons it makes more sense to have 2
separate APIs, instead of having 1 API that both hooks can call.

> >
> > > For bpf prog types that don't support writes on skb data, the dynptr is
> > > read-only (writes and data slices are not permitted). For reads on the
> > > dynptr, this includes reading into data in the non-linear paged buffers
> > > but for writes and data slices, if the data is in a paged buffer, the
> > > user must first call bpf_skb_pull_data to pull the data into the linear
> > > portion.
> > >
> > > Additionally, any helper calls that change the underlying packet buffer
> > > (eg bpf_skb_pull_data) invalidates any data slices of the associated
> > > dynptr.
> >
> > Grepping the verifier did not help me find that, would you mind
> > pointing me to the code?
>
> The base reg type of a skb data slice will be PTR_TO_PACKET - this
> gets set in this patch in check_helper_call() in verifier.c:
>
> + if (func_id == BPF_FUNC_dynptr_data &&
> +    meta.type == BPF_DYNPTR_TYPE_SKB)
> + regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
>
> Anytime there is a helper call that changes the underlying packet
> buffer [0], the verifier iterates through the registers and marks all
> PTR_TO_PACKET reg types as unknown, which invalidates them. The dynptr
> data slice will be invalidated since its base reg type is
> PTR_TO_PACKET
>
> The stack trace is:
>    check_helper_call() -> clear_all_pkt_pointers() ->
> __clear_all_pkt_pointers() -> mark_reg_unknown()
>
>
> I will add this explanation to the commit message for v2 since it is non-obvious
>
>
> [0] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/verifier.c#L7143
>
> [1] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/verifier.c#L6489
>
>
> >
> > > Right now, skb dynptrs can only be constructed from skbs that are
> > > the bpf program context - as such, there does not need to be any
> > > reference tracking or release on skb dynptrs.
Kumar Kartikeya Dwivedi Aug. 4, 2022, 10:58 p.m. UTC | #35
On Thu, 4 Aug 2022 at 01:28, Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 3 Aug 2022 13:29:37 -0700 Joanne Koong wrote:
> > Thinking about this some more, I think BPF_FUNC_dynptr_from_skb needs
> > to be patched regardless in order to set the rd-only flag in the
> > metadata for the dynptr. There will be other helper functions that
> > write into dynptrs (eg memcpy with dynptrs, strncpy with dynptrs,
> > probe read user with dynptrs, ...) so I think it's more scalable if we
> > reject these writes at runtime through the rd-only flag in the
> > metadata, than for the verifier to custom-case that any helper funcs
> > that write into dynptrs will need to get dynptr type + do
> > may_access_direct_pkt_data() if it's type skb or xdp. The
> > inconsistency between not rd-only in metadata vs. rd-only in verifier
> > might be a little confusing as well.
> >
> > For these reasons, I'm leaning more towards having bpf_dynptr_write()
> > and other dynptr write helper funcs be rejected at runtime instead of
> > prog load time, but I'm eager to hear what you prefer.
> >
> > What are your thoughts?
>
> Oh. I thought dynptrs are an extension of the discussion we had about
> creating a skb_header_pointer()-like abstraction but it sounds like
> we veered quite far off that track at some point :(
>
> The point of skb_header_pointer() is to expose the chunk of the packet
> pointed to by [skb, offset, len] as a linear buffer. Potentially coping
> it out to a stack buffer *IIF* the header is not contiguous inside the
> skb head, which should very rarely happen.
>
> Here it seems we return an error so that user must pull if the data is
> not linear, which is defeating the purpose. The user of
> skb_header_pointer() wants to avoid the copy while _reliably_ getting
> a contiguous pointer. Plus pulling in the header may be far more
> expensive than a small copy to the stack.
>
> The pointer returned by skb_header_pointer is writable, but it's not
> guaranteed that the writes go to the packet, they may go to the
> on-stack buffer, so the caller must do some sort of:
>
>         if (data_ptr == stack_buf)
>                 skb_store_bits(...);
>
> Which we were thinking of wrapping in some sort of flush operation.
>
> If I'm reading this right dynptr as implemented here do not provide
> such semantics, am I confused in thinking that this is a continuation
> of the XDP multi-buff discussion? Is it a completely separate thing
> and we'll still need a header_pointer like helper?

When I worked on [0], I actually did it a bit like you described in
the original discussion under the xdp multi-buff thread, but I left
the other case (where data to be read resides across frag boundaries)
up to the user to handle, instead of automatically passing in pointer
to stack and doing the copy for them, so in my case
xdp_load_bytes/xdp_store_bytes is the fallback if you can't get a
bpf_packet_pointer for a ctx, offset, len which you can directly
access. But this was only for XDP, not for skb.

The advantage with a dynptr is that len for the slice from
bpf_packet_pointer style helper doesn't have to be a constant, it can
be a runtime value and since it is checked at runtime anyway, the
helper's code is the same but access can be done for slices whose
length is unknown to the verifier in a safe manner. The dynptr is very
useful as the return value of such a helper.

The suggested usage was like this:

    int err = 0;
    char buf[N];

    off &= 0xffff;
    ptr = bpf_packet_pointer(ctx, off, sizeof(buf), &err);
    if (unlikely(!ptr)) {
        if (err < 0)
            return XDP_ABORTED;
        err = bpf_xdp_load_bytes(ctx, off, buf, sizeof(buf));
        if (err < 0)
            return XDP_ABORTED;
        ptr = buf;
    }
    ...
    // Do some stores and loads in [ptr, ptr + N) region
    ...
    if (unlikely(ptr == buf)) {
        err = bpf_xdp_store_bytes(ctx, off, buf, sizeof(buf));
        if (err < 0)
            return XDP_ABORTED;
    }

So the idea was the same, there is still a "flush" (in that unlikely
branch), but it is done explicitly by the user (which I found less
confusing than it being done automagically or a by a new flush helper
which will do the same thing we do here, but YMMV).

[0]: https://lore.kernel.org/bpf/20220306234311.452206-1-memxor@gmail.com
Jakub Kicinski Aug. 5, 2022, 11:22 p.m. UTC | #36
On Thu, 4 Aug 2022 14:55:14 -0700 Joanne Koong wrote:
> Thinking about this some more, I don't think we get a lot of benefits
> from combining it into one API (bpf_dynptr_from_packet) instead of 2
> separate APIs (bpf_dynptr_from_skb / bpf_dynptr_from_xdp).

Ease of use is not a big benefit? We'll continue to have people trying
to run XDP generic in prod because they wrote their program for XDP,
not tc :(

> The bpf_dynptr_write behavior will be inconsistent (eg bpf_dynptr_write
> into xdp frags will work whereas bpf_dynptr_write into skb frags will
> fail). Martin also pointed out that he'd prefer bpf_dynptr_write() to
> succeed for writing into frags and invalidate data slices (instead of
> failing the write and always keeping data slices valid), which we
> can't do if we combine xdp + skb, without always (needlessly)
> invalidating xdp data slices whenever there's a write. Additionally,
> in the verifier, there's no organic mapping between prog type -> prog
> ctx, so we'll have to hardcode some mapping between prog type -> skb
> vs. xdp ctx. I think for these reasons it makes more sense to have 2
> separate APIs, instead of having 1 API that both hooks can call.

Feels like pushing complexity onto users because the infra is not in
place. One day the infra will be in place yet the uAPI will remain for
ever. But enough emails exchanged, take your pick and time will tell :)
Jakub Kicinski Aug. 5, 2022, 11:25 p.m. UTC | #37
On Fri, 5 Aug 2022 00:58:01 +0200 Kumar Kartikeya Dwivedi wrote:
> When I worked on [0], I actually did it a bit like you described in
> the original discussion under the xdp multi-buff thread, but I left
> the other case (where data to be read resides across frag boundaries)
> up to the user to handle, instead of automatically passing in pointer
> to stack and doing the copy for them, so in my case
> xdp_load_bytes/xdp_store_bytes is the fallback if you can't get a
> bpf_packet_pointer for a ctx, offset, len which you can directly
> access. But this was only for XDP, not for skb.
> 
> The advantage with a dynptr is that len for the slice from
> bpf_packet_pointer style helper doesn't have to be a constant, it can
> be a runtime value and since it is checked at runtime anyway, the
> helper's code is the same but access can be done for slices whose
> length is unknown to the verifier in a safe manner. The dynptr is very
> useful as the return value of such a helper.

I see.

> The suggested usage was like this:
> 
>     int err = 0;
>     char buf[N];
> 
>     off &= 0xffff;
>     ptr = bpf_packet_pointer(ctx, off, sizeof(buf), &err);
>     if (unlikely(!ptr)) {
>         if (err < 0)
>             return XDP_ABORTED;
>         err = bpf_xdp_load_bytes(ctx, off, buf, sizeof(buf));
>         if (err < 0)
>             return XDP_ABORTED;
>         ptr = buf;
>     }
>     ...
>     // Do some stores and loads in [ptr, ptr + N) region
>     ...
>     if (unlikely(ptr == buf)) {
>         err = bpf_xdp_store_bytes(ctx, off, buf, sizeof(buf));
>         if (err < 0)
>             return XDP_ABORTED;
>     }
> 
> So the idea was the same, there is still a "flush" (in that unlikely
> branch), but it is done explicitly by the user (which I found less
> confusing than it being done automagically or a by a new flush helper
> which will do the same thing we do here, but YMMV).

Ack, the flush is awkward to create an API for. I presume that's 
the main reason the xdp mbuf discussion wasn't fruitful.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20c26aed7896..7fbd4324c848 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -407,11 +407,14 @@  enum bpf_type_flag {
 	/* Size is known at compile time. */
 	MEM_FIXED_SIZE		= BIT(10 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to sk_buff */
+	DYNPTR_TYPE_SKB		= BIT(11 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -2556,12 +2559,15 @@  enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_LOCAL,
 	/* Underlying data is a ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
+	/* Underlying data is a sk_buff */
+	BPF_DYNPTR_TYPE_SKB,
 };
 
 void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a5f21dc3c432..649063d9cbfd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1532,4 +1532,8 @@  static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 	return XDP_REDIRECT;
 }
 
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags);
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59a217ca2dfd..0730cd198a7f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5241,11 +5241,22 @@  union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  if *offset* + *len* extends into the skb's paged buffers, the user
+ *		       should manually pull the skb with bpf_skb_pull and then try again.
+ *
+ *		    *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM** (automatically
+ *			recompute the checksum for the packet after storing the bytes) and
+ *			**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ *			**->swhash** and *skb*\ **->l4hash** to 0).
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
+ *		skb-type dynptrs the write extends into the skb's paged buffers.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5253,10 +5264,19 @@  union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		For skb-type dynptrs:
+ *		    * if *offset* + *len* extends into the skb's paged buffers,
+ *		      the user should manually pull the skb with bpf_skb_pull and then
+ *		      try again.
+ *
+ *		    * the data slice is automatically invalidated anytime a
+ *		      helper call that changes the underlying packet buffer
+ *		      (eg bpf_skb_pull) is called.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
- *		is out of bounds.
+ *		is out of bounds or in a paged buffer for skb-type dynptrs.
  *
  * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
  *	Description
@@ -5331,6 +5351,21 @@  union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Get a dynptr to the data in *skb*. *skb* must be the BPF program
+ *		context. Depending on program type, the dynptr may be read-only,
+ *		in which case trying to obtain a direct data slice to it through
+ *		bpf_dynptr_data will return an error.
+ *
+ *		Calls that change the *skb*'s underlying packet buffer
+ *		(eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
+ *		invalidate any data slices associated with the dynptr.
+ *
+ *		*flags* is currently unused, it must be 0 for now.
+ *	Return
+ *		0 on success or -EINVAL if flags is not 0.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5541,6 +5576,7 @@  union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(dynptr_from_skb),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1f961f9982d2..21a806057e9e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1425,11 +1425,21 @@  static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
 
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
 {
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
 static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
@@ -1500,6 +1510,7 @@  static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!src->data || flags)
@@ -1509,6 +1520,11 @@  BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
 	if (err)
 		return err;
 
+	type = bpf_dynptr_get_type(src);
+
+	if (type == BPF_DYNPTR_TYPE_SKB)
+		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+
 	memcpy(dst, src->data + src->offset + offset, len);
 
 	return 0;
@@ -1528,15 +1544,38 @@  static const struct bpf_func_proto bpf_dynptr_read_proto = {
 BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
 	   u32, len, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
-	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
 	if (err)
 		return err;
 
+	type = bpf_dynptr_get_type(dst);
+
+	if (flags) {
+		if (type == BPF_DYNPTR_TYPE_SKB) {
+			if (flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))
+				return -EINVAL;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	if (type == BPF_DYNPTR_TYPE_SKB) {
+		struct sk_buff *skb = dst->data;
+
+		/* if the data is paged, the caller needs to pull it first */
+		if (dst->offset + offset + len > skb->len - skb->data_len)
+			return -EAGAIN;
+
+		return __bpf_skb_store_bytes(skb, dst->offset + offset, src, len,
+					     flags);
+	}
+
 	memcpy(dst->data + dst->offset + offset, src, len);
 
 	return 0;
@@ -1555,6 +1594,7 @@  static const struct bpf_func_proto bpf_dynptr_write_proto = {
 
 BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!ptr->data)
@@ -1567,6 +1607,18 @@  BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
 	if (bpf_dynptr_is_rdonly(ptr))
 		return 0;
 
+	type = bpf_dynptr_get_type(ptr);
+
+	if (type == BPF_DYNPTR_TYPE_SKB) {
+		struct sk_buff *skb = ptr->data;
+
+		/* if the data is paged, the caller needs to pull it first */
+		if (ptr->offset + offset + len > skb->len - skb->data_len)
+			return 0;
+
+		return (unsigned long)(skb->data + ptr->offset + offset);
+	}
+
 	return (unsigned long)(ptr->data + ptr->offset + offset);
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0d523741a543..0838653eeb4e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -263,6 +263,7 @@  struct bpf_call_arg_meta {
 	u32 subprogno;
 	struct bpf_map_value_off_desc *kptr_off_desc;
 	u8 uninit_dynptr_regno;
+	enum bpf_dynptr_type type;
 };
 
 struct btf *btf_vmlinux;
@@ -678,6 +679,8 @@  static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_LOCAL;
 	case DYNPTR_TYPE_RINGBUF:
 		return BPF_DYNPTR_TYPE_RINGBUF;
+	case DYNPTR_TYPE_SKB:
+		return BPF_DYNPTR_TYPE_SKB;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -5820,12 +5823,14 @@  int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void stack_slot_get_dynptr_info(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				       struct bpf_call_arg_meta *meta)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi = get_spi(reg->off);
 
-	return state->stack[spi].spilled_ptr.id;
+	meta->ref_obj_id = state->stack[spi].spilled_ptr.id;
+	meta->type = state->stack[spi].spilled_ptr.dynptr.type;
 }
 
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -6052,6 +6057,9 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 				case DYNPTR_TYPE_RINGBUF:
 					err_extra = "ringbuf ";
 					break;
+				case DYNPTR_TYPE_SKB:
+					err_extra = "skb ";
+					break;
 				default:
 					break;
 				}
@@ -6065,8 +6073,10 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 					verbose(env, "verifier internal error: multiple refcounted args in BPF_FUNC_dynptr_data");
 					return -EFAULT;
 				}
-				/* Find the id of the dynptr we're tracking the reference of */
-				meta->ref_obj_id = stack_slot_get_id(env, reg);
+				/* Find the id and the type of the dynptr we're tracking
+				 * the reference of.
+				 */
+				stack_slot_get_dynptr_info(env, reg, meta);
 			}
 		}
 		break;
@@ -7406,7 +7416,11 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
 	} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+		if (func_id == BPF_FUNC_dynptr_data &&
+		    meta.type == BPF_DYNPTR_TYPE_SKB)
+			regs[BPF_REG_0].type = PTR_TO_PACKET | ret_flag;
+		else
+			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
 	} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
 		const struct btf_type *t;
@@ -14132,6 +14146,25 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		if (insn->imm == BPF_FUNC_dynptr_from_skb) {
+			if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE))
+				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, true);
+			else
+				insn_buf[0] = BPF_MOV32_IMM(BPF_REG_4, false);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = new_prog;
+			prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
diff --git a/net/core/filter.c b/net/core/filter.c
index 5669248aff25..312f99deb759 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1681,8 +1681,8 @@  static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
 		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
 }
 
-BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
-	   const void *, from, u32, len, u64, flags)
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags)
 {
 	void *ptr;
 
@@ -1707,6 +1707,12 @@  BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
 	return 0;
 }
 
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len, u64, flags)
+{
+	return __bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
 static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.func		= bpf_skb_store_bytes,
 	.gpl_only	= false,
@@ -1718,8 +1724,7 @@  static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
-	   void *, to, u32, len)
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
 {
 	void *ptr;
 
@@ -1738,6 +1743,12 @@  BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
 	return -EFAULT;
 }
 
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+	   void *, to, u32, len)
+{
+	return __bpf_skb_load_bytes(skb, offset, to, len);
+}
+
 static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.func		= bpf_skb_load_bytes,
 	.gpl_only	= false,
@@ -1849,6 +1860,32 @@  static const struct bpf_func_proto bpf_skb_pull_data_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+/* is_rdonly is set by the verifier */
+BPF_CALL_4(bpf_dynptr_from_skb, struct sk_buff *, skb, u64, flags,
+	   struct bpf_dynptr_kern *, ptr, u32, is_rdonly)
+{
+	if (flags) {
+		bpf_dynptr_set_null(ptr);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+	if (is_rdonly)
+		bpf_dynptr_set_rdonly(ptr);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_dynptr_from_skb_proto = {
+	.func		= bpf_dynptr_from_skb,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_SKB | MEM_UNINIT,
+};
+
 BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
 {
 	return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
@@ -7808,6 +7845,8 @@  sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_dynptr_from_skb:
+		return &bpf_dynptr_from_skb_proto;
 	default:
 		return bpf_sk_base_func_proto(func_id);
 	}
@@ -7991,6 +8030,8 @@  tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_raw_check_syncookie_ipv6_proto;
 #endif
 #endif
+	case BPF_FUNC_dynptr_from_skb:
+		return &bpf_dynptr_from_skb_proto;
 	default:
 		return bpf_sk_base_func_proto(func_id);
 	}
@@ -8186,6 +8227,8 @@  sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
 #endif
+	case BPF_FUNC_dynptr_from_skb:
+		return &bpf_dynptr_from_skb_proto;
 	default:
 		return bpf_sk_base_func_proto(func_id);
 	}
@@ -8224,6 +8267,8 @@  lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
+	case BPF_FUNC_dynptr_from_skb:
+		return &bpf_dynptr_from_skb_proto;
 	default:
 		return bpf_sk_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 59a217ca2dfd..0730cd198a7f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5241,11 +5241,22 @@  union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  if *offset* + *len* extends into the skb's paged buffers, the user
+ *		       should manually pull the skb with bpf_skb_pull and then try again.
+ *
+ *		    *  *flags* are a combination of **BPF_F_RECOMPUTE_CSUM** (automatically
+ *			recompute the checksum for the packet after storing the bytes) and
+ *			**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ *			**->swhash** and *skb*\ **->l4hash** to 0).
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct, -EAGAIN if for
+ *		skb-type dynptrs the write extends into the skb's paged buffers.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5253,10 +5264,19 @@  union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		For skb-type dynptrs:
+ *		    * if *offset* + *len* extends into the skb's paged buffers,
+ *		      the user should manually pull the skb with bpf_skb_pull and then
+ *		      try again.
+ *
+ *		    * the data slice is automatically invalidated anytime a
+ *		      helper call that changes the underlying packet buffer
+ *		      (eg bpf_skb_pull) is called.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
- *		is out of bounds.
+ *		is out of bounds or in a paged buffer for skb-type dynptrs.
  *
  * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
  *	Description
@@ -5331,6 +5351,21 @@  union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Get a dynptr to the data in *skb*. *skb* must be the BPF program
+ *		context. Depending on program type, the dynptr may be read-only,
+ *		in which case trying to obtain a direct data slice to it through
+ *		bpf_dynptr_data will return an error.
+ *
+ *		Calls that change the *skb*'s underlying packet buffer
+ *		(eg bpf_skb_pull_data) do not invalidate the dynptr, but they do
+ *		invalidate any data slices associated with the dynptr.
+ *
+ *		*flags* is currently unused, it must be 0 for now.
+ *	Return
+ *		0 on success or -EINVAL if flags is not 0.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5541,6 +5576,7 @@  union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(dynptr_from_skb),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper