diff mbox series

[v13,bpf-next,09/10] bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr

Message ID 20230301154953.641654-10-joannelkoong@gmail.com (mailing list archive)
State Accepted
Commit 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845
Delegated to: BPF
Headers show
Series Add skb + xdp dynptrs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-11 pending Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1748 this patch: 1757
netdev/cc_maintainers warning 13 maintainers not CCed: pabeni@redhat.com davem@davemloft.net jolsa@kernel.org john.fastabend@gmail.com martin.lau@linux.dev yhs@fb.com kpsingh@kernel.org song@kernel.org haoluo@google.com edumazet@google.com kuba@kernel.org hawk@kernel.org sdf@google.com
netdev/build_clang fail Errors and warnings before: 159 this patch: 163
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1745 this patch: 1754
netdev/checkpatch warning CHECK: Please don't use multiple blank lines WARNING: Missing a blank line after declarations WARNING: line length of 100 exceeds 80 columns WARNING: line length of 110 exceeds 80 columns WARNING: line length of 112 exceeds 80 columns WARNING: line length of 114 exceeds 80 columns WARNING: line length of 120 exceeds 80 columns WARNING: line length of 124 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc fail Errors and warnings before: 0 this patch: 2
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary

Commit Message

Joanne Koong March 1, 2023, 3:49 p.m. UTC
Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
The user must pass in a buffer to store the contents of the data slice
if a direct pointer to the data cannot be obtained.

For skb and xdp type dynptrs, these two APIs are the only way to obtain
a data slice. However, for other types of dynptrs, there is no
difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.

For skb type dynptrs, the data is copied into the user provided buffer
if any of the data is not in the linear portion of the skb. For xdp type
dynptrs, the data is copied into the user provided buffer if the data is
between xdp frags.

If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
the skb will be uncloned (see bpf_unclone_prologue()).

Please note that any bpf_dynptr_write() automatically invalidates any prior
data slices of the skb dynptr. This is because the skb may be cloned or
may need to pull its paged buffer into the head. As such, any
bpf_dynptr_write() will automatically have its prior data slices
invalidated, even if the write is to data in the skb head of an uncloned
skb. Please note as well that any other helper calls that change the
underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
slices of the skb dynptr as well, for the same reasons.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 include/linux/filter.h         |  14 ++++
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/helpers.c           | 138 +++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 127 +++++++++++++++++++++++++++++-
 net/core/filter.c              |   6 +-
 tools/include/uapi/linux/bpf.h |   5 ++
 6 files changed, 288 insertions(+), 7 deletions(-)

Comments

kernel test robot March 2, 2023, 3:29 a.m. UTC | #1
Hi Joanne,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/bpf-Support-sk_buff-and-xdp_buff-as-valid-kfunc-arg-types/20230301-235341
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong%40gmail.com
patch subject: [PATCH v13 bpf-next 09/10] bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr
config: microblaze-randconfig-s043-20230302 (https://download.01.org/0day-ci/archive/20230302/202303021152.sPWiwGYn-lkp@intel.com/config)
compiler: microblaze-linux-gcc (GCC) 12.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.4-39-gce1a6720-dirty
        # https://github.com/intel-lab-lkp/linux/commit/ab021cad431168baaba04ed320003be30f4deb34
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Joanne-Koong/bpf-Support-sk_buff-and-xdp_buff-as-valid-kfunc-arg-types/20230301-235341
        git checkout ab021cad431168baaba04ed320003be30f4deb34
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze SHELL=/bin/bash kernel/bpf/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303021152.sPWiwGYn-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> kernel/bpf/helpers.c:2231:24: sparse: sparse: Using plain integer as NULL pointer
   kernel/bpf/helpers.c:2235:24: sparse: sparse: Using plain integer as NULL pointer
   kernel/bpf/helpers.c:2256:24: sparse: sparse: Using plain integer as NULL pointer
   kernel/bpf/helpers.c:2305:24: sparse: sparse: Using plain integer as NULL pointer
   kernel/bpf/helpers.c:2342:18: sparse: sparse: context imbalance in 'bpf_rcu_read_lock' - wrong count at exit
   kernel/bpf/helpers.c:2347:18: sparse: sparse: context imbalance in 'bpf_rcu_read_unlock' - unexpected unlock

vim +2231 kernel/bpf/helpers.c

  2195	
  2196	/**
  2197	 * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
  2198	 *
  2199	 * For non-skb and non-xdp type dynptrs, there is no difference between
  2200	 * bpf_dynptr_slice and bpf_dynptr_data.
  2201	 *
  2202	 * If the intention is to write to the data slice, please use
  2203	 * bpf_dynptr_slice_rdwr.
  2204	 *
  2205	 * The user must check that the returned pointer is not null before using it.
  2206	 *
  2207	 * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
  2208	 * does not change the underlying packet data pointers, so a call to
  2209	 * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
  2210	 * the bpf program.
  2211	 *
  2212	 * @ptr: The dynptr whose data slice to retrieve
  2213	 * @offset: Offset into the dynptr
  2214	 * @buffer: User-provided buffer to copy contents into
  2215	 * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
  2216	 * requested slice. This must be a constant.
  2217	 *
  2218	 * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
  2219	 * data slice (can be either direct pointer to the data or a pointer to the user
  2220	 * provided buffer, with its contents containing the data, if unable to obtain
  2221	 * direct pointer)
  2222	 */
  2223	__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
  2224					   void *buffer, u32 buffer__szk)
  2225	{
  2226		enum bpf_dynptr_type type;
  2227		u32 len = buffer__szk;
  2228		int err;
  2229	
  2230		if (!ptr->data)
> 2231			return 0;
  2232	
  2233		err = bpf_dynptr_check_off_len(ptr, offset, len);
  2234		if (err)
  2235			return 0;
  2236	
  2237		type = bpf_dynptr_get_type(ptr);
  2238	
  2239		switch (type) {
  2240		case BPF_DYNPTR_TYPE_LOCAL:
  2241		case BPF_DYNPTR_TYPE_RINGBUF:
  2242			return ptr->data + ptr->offset + offset;
  2243		case BPF_DYNPTR_TYPE_SKB:
  2244			return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
  2245		case BPF_DYNPTR_TYPE_XDP:
  2246		{
  2247			void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
  2248			if (xdp_ptr)
  2249				return xdp_ptr;
  2250	
  2251			bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
  2252			return buffer;
  2253		}
  2254		default:
  2255			WARN_ONCE(true, "unknown dynptr type %d\n", type);
  2256			return 0;
  2257		}
  2258	}
  2259
Joanne Koong March 2, 2023, 3:53 a.m. UTC | #2
On Wed, Mar 1, 2023 at 7:30 PM kernel test robot <lkp@intel.com> wrote:
>
> Hi Joanne,
>
> Thank you for the patch! Perhaps something to improve:
>
> [auto build test WARNING on bpf-next/master]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/bpf-Support-sk_buff-and-xdp_buff-as-valid-kfunc-arg-types/20230301-235341
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> patch link:    https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong%40gmail.com
> patch subject: [PATCH v13 bpf-next 09/10] bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr
> config: microblaze-randconfig-s043-20230302 (https://download.01.org/0day-ci/archive/20230302/202303021152.sPWiwGYn-lkp@intel.com/config)
> compiler: microblaze-linux-gcc (GCC) 12.1.0
> reproduce:
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # apt-get install sparse
>         # sparse version: v0.6.4-39-gce1a6720-dirty
>         # https://github.com/intel-lab-lkp/linux/commit/ab021cad431168baaba04ed320003be30f4deb34
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Joanne-Koong/bpf-Support-sk_buff-and-xdp_buff-as-valid-kfunc-arg-types/20230301-235341
>         git checkout ab021cad431168baaba04ed320003be30f4deb34
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze olddefconfig
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze SHELL=/bin/bash kernel/bpf/
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> | Link: https://lore.kernel.org/oe-kbuild-all/202303021152.sPWiwGYn-lkp@intel.com/
>
> sparse warnings: (new ones prefixed by >>)
> >> kernel/bpf/helpers.c:2231:24: sparse: sparse: Using plain integer as NULL pointer
>    kernel/bpf/helpers.c:2235:24: sparse: sparse: Using plain integer as NULL pointer
>    kernel/bpf/helpers.c:2256:24: sparse: sparse: Using plain integer as NULL pointer
>    kernel/bpf/helpers.c:2305:24: sparse: sparse: Using plain integer as NULL pointer

Argh, sorry about that. I'll submit a follow-up for returning NULL
instead of 0.

>    kernel/bpf/helpers.c:2342:18: sparse: sparse: context imbalance in 'bpf_rcu_read_lock' - wrong count at exit
>    kernel/bpf/helpers.c:2347:18: sparse: sparse: context imbalance in 'bpf_rcu_read_unlock' - unexpected unlock
>
> vim +2231 kernel/bpf/helpers.c
>
>   2195
>   2196  /**
>   2197   * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
>   2198   *
>   2199   * For non-skb and non-xdp type dynptrs, there is no difference between
>   2200   * bpf_dynptr_slice and bpf_dynptr_data.
>   2201   *
>   2202   * If the intention is to write to the data slice, please use
>   2203   * bpf_dynptr_slice_rdwr.
>   2204   *
>   2205   * The user must check that the returned pointer is not null before using it.
>   2206   *
>   2207   * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
>   2208   * does not change the underlying packet data pointers, so a call to
>   2209   * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
>   2210   * the bpf program.
>   2211   *
>   2212   * @ptr: The dynptr whose data slice to retrieve
>   2213   * @offset: Offset into the dynptr
>   2214   * @buffer: User-provided buffer to copy contents into
>   2215   * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
>   2216   * requested slice. This must be a constant.
>   2217   *
>   2218   * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
>   2219   * data slice (can be either direct pointer to the data or a pointer to the user
>   2220   * provided buffer, with its contents containing the data, if unable to obtain
>   2221   * direct pointer)
>   2222   */
>   2223  __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
>   2224                                     void *buffer, u32 buffer__szk)
>   2225  {
>   2226          enum bpf_dynptr_type type;
>   2227          u32 len = buffer__szk;
>   2228          int err;
>   2229
>   2230          if (!ptr->data)
> > 2231                  return 0;
>   2232
>   2233          err = bpf_dynptr_check_off_len(ptr, offset, len);
>   2234          if (err)
>   2235                  return 0;
>   2236
>   2237          type = bpf_dynptr_get_type(ptr);
>   2238
>   2239          switch (type) {
>   2240          case BPF_DYNPTR_TYPE_LOCAL:
>   2241          case BPF_DYNPTR_TYPE_RINGBUF:
>   2242                  return ptr->data + ptr->offset + offset;
>   2243          case BPF_DYNPTR_TYPE_SKB:
>   2244                  return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
>   2245          case BPF_DYNPTR_TYPE_XDP:
>   2246          {
>   2247                  void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
>   2248                  if (xdp_ptr)
>   2249                          return xdp_ptr;
>   2250
>   2251                  bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
>   2252                  return buffer;
>   2253          }
>   2254          default:
>   2255                  WARN_ONCE(true, "unknown dynptr type %d\n", type);
>   2256                  return 0;
>   2257          }
>   2258  }
>   2259
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests
Kumar Kartikeya Dwivedi March 6, 2023, 7:10 a.m. UTC | #3
On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> The user must pass in a buffer to store the contents of the data slice
> if a direct pointer to the data cannot be obtained.
>
> For skb and xdp type dynptrs, these two APIs are the only way to obtain
> a data slice. However, for other types of dynptrs, there is no
> difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
>
> For skb type dynptrs, the data is copied into the user provided buffer
> if any of the data is not in the linear portion of the skb. For xdp type
> dynptrs, the data is copied into the user provided buffer if the data is
> between xdp frags.
>
> If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> the skb will be uncloned (see bpf_unclone_prologue()).
>
> Please note that any bpf_dynptr_write() automatically invalidates any prior
> data slices of the skb dynptr. This is because the skb may be cloned or
> may need to pull its paged buffer into the head. As such, any
> bpf_dynptr_write() will automatically have its prior data slices
> invalidated, even if the write is to data in the skb head of an uncloned
> skb. Please note as well that any other helper calls that change the
> underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> slices of the skb dynptr as well, for the same reasons.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---

Sorry for chiming in late.

I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
we won't reflect changes to the stack state in the verifier for writes happening
through it.

For the worst case scenario, this will basically allow overwriting values of
spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
buffer residing on program stack. To verify, by forcing the memcpy to buffer for
skb_header_pointer I was able to make it dereference a garbage value for
l4lb_all selftest.

--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
 	case BPF_DYNPTR_TYPE_RINGBUF:
 		return ptr->data + ptr->offset + offset;
 	case BPF_DYNPTR_TYPE_SKB:
-		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+	{
+		void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+		if (p == buffer)
+			return p;
+		memcpy(buffer, p, len);
+		return buffer;
+	}

--- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
@@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
 	eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
 	if (!eth)
 		return TC_ACT_SHOT;
-	eth_proto = eth->eth_proto;
+	*(void **)buffer = ctx;
+	*(void **)eth = (void *)0xdeadbeef;
+	ctx = *(void **)buffer;
+	eth_proto = eth->eth_proto + ctx->len;
 	if (eth_proto == bpf_htons(ETH_P_IP))
 		err = process_packet(&ptr, eth, nh_off, false, ctx);

I think the proper fix is to treat it as a separate return type distinct from
PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
fork verifier state whenever there is a write, so that one path verifies it as
PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
think for the rest it's not a problem, but there are allow_ptr_leak checks
applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
Then we ensure that program is safe in either path.

Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
a pointer. We could also fork verifier states on return, to verify either path
separately right from the point following the call instruction.

Any other ideas welcome.
Alexei Starovoitov March 7, 2023, 2:23 a.m. UTC | #4
On Sun, Mar 5, 2023 at 11:10 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> > Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> > The user must pass in a buffer to store the contents of the data slice
> > if a direct pointer to the data cannot be obtained.
> >
> > For skb and xdp type dynptrs, these two APIs are the only way to obtain
> > a data slice. However, for other types of dynptrs, there is no
> > difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
> >
> > For skb type dynptrs, the data is copied into the user provided buffer
> > if any of the data is not in the linear portion of the skb. For xdp type
> > dynptrs, the data is copied into the user provided buffer if the data is
> > between xdp frags.
> >
> > If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> > the skb will be uncloned (see bpf_unclone_prologue()).
> >
> > Please note that any bpf_dynptr_write() automatically invalidates any prior
> > data slices of the skb dynptr. This is because the skb may be cloned or
> > may need to pull its paged buffer into the head. As such, any
> > bpf_dynptr_write() will automatically have its prior data slices
> > invalidated, even if the write is to data in the skb head of an uncloned
> > skb. Please note as well that any other helper calls that change the
> > underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> > slices of the skb dynptr as well, for the same reasons.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
>
> Sorry for chiming in late.
>
> I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
> actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
> we won't reflect changes to the stack state in the verifier for writes happening
> through it.
>
> For the worst case scenario, this will basically allow overwriting values of
> spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
> issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
> buffer residing on program stack. To verify, by forcing the memcpy to buffer for
> skb_header_pointer I was able to make it dereference a garbage value for
> l4lb_all selftest.
>
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
>         case BPF_DYNPTR_TYPE_RINGBUF:
>                 return ptr->data + ptr->offset + offset;
>         case BPF_DYNPTR_TYPE_SKB:
> -               return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> +       {
> +               void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> +               if (p == buffer)
> +                       return p;
> +               memcpy(buffer, p, len);
> +               return buffer;
> +       }
>
> --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> @@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
>         eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
>         if (!eth)
>                 return TC_ACT_SHOT;
> -       eth_proto = eth->eth_proto;
> +       *(void **)buffer = ctx;

Great catch.
To fix the issue I think we should simply disallow such
stack abuse. The compiler won't be spilling registers
into C array on the stack.
This manual spill/fill is exploiting verifier logic.
After bpf_dynptr_slice_rdwr() we can mark all slots of the
buffer as STACK_POISON or some better name and
reject spill into such slots.

> +       *(void **)eth = (void *)0xdeadbeef;
> +       ctx = *(void **)buffer;
> +       eth_proto = eth->eth_proto + ctx->len;
>         if (eth_proto == bpf_htons(ETH_P_IP))
>                 err = process_packet(&ptr, eth, nh_off, false, ctx);
>
> I think the proper fix is to treat it as a separate return type distinct from
> PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> fork verifier state whenever there is a write, so that one path verifies it as
> PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> think for the rest it's not a problem, but there are allow_ptr_leak checks
> applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> Then we ensure that program is safe in either path.
>
> Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> a pointer. We could also fork verifier states on return, to verify either path
> separately right from the point following the call instruction.

This is too complex imo.
Kumar Kartikeya Dwivedi March 7, 2023, 10:22 a.m. UTC | #5
On Tue, Mar 07, 2023 at 03:23:25AM CET, Alexei Starovoitov wrote:
> On Sun, Mar 5, 2023 at 11:10 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> > > Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> > > The user must pass in a buffer to store the contents of the data slice
> > > if a direct pointer to the data cannot be obtained.
> > >
> > > For skb and xdp type dynptrs, these two APIs are the only way to obtain
> > > a data slice. However, for other types of dynptrs, there is no
> > > difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
> > >
> > > For skb type dynptrs, the data is copied into the user provided buffer
> > > if any of the data is not in the linear portion of the skb. For xdp type
> > > dynptrs, the data is copied into the user provided buffer if the data is
> > > between xdp frags.
> > >
> > > If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> > > the skb will be uncloned (see bpf_unclone_prologue()).
> > >
> > > Please note that any bpf_dynptr_write() automatically invalidates any prior
> > > data slices of the skb dynptr. This is because the skb may be cloned or
> > > may need to pull its paged buffer into the head. As such, any
> > > bpf_dynptr_write() will automatically have its prior data slices
> > > invalidated, even if the write is to data in the skb head of an uncloned
> > > skb. Please note as well that any other helper calls that change the
> > > underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> > > slices of the skb dynptr as well, for the same reasons.
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > ---
> >
> > Sorry for chiming in late.
> >
> > I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
> > actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
> > we won't reflect changes to the stack state in the verifier for writes happening
> > through it.
> >
> > For the worst case scenario, this will basically allow overwriting values of
> > spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
> > issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
> > buffer residing on program stack. To verify, by forcing the memcpy to buffer for
> > skb_header_pointer I was able to make it dereference a garbage value for
> > l4lb_all selftest.
> >
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
> >         case BPF_DYNPTR_TYPE_RINGBUF:
> >                 return ptr->data + ptr->offset + offset;
> >         case BPF_DYNPTR_TYPE_SKB:
> > -               return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > +       {
> > +               void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > +               if (p == buffer)
> > +                       return p;
> > +               memcpy(buffer, p, len);
> > +               return buffer;
> > +       }
> >
> > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > @@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
> >         eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
> >         if (!eth)
> >                 return TC_ACT_SHOT;
> > -       eth_proto = eth->eth_proto;
> > +       *(void **)buffer = ctx;
>
> Great catch.
> To fix the issue I think we should simply disallow such
> stack abuse. The compiler won't be spilling registers
> into C array on the stack.
> This manual spill/fill is exploiting verifier logic.
> After bpf_dynptr_slice_rdwr() we can mark all slots of the
> buffer as STACK_POISON or some better name and
> reject spill into such slots.
>

I agree this is simpler, but I'm not sure it will work properly. Verifier won't
know when the lifetime of the buffer ends, so if we disallow spills until its
written over it's going to be a pain for users.

Something like:

for (...) {
	char buf[64];
	bpf_dynptr_slice_rdwr(..., buf, 64);
	...
}

.. and then compiler decides to spill something where buf was located on stack
outside the for loop. The verifier can't know when buf goes out of scope to
unpoison the slots.

> > +       *(void **)eth = (void *)0xdeadbeef;
> > +       ctx = *(void **)buffer;
> > +       eth_proto = eth->eth_proto + ctx->len;
> >         if (eth_proto == bpf_htons(ETH_P_IP))
> >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> >
> > I think the proper fix is to treat it as a separate return type distinct from
> > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > fork verifier state whenever there is a write, so that one path verifies it as
> > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > Then we ensure that program is safe in either path.
> >
> > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > a pointer. We could also fork verifier states on return, to verify either path
> > separately right from the point following the call instruction.
>
> This is too complex imo.

A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
and push_stack with R0 = buffer's reg->type + size set to len in the other path
for exploration later. In terms of verifier infra everything is there already,
it just needs to analyze both cases which fall into the regular code handling
the reg->type's. Probably then no adjustments to regsafe are needed either. It's
like exploring branch instructions.
Alexei Starovoitov March 7, 2023, 3:45 p.m. UTC | #6
On Tue, Mar 7, 2023 at 2:22 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Tue, Mar 07, 2023 at 03:23:25AM CET, Alexei Starovoitov wrote:
> > On Sun, Mar 5, 2023 at 11:10 PM Kumar Kartikeya Dwivedi
> > <memxor@gmail.com> wrote:
> > >
> > > On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> > > > Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> > > > The user must pass in a buffer to store the contents of the data slice
> > > > if a direct pointer to the data cannot be obtained.
> > > >
> > > > For skb and xdp type dynptrs, these two APIs are the only way to obtain
> > > > a data slice. However, for other types of dynptrs, there is no
> > > > difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
> > > >
> > > > For skb type dynptrs, the data is copied into the user provided buffer
> > > > if any of the data is not in the linear portion of the skb. For xdp type
> > > > dynptrs, the data is copied into the user provided buffer if the data is
> > > > between xdp frags.
> > > >
> > > > If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> > > > the skb will be uncloned (see bpf_unclone_prologue()).
> > > >
> > > > Please note that any bpf_dynptr_write() automatically invalidates any prior
> > > > data slices of the skb dynptr. This is because the skb may be cloned or
> > > > may need to pull its paged buffer into the head. As such, any
> > > > bpf_dynptr_write() will automatically have its prior data slices
> > > > invalidated, even if the write is to data in the skb head of an uncloned
> > > > skb. Please note as well that any other helper calls that change the
> > > > underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> > > > slices of the skb dynptr as well, for the same reasons.
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > ---
> > >
> > > Sorry for chiming in late.
> > >
> > > I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
> > > actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
> > > we won't reflect changes to the stack state in the verifier for writes happening
> > > through it.
> > >
> > > For the worst case scenario, this will basically allow overwriting values of
> > > spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
> > > issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
> > > buffer residing on program stack. To verify, by forcing the memcpy to buffer for
> > > skb_header_pointer I was able to make it dereference a garbage value for
> > > l4lb_all selftest.
> > >
> > > --- a/kernel/bpf/helpers.c
> > > +++ b/kernel/bpf/helpers.c
> > > @@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
> > >         case BPF_DYNPTR_TYPE_RINGBUF:
> > >                 return ptr->data + ptr->offset + offset;
> > >         case BPF_DYNPTR_TYPE_SKB:
> > > -               return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > +       {
> > > +               void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > +               if (p == buffer)
> > > +                       return p;
> > > +               memcpy(buffer, p, len);
> > > +               return buffer;
> > > +       }
> > >
> > > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > @@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
> > >         eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
> > >         if (!eth)
> > >                 return TC_ACT_SHOT;
> > > -       eth_proto = eth->eth_proto;
> > > +       *(void **)buffer = ctx;
> >
> > Great catch.
> > To fix the issue I think we should simply disallow such
> > stack abuse. The compiler won't be spilling registers
> > into C array on the stack.
> > This manual spill/fill is exploiting verifier logic.
> > After bpf_dynptr_slice_rdwr() we can mark all slots of the
> > buffer as STACK_POISON or some better name and
> > reject spill into such slots.
> >
>
> I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> know when the lifetime of the buffer ends, so if we disallow spills until its
> written over it's going to be a pain for users.
>
> Something like:
>
> for (...) {
>         char buf[64];
>         bpf_dynptr_slice_rdwr(..., buf, 64);
>         ...
> }
>
> .. and then compiler decides to spill something where buf was located on stack
> outside the for loop. The verifier can't know when buf goes out of scope to
> unpoison the slots.

You're saying the "verifier doesn't know when buf ...".
The same applies to the compiler. It has no visibility
into what bpf_dynptr_slice_rdwr is doing.
So it never spills into a declared C array
as I tried to explain in the previous reply.
Spill/fill slots are always invisible to C.
(unless of course you do pointer arithmetic asm style)

> > > +       *(void **)eth = (void *)0xdeadbeef;
> > > +       ctx = *(void **)buffer;
> > > +       eth_proto = eth->eth_proto + ctx->len;
> > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > >
> > > I think the proper fix is to treat it as a separate return type distinct from
> > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > fork verifier state whenever there is a write, so that one path verifies it as
> > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > Then we ensure that program is safe in either path.
> > >
> > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > a pointer. We could also fork verifier states on return, to verify either path
> > > separately right from the point following the call instruction.
> >
> > This is too complex imo.
>
> A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> and push_stack with R0 = buffer's reg->type + size set to len in the other path
> for exploration later. In terms of verifier infra everything is there already,
> it just needs to analyze both cases which fall into the regular code handling
> the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> like exploring branch instructions.

I still don't like it. There is no reason to go a complex path
when much simpler suffices.
Kumar Kartikeya Dwivedi March 7, 2023, 5:35 p.m. UTC | #7
On Tue, Mar 07, 2023 at 04:45:14PM CET, Alexei Starovoitov wrote:
> On Tue, Mar 7, 2023 at 2:22 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
> >
> > On Tue, Mar 07, 2023 at 03:23:25AM CET, Alexei Starovoitov wrote:
> > > On Sun, Mar 5, 2023 at 11:10 PM Kumar Kartikeya Dwivedi
> > > <memxor@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> > > > > Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> > > > > The user must pass in a buffer to store the contents of the data slice
> > > > > if a direct pointer to the data cannot be obtained.
> > > > >
> > > > > For skb and xdp type dynptrs, these two APIs are the only way to obtain
> > > > > a data slice. However, for other types of dynptrs, there is no
> > > > > difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
> > > > >
> > > > > For skb type dynptrs, the data is copied into the user provided buffer
> > > > > if any of the data is not in the linear portion of the skb. For xdp type
> > > > > dynptrs, the data is copied into the user provided buffer if the data is
> > > > > between xdp frags.
> > > > >
> > > > > If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> > > > > the skb will be uncloned (see bpf_unclone_prologue()).
> > > > >
> > > > > Please note that any bpf_dynptr_write() automatically invalidates any prior
> > > > > data slices of the skb dynptr. This is because the skb may be cloned or
> > > > > may need to pull its paged buffer into the head. As such, any
> > > > > bpf_dynptr_write() will automatically have its prior data slices
> > > > > invalidated, even if the write is to data in the skb head of an uncloned
> > > > > skb. Please note as well that any other helper calls that change the
> > > > > underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> > > > > slices of the skb dynptr as well, for the same reasons.
> > > > >
> > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > ---
> > > >
> > > > Sorry for chiming in late.
> > > >
> > > > I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
> > > > actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
> > > > we won't reflect changes to the stack state in the verifier for writes happening
> > > > through it.
> > > >
> > > > For the worst case scenario, this will basically allow overwriting values of
> > > > spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
> > > > issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
> > > > buffer residing on program stack. To verify, by forcing the memcpy to buffer for
> > > > skb_header_pointer I was able to make it dereference a garbage value for
> > > > l4lb_all selftest.
> > > >
> > > > --- a/kernel/bpf/helpers.c
> > > > +++ b/kernel/bpf/helpers.c
> > > > @@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
> > > >         case BPF_DYNPTR_TYPE_RINGBUF:
> > > >                 return ptr->data + ptr->offset + offset;
> > > >         case BPF_DYNPTR_TYPE_SKB:
> > > > -               return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > > +       {
> > > > +               void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > > +               if (p == buffer)
> > > > +                       return p;
> > > > +               memcpy(buffer, p, len);
> > > > +               return buffer;
> > > > +       }
> > > >
> > > > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > > @@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
> > > >         eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
> > > >         if (!eth)
> > > >                 return TC_ACT_SHOT;
> > > > -       eth_proto = eth->eth_proto;
> > > > +       *(void **)buffer = ctx;
> > >
> > > Great catch.
> > > To fix the issue I think we should simply disallow such
> > > stack abuse. The compiler won't be spilling registers
> > > into C array on the stack.
> > > This manual spill/fill is exploiting verifier logic.
> > > After bpf_dynptr_slice_rdwr() we can mark all slots of the
> > > buffer as STACK_POISON or some better name and
> > > reject spill into such slots.
> > >
> >
> > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > know when the lifetime of the buffer ends, so if we disallow spills until its
> > written over it's going to be a pain for users.
> >
> > Something like:
> >
> > for (...) {
> >         char buf[64];
> >         bpf_dynptr_slice_rdwr(..., buf, 64);
> >         ...
> > }
> >
> > .. and then compiler decides to spill something where buf was located on stack
> > outside the for loop. The verifier can't know when buf goes out of scope to
> > unpoison the slots.
>
> You're saying the "verifier doesn't know when buf ...".
> The same applies to the compiler. It has no visibility
> into what bpf_dynptr_slice_rdwr is doing.

That is true, it can't assume anything about the side effects. But I am talking
about the point in the program when the buffer object no longer lives. Use of
the escaped pointer to such an object any longer is UB. The compiler is well
within its rights to reuse its stack storage at that point, including for
spilling registers. Which is why "outside the for loop" in my earlier reply.

> So it never spills into a declared C array
> as I tried to explain in the previous reply.
> Spill/fill slots are always invisible to C.
> (unless of course you do pointer arithmetic asm style)

When the declared array's lifetime ends, it can.
https://godbolt.org/z/Ez7v4xfnv

The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
baz, spills r0 to fp-8, and calls bar again with fp-8.

If such a stack slot is STACK_POISON, verifier will reject this program.

>
> > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > +       ctx = *(void **)buffer;
> > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > >
> > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > Then we ensure that program is safe in either path.
> > > >
> > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > separately right from the point following the call instruction.
> > >
> > > This is too complex imo.
> >
> > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > for exploration later. In terms of verifier infra everything is there already,
> > it just needs to analyze both cases which fall into the regular code handling
> > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > like exploring branch instructions.
>
> I still don't like it. There is no reason to go a complex path
> when much simpler suffices.
Andrii Nakryiko March 8, 2023, 12:01 a.m. UTC | #8
On Tue, Mar 7, 2023 at 9:35 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Tue, Mar 07, 2023 at 04:45:14PM CET, Alexei Starovoitov wrote:
> > On Tue, Mar 7, 2023 at 2:22 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
> > >
> > > On Tue, Mar 07, 2023 at 03:23:25AM CET, Alexei Starovoitov wrote:
> > > > On Sun, Mar 5, 2023 at 11:10 PM Kumar Kartikeya Dwivedi
> > > > <memxor@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 01, 2023 at 04:49:52PM CET, Joanne Koong wrote:
> > > > > > Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
> > > > > > The user must pass in a buffer to store the contents of the data slice
> > > > > > if a direct pointer to the data cannot be obtained.
> > > > > >
> > > > > > For skb and xdp type dynptrs, these two APIs are the only way to obtain
> > > > > > a data slice. However, for other types of dynptrs, there is no
> > > > > > difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.
> > > > > >
> > > > > > For skb type dynptrs, the data is copied into the user provided buffer
> > > > > > if any of the data is not in the linear portion of the skb. For xdp type
> > > > > > dynptrs, the data is copied into the user provided buffer if the data is
> > > > > > between xdp frags.
> > > > > >
> > > > > > If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
> > > > > > the skb will be uncloned (see bpf_unclone_prologue()).
> > > > > >
> > > > > > Please note that any bpf_dynptr_write() automatically invalidates any prior
> > > > > > data slices of the skb dynptr. This is because the skb may be cloned or
> > > > > > may need to pull its paged buffer into the head. As such, any
> > > > > > bpf_dynptr_write() will automatically have its prior data slices
> > > > > > invalidated, even if the write is to data in the skb head of an uncloned
> > > > > > skb. Please note as well that any other helper calls that change the
> > > > > > underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
> > > > > > slices of the skb dynptr as well, for the same reasons.
> > > > > >
> > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > ---
> > > > >
> > > > > Sorry for chiming in late.
> > > > >
> > > > > I see one potential hole in bpf_dynptr_slice_rdwr. If the returned pointer is
> > > > > actually pointing to the stack (but verified as a PTR_TO_MEM in verifier state),
> > > > > we won't reflect changes to the stack state in the verifier for writes happening
> > > > > through it.
> > > > >
> > > > > For the worst case scenario, this will basically allow overwriting values of
> > > > > spilled pointers and doing arbitrary kernel memory reads/writes. This is only an
> > > > > issue when bpf_dynptr_slice_rdwr at runtime returns a pointer to the supplied
> > > > > buffer residing on program stack. To verify, by forcing the memcpy to buffer for
> > > > > skb_header_pointer I was able to make it dereference a garbage value for
> > > > > l4lb_all selftest.
> > > > >
> > > > > --- a/kernel/bpf/helpers.c
> > > > > +++ b/kernel/bpf/helpers.c
> > > > > @@ -2253,7 +2253,13 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
> > > > >         case BPF_DYNPTR_TYPE_RINGBUF:
> > > > >                 return ptr->data + ptr->offset + offset;
> > > > >         case BPF_DYNPTR_TYPE_SKB:
> > > > > -               return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > > > +       {
> > > > > +               void *p = skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
> > > > > +               if (p == buffer)
> > > > > +                       return p;
> > > > > +               memcpy(buffer, p, len);
> > > > > +               return buffer;
> > > > > +       }
> > > > >
> > > > > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > > > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
> > > > > @@ -470,7 +470,10 @@ int balancer_ingress(struct __sk_buff *ctx)
> > > > >         eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
> > > > >         if (!eth)
> > > > >                 return TC_ACT_SHOT;
> > > > > -       eth_proto = eth->eth_proto;
> > > > > +       *(void **)buffer = ctx;
> > > >
> > > > Great catch.
> > > > To fix the issue I think we should simply disallow such
> > > > stack abuse. The compiler won't be spilling registers
> > > > into C array on the stack.
> > > > This manual spill/fill is exploiting verifier logic.
> > > > After bpf_dynptr_slice_rdwr() we can mark all slots of the
> > > > buffer as STACK_POISON or some better name and
> > > > reject spill into such slots.
> > > >
> > >
> > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > written over it's going to be a pain for users.
> > >
> > > Something like:
> > >
> > > for (...) {
> > >         char buf[64];
> > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > >         ...
> > > }
> > >
> > > .. and then compiler decides to spill something where buf was located on stack
> > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > unpoison the slots.
> >
> > You're saying the "verifier doesn't know when buf ...".
> > The same applies to the compiler. It has no visibility
> > into what bpf_dynptr_slice_rdwr is doing.
>
> That is true, it can't assume anything about the side effects. But I am talking
> about the point in the program when the buffer object no longer lives. Use of
> the escaped pointer to such an object any longer is UB. The compiler is well
> within its rights to reuse its stack storage at that point, including for
> spilling registers. Which is why "outside the for loop" in my earlier reply.
>
> > So it never spills into a declared C array
> > as I tried to explain in the previous reply.
> > Spill/fill slots are always invisible to C.
> > (unless of course you do pointer arithmetic asm style)
>
> When the declared array's lifetime ends, it can.
> https://godbolt.org/z/Ez7v4xfnv
>
> The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> baz, spills r0 to fp-8, and calls bar again with fp-8.
>
> If such a stack slot is STACK_POISON, verifier will reject this program.
>
> >
> > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > +       ctx = *(void **)buffer;
> > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > >
> > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > Then we ensure that program is safe in either path.
> > > > >
> > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > separately right from the point following the call instruction.
> > > >
> > > > This is too complex imo.
> > >
> > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > for exploration later. In terms of verifier infra everything is there already,
> > > it just needs to analyze both cases which fall into the regular code handling
> > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > like exploring branch instructions.
> >
> > I still don't like it. There is no reason to go a complex path
> > when much simpler suffices.

This issue you are discussing is the reason we don't support
bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
postponed it initially).

I've been thinking about something along the lines of STACK_POISON,
but remembering associated id/ref_obj_id. When ref is released, turn
STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
associated with returned pointer, so can we somehow incorporate that?
Alexei Starovoitov March 10, 2023, 9:15 p.m. UTC | #9
On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > >
> > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > written over it's going to be a pain for users.
> > > >
> > > > Something like:
> > > >
> > > > for (...) {
> > > >         char buf[64];
> > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > >         ...
> > > > }
> > > >
> > > > .. and then compiler decides to spill something where buf was located on stack
> > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > unpoison the slots.
> > >
> > > You're saying the "verifier doesn't know when buf ...".
> > > The same applies to the compiler. It has no visibility
> > > into what bpf_dynptr_slice_rdwr is doing.
> >
> > That is true, it can't assume anything about the side effects. But I am talking
> > about the point in the program when the buffer object no longer lives. Use of
> > the escaped pointer to such an object any longer is UB. The compiler is well
> > within its rights to reuse its stack storage at that point, including for
> > spilling registers. Which is why "outside the for loop" in my earlier reply.
> >
> > > So it never spills into a declared C array
> > > as I tried to explain in the previous reply.
> > > Spill/fill slots are always invisible to C.
> > > (unless of course you do pointer arithmetic asm style)
> >
> > When the declared array's lifetime ends, it can.
> > https://godbolt.org/z/Ez7v4xfnv
> >
> > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > baz, spills r0 to fp-8, and calls bar again with fp-8.

Right. If user writes such program and does explicit store of spillable
pointer into a stack.
I was talking about compiler generated spill/fill and I still believe
that compiler will not be reusing variable's stack memory for them.

> >
> > If such a stack slot is STACK_POISON, verifier will reject this program.

Yes and I think it's an ok trade-off.
The user has to specifically code such program to hit this issue.
I don't think we will see this in practice.
If we do we can consider a more complex fix.

> >
> > >
> > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > +       ctx = *(void **)buffer;
> > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > >
> > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > Then we ensure that program is safe in either path.
> > > > > >
> > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > separately right from the point following the call instruction.
> > > > >
> > > > > This is too complex imo.
> > > >
> > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > for exploration later. In terms of verifier infra everything is there already,
> > > > it just needs to analyze both cases which fall into the regular code handling
> > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > like exploring branch instructions.
> > >
> > > I still don't like it. There is no reason to go a complex path
> > > when much simpler suffices.
> 
> This issue you are discussing is the reason we don't support
> bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> postponed it initially).
> 
> I've been thinking about something along the lines of STACK_POISON,
> but remembering associated id/ref_obj_id. When ref is released, turn
> STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> associated with returned pointer, so can we somehow incorporate that?

There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
but I don't see how we can use it to help this case.
imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
should be good enough in practice.

We can potentially do some liveness trick. When PTR_TO_MEM with dynptr_id becomes
REG_LIVE_DONE we can convert STACK_POISON. But I'd go with the simplest approach first.
Andrii Nakryiko March 10, 2023, 9:29 p.m. UTC | #10
On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > >
> > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > written over it's going to be a pain for users.
> > > > >
> > > > > Something like:
> > > > >
> > > > > for (...) {
> > > > >         char buf[64];
> > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > >         ...
> > > > > }
> > > > >
> > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > unpoison the slots.
> > > >
> > > > You're saying the "verifier doesn't know when buf ...".
> > > > The same applies to the compiler. It has no visibility
> > > > into what bpf_dynptr_slice_rdwr is doing.
> > >
> > > That is true, it can't assume anything about the side effects. But I am talking
> > > about the point in the program when the buffer object no longer lives. Use of
> > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > within its rights to reuse its stack storage at that point, including for
> > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > >
> > > > So it never spills into a declared C array
> > > > as I tried to explain in the previous reply.
> > > > Spill/fill slots are always invisible to C.
> > > > (unless of course you do pointer arithmetic asm style)
> > >
> > > When the declared array's lifetime ends, it can.
> > > https://godbolt.org/z/Ez7v4xfnv
> > >
> > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > baz, spills r0 to fp-8, and calls bar again with fp-8.
>
> Right. If user writes such program and does explicit store of spillable
> pointer into a stack.
> I was talking about compiler generated spill/fill and I still believe
> that compiler will not be reusing variable's stack memory for them.
>
> > >
> > > If such a stack slot is STACK_POISON, verifier will reject this program.
>
> Yes and I think it's an ok trade-off.
> The user has to specifically code such program to hit this issue.
> I don't think we will see this in practice.
> If we do we can consider a more complex fix.

I was just debugging (a completely unrelated) issue where two
completely independent functions, with different local variables, were
reusing the same stack slots just because of them being inlined in
parent functions. So stack reuse happens all the time, unfortunately.
It's not always obvious or malicious.

>
> > >
> > > >
> > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > +       ctx = *(void **)buffer;
> > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > >
> > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > Then we ensure that program is safe in either path.
> > > > > > >
> > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > separately right from the point following the call instruction.
> > > > > >
> > > > > > This is too complex imo.
> > > > >
> > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > like exploring branch instructions.
> > > >
> > > > I still don't like it. There is no reason to go a complex path
> > > > when much simpler suffices.
> >
> > This issue you are discussing is the reason we don't support
> > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > postponed it initially).
> >
> > I've been thinking about something along the lines of STACK_POISON,
> > but remembering associated id/ref_obj_id. When ref is released, turn
> > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > associated with returned pointer, so can we somehow incorporate that?
>
> There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> but I don't see how we can use it to help this case.
> imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> should be good enough in practice.

That's basically what I'm proposing, except when this overwrite
happens we have to go and invalidate all the PTR_TO_MEM references
that are pointing to that stack slot. E.g., in the below case
(assuming we allow LOCAL dynptr to be constructed from stack)

char buf[256], *p;
struct bpf_dynptr dptr;

bpf_dynptr_from_mem(buf, buf+256, &dptr);

p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
offset 128 */

/* buf[128] through buf[128+16] are STACK_POISON */

buf[128] = 123;

So here is where the problem happens. Should we invalidate just p
here? Or entire dptr? Haven't thought much about details, but
something like that. It was getting messy when we started to think
about this with Joanne.

>
> We can potentially do some liveness trick. When PTR_TO_MEM with dynptr_id becomes
> REG_LIVE_DONE we can convert STACK_POISON. But I'd go with the simplest approach first.
Kumar Kartikeya Dwivedi March 10, 2023, 9:38 p.m. UTC | #11
On Fri, Mar 10, 2023 at 10:15:41PM CET, Alexei Starovoitov wrote:
> On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > >
> > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > written over it's going to be a pain for users.
> > > > >
> > > > > Something like:
> > > > >
> > > > > for (...) {
> > > > >         char buf[64];
> > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > >         ...
> > > > > }
> > > > >
> > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > unpoison the slots.
> > > >
> > > > You're saying the "verifier doesn't know when buf ...".
> > > > The same applies to the compiler. It has no visibility
> > > > into what bpf_dynptr_slice_rdwr is doing.
> > >
> > > That is true, it can't assume anything about the side effects. But I am talking
> > > about the point in the program when the buffer object no longer lives. Use of
> > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > within its rights to reuse its stack storage at that point, including for
> > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > >
> > > > So it never spills into a declared C array
> > > > as I tried to explain in the previous reply.
> > > > Spill/fill slots are always invisible to C.
> > > > (unless of course you do pointer arithmetic asm style)
> > >
> > > When the declared array's lifetime ends, it can.
> > > https://godbolt.org/z/Ez7v4xfnv
> > >
> > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > baz, spills r0 to fp-8, and calls bar again with fp-8.
>
> Right. If user writes such program and does explicit store of spillable
> pointer into a stack.
> I was talking about compiler generated spill/fill and I still believe
> that compiler will not be reusing variable's stack memory for them.
>

But that example on godbolt is about the _compiler_ doing spill into a
variable's stack memory, once it is dead. There is no explicit store to spill
from the user happening there.

Maybe buffer in explicit program scope {} is not that common, but always_inline
functions will have a similar effect, since they introduce a scope out of which
poisoned buffer's storage can be reused.

> [...]
Alexei Starovoitov March 10, 2023, 9:49 p.m. UTC | #12
On Fri, Mar 10, 2023 at 1:38 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Fri, Mar 10, 2023 at 10:15:41PM CET, Alexei Starovoitov wrote:
> > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > >
> > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > written over it's going to be a pain for users.
> > > > > >
> > > > > > Something like:
> > > > > >
> > > > > > for (...) {
> > > > > >         char buf[64];
> > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > >         ...
> > > > > > }
> > > > > >
> > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > unpoison the slots.
> > > > >
> > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > The same applies to the compiler. It has no visibility
> > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > >
> > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > about the point in the program when the buffer object no longer lives. Use of
> > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > within its rights to reuse its stack storage at that point, including for
> > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > >
> > > > > So it never spills into a declared C array
> > > > > as I tried to explain in the previous reply.
> > > > > Spill/fill slots are always invisible to C.
> > > > > (unless of course you do pointer arithmetic asm style)
> > > >
> > > > When the declared array's lifetime ends, it can.
> > > > https://godbolt.org/z/Ez7v4xfnv
> > > >
> > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> >
> > Right. If user writes such program and does explicit store of spillable
> > pointer into a stack.
> > I was talking about compiler generated spill/fill and I still believe
> > that compiler will not be reusing variable's stack memory for them.
> >
>
> But that example on godbolt is about the _compiler_ doing spill into a
> variable's stack memory, once it is dead. There is no explicit store to spill
> from the user happening there.

Where do you see it?
It's
p = baz();
and subsequent &p.
That is user requested store.
Your example has other undefined behavior.
I tweaked it like this for clarity:
https://godbolt.org/z/qhcKdeWjb
Kumar Kartikeya Dwivedi March 10, 2023, 9:54 p.m. UTC | #13
On Fri, Mar 10, 2023 at 10:29:45PM CET, Andrii Nakryiko wrote:
> On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > >
> > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > written over it's going to be a pain for users.
> > > > > >
> > > > > > Something like:
> > > > > >
> > > > > > for (...) {
> > > > > >         char buf[64];
> > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > >         ...
> > > > > > }
> > > > > >
> > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > unpoison the slots.
> > > > >
> > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > The same applies to the compiler. It has no visibility
> > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > >
> > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > about the point in the program when the buffer object no longer lives. Use of
> > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > within its rights to reuse its stack storage at that point, including for
> > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > >
> > > > > So it never spills into a declared C array
> > > > > as I tried to explain in the previous reply.
> > > > > Spill/fill slots are always invisible to C.
> > > > > (unless of course you do pointer arithmetic asm style)
> > > >
> > > > When the declared array's lifetime ends, it can.
> > > > https://godbolt.org/z/Ez7v4xfnv
> > > >
> > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> >
> > Right. If user writes such program and does explicit store of spillable
> > pointer into a stack.
> > I was talking about compiler generated spill/fill and I still believe
> > that compiler will not be reusing variable's stack memory for them.
> >
> > > >
> > > > If such a stack slot is STACK_POISON, verifier will reject this program.
> >
> > Yes and I think it's an ok trade-off.
> > The user has to specifically code such program to hit this issue.
> > I don't think we will see this in practice.
> > If we do we can consider a more complex fix.
>
> I was just debugging (a completely unrelated) issue where two
> completely independent functions, with different local variables, were
> reusing the same stack slots just because of them being inlined in
> parent functions. So stack reuse happens all the time, unfortunately.
> It's not always obvious or malicious.
>
> >
> > > >
> > > > >
> > > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > > +       ctx = *(void **)buffer;
> > > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > > >
> > > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > > Then we ensure that program is safe in either path.
> > > > > > > >
> > > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > > separately right from the point following the call instruction.
> > > > > > >
> > > > > > > This is too complex imo.
> > > > > >
> > > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > > like exploring branch instructions.
> > > > >
> > > > > I still don't like it. There is no reason to go a complex path
> > > > > when much simpler suffices.
> > >
> > > This issue you are discussing is the reason we don't support
> > > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > > postponed it initially).
> > >
> > > I've been thinking about something along the lines of STACK_POISON,
> > > but remembering associated id/ref_obj_id. When ref is released, turn
> > > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > > associated with returned pointer, so can we somehow incorporate that?
> >
> > There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> > but I don't see how we can use it to help this case.
> > imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> > should be good enough in practice.
>
> That's basically what I'm proposing, except when this overwrite
> happens we have to go and invalidate all the PTR_TO_MEM references
> that are pointing to that stack slot. E.g., in the below case
> (assuming we allow LOCAL dynptr to be constructed from stack)
>
> char buf[256], *p;
> struct bpf_dynptr dptr;
>
> bpf_dynptr_from_mem(buf, buf+256, &dptr);
>
> p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
> offset 128 */
>
> /* buf[128] through buf[128+16] are STACK_POISON */
>
> buf[128] = 123;
>
> So here is where the problem happens. Should we invalidate just p
> here? Or entire dptr? Haven't thought much about details, but
> something like that. It was getting messy when we started to think
> about this with Joanne.
>

I think there's also the option (for your particular case) to conservatively
mark the entire range a dynptr pointing to stack represents as STACK_MISC
whenever a *write* happens (through bpf_dynptr_write or pointers obtained using
bpf_dynptr_data). We do know exact memory start and length when creating the
dynptr, right?

If somebody tries to be funky, e.g. by doing a spill and then trying to
overwrite its value, the entire range becomes STACK_MISC, so reload would just
mark the reg as unknown. You can be a bit smarter when you know the exact start
and length of stack memory e.g. bpf_dynptr_data pointer points to, but I'm
unsure that will be needed.

Otherwise things work as normal, users can spill stuff to the stack if they
wants, and as long as they are not writing through the dynptr again, we don't
remark the entire range STACK_MISC. If that was the last use of dynptr and it
went out of scope, things work normally. If not, the dynptr and its buffer
should still be in scope so it won't be the compiler doing something funny
spilling stuff into it, only the user.

Due to STACK_INVALID complications over-eager remarking as STACK_MISC might only
make sense for privileged programs, but otherwise I think this is ok.

Am I missing something important?
Alexei Starovoitov March 10, 2023, 9:54 p.m. UTC | #14
On Fri, Mar 10, 2023 at 1:30 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > >
> > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > written over it's going to be a pain for users.
> > > > > >
> > > > > > Something like:
> > > > > >
> > > > > > for (...) {
> > > > > >         char buf[64];
> > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > >         ...
> > > > > > }
> > > > > >
> > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > unpoison the slots.
> > > > >
> > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > The same applies to the compiler. It has no visibility
> > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > >
> > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > about the point in the program when the buffer object no longer lives. Use of
> > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > within its rights to reuse its stack storage at that point, including for
> > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > >
> > > > > So it never spills into a declared C array
> > > > > as I tried to explain in the previous reply.
> > > > > Spill/fill slots are always invisible to C.
> > > > > (unless of course you do pointer arithmetic asm style)
> > > >
> > > > When the declared array's lifetime ends, it can.
> > > > https://godbolt.org/z/Ez7v4xfnv
> > > >
> > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> >
> > Right. If user writes such program and does explicit store of spillable
> > pointer into a stack.
> > I was talking about compiler generated spill/fill and I still believe
> > that compiler will not be reusing variable's stack memory for them.
> >
> > > >
> > > > If such a stack slot is STACK_POISON, verifier will reject this program.
> >
> > Yes and I think it's an ok trade-off.
> > The user has to specifically code such program to hit this issue.
> > I don't think we will see this in practice.
> > If we do we can consider a more complex fix.
>
> I was just debugging (a completely unrelated) issue where two
> completely independent functions, with different local variables, were
> reusing the same stack slots just because of them being inlined in
> parent functions. So stack reuse happens all the time, unfortunately.
> It's not always obvious or malicious.

Right. Stack reuse happens for variables all the time.
I'm still arguing that compile internal spill/fill is coming
from different slots.

When clang compiles the kernel it prints:
../kernel/bpf/verifier.c:18017:5: warning: stack frame size (2296)
exceeds limit (2048) in 'bpf_check' [-Wframe-larger-than]
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
    ^
572/2296 (24.91%) spills, 1724/2296 (75.09%) variables

spills and variables are different areas.

> >
> > > >
> > > > >
> > > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > > +       ctx = *(void **)buffer;
> > > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > > >
> > > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > > Then we ensure that program is safe in either path.
> > > > > > > >
> > > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > > separately right from the point following the call instruction.
> > > > > > >
> > > > > > > This is too complex imo.
> > > > > >
> > > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > > like exploring branch instructions.
> > > > >
> > > > > I still don't like it. There is no reason to go a complex path
> > > > > when much simpler suffices.
> > >
> > > This issue you are discussing is the reason we don't support
> > > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > > postponed it initially).
> > >
> > > I've been thinking about something along the lines of STACK_POISON,
> > > but remembering associated id/ref_obj_id. When ref is released, turn
> > > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > > associated with returned pointer, so can we somehow incorporate that?
> >
> > There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> > but I don't see how we can use it to help this case.
> > imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> > should be good enough in practice.
>
> That's basically what I'm proposing, except when this overwrite
> happens we have to go and invalidate all the PTR_TO_MEM references
> that are pointing to that stack slot. E.g., in the below case
> (assuming we allow LOCAL dynptr to be constructed from stack)
>
> char buf[256], *p;
> struct bpf_dynptr dptr;
>
> bpf_dynptr_from_mem(buf, buf+256, &dptr);
>
> p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
> offset 128 */
>
> /* buf[128] through buf[128+16] are STACK_POISON */
>
> buf[128] = 123;
>
> So here is where the problem happens. Should we invalidate just p
> here? Or entire dptr? Haven't thought much about details, but
> something like that. It was getting messy when we started to think
> about this with Joanne.

Let's table dynptr_from_mem for a second and solve
bpf_dynptr_slice_rdrw first, since I'm getting confused.

For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
Then as soon as first spillable reg touches that poisoned stack area
we can invalidate all PTR_TO_MEM's with that dynptr_id.
Joanne Koong March 13, 2023, 6:31 a.m. UTC | #15
On Fri, Mar 10, 2023 at 1:55 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Mar 10, 2023 at 1:30 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > > >
> > > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > > written over it's going to be a pain for users.
> > > > > > >
> > > > > > > Something like:
> > > > > > >
> > > > > > > for (...) {
> > > > > > >         char buf[64];
> > > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > > >         ...
> > > > > > > }
> > > > > > >
> > > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > > unpoison the slots.
> > > > > >
> > > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > > The same applies to the compiler. It has no visibility
> > > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > > >
> > > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > > about the point in the program when the buffer object no longer lives. Use of
> > > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > > within its rights to reuse its stack storage at that point, including for
> > > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > > >
> > > > > > So it never spills into a declared C array
> > > > > > as I tried to explain in the previous reply.
> > > > > > Spill/fill slots are always invisible to C.
> > > > > > (unless of course you do pointer arithmetic asm style)
> > > > >
> > > > > When the declared array's lifetime ends, it can.
> > > > > https://godbolt.org/z/Ez7v4xfnv
> > > > >
> > > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> > >
> > > Right. If user writes such program and does explicit store of spillable
> > > pointer into a stack.
> > > I was talking about compiler generated spill/fill and I still believe
> > > that compiler will not be reusing variable's stack memory for them.
> > >
> > > > >
> > > > > If such a stack slot is STACK_POISON, verifier will reject this program.
> > >
> > > Yes and I think it's an ok trade-off.
> > > The user has to specifically code such program to hit this issue.
> > > I don't think we will see this in practice.
> > > If we do we can consider a more complex fix.
> >
> > I was just debugging (a completely unrelated) issue where two
> > completely independent functions, with different local variables, were
> > reusing the same stack slots just because of them being inlined in
> > parent functions. So stack reuse happens all the time, unfortunately.
> > It's not always obvious or malicious.
>
> Right. Stack reuse happens for variables all the time.
> I'm still arguing that compile internal spill/fill is coming
> from different slots.
>
> When clang compiles the kernel it prints:
> ../kernel/bpf/verifier.c:18017:5: warning: stack frame size (2296)
> exceeds limit (2048) in 'bpf_check' [-Wframe-larger-than]
> int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
>     ^
> 572/2296 (24.91%) spills, 1724/2296 (75.09%) variables
>
> spills and variables are different areas.
>
> > >
> > > > >
> > > > > >
> > > > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > > > +       ctx = *(void **)buffer;
> > > > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > > > >
> > > > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > > > Then we ensure that program is safe in either path.
> > > > > > > > >
> > > > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > > > separately right from the point following the call instruction.
> > > > > > > >
> > > > > > > > This is too complex imo.
> > > > > > >
> > > > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > > > like exploring branch instructions.
> > > > > >
> > > > > > I still don't like it. There is no reason to go a complex path
> > > > > > when much simpler suffices.
> > > >
> > > > This issue you are discussing is the reason we don't support
> > > > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > > > postponed it initially).
> > > >
> > > > I've been thinking about something along the lines of STACK_POISON,
> > > > but remembering associated id/ref_obj_id. When ref is released, turn
> > > > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > > > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > > > associated with returned pointer, so can we somehow incorporate that?
> > >
> > > There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> > > but I don't see how we can use it to help this case.
> > > imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> > > should be good enough in practice.
> >
> > That's basically what I'm proposing, except when this overwrite
> > happens we have to go and invalidate all the PTR_TO_MEM references
> > that are pointing to that stack slot. E.g., in the below case
> > (assuming we allow LOCAL dynptr to be constructed from stack)
> >
> > char buf[256], *p;
> > struct bpf_dynptr dptr;
> >
> > bpf_dynptr_from_mem(buf, buf+256, &dptr);
> >
> > p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
> > offset 128 */
> >
> > /* buf[128] through buf[128+16] are STACK_POISON */
> >
> > buf[128] = 123;
> >
> > So here is where the problem happens. Should we invalidate just p
> > here? Or entire dptr? Haven't thought much about details, but
> > something like that. It was getting messy when we started to think
> > about this with Joanne.
>
> Let's table dynptr_from_mem for a second and solve
> bpf_dynptr_slice_rdrw first, since I'm getting confused.
>
> For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> Then as soon as first spillable reg touches that poisoned stack area
> we can invalidate all PTR_TO_MEM's with that dynptr_id.

Okay, this makes sense to me. are you already currently working or
planning to work on a fix for this Kumar, or should i take a stab at
it?
Kumar Kartikeya Dwivedi March 13, 2023, 2:41 p.m. UTC | #16
On Mon, Mar 13, 2023 at 07:31:03AM CET, Joanne Koong wrote:
> On Fri, Mar 10, 2023 at 1:55 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Mar 10, 2023 at 1:30 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > > > >
> > > > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > > > written over it's going to be a pain for users.
> > > > > > > >
> > > > > > > > Something like:
> > > > > > > >
> > > > > > > > for (...) {
> > > > > > > >         char buf[64];
> > > > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > > > >         ...
> > > > > > > > }
> > > > > > > >
> > > > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > > > unpoison the slots.
> > > > > > >
> > > > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > > > The same applies to the compiler. It has no visibility
> > > > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > > > >
> > > > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > > > about the point in the program when the buffer object no longer lives. Use of
> > > > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > > > within its rights to reuse its stack storage at that point, including for
> > > > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > > > >
> > > > > > > So it never spills into a declared C array
> > > > > > > as I tried to explain in the previous reply.
> > > > > > > Spill/fill slots are always invisible to C.
> > > > > > > (unless of course you do pointer arithmetic asm style)
> > > > > >
> > > > > > When the declared array's lifetime ends, it can.
> > > > > > https://godbolt.org/z/Ez7v4xfnv
> > > > > >
> > > > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> > > >
> > > > Right. If user writes such program and does explicit store of spillable
> > > > pointer into a stack.
> > > > I was talking about compiler generated spill/fill and I still believe
> > > > that compiler will not be reusing variable's stack memory for them.
> > > >
> > > > > >
> > > > > > If such a stack slot is STACK_POISON, verifier will reject this program.
> > > >
> > > > Yes and I think it's an ok trade-off.
> > > > The user has to specifically code such program to hit this issue.
> > > > I don't think we will see this in practice.
> > > > If we do we can consider a more complex fix.
> > >
> > > I was just debugging (a completely unrelated) issue where two
> > > completely independent functions, with different local variables, were
> > > reusing the same stack slots just because of them being inlined in
> > > parent functions. So stack reuse happens all the time, unfortunately.
> > > It's not always obvious or malicious.
> >
> > Right. Stack reuse happens for variables all the time.
> > I'm still arguing that compile internal spill/fill is coming
> > from different slots.
> >
> > When clang compiles the kernel it prints:
> > ../kernel/bpf/verifier.c:18017:5: warning: stack frame size (2296)
> > exceeds limit (2048) in 'bpf_check' [-Wframe-larger-than]
> > int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
> >     ^
> > 572/2296 (24.91%) spills, 1724/2296 (75.09%) variables
> >
> > spills and variables are different areas.
> >
> > > >
> > > > > >
> > > > > > >
> > > > > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > > > > +       ctx = *(void **)buffer;
> > > > > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > > > > >
> > > > > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > > > > Then we ensure that program is safe in either path.
> > > > > > > > > >
> > > > > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > > > > separately right from the point following the call instruction.
> > > > > > > > >
> > > > > > > > > This is too complex imo.
> > > > > > > >
> > > > > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > > > > like exploring branch instructions.
> > > > > > >
> > > > > > > I still don't like it. There is no reason to go a complex path
> > > > > > > when much simpler suffices.
> > > > >
> > > > > This issue you are discussing is the reason we don't support
> > > > > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > > > > postponed it initially).
> > > > >
> > > > > I've been thinking about something along the lines of STACK_POISON,
> > > > > but remembering associated id/ref_obj_id. When ref is released, turn
> > > > > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > > > > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > > > > associated with returned pointer, so can we somehow incorporate that?
> > > >
> > > > There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> > > > but I don't see how we can use it to help this case.
> > > > imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> > > > should be good enough in practice.
> > >
> > > That's basically what I'm proposing, except when this overwrite
> > > happens we have to go and invalidate all the PTR_TO_MEM references
> > > that are pointing to that stack slot. E.g., in the below case
> > > (assuming we allow LOCAL dynptr to be constructed from stack)
> > >
> > > char buf[256], *p;
> > > struct bpf_dynptr dptr;
> > >
> > > bpf_dynptr_from_mem(buf, buf+256, &dptr);
> > >
> > > p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
> > > offset 128 */
> > >
> > > /* buf[128] through buf[128+16] are STACK_POISON */
> > >
> > > buf[128] = 123;
> > >
> > > So here is where the problem happens. Should we invalidate just p
> > > here? Or entire dptr? Haven't thought much about details, but
> > > something like that. It was getting messy when we started to think
> > > about this with Joanne.
> >
> > Let's table dynptr_from_mem for a second and solve
> > bpf_dynptr_slice_rdrw first, since I'm getting confused.
> >
> > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > Then as soon as first spillable reg touches that poisoned stack area
> > we can invalidate all PTR_TO_MEM's with that dynptr_id.
>
> Okay, this makes sense to me. are you already currently working or
> planning to work on a fix for this Kumar, or should i take a stab at
> it?

I'm not planning to do so, so go ahead. One more thing I noticed just now is
that we probably need to update regsafe to perform a check_ids comparison for
dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
added PTR_TO_MEM in the switch statement.
Andrii Nakryiko March 16, 2023, 6:55 p.m. UTC | #17
On Mon, Mar 13, 2023 at 7:41 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Mon, Mar 13, 2023 at 07:31:03AM CET, Joanne Koong wrote:
> > On Fri, Mar 10, 2023 at 1:55 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Mar 10, 2023 at 1:30 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 10, 2023 at 1:15 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Tue, Mar 07, 2023 at 04:01:28PM -0800, Andrii Nakryiko wrote:
> > > > > > > > >
> > > > > > > > > I agree this is simpler, but I'm not sure it will work properly. Verifier won't
> > > > > > > > > know when the lifetime of the buffer ends, so if we disallow spills until its
> > > > > > > > > written over it's going to be a pain for users.
> > > > > > > > >
> > > > > > > > > Something like:
> > > > > > > > >
> > > > > > > > > for (...) {
> > > > > > > > >         char buf[64];
> > > > > > > > >         bpf_dynptr_slice_rdwr(..., buf, 64);
> > > > > > > > >         ...
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > .. and then compiler decides to spill something where buf was located on stack
> > > > > > > > > outside the for loop. The verifier can't know when buf goes out of scope to
> > > > > > > > > unpoison the slots.
> > > > > > > >
> > > > > > > > You're saying the "verifier doesn't know when buf ...".
> > > > > > > > The same applies to the compiler. It has no visibility
> > > > > > > > into what bpf_dynptr_slice_rdwr is doing.
> > > > > > >
> > > > > > > That is true, it can't assume anything about the side effects. But I am talking
> > > > > > > about the point in the program when the buffer object no longer lives. Use of
> > > > > > > the escaped pointer to such an object any longer is UB. The compiler is well
> > > > > > > within its rights to reuse its stack storage at that point, including for
> > > > > > > spilling registers. Which is why "outside the for loop" in my earlier reply.
> > > > > > >
> > > > > > > > So it never spills into a declared C array
> > > > > > > > as I tried to explain in the previous reply.
> > > > > > > > Spill/fill slots are always invisible to C.
> > > > > > > > (unless of course you do pointer arithmetic asm style)
> > > > > > >
> > > > > > > When the declared array's lifetime ends, it can.
> > > > > > > https://godbolt.org/z/Ez7v4xfnv
> > > > > > >
> > > > > > > The 2nd call to bar as part of unrolled loop happens with fp-8, then it calls
> > > > > > > baz, spills r0 to fp-8, and calls bar again with fp-8.
> > > > >
> > > > > Right. If user writes such program and does explicit store of spillable
> > > > > pointer into a stack.
> > > > > I was talking about compiler generated spill/fill and I still believe
> > > > > that compiler will not be reusing variable's stack memory for them.
> > > > >
> > > > > > >
> > > > > > > If such a stack slot is STACK_POISON, verifier will reject this program.
> > > > >
> > > > > Yes and I think it's an ok trade-off.
> > > > > The user has to specifically code such program to hit this issue.
> > > > > I don't think we will see this in practice.
> > > > > If we do we can consider a more complex fix.
> > > >
> > > > I was just debugging (a completely unrelated) issue where two
> > > > completely independent functions, with different local variables, were
> > > > reusing the same stack slots just because of them being inlined in
> > > > parent functions. So stack reuse happens all the time, unfortunately.
> > > > It's not always obvious or malicious.
> > >
> > > Right. Stack reuse happens for variables all the time.
> > > I'm still arguing that compile internal spill/fill is coming
> > > from different slots.
> > >
> > > When clang compiles the kernel it prints:
> > > ../kernel/bpf/verifier.c:18017:5: warning: stack frame size (2296)
> > > exceeds limit (2048) in 'bpf_check' [-Wframe-larger-than]
> > > int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
> > >     ^
> > > 572/2296 (24.91%) spills, 1724/2296 (75.09%) variables
> > >
> > > spills and variables are different areas.
> > >
> > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > > > +       *(void **)eth = (void *)0xdeadbeef;
> > > > > > > > > > > +       ctx = *(void **)buffer;
> > > > > > > > > > > +       eth_proto = eth->eth_proto + ctx->len;
> > > > > > > > > > >         if (eth_proto == bpf_htons(ETH_P_IP))
> > > > > > > > > > >                 err = process_packet(&ptr, eth, nh_off, false, ctx);
> > > > > > > > > > >
> > > > > > > > > > > I think the proper fix is to treat it as a separate return type distinct from
> > > > > > > > > > > PTR_TO_MEM like PTR_TO_MEM_OR_PKT (or handle PTR_TO_MEM | DYNPTR_* specially),
> > > > > > > > > > > fork verifier state whenever there is a write, so that one path verifies it as
> > > > > > > > > > > PTR_TO_PACKET, while another as PTR_TO_STACK (if buffer was a stack ptr). I
> > > > > > > > > > > think for the rest it's not a problem, but there are allow_ptr_leak checks
> > > > > > > > > > > applied to PTR_TO_STACK and PTR_TO_MAP_VALUE, so that needs to be rechecked.
> > > > > > > > > > > Then we ensure that program is safe in either path.
> > > > > > > > > > >
> > > > > > > > > > > Also we need to fix regsafe to not consider other PTR_TO_MEMs equivalent to such
> > > > > > > > > > > a pointer. We could also fork verifier states on return, to verify either path
> > > > > > > > > > > separately right from the point following the call instruction.
> > > > > > > > > >
> > > > > > > > > > This is too complex imo.
> > > > > > > > >
> > > > > > > > > A better way to phrase this is to verify with R0 = PTR_TO_PACKET in one path,
> > > > > > > > > and push_stack with R0 = buffer's reg->type + size set to len in the other path
> > > > > > > > > for exploration later. In terms of verifier infra everything is there already,
> > > > > > > > > it just needs to analyze both cases which fall into the regular code handling
> > > > > > > > > the reg->type's. Probably then no adjustments to regsafe are needed either. It's
> > > > > > > > > like exploring branch instructions.
> > > > > > > >
> > > > > > > > I still don't like it. There is no reason to go a complex path
> > > > > > > > when much simpler suffices.
> > > > > >
> > > > > > This issue you are discussing is the reason we don't support
> > > > > > bpf_dynptr_from_mem() taking PTR_TO_STACK (which is a pity, but we
> > > > > > postponed it initially).
> > > > > >
> > > > > > I've been thinking about something along the lines of STACK_POISON,
> > > > > > but remembering associated id/ref_obj_id. When ref is released, turn
> > > > > > STACK_POISON to STACK_MISC. If it's bpf_dynptr_slice_rdrw() or
> > > > > > bpf_dynptr_from_mem(), which don't have ref_obj_id, they still have ID
> > > > > > associated with returned pointer, so can we somehow incorporate that?
> > > > >
> > > > > There is dynptr_id in PTR_TO_MEM that is used by destroy_if_dynptr_stack_slot(),
> > > > > but I don't see how we can use it to help this case.
> > > > > imo plain STACK_POISON that is overwriteable by STACK_MISC/STACK_ZERO
> > > > > should be good enough in practice.
> > > >
> > > > That's basically what I'm proposing, except when this overwrite
> > > > happens we have to go and invalidate all the PTR_TO_MEM references
> > > > that are pointing to that stack slot. E.g., in the below case
> > > > (assuming we allow LOCAL dynptr to be constructed from stack)
> > > >
> > > > char buf[256], *p;
> > > > struct bpf_dynptr dptr;
> > > >
> > > > bpf_dynptr_from_mem(buf, buf+256, &dptr);
> > > >
> > > > p = bpf_dynptr_data(&dptr, 128, 16); /* get 16-byte slice into buf, at
> > > > offset 128 */
> > > >
> > > > /* buf[128] through buf[128+16] are STACK_POISON */
> > > >
> > > > buf[128] = 123;
> > > >
> > > > So here is where the problem happens. Should we invalidate just p
> > > > here? Or entire dptr? Haven't thought much about details, but
> > > > something like that. It was getting messy when we started to think
> > > > about this with Joanne.
> > >
> > > Let's table dynptr_from_mem for a second and solve
> > > bpf_dynptr_slice_rdrw first, since I'm getting confused.
> > >
> > > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > > Then as soon as first spillable reg touches that poisoned stack area
> > > we can invalidate all PTR_TO_MEM's with that dynptr_id.
> >
> > Okay, this makes sense to me. are you already currently working or
> > planning to work on a fix for this Kumar, or should i take a stab at
> > it?
>
> I'm not planning to do so, so go ahead. One more thing I noticed just now is
> that we probably need to update regsafe to perform a check_ids comparison for
> dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
> ("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
> 567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
> added PTR_TO_MEM in the switch statement.

I can take care of this. But I really would like to avoid these
special cases of extra dynptr_id, exactly for reasons like this
omitted check.

What do people think about generalizing current ref_obj_id to be more
like "lifetime id" (to borrow Rust terminology a bit), which would be
an object (which might or might not be a tracked reference) defining
the scope/lifetime of the current register (whatever it represents).

I haven't looked through code much, but I've been treating ref_obj_id
as that already in my thinking before, and it seems to be a better
approach than having a special-case of dynptr_id.

Thoughts?
Joanne Koong March 27, 2023, 7:47 a.m. UTC | #18
On Thu, Mar 16, 2023 at 11:55 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Mar 13, 2023 at 7:41 AM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
[...]
> > > > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > > > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > > > Then as soon as first spillable reg touches that poisoned stack area
> > > > we can invalidate all PTR_TO_MEM's with that dynptr_id.
> > >
> > > Okay, this makes sense to me. are you already currently working or
> > > planning to work on a fix for this Kumar, or should i take a stab at
> > > it?
> >
> > I'm not planning to do so, so go ahead. One more thing I noticed just now is
> > that we probably need to update regsafe to perform a check_ids comparison for
> > dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
> > ("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
> > 567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
> > added PTR_TO_MEM in the switch statement.
>
> I can take care of this. But I really would like to avoid these
> special cases of extra dynptr_id, exactly for reasons like this
> omitted check.
>
> What do people think about generalizing current ref_obj_id to be more
> like "lifetime id" (to borrow Rust terminology a bit), which would be
> an object (which might or might not be a tracked reference) defining
> the scope/lifetime of the current register (whatever it represents).
>
> I haven't looked through code much, but I've been treating ref_obj_id
> as that already in my thinking before, and it seems to be a better
> approach than having a special-case of dynptr_id.
>
> Thoughts?

Thanks for taking care of this (and apologies for the late reply). i
think the dynptr_id field would still be needed in this case to
associate a slice with a dynptr, so that when a dynptr is invalidated
its slices get invalidated as well. I'm not sure we could get away
with just having ref_obj_id symbolize that in the case where the
underlying object is a tracked reference, because for example, it
seems like a dynptr would need both a unique reference id to the
object (so that if for example there are two dynptrs pointing to the
same object, they will both be assignedthe same reference id so the
object can't for example be freed twice) and also its own dynptr id so
that its slices get invalidated if the dynptr is invalidated
Andrii Nakryiko March 28, 2023, 9:42 p.m. UTC | #19
On Mon, Mar 27, 2023 at 12:47 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Thu, Mar 16, 2023 at 11:55 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Mar 13, 2023 at 7:41 AM Kumar Kartikeya Dwivedi
> > <memxor@gmail.com> wrote:
> > >
> [...]
> > > > > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > > > > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > > > > Then as soon as first spillable reg touches that poisoned stack area
> > > > > we can invalidate all PTR_TO_MEM's with that dynptr_id.
> > > >
> > > > Okay, this makes sense to me. are you already currently working or
> > > > planning to work on a fix for this Kumar, or should i take a stab at
> > > > it?
> > >
> > > I'm not planning to do so, so go ahead. One more thing I noticed just now is
> > > that we probably need to update regsafe to perform a check_ids comparison for
> > > dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
> > > ("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
> > > 567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
> > > added PTR_TO_MEM in the switch statement.
> >
> > I can take care of this. But I really would like to avoid these
> > special cases of extra dynptr_id, exactly for reasons like this
> > omitted check.
> >
> > What do people think about generalizing current ref_obj_id to be more
> > like "lifetime id" (to borrow Rust terminology a bit), which would be
> > an object (which might or might not be a tracked reference) defining
> > the scope/lifetime of the current register (whatever it represents).
> >
> > I haven't looked through code much, but I've been treating ref_obj_id
> > as that already in my thinking before, and it seems to be a better
> > approach than having a special-case of dynptr_id.
> >
> > Thoughts?
>
> Thanks for taking care of this (and apologies for the late reply). i
> think the dynptr_id field would still be needed in this case to
> associate a slice with a dynptr, so that when a dynptr is invalidated
> its slices get invalidated as well. I'm not sure we could get away
> with just having ref_obj_id symbolize that in the case where the
> underlying object is a tracked reference, because for example, it
> seems like a dynptr would need both a unique reference id to the
> object (so that if for example there are two dynptrs pointing to the
> same object, they will both be assignedthe same reference id so the
> object can't for example be freed twice) and also its own dynptr id so
> that its slices get invalidated if the dynptr is invalidated

Can you elaborate on specific example? Because let's say dynptr is
created from some refcounted object. Then that dynptr's id field will
be a unique "dynptr id", dynptr's ref_obj_id will point to that
refcounted object from which we derived dynptr itself. And then when
we create slices from dynptrs, then each slice gets its own unique id,
but records dynptr's id as slice's ref_obj_id. So we end up with this
hierarchy of id + ref_obj_id forming a tree.

Or am I missing something?

I want to take a look at simplifying this at some point, so I'll know
more details once I start digging into code. Right now I still fail to
see why we need a third ID for dynptr.
Joanne Koong April 9, 2023, 12:22 a.m. UTC | #20
On Tue, Mar 28, 2023 at 2:43 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Mar 27, 2023 at 12:47 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 11:55 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Mon, Mar 13, 2023 at 7:41 AM Kumar Kartikeya Dwivedi
> > > <memxor@gmail.com> wrote:
> > > >
> > [...]
> > > > > > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > > > > > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > > > > > Then as soon as first spillable reg touches that poisoned stack area
> > > > > > we can invalidate all PTR_TO_MEM's with that dynptr_id.
> > > > >
> > > > > Okay, this makes sense to me. are you already currently working or
> > > > > planning to work on a fix for this Kumar, or should i take a stab at
> > > > > it?
> > > >
> > > > I'm not planning to do so, so go ahead. One more thing I noticed just now is
> > > > that we probably need to update regsafe to perform a check_ids comparison for
> > > > dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
> > > > ("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
> > > > 567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
> > > > added PTR_TO_MEM in the switch statement.
> > >
> > > I can take care of this. But I really would like to avoid these
> > > special cases of extra dynptr_id, exactly for reasons like this
> > > omitted check.
> > >
> > > What do people think about generalizing current ref_obj_id to be more
> > > like "lifetime id" (to borrow Rust terminology a bit), which would be
> > > an object (which might or might not be a tracked reference) defining
> > > the scope/lifetime of the current register (whatever it represents).
> > >
> > > I haven't looked through code much, but I've been treating ref_obj_id
> > > as that already in my thinking before, and it seems to be a better
> > > approach than having a special-case of dynptr_id.
> > >
> > > Thoughts?
> >
> > Thanks for taking care of this (and apologies for the late reply). i
> > think the dynptr_id field would still be needed in this case to
> > associate a slice with a dynptr, so that when a dynptr is invalidated
> > its slices get invalidated as well. I'm not sure we could get away
> > with just having ref_obj_id symbolize that in the case where the
> > underlying object is a tracked reference, because for example, it
> > seems like a dynptr would need both a unique reference id to the
> > object (so that if for example there are two dynptrs pointing to the
> > same object, they will both be assignedthe same reference id so the
> > object can't for example be freed twice) and also its own dynptr id so
> > that its slices get invalidated if the dynptr is invalidated
>
> Can you elaborate on specific example? Because let's say dynptr is
> created from some refcounted object. Then that dynptr's id field will
> be a unique "dynptr id", dynptr's ref_obj_id will point to that
> refcounted object from which we derived dynptr itself. And then when
> we create slices from dynptrs, then each slice gets its own unique id,
> but records dynptr's id as slice's ref_obj_id. So we end up with this
> hierarchy of id + ref_obj_id forming a tree.
>
> Or am I missing something?
>
> I want to take a look at simplifying this at some point, so I'll know
> more details once I start digging into code. Right now I still fail to
> see why we need a third ID for dynptr.

My mental model is that
* dynptr's ref_obj_id is set whenver there's a refcounted object
(right now, only ringbuf dynptrs are refcounted), to enforce that the
reference gets released by the time the program exits (dynptr
ref_obj_id is set in mark_stack_slots_dynptr())
* dynptr's dynptr id is set for all dynptrs, so that if a dynptr gets
overwritten/invalidated, all slices for that dynptr get invalidated
(dynptr id is set in mark_dynptr_stack_regs(), called in
mark_stack_slots_dynptr())
* when there's a data slice, both the slice's dynptr id and ref_obj_id
get set to the dynptr's dynptr id and ref_obj_id, so that the slice
gets invalidated when either the dynptr is released or when the dynptr
is overwritten (two separate cases) (the slice's dynptr id and ref obj
id get set in check_helper_call()). The data slice also has its own
unique id, but this is to handle the case where the data slice may be
null.

"And then when we create slices from dynptrs, then each slice gets its
own unique id, but records dynptr's id as slice's ref_obj_id. So we
end up with this hierarchy of id + ref_obj_id forming a tree." I don't
think I'm following the tree part. I think it records the dynptr's id
as slice's id (and dynptr's ref obj id as slice's ref obj id) in
check_helper_call().

"Right now I still fail to see why we need a third ID for dynptr". I
think for dynptrs, there are two IDs:
state->stack[spi].spilled_ptr.ref_obj_id and
state->stack[spi].spilled_ptr.id (where ref_obj_id is used to
invalidate slices when dynptr is released and id is used to
invalidates slices when dynptr is overwritten), and then for dynptr
slices there are 3 IDs: reg->id, reg->dynptr_id, reg->ref_obj_id
(where id is used for the data slice returning NULL case, and
ref_obj_id / dynptr_id are used when dynptrs are invalidated).
Andrii Nakryiko April 12, 2023, 7:05 p.m. UTC | #21
On Sat, Apr 8, 2023 at 5:22 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Tue, Mar 28, 2023 at 2:43 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Mar 27, 2023 at 12:47 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> > >
> > > On Thu, Mar 16, 2023 at 11:55 AM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 13, 2023 at 7:41 AM Kumar Kartikeya Dwivedi
> > > > <memxor@gmail.com> wrote:
> > > > >
> > > [...]
> > > > > > > For bpf_dynptr_slice_rdrw we can mark buffer[] in stack as
> > > > > > > poisoned with dynptr_id == R0's PTR_TO_MEM dynptr_id.
> > > > > > > Then as soon as first spillable reg touches that poisoned stack area
> > > > > > > we can invalidate all PTR_TO_MEM's with that dynptr_id.
> > > > > >
> > > > > > Okay, this makes sense to me. are you already currently working or
> > > > > > planning to work on a fix for this Kumar, or should i take a stab at
> > > > > > it?
> > > > >
> > > > > I'm not planning to do so, so go ahead. One more thing I noticed just now is
> > > > > that we probably need to update regsafe to perform a check_ids comparison for
> > > > > dynptr_id for dynptr PTR_TO_MEMs? It was not a problem back when f8064ab90d66
> > > > > ("bpf: Invalidate slices on destruction of dynptrs on stack") was added but
> > > > > 567da5d253cd ("bpf: improve regsafe() checks for PTR_TO_{MEM,BUF,TP_BUFFER}")
> > > > > added PTR_TO_MEM in the switch statement.
> > > >
> > > > I can take care of this. But I really would like to avoid these
> > > > special cases of extra dynptr_id, exactly for reasons like this
> > > > omitted check.
> > > >
> > > > What do people think about generalizing current ref_obj_id to be more
> > > > like "lifetime id" (to borrow Rust terminology a bit), which would be
> > > > an object (which might or might not be a tracked reference) defining
> > > > the scope/lifetime of the current register (whatever it represents).
> > > >
> > > > I haven't looked through code much, but I've been treating ref_obj_id
> > > > as that already in my thinking before, and it seems to be a better
> > > > approach than having a special-case of dynptr_id.
> > > >
> > > > Thoughts?
> > >
> > > Thanks for taking care of this (and apologies for the late reply). i
> > > think the dynptr_id field would still be needed in this case to
> > > associate a slice with a dynptr, so that when a dynptr is invalidated
> > > its slices get invalidated as well. I'm not sure we could get away
> > > with just having ref_obj_id symbolize that in the case where the
> > > underlying object is a tracked reference, because for example, it
> > > seems like a dynptr would need both a unique reference id to the
> > > object (so that if for example there are two dynptrs pointing to the
> > > same object, they will both be assignedthe same reference id so the
> > > object can't for example be freed twice) and also its own dynptr id so
> > > that its slices get invalidated if the dynptr is invalidated
> >
> > Can you elaborate on specific example? Because let's say dynptr is
> > created from some refcounted object. Then that dynptr's id field will
> > be a unique "dynptr id", dynptr's ref_obj_id will point to that
> > refcounted object from which we derived dynptr itself. And then when
> > we create slices from dynptrs, then each slice gets its own unique id,
> > but records dynptr's id as slice's ref_obj_id. So we end up with this
> > hierarchy of id + ref_obj_id forming a tree.
> >
> > Or am I missing something?
> >
> > I want to take a look at simplifying this at some point, so I'll know
> > more details once I start digging into code. Right now I still fail to
> > see why we need a third ID for dynptr.
>
> My mental model is that
> * dynptr's ref_obj_id is set whenver there's a refcounted object
> (right now, only ringbuf dynptrs are refcounted), to enforce that the
> reference gets released by the time the program exits (dynptr
> ref_obj_id is set in mark_stack_slots_dynptr())
> * dynptr's dynptr id is set for all dynptrs, so that if a dynptr gets
> overwritten/invalidated, all slices for that dynptr get invalidated
> (dynptr id is set in mark_dynptr_stack_regs(), called in
> mark_stack_slots_dynptr())

yeah, I understand that's how it works today and what the semantics of
ref_obj_id is. But I'm saying that we should look at whether we can
revise ref_obj_id semantics and generalize it to be "ID of the
<object> whose lifetime we are bound to". This refcount part could be
optional (again, will know for sure when I get to writing the code).

I'll get to this in time and will validate my own preconceptions. I
don't think we should spend too much time discussing this in abstract
right now.


> * when there's a data slice, both the slice's dynptr id and ref_obj_id
> get set to the dynptr's dynptr id and ref_obj_id, so that the slice
> gets invalidated when either the dynptr is released or when the dynptr
> is overwritten (two separate cases) (the slice's dynptr id and ref obj
> id get set in check_helper_call()). The data slice also has its own
> unique id, but this is to handle the case where the data slice may be
> null.
>
> "And then when we create slices from dynptrs, then each slice gets its
> own unique id, but records dynptr's id as slice's ref_obj_id. So we
> end up with this hierarchy of id + ref_obj_id forming a tree." I don't
> think I'm following the tree part. I think it records the dynptr's id
> as slice's id (and dynptr's ref obj id as slice's ref obj id) in
> check_helper_call().
>
> "Right now I still fail to see why we need a third ID for dynptr". I
> think for dynptrs, there are two IDs:
> state->stack[spi].spilled_ptr.ref_obj_id and
> state->stack[spi].spilled_ptr.id (where ref_obj_id is used to
> invalidate slices when dynptr is released and id is used to
> invalidates slices when dynptr is overwritten), and then for dynptr
> slices there are 3 IDs: reg->id, reg->dynptr_id, reg->ref_obj_id
> (where id is used for the data slice returning NULL case, and
> ref_obj_id / dynptr_id are used when dynptrs are invalidated).
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3f6992261ec5..efa5d4a1677e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1548,6 +1548,9 @@  int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
 			  u32 len, u64 flags);
 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
 				       void *to, u32 len)
@@ -1572,6 +1575,17 @@  static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	return NULL;
+}
+
+static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
+				     unsigned long len, bool flush)
+{
+	return NULL;
+}
 #endif /* CONFIG_NET */
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@  union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 114a875a05b1..648b29e78b84 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2193,6 +2193,142 @@  __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
 	return p;
 }
 
+/**
+ * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+				   void *buffer, u32 buffer__szk)
+{
+	enum bpf_dynptr_type type;
+	u32 len = buffer__szk;
+	int err;
+
+	if (!ptr->data)
+		return 0;
+
+	err = bpf_dynptr_check_off_len(ptr, offset, len);
+	if (err)
+		return 0;
+
+	type = bpf_dynptr_get_type(ptr);
+
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return ptr->data + ptr->offset + offset;
+	case BPF_DYNPTR_TYPE_SKB:
+		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+	case BPF_DYNPTR_TYPE_XDP:
+	{
+		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+		if (xdp_ptr)
+			return xdp_ptr;
+
+		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+		return buffer;
+	}
+	default:
+		WARN_ONCE(true, "unknown dynptr type %d\n", type);
+		return 0;
+	}
+}
+
+/**
+ * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ *	return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+					void *buffer, u32 buffer__szk)
+{
+	if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+		return 0;
+
+	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+	 *
+	 * For skb-type dynptrs, it is safe to write into the returned pointer
+	 * if the bpf program allows skb data writes. There are two possiblities
+	 * that may occur when calling bpf_dynptr_slice_rdwr:
+	 *
+	 * 1) The requested slice is in the head of the skb. In this case, the
+	 * returned pointer is directly to skb data, and if the skb is cloned, the
+	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+	 * The pointer can be directly written into.
+	 *
+	 * 2) Some portion of the requested slice is in the paged buffer area.
+	 * In this case, the requested data will be copied out into the buffer
+	 * and the returned pointer will be a pointer to the buffer. The skb
+	 * will not be pulled. To persist the write, the user will need to call
+	 * bpf_dynptr_write(), which will pull the skb and commit the write.
+	 *
+	 * Similarly for xdp programs, if the requested slice is not across xdp
+	 * fragments, then a direct pointer will be returned, otherwise the data
+	 * will be copied out into the buffer and the user will need to call
+	 * bpf_dynptr_write() to commit changes.
+	 */
+	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2262,6 +2398,8 @@  BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_ID_FLAGS(func, bpf_rdonly_cast)
 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e42946e53ab..a856896e835a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -759,6 +759,22 @@  static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 	}
 }
 
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+		return DYNPTR_TYPE_LOCAL;
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return DYNPTR_TYPE_RINGBUF;
+	case BPF_DYNPTR_TYPE_SKB:
+		return DYNPTR_TYPE_SKB;
+	case BPF_DYNPTR_TYPE_XDP:
+		return DYNPTR_TYPE_XDP;
+	default:
+		return 0;
+	}
+}
+
 static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 {
 	return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -1681,6 +1697,12 @@  static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
 	       reg->type == PTR_TO_PACKET_END;
 }
 
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+	return base_type(reg->type) == PTR_TO_MEM &&
+		(reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
 /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
 static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
 				    enum bpf_reg_type which)
@@ -7429,6 +7451,9 @@  static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
  * are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -7436,7 +7461,7 @@  static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	struct bpf_reg_state *reg;
 
 	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
-		if (reg_is_pkt_pointer_any(reg))
+		if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
 			mark_reg_invalid(env, reg);
 	}));
 }
@@ -8688,6 +8713,11 @@  struct bpf_kfunc_call_arg_meta {
 	struct {
 		struct btf_field *field;
 	} arg_rbtree_root;
+	struct {
+		enum bpf_dynptr_type type;
+		u32 id;
+	} initialized_dynptr;
+	u64 mem_size;
 };
 
 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
@@ -8761,6 +8791,19 @@  static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	return __kfunc_param_match_suffix(btf, arg, "__sz");
 }
 
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+					const struct btf_param *arg,
+					const struct bpf_reg_state *reg)
+{
+	const struct btf_type *t;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
 static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
 {
 	return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8949,6 +8992,8 @@  enum special_kfunc_type {
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
+	KF_bpf_dynptr_slice,
+	KF_bpf_dynptr_slice_rdwr,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8965,6 +9010,8 @@  BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8983,6 +9030,8 @@  BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9062,7 +9111,10 @@  get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_CALLBACK;
 
-	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+	if (argno + 1 < nargs &&
+	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
 		arg_mem_size = true;
 
 	/* This is the catch all argument type of register types supported by
@@ -9745,6 +9797,18 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
 				return ret;
+
+			if (!(dynptr_arg_type & MEM_UNINIT)) {
+				int id = dynptr_id(env, reg);
+
+				if (id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+					return id;
+				}
+				meta->initialized_dynptr.id = id;
+				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+			}
+
 			break;
 		}
 		case KF_ARG_PTR_TO_LIST_HEAD:
@@ -9840,14 +9904,33 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_MEM_SIZE:
-			ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+		{
+			struct bpf_reg_state *size_reg = &regs[regno + 1];
+			const struct btf_param *size_arg = &args[i + 1];
+
+			ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
 			if (ret < 0) {
 				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
 				return ret;
 			}
-			/* Skip next '__sz' argument */
+
+			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+				if (meta->arg_constant.found) {
+					verbose(env, "verifier internal error: only one constant argument permitted\n");
+					return -EFAULT;
+				}
+				if (!tnum_is_const(size_reg->var_off)) {
+					verbose(env, "R%d must be a known constant\n", regno + 1);
+					return -EINVAL;
+				}
+				meta->arg_constant.found = true;
+				meta->arg_constant.value = size_reg->var_off.value;
+			}
+
+			/* Skip next '__sz' or '__szk' argument */
 			i++;
 			break;
+		}
 		case KF_ARG_PTR_TO_CALLBACK:
 			meta->subprogno = reg->subprogno;
 			break;
@@ -10082,6 +10165,42 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
 				regs[BPF_REG_0].btf = desc_btf;
 				regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+			} else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+				   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+				enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+				mark_reg_known_zero(env, regs, BPF_REG_0);
+
+				if (!meta.arg_constant.found) {
+					verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+					return -EFAULT;
+				}
+
+				regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+				/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+				regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+				if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+					regs[BPF_REG_0].type |= MEM_RDONLY;
+				} else {
+					/* this will set env->seen_direct_write to true */
+					if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+						verbose(env, "the prog does not allow writes to packet data\n");
+						return -EINVAL;
+					}
+				}
+
+				if (!meta.initialized_dynptr.id) {
+					verbose(env, "verifier internal error: no dynptr id\n");
+					return -EFAULT;
+				}
+				regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+				/* we don't need to set BPF_REG_0's ref obj id
+				 * because packet slices are not refcounted (see
+				 * dynptr_type_refcounted)
+				 */
 			} else {
 				verbose(env, "kernel function %s unhandled dynamic return type\n",
 					meta.func_name);
diff --git a/net/core/filter.c b/net/core/filter.c
index c692046fa7f6..8f3124e06133 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3894,8 +3894,8 @@  static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
-			     void *buf, unsigned long len, bool flush)
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush)
 {
 	unsigned long ptr_len, ptr_off = 0;
 	skb_frag_t *next_frag, *end_frag;
@@ -3941,7 +3941,7 @@  static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
 	}
 }
 
-static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
 	u32 size = xdp->data_end - xdp->data;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@  union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return