diff mbox series

[v7,bpf-next,8/9] bpf: introduce bpf_jit_binary_pack_[alloc|finalize|free]

Message ID 20220128234517.3503701-9-song@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_prog_pack allocator | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/apply fail Patch does not apply to bpf-next
bpf/vmtest-bpf-next-PR fail merge-conflict

Commit Message

Song Liu Jan. 28, 2022, 11:45 p.m. UTC
From: Song Liu <songliubraving@fb.com>

This is the jit binary allocator built on top of bpf_prog_pack.

bpf_prog_pack allocates RO memory, which cannot be used directly by the
JIT engine. Therefore, a temporary rw buffer is allocated for the JIT
engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy
the program to the RO memory.

bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal
instructions, which is small than the 128 bytes space reserved by
bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr
to find the correct header. Also, flag use_bpf_prog_pack is added to
differentiate a program allocated by bpf_jit_binary_pack_alloc.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/bpf.h    |   1 +
 include/linux/filter.h |  21 ++++----
 kernel/bpf/core.c      | 108 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 120 insertions(+), 10 deletions(-)

Comments

Daniel Borkmann Feb. 1, 2022, 12:21 a.m. UTC | #1
On 1/29/22 12:45 AM, Song Liu wrote:
[...]
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 25e34caa9a95..ff0c51ef1cb7 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
>   	bpf_jit_uncharge_modmem(size);
>   }
>   
> +/* Allocate jit binary from bpf_prog_pack allocator.
> + * Since the allocated meory is RO+X, the JIT engine cannot write directly

nit: meory

> + * to the memory. To solve this problem, a RW buffer is also allocated at
> + * as the same time. The JIT engine should calculate offsets based on the
> + * RO memory address, but write JITed program to the RW buffer. Once the
> + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
> + * the JITed program to the RO memory.
> + */
> +struct bpf_binary_header *
> +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
> +			  unsigned int alignment,
> +			  struct bpf_binary_header **rw_header,
> +			  u8 **rw_image,
> +			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
> +{
> +	struct bpf_binary_header *ro_header;
> +	u32 size, hole, start;
> +
> +	WARN_ON_ONCE(!is_power_of_2(alignment) ||
> +		     alignment > BPF_IMAGE_ALIGNMENT);
> +
> +	/* add 16 bytes for a random section of illegal instructions */
> +	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
> +
> +	if (bpf_jit_charge_modmem(size))
> +		return NULL;
> +	ro_header = bpf_prog_pack_alloc(size);
> +	if (!ro_header) {
> +		bpf_jit_uncharge_modmem(size);
> +		return NULL;
> +	}
> +
> +	*rw_header = kvmalloc(size, GFP_KERNEL);
> +	if (!*rw_header) {
> +		bpf_prog_pack_free(ro_header);
> +		bpf_jit_uncharge_modmem(size);
> +		return NULL;
> +	}
> +
> +	/* Fill space with illegal/arch-dep instructions. */
> +	bpf_fill_ill_insns(*rw_header, size);
> +	(*rw_header)->size = size;
> +
> +	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
> +		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
> +	start = (get_random_int() % hole) & ~(alignment - 1);
> +
> +	*image_ptr = &ro_header->image[start];
> +	*rw_image = &(*rw_header)->image[start];
> +
> +	return ro_header;
> +}
> +
> +/* Copy JITed text from rw_header to its final location, the ro_header. */
> +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
> +				 struct bpf_binary_header *ro_header,
> +				 struct bpf_binary_header *rw_header)
> +{
> +	void *ptr;
> +
> +	ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);

Does this need to be wrapped with a text_mutex lock/unlock pair given
text_poke_copy() internally relies on __text_poke() ?

> +	kvfree(rw_header);
> +
> +	if (IS_ERR(ptr)) {
> +		bpf_prog_pack_free(ro_header);
> +		return PTR_ERR(ptr);
> +	}
> +	prog->aux->use_bpf_prog_pack = true;
> +	return 0;
> +}
> +
[...]
Song Liu Feb. 1, 2022, 12:35 a.m. UTC | #2
On Mon, Jan 31, 2022 at 4:21 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 1/29/22 12:45 AM, Song Liu wrote:
> [...]
[...]
> > +}
> > +
> > +/* Copy JITed text from rw_header to its final location, the ro_header. */
> > +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
> > +                              struct bpf_binary_header *ro_header,
> > +                              struct bpf_binary_header *rw_header)
> > +{
> > +     void *ptr;
> > +
> > +     ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
>
> Does this need to be wrapped with a text_mutex lock/unlock pair given
> text_poke_copy() internally relies on __text_poke() ?

Yes... Good catch. I guess we may do the lock in text_poke_copy().

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7f58fe256671..06d119c472e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -945,6 +945,7 @@  struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool use_bpf_prog_pack;
 	struct hlist_node tramp_hlist;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5855eb474c62..1cb1af917617 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -890,15 +890,6 @@  static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 	set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
-static inline struct bpf_binary_header *
-bpf_jit_binary_hdr(const struct bpf_prog *fp)
-{
-	unsigned long real_start = (unsigned long)fp->bpf_func;
-	unsigned long addr = real_start & PAGE_MASK;
-
-	return (void *)addr;
-}
-
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
@@ -1068,6 +1059,18 @@  void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
+			  unsigned int alignment,
+			  struct bpf_binary_header **rw_hdr,
+			  u8 **rw_image,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns);
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+				 struct bpf_binary_header *ro_header,
+				 struct bpf_binary_header *rw_header);
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+			      struct bpf_binary_header *rw_header);
+
 int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 				struct bpf_jit_poke_descriptor *poke);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25e34caa9a95..ff0c51ef1cb7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1031,6 +1031,109 @@  void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(size);
 }
 
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated meory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+			  unsigned int alignment,
+			  struct bpf_binary_header **rw_header,
+			  u8 **rw_image,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+	struct bpf_binary_header *ro_header;
+	u32 size, hole, start;
+
+	WARN_ON_ONCE(!is_power_of_2(alignment) ||
+		     alignment > BPF_IMAGE_ALIGNMENT);
+
+	/* add 16 bytes for a random section of illegal instructions */
+	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+	if (bpf_jit_charge_modmem(size))
+		return NULL;
+	ro_header = bpf_prog_pack_alloc(size);
+	if (!ro_header) {
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	*rw_header = kvmalloc(size, GFP_KERNEL);
+	if (!*rw_header) {
+		bpf_prog_pack_free(ro_header);
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	/* Fill space with illegal/arch-dep instructions. */
+	bpf_fill_ill_insns(*rw_header, size);
+	(*rw_header)->size = size;
+
+	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+	start = (get_random_int() % hole) & ~(alignment - 1);
+
+	*image_ptr = &ro_header->image[start];
+	*rw_image = &(*rw_header)->image[start];
+
+	return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+				 struct bpf_binary_header *ro_header,
+				 struct bpf_binary_header *rw_header)
+{
+	void *ptr;
+
+	ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+	kvfree(rw_header);
+
+	if (IS_ERR(ptr)) {
+		bpf_prog_pack_free(ro_header);
+		return PTR_ERR(ptr);
+	}
+	prog->aux->use_bpf_prog_pack = true;
+	return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ *   1) when the program is freed after;
+ *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ * Also, ro_header->size in 2) is not properly set yet, so rw_header->size
+ * is used for uncharge.
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+			      struct bpf_binary_header *rw_header)
+{
+	u32 size = rw_header ? rw_header->size : ro_header->size;
+
+	bpf_prog_pack_free(ro_header);
+	kvfree(rw_header);
+	bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr;
+
+	if (fp->aux->use_bpf_prog_pack)
+		addr = real_start & BPF_PROG_CHUNK_MASK;
+	else
+		addr = real_start & PAGE_MASK;
+
+	return (void *)addr;
+}
+
 /* This symbol is only overridden by archs that have different
  * requirements than the usual eBPF JITs, f.e. when they only
  * implement cBPF JIT, do not set images read-only, etc.
@@ -1040,7 +1143,10 @@  void __weak bpf_jit_free(struct bpf_prog *fp)
 	if (fp->jited) {
 		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
-		bpf_jit_binary_free(hdr);
+		if (fp->aux->use_bpf_prog_pack)
+			bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+		else
+			bpf_jit_binary_free(hdr);
 
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
 	}