diff mbox series

[v4,bpf-next,5/8] bpf: use module_alloc_huge for bpf_prog_pack

Message ID 20220520235758.1858153-6-song@kernel.org (mailing list archive)
State New
Headers show
Series bpf_prog_pack followup | expand

Commit Message

Song Liu May 20, 2022, 11:57 p.m. UTC
Use module_alloc_huge for bpf_prog_pack so that BPF programs sit on
PMD_SIZE pages. This benefits system performance by reducing iTLB miss
rate. Benchmark of a real web service workload shows this change gives
another ~0.2% performance boost on top of PAGE_SIZE bpf_prog_pack
(which improve system throughput by ~0.5%).

Also, remove set_vm_flush_reset_perms() from alloc_new_pack() and use
set_memory_[nx|rw] in bpf_prog_pack_free(). This is because
VM_FLUSH_RESET_PERMS does not work with huge pages yet. [1]

[1] https://lore.kernel.org/bpf/aeeeaf0b7ec63fdba55d4834d2f524d8bf05b71b.camel@intel.com/
Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Song Liu <song@kernel.org>
---
 kernel/bpf/core.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

Comments

Mike Rapoport May 24, 2022, 7:22 a.m. UTC | #1
On Fri, May 20, 2022 at 04:57:55PM -0700, Song Liu wrote:
> Use module_alloc_huge for bpf_prog_pack so that BPF programs sit on
> PMD_SIZE pages. This benefits system performance by reducing iTLB miss
> rate. Benchmark of a real web service workload shows this change gives
> another ~0.2% performance boost on top of PAGE_SIZE bpf_prog_pack
> (which improve system throughput by ~0.5%).
> 
> Also, remove set_vm_flush_reset_perms() from alloc_new_pack() and use
> set_memory_[nx|rw] in bpf_prog_pack_free(). This is because
> VM_FLUSH_RESET_PERMS does not work with huge pages yet. [1]
> 
> [1] https://lore.kernel.org/bpf/aeeeaf0b7ec63fdba55d4834d2f524d8bf05b71b.camel@intel.com/
> Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  kernel/bpf/core.c | 12 +++++++-----
>  1 file changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index cacd8684c3c4..b64d91fcb0ba 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -857,7 +857,7 @@ static size_t select_bpf_prog_pack_size(void)
>  	void *ptr;
>  
>  	size = BPF_HPAGE_SIZE * num_online_nodes();
> -	ptr = module_alloc(size);
> +	ptr = module_alloc_huge(size);
>  
>  	/* Test whether we can get huge pages. If not just use PAGE_SIZE
>  	 * packs.
> @@ -881,7 +881,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
>  		       GFP_KERNEL);
>  	if (!pack)
>  		return NULL;
> -	pack->ptr = module_alloc(bpf_prog_pack_size);
> +	pack->ptr = module_alloc_huge(bpf_prog_pack_size);
>  	if (!pack->ptr) {
>  		kfree(pack);
>  		return NULL;
> @@ -890,7 +890,6 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
>  	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
>  	list_add_tail(&pack->list, &pack_list);
>  
> -	set_vm_flush_reset_perms(pack->ptr);
>  	set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
>  	set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
>  	return pack;
> @@ -909,10 +908,9 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
>  
>  	if (size > bpf_prog_pack_size) {
>  		size = round_up(size, PAGE_SIZE);
> -		ptr = module_alloc(size);
> +		ptr = module_alloc_huge(size);
>  		if (ptr) {
>  			bpf_fill_ill_insns(ptr, size);
> -			set_vm_flush_reset_perms(ptr);
>  			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
>  			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
>  		}
> @@ -949,6 +947,8 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
>  
>  	mutex_lock(&pack_mutex);
>  	if (hdr->size > bpf_prog_pack_size) {
> +		set_memory_nx((unsigned long)hdr, hdr->size / PAGE_SIZE);
> +		set_memory_rw((unsigned long)hdr, hdr->size / PAGE_SIZE);

set_memory_{nx,rw} can fail. Please take care of error handling.

>  		module_memfree(hdr);
>  		goto out;
>  	}
> @@ -975,6 +975,8 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
>  	if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
>  				       bpf_prog_chunk_count(), 0) == 0) {
>  		list_del(&pack->list);
> +		set_memory_nx((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
> +		set_memory_rw((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);

ditto.

>  		module_memfree(pack->ptr);
>  		kfree(pack);
>  	}
> -- 
> 2.30.2
> 
>
Edgecombe, Rick P May 24, 2022, 4:42 p.m. UTC | #2
On Tue, 2022-05-24 at 10:22 +0300, Mike Rapoport wrote:
> > @@ -949,6 +947,8 @@ static void bpf_prog_pack_free(struct
> > bpf_binary_header *hdr)
> >   
> >        mutex_lock(&pack_mutex);
> >        if (hdr->size > bpf_prog_pack_size) {
> > +             set_memory_nx((unsigned long)hdr, hdr->size /
> > PAGE_SIZE);
> > +             set_memory_rw((unsigned long)hdr, hdr->size /
> > PAGE_SIZE);
> 
> set_memory_{nx,rw} can fail. Please take care of error handling.

I think there is nothing to do here, actually. At least on the freeing
part.

When called on a vmalloc primary address, the set_memory() calls will
try to first change the permissions on the vmalloc alias, then try to
change it on the direct map. Yea, it can fail from failing to allocate
a page from a split, but whatever memory managed to change earlier will
also succeed to change back on reset. The set_memory() functions don't
rollback on failure. So all the modules callers depend on this logic of
a second resetting call to reset anything that succeeded to change on
the first call. The split will have already happened for any memory
that succeeded to change, and the order of the changes is the same.

As for the primary alias, for 4k vmallocs, it will always succeed, and
set_memory() does this part first, so set_memory_x() will
(crucially) always succeed for kernel text mappings. For 2MB vmalloc
pages, the primary alias should succeed if the set_memory() call is 2MB
aligned.

So pre-VM_FLUSH_RESET_PERMS (and it also has pretty similar logic),
they all went something like this:
Allocate:
1. ptr = vmalloc();
2. set_memory_ro(ptr); <-Could fail halfway though

Free:
3. set_memory_rw(ptr); <-Could also fail halfway though and return an 
                         error, but only after the split work done in 
                         step 2.
4. vfree(ptr);

It's pretty horrible. BPF people once tried to be more proper and
change it to:
ptr = vmalloc()
if (set_memory_ro())
{
     vfree(ptr);
}

It looks correct, but this caused RO pages to be freed if
set_memory_ro() half succeeded in changing permissions. If there is an
error on reset, it means the first set_memory() call failed to change
everything, but anything that succeeded is reset. So the right thing to
do is to free the pages in either case.

We could fail to allocate a pack if the initial set_memory() calls
failed, but we still have to call set_memory_rw(), etc on free and
ignore any errors.

As an aside, this is one of the reasons it's hard to improve things
incrementally here. Everything is carefully balanced like a house of
cards. The solution IMO, is to change the interface so this type of
logic is in one place instead of scattered in the callers.
Edgecombe, Rick P June 17, 2022, 11:05 p.m. UTC | #3
On Fri, 2022-05-20 at 16:57 -0700, Song Liu wrote:
> Use module_alloc_huge for bpf_prog_pack so that BPF programs sit on
> PMD_SIZE pages. This benefits system performance by reducing iTLB
> miss
> rate. Benchmark of a real web service workload shows this change
> gives
> another ~0.2% performance boost on top of PAGE_SIZE bpf_prog_pack
> (which improve system throughput by ~0.5%).
> 
> Also, remove set_vm_flush_reset_perms() from alloc_new_pack() and use
> set_memory_[nx|rw] in bpf_prog_pack_free(). This is because
> VM_FLUSH_RESET_PERMS does not work with huge pages yet. [1]
> 
> [1] 
> https://lore.kernel.org/bpf/aeeeaf0b7ec63fdba55d4834d2f524d8bf05b71b.camel@intel.com/
> Suggested-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Song Liu <song@kernel.org>

Hi,

Per request, just wanted to make it clear I'm personally on board with
side-stepping VM_FLUSH_RESET_PERMS here because the way things are
headed it may change anyway, and this already does a better job at
minimizing eBPF JIT kernel shootdowns than VM_FLUSH_RESET_PERMS did
when it was introduced. The warning added in the other patch will
prevent accidents like the first version of the series. So that aspect
seems ok to me.

In general I think this series still needs more x86/mm scrutiny though.

Thanks,

Rick
diff mbox series

Patch

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cacd8684c3c4..b64d91fcb0ba 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -857,7 +857,7 @@  static size_t select_bpf_prog_pack_size(void)
 	void *ptr;
 
 	size = BPF_HPAGE_SIZE * num_online_nodes();
-	ptr = module_alloc(size);
+	ptr = module_alloc_huge(size);
 
 	/* Test whether we can get huge pages. If not just use PAGE_SIZE
 	 * packs.
@@ -881,7 +881,7 @@  static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
 		       GFP_KERNEL);
 	if (!pack)
 		return NULL;
-	pack->ptr = module_alloc(bpf_prog_pack_size);
+	pack->ptr = module_alloc_huge(bpf_prog_pack_size);
 	if (!pack->ptr) {
 		kfree(pack);
 		return NULL;
@@ -890,7 +890,6 @@  static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
 	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
 	list_add_tail(&pack->list, &pack_list);
 
-	set_vm_flush_reset_perms(pack->ptr);
 	set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 	set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 	return pack;
@@ -909,10 +908,9 @@  static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
 
 	if (size > bpf_prog_pack_size) {
 		size = round_up(size, PAGE_SIZE);
-		ptr = module_alloc(size);
+		ptr = module_alloc_huge(size);
 		if (ptr) {
 			bpf_fill_ill_insns(ptr, size);
-			set_vm_flush_reset_perms(ptr);
 			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
 			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
 		}
@@ -949,6 +947,8 @@  static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 
 	mutex_lock(&pack_mutex);
 	if (hdr->size > bpf_prog_pack_size) {
+		set_memory_nx((unsigned long)hdr, hdr->size / PAGE_SIZE);
+		set_memory_rw((unsigned long)hdr, hdr->size / PAGE_SIZE);
 		module_memfree(hdr);
 		goto out;
 	}
@@ -975,6 +975,8 @@  static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
 				       bpf_prog_chunk_count(), 0) == 0) {
 		list_del(&pack->list);
+		set_memory_nx((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+		set_memory_rw((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 		module_memfree(pack->ptr);
 		kfree(pack);
 	}