diff mbox series

[RFC,4/5] vmalloc_exec: share a huge page with kernel text

Message ID 20220818224218.2399791-5-song@kernel.org (mailing list archive)
State New
Headers show
Series vmalloc_exec for modules and BPF programs | expand

Commit Message

Song Liu Aug. 18, 2022, 10:42 p.m. UTC
On x86 kernel, we allocate 2MB pages for kernel text up to
round_down(_etext, 2MB). Therefore, some of the kernel text is still
on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
round_up(_etext, 2MB), and use the rest of the page for modules and
BPF programs.

Here is an example:

[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext

[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]

[root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd

[root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
ffffffff822bc580 t xfs_flush_inodes     [xfs]

ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
module, and bpf programs.
---
 arch/x86/mm/init_64.c |  3 ++-
 mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

Comments

Luis Chamberlain Oct. 6, 2022, 11:44 p.m. UTC | #1
On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
> On x86 kernel, we allocate 2MB pages for kernel text up to
> round_down(_etext, 2MB). Therefore, some of the kernel text is still
> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> round_up(_etext, 2MB), and use the rest of the page for modules and
> BPF programs.
> 
> Here is an example:
> 
> [root@eth50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
> 
> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> 
> [root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd
> 
> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
> ffffffff822bc580 t xfs_flush_inodes     [xfs]
> 
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
> module, and bpf programs.

This is pretty rad. I'm not sure how you were able to squeeze xfs and
*more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
xfs is 3.6847 MiB. How big is your XFS module?

I don't grok mm stuff, but I'd like to understand why we gain the ability
of re-use the same 2 MiB page with this patch, from the code I really
can't tail. Any pointers?

But, I'm still concerned about the free'ing case in terms of
fragmentation for contigous memory, when free huage pages are available.

  Luis

> ---
>  arch/x86/mm/init_64.c |  3 ++-
>  mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 39c5246964a9..d27d0af5beb5 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>  
>  int kernel_set_to_readonly;
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>  void mark_rodata_ro(void)
>  {
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>  	unsigned long end = (unsigned long)__end_rodata_hpage_align;
> -	unsigned long text_end = PFN_ALIGN(_etext);
> +	unsigned long text_end = PMD_ALIGN(_etext);
>  	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>  	unsigned long all_end;
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 472287e71bf1..5f3b5df9313f 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>  static const bool vmap_allow_huge = false;
>  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
> +
> +static struct vm_struct text_tail_vm;
> +static struct vmap_area text_tail_va;
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>  	if (addr >= MODULES_VADDR && addr < MODULES_END)
>  		return 1;
> +	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
> +		return 1;
>  #endif
>  	return is_vmalloc_addr(x);
>  }
> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> +static void register_text_tail_vm(void)
> +{
> +	unsigned long start = PFN_ALIGN(_etext);
> +	unsigned long end = PMD_ALIGN(_etext);
> +	struct vmap_area *va;
> +
> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(!va))
> +		return;
> +	text_tail_vm.addr = (void *)start;
> +	text_tail_vm.size = end - start;
> +	text_tail_vm.flags = VM_KERNEL_EXEC;
> +	text_tail_va.va_start = start;
> +	text_tail_va.va_end = end;
> +	text_tail_va.vm = &text_tail_vm;
> +	memcpy(va, &text_tail_va, sizeof(*va));
> +	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +}
> +
>  void __init vmalloc_init(void)
>  {
>  	struct vmap_area *va;
> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>  	 * Create the cache for vmap_area objects.
>  	 */
>  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
> +	register_text_tail_vm();
>  
>  	for_each_possible_cpu(i) {
>  		struct vmap_block_queue *vbq;
> -- 
> 2.30.2
>
Song Liu Oct. 7, 2022, 6:53 a.m. UTC | #2
> On Oct 6, 2022, at 4:44 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
>> On x86 kernel, we allocate 2MB pages for kernel text up to
>> round_down(_etext, 2MB). Therefore, some of the kernel text is still
>> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
>> round_up(_etext, 2MB), and use the rest of the page for modules and
>> BPF programs.
>> 
>> Here is an example:
>> 
>> [root@eth50-1 ~]# grep _etext /proc/kallsyms
>> ffffffff82202a08 T _etext
>> 
>> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
>> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
>> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
>> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
>> 
>> [root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
>> 0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd
>> 
>> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
>> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
>> ffffffff822bc580 t xfs_flush_inodes     [xfs]
>> 
>> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
>> module, and bpf programs.
> 
> This is pretty rad. I'm not sure how you were able to squeeze xfs and
> *more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
> xfs is 3.6847 MiB. How big is your XFS module?

In my build, xfs.ko is 50MB before strip, and 3.1MB after strip. But the
text section is about 1.3MB, so it fits in one 2MB page. 

> 
> I don't grok mm stuff, but I'd like to understand why we gain the ability
> of re-use the same 2 MiB page with this patch, from the code I really
> can't tail. Any pointers?

I don't quite follow the question here. In this case, we allocate one more
2MB page, so that some static kernel text can use it, and shall it with 
dynamic kernel text. Does this answer your questions?

I am working on a newer version of this. I am planning to resend when it 
is stable for BPF programs. For modules, I think we will need more 
discussion about the interface with arch code. 

Thanks,
Song

> 
> But, I'm still concerned about the free'ing case in terms of
> fragmentation for contigous memory, when free huage pages are available.
> 
>  Luis
> 
>> ---
>> arch/x86/mm/init_64.c |  3 ++-
>> mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
>> 2 files changed, 29 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 39c5246964a9..d27d0af5beb5 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>> 
>> int kernel_set_to_readonly;
>> 
>> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> void mark_rodata_ro(void)
>> {
>> 	unsigned long start = PFN_ALIGN(_text);
>> 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>> 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
>> -	unsigned long text_end = PFN_ALIGN(_etext);
>> +	unsigned long text_end = PMD_ALIGN(_etext);
>> 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>> 	unsigned long all_end;
>> 
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 472287e71bf1..5f3b5df9313f 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>> static const bool vmap_allow_huge = false;
>> #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>> 
>> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> +
>> +static struct vm_struct text_tail_vm;
>> +static struct vmap_area text_tail_va;
>> +
>> bool is_vmalloc_addr(const void *x)
>> {
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> 	if (addr >= MODULES_VADDR && addr < MODULES_END)
>> 		return 1;
>> +	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
>> +		return 1;
>> #endif
>> 	return is_vmalloc_addr(x);
>> }
>> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>> 	}
>> }
>> 
>> +static void register_text_tail_vm(void)
>> +{
>> +	unsigned long start = PFN_ALIGN(_etext);
>> +	unsigned long end = PMD_ALIGN(_etext);
>> +	struct vmap_area *va;
>> +
>> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
>> +	if (WARN_ON_ONCE(!va))
>> +		return;
>> +	text_tail_vm.addr = (void *)start;
>> +	text_tail_vm.size = end - start;
>> +	text_tail_vm.flags = VM_KERNEL_EXEC;
>> +	text_tail_va.va_start = start;
>> +	text_tail_va.va_end = end;
>> +	text_tail_va.vm = &text_tail_vm;
>> +	memcpy(va, &text_tail_va, sizeof(*va));
>> +	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
>> +}
>> +
>> void __init vmalloc_init(void)
>> {
>> 	struct vmap_area *va;
>> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>> 	 * Create the cache for vmap_area objects.
>> 	 */
>> 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
>> +	register_text_tail_vm();
>> 
>> 	for_each_possible_cpu(i) {
>> 		struct vmap_block_queue *vbq;
>> -- 
>> 2.30.2
>>
diff mbox series

Patch

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 39c5246964a9..d27d0af5beb5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1367,12 +1367,13 @@  int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
 
 int kernel_set_to_readonly;
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
-	unsigned long text_end = PFN_ALIGN(_etext);
+	unsigned long text_end = PMD_ALIGN(_etext);
 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
 	unsigned long all_end;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 472287e71bf1..5f3b5df9313f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -72,6 +72,11 @@  early_param("nohugevmalloc", set_nohugevmalloc);
 static const bool vmap_allow_huge = false;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
+
+static struct vm_struct text_tail_vm;
+static struct vmap_area text_tail_va;
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -634,6 +639,8 @@  int is_vmalloc_or_module_addr(const void *x)
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
 	if (addr >= MODULES_VADDR && addr < MODULES_END)
 		return 1;
+	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
+		return 1;
 #endif
 	return is_vmalloc_addr(x);
 }
@@ -2371,6 +2378,25 @@  static void vmap_init_free_space(void)
 	}
 }
 
+static void register_text_tail_vm(void)
+{
+	unsigned long start = PFN_ALIGN(_etext);
+	unsigned long end = PMD_ALIGN(_etext);
+	struct vmap_area *va;
+
+	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+	if (WARN_ON_ONCE(!va))
+		return;
+	text_tail_vm.addr = (void *)start;
+	text_tail_vm.size = end - start;
+	text_tail_vm.flags = VM_KERNEL_EXEC;
+	text_tail_va.va_start = start;
+	text_tail_va.va_end = end;
+	text_tail_va.vm = &text_tail_vm;
+	memcpy(va, &text_tail_va, sizeof(*va));
+	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@@ -2381,6 +2407,7 @@  void __init vmalloc_init(void)
 	 * Create the cache for vmap_area objects.
 	 */
 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+	register_text_tail_vm();
 
 	for_each_possible_cpu(i) {
 		struct vmap_block_queue *vbq;