diff mbox series

[RFC,v2,4/4] vmalloc_exec: share a huge page with kernel text

Message ID 20221007234315.2877365-5-song@kernel.org (mailing list archive)
State New
Headers show
Series vmalloc_exec for modules and BPF programs | expand

Commit Message

Song Liu Oct. 7, 2022, 11:43 p.m. UTC
On x86 kernel, we allocate 2MB pages for kernel text up to
round_down(_etext, 2MB). Therefore, some of the kernel text is still
on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
round_up(_etext, 2MB), and use the rest of the page for modules and
BPF programs.

Here is an example:

[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext

[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]

[root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd

[root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
ffffffff822bc580 t xfs_flush_inodes     [xfs]

ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
module, and bpf programs.

Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/mm/init_64.c |  3 ++-
 mm/vmalloc.c          | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

Comments

Edgecombe, Rick P Oct. 10, 2022, 6:32 p.m. UTC | #1
On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote:
> On x86 kernel, we allocate 2MB pages for kernel text up to
> round_down(_etext, 2MB). Therefore, some of the kernel text is still
> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> round_up(_etext, 2MB), and use the rest of the page for modules and
> BPF programs.
> 
> Here is an example:
> 
> [root@eth50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
> 
> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> ffffffff8220f920 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> ffffffff8220fa28 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> ffffffff8220fad4 t
> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> 
> [root@eth50-1 ~]#  grep 0xffffffff82200000
> /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-
> 0xffffffff82400000     2M     ro   PSE         x  pmd
> 
> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
> ffffffff822bc580 t xfs_flush_inodes     [xfs]
> 
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text,
> xfs
> module, and bpf programs.

Can this memory range be freed as part of a vfree_exec() call then?
Does vmalloc actually try to unmap it? If so, it could get complicated
with PTI.

It probably should be a special case that never gets fully freed.

> 
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  arch/x86/mm/init_64.c |  3 ++-
>  mm/vmalloc.c          | 24 ++++++++++++++++++++++++
>  2 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 0fe690ebc269..d94f196c541a 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1367,12 +1367,13 @@ int __init
> deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>  
>  int kernel_set_to_readonly;
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) &
> PMD_MASK)
>  void mark_rodata_ro(void)
>  {
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>  	unsigned long end = (unsigned long)__end_rodata_hpage_align;
> -	unsigned long text_end = PFN_ALIGN(_etext);
> +	unsigned long text_end = PMD_ALIGN(_etext);

This should probably have more logic and adjustments. If etext is PMD
aligned, some of the stuff outside the diff won't do anything.

Also, if a kernel doesn't have modules or BPF JIT it would be a waste
of memory.

>  	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>  	unsigned long all_end;
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 9212ff96b871..41509bbec583 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false;
>  #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
>  #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
>  
> +static struct vm_struct text_tail_vm;
> +static struct vmap_area text_tail_va;
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x)
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>  	if (addr >= MODULES_VADDR && addr < MODULES_END)
>  		return 1;
> +	if (addr >= text_tail_va.va_start && addr <
> text_tail_va.va_end)
> +		return 1;
>  #endif
>  	return is_vmalloc_addr(x);
>  }
> @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> +static void register_text_tail_vm(void)
> +{
> +	unsigned long start = PFN_ALIGN((unsigned long)_etext);
> +	unsigned long end = PMD_ALIGN((unsigned long)_etext);
> +	struct vmap_area *va;
> +
> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(!va))
> +		return;
> +	text_tail_vm.addr = (void *)start;
> +	text_tail_vm.size = end - start;
> +	text_tail_va.va_start = start;
> +	text_tail_va.va_end = end;
> +	text_tail_va.vm = &text_tail_vm;
> +	memcpy(va, &text_tail_va, sizeof(*va));
> +	insert_vmap_area_augment(va, NULL, &free_text_area_root,
> &free_text_area_list);
> +}
> +
>  void __init vmalloc_init(void)
>  {
>  	struct vmap_area *va;
> @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void)
>  	 * Create the cache for vmap_area objects.
>  	 */
>  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
> +	register_text_tail_vm();
>  
>  	for_each_possible_cpu(i) {
>  		struct vmap_block_queue *vbq;
Song Liu Oct. 10, 2022, 7:08 p.m. UTC | #2
> On Oct 10, 2022, at 11:32 AM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote:
> 
> On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote:
>> On x86 kernel, we allocate 2MB pages for kernel text up to
>> round_down(_etext, 2MB). Therefore, some of the kernel text is still
>> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
>> round_up(_etext, 2MB), and use the rest of the page for modules and
>> BPF programs.
>> 
>> Here is an example:
>> 
>> [root@eth50-1 ~]# grep _etext /proc/kallsyms
>> ffffffff82202a08 T _etext
>> 
>> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
>> ffffffff8220f920 t
>> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
>> ffffffff8220fa28 t
>> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
>> ffffffff8220fad4 t
>> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
>> 
>> [root@eth50-1 ~]#  grep 0xffffffff82200000
>> /sys/kernel/debug/page_tables/kernel
>> 0xffffffff82200000-
>> 0xffffffff82400000     2M     ro   PSE         x  pmd
>> 
>> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
>> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
>> ffffffff822bc580 t xfs_flush_inodes     [xfs]
>> 
>> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text,
>> xfs
>> module, and bpf programs.
> 
> Can this memory range be freed as part of a vfree_exec() call then?
> Does vmalloc actually try to unmap it? If so, it could get complicated
> with PTI.
> 
> It probably should be a special case that never gets fully freed.

Right, this is never freed. 

> 
>> 
>> Signed-off-by: Song Liu <song@kernel.org>
>> ---
>> arch/x86/mm/init_64.c |  3 ++-
>> mm/vmalloc.c          | 24 ++++++++++++++++++++++++
>> 2 files changed, 26 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 0fe690ebc269..d94f196c541a 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -1367,12 +1367,13 @@ int __init
>> deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>> 
>> int kernel_set_to_readonly;
>> 
>> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) &
>> PMD_MASK)
>> void mark_rodata_ro(void)
>> {
>> 	unsigned long start = PFN_ALIGN(_text);
>> 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>> 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
>> -	unsigned long text_end = PFN_ALIGN(_etext);
>> +	unsigned long text_end = PMD_ALIGN(_etext);
> 
> This should probably have more logic and adjustments. If etext is PMD
> aligned, some of the stuff outside the diff won't do anything.

Hmm.. I don't quite follow this comment. If the etext is PMD aligned, 
we can still use vmalloc_exec to allocate memory. So it shouldn't 
matter, no?

> 
> Also, if a kernel doesn't have modules or BPF JIT it would be a waste
> of memory.

I guess we can add a command line argument for these corner cases? 

Thanks,
Song

> 
>> 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>> 	unsigned long all_end;
>> 
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 9212ff96b871..41509bbec583 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false;
>> #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
>> #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
>> 
>> +static struct vm_struct text_tail_vm;
>> +static struct vmap_area text_tail_va;
>> +
>> bool is_vmalloc_addr(const void *x)
>> {
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x)
>> 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> 	if (addr >= MODULES_VADDR && addr < MODULES_END)
>> 		return 1;
>> +	if (addr >= text_tail_va.va_start && addr <
>> text_tail_va.va_end)
>> +		return 1;
>> #endif
>> 	return is_vmalloc_addr(x);
>> }
>> @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void)
>> 	}
>> }
>> 
>> +static void register_text_tail_vm(void)
>> +{
>> +	unsigned long start = PFN_ALIGN((unsigned long)_etext);
>> +	unsigned long end = PMD_ALIGN((unsigned long)_etext);
>> +	struct vmap_area *va;
>> +
>> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
>> +	if (WARN_ON_ONCE(!va))
>> +		return;
>> +	text_tail_vm.addr = (void *)start;
>> +	text_tail_vm.size = end - start;
>> +	text_tail_va.va_start = start;
>> +	text_tail_va.va_end = end;
>> +	text_tail_va.vm = &text_tail_vm;
>> +	memcpy(va, &text_tail_va, sizeof(*va));
>> +	insert_vmap_area_augment(va, NULL, &free_text_area_root,
>> &free_text_area_list);
>> +}
>> +
>> void __init vmalloc_init(void)
>> {
>> 	struct vmap_area *va;
>> @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void)
>> 	 * Create the cache for vmap_area objects.
>> 	 */
>> 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
>> +	register_text_tail_vm();
>> 
>> 	for_each_possible_cpu(i) {
>> 		struct vmap_block_queue *vbq;
Edgecombe, Rick P Oct. 10, 2022, 8:09 p.m. UTC | #3
On Mon, 2022-10-10 at 19:08 +0000, Song Liu wrote:
> > On Oct 10, 2022, at 11:32 AM, Edgecombe, Rick P <
> > rick.p.edgecombe@intel.com> wrote:
> > 
> > On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote:
> > > On x86 kernel, we allocate 2MB pages for kernel text up to
> > > round_down(_etext, 2MB). Therefore, some of the kernel text is
> > > still
> > > on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> > > round_up(_etext, 2MB), and use the rest of the page for modules
> > > and
> > > BPF programs.
> > > 
> > > Here is an example:
> > > 
> > > [root@eth50-1 ~]# grep _etext /proc/kallsyms
> > > ffffffff82202a08 T _etext
> > > 
> > > [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> > > ffffffff8220f920 t
> > > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> > > ffffffff8220fa28 t
> > > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> > > ffffffff8220fad4 t
> > > bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> > > 
> > > [root@eth50-1 ~]#  grep 0xffffffff82200000
> > > /sys/kernel/debug/page_tables/kernel
> > > 0xffffffff82200000-
> > > 0xffffffff82400000     2M     ro   PSE         x  pmd
> > > 
> > > [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> > > ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
> > > ffffffff822bc580 t xfs_flush_inodes     [xfs]
> > > 
> > > ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel
> > > text,
> > > xfs
> > > module, and bpf programs.
> > 
> > Can this memory range be freed as part of a vfree_exec() call then?
> > Does vmalloc actually try to unmap it? If so, it could get
> > complicated
> > with PTI.
> > 
> > It probably should be a special case that never gets fully freed.
> 
> Right, this is never freed. 

Can we get a comment somewhere highlighting how this is avoided?

Maybe this is just me missing some vmalloc understanding, but this
pointer to an all zero vm_struct seems weird too. Are there other vmap
allocations like this? Which vmap APIs work with this and which don't?

> 
> > 
> > > 
> > > Signed-off-by: Song Liu <song@kernel.org>
> > > ---
> > > arch/x86/mm/init_64.c |  3 ++-
> > > mm/vmalloc.c          | 24 ++++++++++++++++++++++++
> > > 2 files changed, 26 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> > > index 0fe690ebc269..d94f196c541a 100644
> > > --- a/arch/x86/mm/init_64.c
> > > +++ b/arch/x86/mm/init_64.c
> > > @@ -1367,12 +1367,13 @@ int __init
> > > deferred_page_init_max_threads(const struct cpumask
> > > *node_cpumask)
> > > 
> > > int kernel_set_to_readonly;
> > > 
> > > +#define PMD_ALIGN(x)        (((unsigned long)(x) + (PMD_SIZE -
> > > 1)) &
> > > PMD_MASK)
> > > void mark_rodata_ro(void)
> > > {
> > >       unsigned long start = PFN_ALIGN(_text);
> > >       unsigned long rodata_start = PFN_ALIGN(__start_rodata);
> > >       unsigned long end = (unsigned
> > > long)__end_rodata_hpage_align;
> > > -    unsigned long text_end = PFN_ALIGN(_etext);
> > > +    unsigned long text_end = PMD_ALIGN(_etext);
> > 
> > This should probably have more logic and adjustments. If etext is
> > PMD
> > aligned, some of the stuff outside the diff won't do anything.
> 
> Hmm.. I don't quite follow this comment. If the etext is PMD
> aligned, 
> we can still use vmalloc_exec to allocate memory. So it shouldn't 
> matter, no?

Maybe this doesn't matter since PMD alignment must happen naturally
sometimes. I was just noticing the attempts to operate on this region
between etext and start_rodata (free_init_pages(), etc). If this was
never not PMD aligned they could be dropped. But if you are going to
adjust the behavior for !CONFIG_MODULES, etc, then it is still needed.
Edgecombe, Rick P Oct. 11, 2022, 8:40 p.m. UTC | #4
On Tue, 2022-10-11 at 16:25 +0000, Song Liu wrote:
> > Maybe this is just me missing some vmalloc understanding, but this
> > pointer to an all zero vm_struct seems weird too. Are there other
> > vmap
> > allocations like this? Which vmap APIs work with this and which
> > don't?
> 
> There are two vmap trees at the moment: free_area_ tree and 
> vmap_area_ tree. free_area_ tree uses vmap->subtree_max_size, while 
> vmap_area_ tree contains vmap backed by vm_struct, and thus uses 
> vmap->vm. 
> 
> This set add a new tree, free_text_area_. This tree is different to 
> the other two, as it uses subtree_max_size, and it is also backed 
> by vm_struct. To handle this requirement without growing vmap_struct,
> we introduced all_text_vm to store the vm_struct for free_text_area_
> tree. 
> 
> free_text_area_ tree is different to vmap_area_ tree. Each vmap in
> vmap_area_ tree has its own vm_struct (1 to 1 mapping), while 
> multiple vmap in free_text_area_ tree map to a single vm_struct.
> 
> Also, free_text_area_ handles granularity < PAGE_SIZE; while the
> other two trees only work with PAGE_SIZE aligned memory. 
> 
> Does this answer your questions? 

I mean from the perspective of someone trying to use this without
diving into the entire implementation.

The function is called vmalloc_exec() and is freed with vfree_exec().
Makes sense. But with the other vmallocs_foo's (including previous
vmalloc_exec() implementations) you can call find_vm_area(), etc on
them. They show in "vmallocinfo" and generally behave similarly. That
isn't true for these new allocations, right?

Then you have code that operates on module text like:
if (is_vmalloc_or_module_addr(addr))
	pfn = vmalloc_to_pfn(addr);

It looks like it would work (on x86 at least). Should it be expected
to?

Especially after this patch, where there is memory that isn't even
tracked by the original vmap_area trees, it is pretty much a separate
allocator. So I think it might be nice to spell out which other vmalloc
APIs work with these new functions since they are named "vmalloc".
Maybe just say none of them do.


Separate from that, I guess you are planning to make this limited to
certain architectures? It might be better to put logic with assumptions
about x86 boot time page table details inside arch/x86 somewhere.
Song Liu Oct. 12, 2022, 5:37 a.m. UTC | #5
> On Oct 11, 2022, at 1:40 PM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote:
> 
> On Tue, 2022-10-11 at 16:25 +0000, Song Liu wrote:
>>> Maybe this is just me missing some vmalloc understanding, but this
>>> pointer to an all zero vm_struct seems weird too. Are there other
>>> vmap
>>> allocations like this? Which vmap APIs work with this and which
>>> don't?
>> 
>> There are two vmap trees at the moment: free_area_ tree and 
>> vmap_area_ tree. free_area_ tree uses vmap->subtree_max_size, while 
>> vmap_area_ tree contains vmap backed by vm_struct, and thus uses 
>> vmap->vm. 
>> 
>> This set add a new tree, free_text_area_. This tree is different to 
>> the other two, as it uses subtree_max_size, and it is also backed 
>> by vm_struct. To handle this requirement without growing vmap_struct,
>> we introduced all_text_vm to store the vm_struct for free_text_area_
>> tree. 
>> 
>> free_text_area_ tree is different to vmap_area_ tree. Each vmap in
>> vmap_area_ tree has its own vm_struct (1 to 1 mapping), while 
>> multiple vmap in free_text_area_ tree map to a single vm_struct.
>> 
>> Also, free_text_area_ handles granularity < PAGE_SIZE; while the
>> other two trees only work with PAGE_SIZE aligned memory. 
>> 
>> Does this answer your questions? 
> 
> I mean from the perspective of someone trying to use this without
> diving into the entire implementation.
> 
> The function is called vmalloc_exec() and is freed with vfree_exec().
> Makes sense. But with the other vmallocs_foo's (including previous
> vmalloc_exec() implementations) you can call find_vm_area(), etc on
> them. They show in "vmallocinfo" and generally behave similarly. That
> isn't true for these new allocations, right?

That's right. These operations are not supported (at least for now). 

> 
> Then you have code that operates on module text like:
> if (is_vmalloc_or_module_addr(addr))
> 	pfn = vmalloc_to_pfn(addr);
> 
> It looks like it would work (on x86 at least). Should it be expected
> to?
> 
> Especially after this patch, where there is memory that isn't even
> tracked by the original vmap_area trees, it is pretty much a separate
> allocator. So I think it might be nice to spell out which other vmalloc
> APIs work with these new functions since they are named "vmalloc".
> Maybe just say none of them do.

I guess it is fair to call this a separate allocator. Maybe 
vmalloc_exec is not the right name? I do think this is the best 
way to build an allocator with vmap tree logic. 

> 
> 
> Separate from that, I guess you are planning to make this limited to
> certain architectures? It might be better to put logic with assumptions
> about x86 boot time page table details inside arch/x86 somewhere.

Yes, the architecture need some text_poke mechanism to use this. 
On BPF side, x86_64 calls this directly from arch code (jit engine), 
so it is mostly covered. For modules, we need to handle this better. 

Thanks,
Song
Edgecombe, Rick P Oct. 12, 2022, 6:38 p.m. UTC | #6
On Wed, 2022-10-12 at 05:37 +0000, Song Liu wrote:
> > Then you have code that operates on module text like:
> > if (is_vmalloc_or_module_addr(addr))
> >        pfn = vmalloc_to_pfn(addr);
> > 
> > It looks like it would work (on x86 at least). Should it be
> > expected
> > to?
> > 
> > Especially after this patch, where there is memory that isn't even
> > tracked by the original vmap_area trees, it is pretty much a
> > separate
> > allocator. So I think it might be nice to spell out which other
> > vmalloc
> > APIs work with these new functions since they are named "vmalloc".
> > Maybe just say none of them do.
> 
> I guess it is fair to call this a separate allocator. Maybe 
> vmalloc_exec is not the right name? I do think this is the best 
> way to build an allocator with vmap tree logic. 

Yea, I don't know about the name. I think someone else suggested it
specifically, right?

I had called mine perm_alloc() so it could also handle read-only and
other permissions. If you keep vmalloc_exec() it needs some big
comments about which APIs can work with it, and an audit of the
existing code that works on module and JIT text.

> 
> > 
> > 
> > Separate from that, I guess you are planning to make this limited
> > to
> > certain architectures? It might be better to put logic with
> > assumptions
> > about x86 boot time page table details inside arch/x86 somewhere.
> 
> Yes, the architecture need some text_poke mechanism to use this. 

It also depends on the space between _etext and the PMD aligned _etext
to be present and not get used by anything else. For other
architectures, there might be rodata there or other things.

> On BPF side, x86_64 calls this directly from arch code (jit engine), 
> so it is mostly covered. For modules, we need to handle this better. 

That old RFC has some ideas around this. I kind of like your
incremental approach though. To me it seems to be moving in the right
direction.
Song Liu Oct. 12, 2022, 7:01 p.m. UTC | #7
> On Oct 12, 2022, at 11:38 AM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote:
> 
> On Wed, 2022-10-12 at 05:37 +0000, Song Liu wrote:
>>> Then you have code that operates on module text like:
>>> if (is_vmalloc_or_module_addr(addr))
>>>       pfn = vmalloc_to_pfn(addr);
>>> 
>>> It looks like it would work (on x86 at least). Should it be
>>> expected
>>> to?
>>> 
>>> Especially after this patch, where there is memory that isn't even
>>> tracked by the original vmap_area trees, it is pretty much a
>>> separate
>>> allocator. So I think it might be nice to spell out which other
>>> vmalloc
>>> APIs work with these new functions since they are named "vmalloc".
>>> Maybe just say none of them do.
>> 
>> I guess it is fair to call this a separate allocator. Maybe 
>> vmalloc_exec is not the right name? I do think this is the best 
>> way to build an allocator with vmap tree logic. 
> 
> Yea, I don't know about the name. I think someone else suggested it
> specifically, right?

I think Luis suggested rename module_alloc to vmalloc_exec. But I 
guess we still need module_alloc for module data allocations. 

> 
> I had called mine perm_alloc() so it could also handle read-only and
> other permissions.

What are other permissions that we use? We can probably duplicate
the free_text_are_ tree logic for other cases. 


> If you keep vmalloc_exec() it needs some big
> comments about which APIs can work with it, and an audit of the
> existing code that works on module and JIT text.
> 
>> 
>>> 
>>> 
>>> Separate from that, I guess you are planning to make this limited
>>> to
>>> certain architectures? It might be better to put logic with
>>> assumptions
>>> about x86 boot time page table details inside arch/x86 somewhere.
>> 
>> Yes, the architecture need some text_poke mechanism to use this. 
> 
> It also depends on the space between _etext and the PMD aligned _etext
> to be present and not get used by anything else. For other
> architectures, there might be rodata there or other things.

Good point! We need to make sure this part is not used by other things.

> 
>> On BPF side, x86_64 calls this directly from arch code (jit engine), 
>> so it is mostly covered. For modules, we need to handle this better. 
> 
> That old RFC has some ideas around this. I kind of like your
> incremental approach though. To me it seems to be moving in the right
> direction.

Thanks!
Song
diff mbox series

Patch

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0fe690ebc269..d94f196c541a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1367,12 +1367,13 @@  int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
 
 int kernel_set_to_readonly;
 
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
-	unsigned long text_end = PFN_ALIGN(_etext);
+	unsigned long text_end = PMD_ALIGN(_etext);
 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
 	unsigned long all_end;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9212ff96b871..41509bbec583 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -75,6 +75,9 @@  static const bool vmap_allow_huge = false;
 #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
 #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
 
+static struct vm_struct text_tail_vm;
+static struct vmap_area text_tail_va;
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -637,6 +640,8 @@  int is_vmalloc_or_module_addr(const void *x)
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
 	if (addr >= MODULES_VADDR && addr < MODULES_END)
 		return 1;
+	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
+		return 1;
 #endif
 	return is_vmalloc_addr(x);
 }
@@ -2422,6 +2427,24 @@  static void vmap_init_free_space(void)
 	}
 }
 
+static void register_text_tail_vm(void)
+{
+	unsigned long start = PFN_ALIGN((unsigned long)_etext);
+	unsigned long end = PMD_ALIGN((unsigned long)_etext);
+	struct vmap_area *va;
+
+	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+	if (WARN_ON_ONCE(!va))
+		return;
+	text_tail_vm.addr = (void *)start;
+	text_tail_vm.size = end - start;
+	text_tail_va.va_start = start;
+	text_tail_va.va_end = end;
+	text_tail_va.vm = &text_tail_vm;
+	memcpy(va, &text_tail_va, sizeof(*va));
+	insert_vmap_area_augment(va, NULL, &free_text_area_root, &free_text_area_list);
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@@ -2432,6 +2455,7 @@  void __init vmalloc_init(void)
 	 * Create the cache for vmap_area objects.
 	 */
 	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+	register_text_tail_vm();
 
 	for_each_possible_cpu(i) {
 		struct vmap_block_queue *vbq;