diff mbox series

arm64: reserve [_text, _stext) virtual address range

Message ID ec2d02358eee53f095ad9a7d9c96af8f4d0d4eeb.1741636880.git.osandov@osandov.com (mailing list archive)
State New
Headers show
Series arm64: reserve [_text, _stext) virtual address range | expand

Commit Message

Omar Sandoval March 10, 2025, 8:05 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Since the referenced fixes commit, the kernel's .text section is only
mapped starting from _stext; the region [_text, _stext) is omitted. As a
result, other vmalloc/vmap allocations may use the virtual addresses
nominally in the range [_text, _stext). This address reuse confuses
multiple things:

1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
   mapping the entire range [_text, _end) to
   [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
   [_text, _stext) from /proc/vmcore therefore gives the incorrect
   result.
2. Tools doing symbolization (either by reading /proc/kallsyms or based
   on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
   allocations in [_text, _stext) as kernel symbols.

In practice, both of these issues affect the drgn debugger.
Specifically, there were cases where the vmap IRQ stacks for some CPUs
were allocated in [_text, _stext). As a result, drgn could not get the
stack trace for a crash in an IRQ handler because the core dump
contained invalid data for the IRQ stack address. The stack addresses
were also symbolized as being in the _text symbol.

Fix this by creating an unmapped vm_area to cover [_text, _stext). This
prevents other allocations from using it while still achieving the
original goal of not mapping unpredictable data.

Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
Cc: stable@vger.kernel.org
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
Based on v6.14-rc6.

Thanks!

 arch/arm64/mm/mmu.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

Comments

Will Deacon March 11, 2025, 12:54 p.m. UTC | #1
[+Ard]

On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Since the referenced fixes commit, the kernel's .text section is only
> mapped starting from _stext; the region [_text, _stext) is omitted. As a
> result, other vmalloc/vmap allocations may use the virtual addresses
> nominally in the range [_text, _stext). This address reuse confuses
> multiple things:
> 
> 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
>    mapping the entire range [_text, _end) to
>    [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
>    [_text, _stext) from /proc/vmcore therefore gives the incorrect
>    result.
> 2. Tools doing symbolization (either by reading /proc/kallsyms or based
>    on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
>    allocations in [_text, _stext) as kernel symbols.
> 
> In practice, both of these issues affect the drgn debugger.
> Specifically, there were cases where the vmap IRQ stacks for some CPUs
> were allocated in [_text, _stext). As a result, drgn could not get the
> stack trace for a crash in an IRQ handler because the core dump
> contained invalid data for the IRQ stack address. The stack addresses
> were also symbolized as being in the _text symbol.
> 
> Fix this by creating an unmapped vm_area to cover [_text, _stext). This
> prevents other allocations from using it while still achieving the
> original goal of not mapping unpredictable data.
> 
> Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
> Cc: stable@vger.kernel.org
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
> Based on v6.14-rc6.
> 
> Thanks!
> 
>  arch/arm64/mm/mmu.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index b4df5bc5b1b8..88595ea12f39 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -703,10 +703,14 @@ static void __init declare_vma(struct vm_struct *vma,
>  			       void *va_start, void *va_end,
>  			       unsigned long vm_flags)
>  {
> -	phys_addr_t pa_start = __pa_symbol(va_start);
> +	phys_addr_t pa_start = 0;
>  	unsigned long size = va_end - va_start;
>  
> -	BUG_ON(!PAGE_ALIGNED(pa_start));
> +	if (vm_flags & VM_MAP) {
> +		pa_start = __pa_symbol(va_start);
> +		BUG_ON(!PAGE_ALIGNED(pa_start));
> +	}
> +
>  	BUG_ON(!PAGE_ALIGNED(size));
>  
>  	if (!(vm_flags & VM_NO_GUARD))
> @@ -715,7 +719,7 @@ static void __init declare_vma(struct vm_struct *vma,
>  	vma->addr	= va_start;
>  	vma->phys_addr	= pa_start;
>  	vma->size	= size;
> -	vma->flags	= VM_MAP | vm_flags;
> +	vma->flags	= vm_flags;
>  	vma->caller	= __builtin_return_address(0);
>  
>  	vm_area_add_early(vma);
> @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline);
>   */
>  static void __init declare_kernel_vmas(void)
>  {
> -	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
> +	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1];
>  
> -	declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
> -	declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
> -	declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
> -	declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
> -	declare_vma(&vmlinux_seg[4], _data, _end, 0);
> +	declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD);

Should we also put the memblock reservation back as it was, so that this
region can't be allocated there?

In fact, if we're not allocating from here, why don't we just map it
anyway but without execute permissions?

Will
Ard Biesheuvel March 11, 2025, 1:32 p.m. UTC | #2
On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote:
>
> [+Ard]
>
> On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval <osandov@fb.com>
> >
> > Since the referenced fixes commit, the kernel's .text section is only
> > mapped starting from _stext; the region [_text, _stext) is omitted. As a
> > result, other vmalloc/vmap allocations may use the virtual addresses
> > nominally in the range [_text, _stext). This address reuse confuses
> > multiple things:
> >
> > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
> >    mapping the entire range [_text, _end) to
> >    [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
> >    [_text, _stext) from /proc/vmcore therefore gives the incorrect
> >    result.
> > 2. Tools doing symbolization (either by reading /proc/kallsyms or based
> >    on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
> >    allocations in [_text, _stext) as kernel symbols.
> >
> > In practice, both of these issues affect the drgn debugger.
> > Specifically, there were cases where the vmap IRQ stacks for some CPUs
> > were allocated in [_text, _stext). As a result, drgn could not get the
> > stack trace for a crash in an IRQ handler because the core dump
> > contained invalid data for the IRQ stack address. The stack addresses
> > were also symbolized as being in the _text symbol.
> >
> > Fix this by creating an unmapped vm_area to cover [_text, _stext). This
> > prevents other allocations from using it while still achieving the
> > original goal of not mapping unpredictable data.
> >
> > Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> > Based on v6.14-rc6.
> >
> > Thanks!
> >
> >  arch/arm64/mm/mmu.c | 26 +++++++++++++++++---------
> >  1 file changed, 17 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> > index b4df5bc5b1b8..88595ea12f39 100644
> > --- a/arch/arm64/mm/mmu.c
> > +++ b/arch/arm64/mm/mmu.c
> > @@ -703,10 +703,14 @@ static void __init declare_vma(struct vm_struct *vma,
> >                              void *va_start, void *va_end,
> >                              unsigned long vm_flags)
> >  {
> > -     phys_addr_t pa_start = __pa_symbol(va_start);
> > +     phys_addr_t pa_start = 0;
> >       unsigned long size = va_end - va_start;
> >
> > -     BUG_ON(!PAGE_ALIGNED(pa_start));
> > +     if (vm_flags & VM_MAP) {
> > +             pa_start = __pa_symbol(va_start);
> > +             BUG_ON(!PAGE_ALIGNED(pa_start));
> > +     }
> > +
> >       BUG_ON(!PAGE_ALIGNED(size));
> >
> >       if (!(vm_flags & VM_NO_GUARD))
> > @@ -715,7 +719,7 @@ static void __init declare_vma(struct vm_struct *vma,
> >       vma->addr       = va_start;
> >       vma->phys_addr  = pa_start;
> >       vma->size       = size;
> > -     vma->flags      = VM_MAP | vm_flags;
> > +     vma->flags      = vm_flags;
> >       vma->caller     = __builtin_return_address(0);
> >
> >       vm_area_add_early(vma);
> > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline);
> >   */
> >  static void __init declare_kernel_vmas(void)
> >  {
> > -     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
> > +     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1];
> >
> > -     declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
> > -     declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
> > -     declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
> > -     declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
> > -     declare_vma(&vmlinux_seg[4], _data, _end, 0);
> > +     declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD);
>
> Should we also put the memblock reservation back as it was, so that this
> region can't be allocated there?
>

The issue is about the virtual address space, not the physical memory
behind it, right? So the VA range should be protected from reuse, but
nothing needs to be mapped there.

> In fact, if we're not allocating from here, why don't we just map it
> anyway but without execute permissions?
>

It's just 64k so if this is the simplest approach, I won't object.

I wonder if this needs to be so intrusive, though - there is already a
precedent of VMAs not actually mapping the entire region they describe
(with guard pages), and so we might just declare the first VMA as
[_text, _etext), even though the first 64k of that region is not not
actually mapped.

However, if that confuses the bookkeeping or creates other problems,
declaring a separate VMA to reserve the VA range seems fine, although
the patch seems a bit intrusive (and I don't even see the whole
thing).
Will Deacon March 11, 2025, 2:17 p.m. UTC | #3
On Tue, Mar 11, 2025 at 02:32:47PM +0100, Ard Biesheuvel wrote:
> On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote:
> >
> > [+Ard]
> >
> > On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote:
> > > From: Omar Sandoval <osandov@fb.com>
> > >
> > > Since the referenced fixes commit, the kernel's .text section is only
> > > mapped starting from _stext; the region [_text, _stext) is omitted. As a
> > > result, other vmalloc/vmap allocations may use the virtual addresses
> > > nominally in the range [_text, _stext). This address reuse confuses
> > > multiple things:
> > >
> > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
> > >    mapping the entire range [_text, _end) to
> > >    [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
> > >    [_text, _stext) from /proc/vmcore therefore gives the incorrect
> > >    result.

[...]

> > > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline);
> > >   */
> > >  static void __init declare_kernel_vmas(void)
> > >  {
> > > -     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
> > > +     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1];
> > >
> > > -     declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
> > > -     declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
> > > -     declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
> > > -     declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
> > > -     declare_vma(&vmlinux_seg[4], _data, _end, 0);
> > > +     declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD);
> >
> > Should we also put the memblock reservation back as it was, so that this
> > region can't be allocated there?
> >
> 
> The issue is about the virtual address space, not the physical memory
> behind it, right? So the VA range should be protected from reuse, but
> nothing needs to be mapped there.

You're absolutely right, but now I'm more confused about the reference
to crash_prepare_elf64_headers() in the commit message. That sets both
the virtual (_text) and the physical (__pa_symbol(_text)) addresses in
the header, so it feels like we really need to keep that memory around
because it's accessible via /proc/vmcore.

> 
> > In fact, if we're not allocating from here, why don't we just map it
> > anyway but without execute permissions?
> >
> 
> It's just 64k so if this is the simplest approach, I won't object.
> 
> I wonder if this needs to be so intrusive, though - there is already a
> precedent of VMAs not actually mapping the entire region they describe
> (with guard pages), and so we might just declare the first VMA as
> [_text, _etext), even though the first 64k of that region is not not
> actually mapped.
> 
> However, if that confuses the bookkeeping or creates other problems,
> declaring a separate VMA to reserve the VA range seems fine, although
> the patch seems a bit intrusive (and I don't even see the whole
> thing).

As above, I think we'll have to give /proc/vmcore the physical address
of _something_.

Will
Omar Sandoval March 11, 2025, 2:54 p.m. UTC | #4
On Tue, Mar 11, 2025 at 02:17:18PM +0000, Will Deacon wrote:
> On Tue, Mar 11, 2025 at 02:32:47PM +0100, Ard Biesheuvel wrote:
> > On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote:
> > >
> > > [+Ard]
> > >
> > > On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote:
> > > > From: Omar Sandoval <osandov@fb.com>
> > > >
> > > > Since the referenced fixes commit, the kernel's .text section is only
> > > > mapped starting from _stext; the region [_text, _stext) is omitted. As a
> > > > result, other vmalloc/vmap allocations may use the virtual addresses
> > > > nominally in the range [_text, _stext). This address reuse confuses
> > > > multiple things:
> > > >
> > > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
> > > >    mapping the entire range [_text, _end) to
> > > >    [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
> > > >    [_text, _stext) from /proc/vmcore therefore gives the incorrect
> > > >    result.
> 
> [...]
> 
> > > > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline);
> > > >   */
> > > >  static void __init declare_kernel_vmas(void)
> > > >  {
> > > > -     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
> > > > +     static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1];
> > > >
> > > > -     declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
> > > > -     declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
> > > > -     declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
> > > > -     declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
> > > > -     declare_vma(&vmlinux_seg[4], _data, _end, 0);
> > > > +     declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD);
> > >
> > > Should we also put the memblock reservation back as it was, so that this
> > > region can't be allocated there?
> > >
> > 
> > The issue is about the virtual address space, not the physical memory
> > behind it, right? So the VA range should be protected from reuse, but
> > nothing needs to be mapped there.
> 
> You're absolutely right, but now I'm more confused about the reference
> to crash_prepare_elf64_headers() in the commit message. That sets both
> the virtual (_text) and the physical (__pa_symbol(_text)) addresses in
> the header, so it feels like we really need to keep that memory around
> because it's accessible via /proc/vmcore.
> 
> > 
> > > In fact, if we're not allocating from here, why don't we just map it
> > > anyway but without execute permissions?
> > >
> > 
> > It's just 64k so if this is the simplest approach, I won't object.
> > 
> > I wonder if this needs to be so intrusive, though - there is already a
> > precedent of VMAs not actually mapping the entire region they describe
> > (with guard pages), and so we might just declare the first VMA as
> > [_text, _etext), even though the first 64k of that region is not not
> > actually mapped.
> > 
> > However, if that confuses the bookkeeping or creates other problems,
> > declaring a separate VMA to reserve the VA range seems fine, although
> > the patch seems a bit intrusive (and I don't even see the whole
> > thing).
> 
> As above, I think we'll have to give /proc/vmcore the physical address
> of _something_.

I figured since there's nothing actually at that virtual address
anymore, it doesn't matter what physical address /proc/vmcore gives. But
that doesn't sound great now that you point it out. I'll rework this to
map the range non-executable.

Thanks,
Omar
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b4df5bc5b1b8..88595ea12f39 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -703,10 +703,14 @@  static void __init declare_vma(struct vm_struct *vma,
 			       void *va_start, void *va_end,
 			       unsigned long vm_flags)
 {
-	phys_addr_t pa_start = __pa_symbol(va_start);
+	phys_addr_t pa_start = 0;
 	unsigned long size = va_end - va_start;
 
-	BUG_ON(!PAGE_ALIGNED(pa_start));
+	if (vm_flags & VM_MAP) {
+		pa_start = __pa_symbol(va_start);
+		BUG_ON(!PAGE_ALIGNED(pa_start));
+	}
+
 	BUG_ON(!PAGE_ALIGNED(size));
 
 	if (!(vm_flags & VM_NO_GUARD))
@@ -715,7 +719,7 @@  static void __init declare_vma(struct vm_struct *vma,
 	vma->addr	= va_start;
 	vma->phys_addr	= pa_start;
 	vma->size	= size;
-	vma->flags	= VM_MAP | vm_flags;
+	vma->flags	= vm_flags;
 	vma->caller	= __builtin_return_address(0);
 
 	vm_area_add_early(vma);
@@ -765,13 +769,17 @@  core_initcall(map_entry_trampoline);
  */
 static void __init declare_kernel_vmas(void)
 {
-	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
+	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1];
 
-	declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
-	declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
-	declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
-	declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
-	declare_vma(&vmlinux_seg[4], _data, _end, 0);
+	declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[1], _stext, _etext, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[2], __start_rodata, __inittext_begin,
+		    VM_MAP | VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[3], __inittext_begin, __inittext_end,
+		    VM_MAP | VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[4], __initdata_begin, __initdata_end,
+		    VM_MAP | VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[5], _data, _end, VM_MAP);
 }
 
 void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,