Message ID | ec2d02358eee53f095ad9a7d9c96af8f4d0d4eeb.1741636880.git.osandov@osandov.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | arm64: reserve [_text, _stext) virtual address range | expand |
[+Ard] On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote: > From: Omar Sandoval <osandov@fb.com> > > Since the referenced fixes commit, the kernel's .text section is only > mapped starting from _stext; the region [_text, _stext) is omitted. As a > result, other vmalloc/vmap allocations may use the virtual addresses > nominally in the range [_text, _stext). This address reuse confuses > multiple things: > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore > mapping the entire range [_text, _end) to > [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in > [_text, _stext) from /proc/vmcore therefore gives the incorrect > result. > 2. Tools doing symbolization (either by reading /proc/kallsyms or based > on the vmlinux ELF file) will incorrectly identify vmalloc/vmap > allocations in [_text, _stext) as kernel symbols. > > In practice, both of these issues affect the drgn debugger. > Specifically, there were cases where the vmap IRQ stacks for some CPUs > were allocated in [_text, _stext). As a result, drgn could not get the > stack trace for a crash in an IRQ handler because the core dump > contained invalid data for the IRQ stack address. The stack addresses > were also symbolized as being in the _text symbol. > > Fix this by creating an unmapped vm_area to cover [_text, _stext). This > prevents other allocations from using it while still achieving the > original goal of not mapping unpredictable data. > > Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping") > Cc: stable@vger.kernel.org > Signed-off-by: Omar Sandoval <osandov@fb.com> > --- > Based on v6.14-rc6. > > Thanks! > > arch/arm64/mm/mmu.c | 26 +++++++++++++++++--------- > 1 file changed, 17 insertions(+), 9 deletions(-) > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index b4df5bc5b1b8..88595ea12f39 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -703,10 +703,14 @@ static void __init declare_vma(struct vm_struct *vma, > void *va_start, void *va_end, > unsigned long vm_flags) > { > - phys_addr_t pa_start = __pa_symbol(va_start); > + phys_addr_t pa_start = 0; > unsigned long size = va_end - va_start; > > - BUG_ON(!PAGE_ALIGNED(pa_start)); > + if (vm_flags & VM_MAP) { > + pa_start = __pa_symbol(va_start); > + BUG_ON(!PAGE_ALIGNED(pa_start)); > + } > + > BUG_ON(!PAGE_ALIGNED(size)); > > if (!(vm_flags & VM_NO_GUARD)) > @@ -715,7 +719,7 @@ static void __init declare_vma(struct vm_struct *vma, > vma->addr = va_start; > vma->phys_addr = pa_start; > vma->size = size; > - vma->flags = VM_MAP | vm_flags; > + vma->flags = vm_flags; > vma->caller = __builtin_return_address(0); > > vm_area_add_early(vma); > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline); > */ > static void __init declare_kernel_vmas(void) > { > - static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; > + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1]; > > - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); > - declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); > - declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); > - declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); > - declare_vma(&vmlinux_seg[4], _data, _end, 0); > + declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD); Should we also put the memblock reservation back as it was, so that this region can't be allocated there? In fact, if we're not allocating from here, why don't we just map it anyway but without execute permissions? Will
On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote: > > [+Ard] > > On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote: > > From: Omar Sandoval <osandov@fb.com> > > > > Since the referenced fixes commit, the kernel's .text section is only > > mapped starting from _stext; the region [_text, _stext) is omitted. As a > > result, other vmalloc/vmap allocations may use the virtual addresses > > nominally in the range [_text, _stext). This address reuse confuses > > multiple things: > > > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore > > mapping the entire range [_text, _end) to > > [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in > > [_text, _stext) from /proc/vmcore therefore gives the incorrect > > result. > > 2. Tools doing symbolization (either by reading /proc/kallsyms or based > > on the vmlinux ELF file) will incorrectly identify vmalloc/vmap > > allocations in [_text, _stext) as kernel symbols. > > > > In practice, both of these issues affect the drgn debugger. > > Specifically, there were cases where the vmap IRQ stacks for some CPUs > > were allocated in [_text, _stext). As a result, drgn could not get the > > stack trace for a crash in an IRQ handler because the core dump > > contained invalid data for the IRQ stack address. The stack addresses > > were also symbolized as being in the _text symbol. > > > > Fix this by creating an unmapped vm_area to cover [_text, _stext). This > > prevents other allocations from using it while still achieving the > > original goal of not mapping unpredictable data. > > > > Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping") > > Cc: stable@vger.kernel.org > > Signed-off-by: Omar Sandoval <osandov@fb.com> > > --- > > Based on v6.14-rc6. > > > > Thanks! > > > > arch/arm64/mm/mmu.c | 26 +++++++++++++++++--------- > > 1 file changed, 17 insertions(+), 9 deletions(-) > > > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > > index b4df5bc5b1b8..88595ea12f39 100644 > > --- a/arch/arm64/mm/mmu.c > > +++ b/arch/arm64/mm/mmu.c > > @@ -703,10 +703,14 @@ static void __init declare_vma(struct vm_struct *vma, > > void *va_start, void *va_end, > > unsigned long vm_flags) > > { > > - phys_addr_t pa_start = __pa_symbol(va_start); > > + phys_addr_t pa_start = 0; > > unsigned long size = va_end - va_start; > > > > - BUG_ON(!PAGE_ALIGNED(pa_start)); > > + if (vm_flags & VM_MAP) { > > + pa_start = __pa_symbol(va_start); > > + BUG_ON(!PAGE_ALIGNED(pa_start)); > > + } > > + > > BUG_ON(!PAGE_ALIGNED(size)); > > > > if (!(vm_flags & VM_NO_GUARD)) > > @@ -715,7 +719,7 @@ static void __init declare_vma(struct vm_struct *vma, > > vma->addr = va_start; > > vma->phys_addr = pa_start; > > vma->size = size; > > - vma->flags = VM_MAP | vm_flags; > > + vma->flags = vm_flags; > > vma->caller = __builtin_return_address(0); > > > > vm_area_add_early(vma); > > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline); > > */ > > static void __init declare_kernel_vmas(void) > > { > > - static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; > > + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1]; > > > > - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); > > - declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); > > - declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); > > - declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); > > - declare_vma(&vmlinux_seg[4], _data, _end, 0); > > + declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD); > > Should we also put the memblock reservation back as it was, so that this > region can't be allocated there? > The issue is about the virtual address space, not the physical memory behind it, right? So the VA range should be protected from reuse, but nothing needs to be mapped there. > In fact, if we're not allocating from here, why don't we just map it > anyway but without execute permissions? > It's just 64k so if this is the simplest approach, I won't object. I wonder if this needs to be so intrusive, though - there is already a precedent of VMAs not actually mapping the entire region they describe (with guard pages), and so we might just declare the first VMA as [_text, _etext), even though the first 64k of that region is not not actually mapped. However, if that confuses the bookkeeping or creates other problems, declaring a separate VMA to reserve the VA range seems fine, although the patch seems a bit intrusive (and I don't even see the whole thing).
On Tue, Mar 11, 2025 at 02:32:47PM +0100, Ard Biesheuvel wrote: > On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote: > > > > [+Ard] > > > > On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote: > > > From: Omar Sandoval <osandov@fb.com> > > > > > > Since the referenced fixes commit, the kernel's .text section is only > > > mapped starting from _stext; the region [_text, _stext) is omitted. As a > > > result, other vmalloc/vmap allocations may use the virtual addresses > > > nominally in the range [_text, _stext). This address reuse confuses > > > multiple things: > > > > > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore > > > mapping the entire range [_text, _end) to > > > [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in > > > [_text, _stext) from /proc/vmcore therefore gives the incorrect > > > result. [...] > > > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline); > > > */ > > > static void __init declare_kernel_vmas(void) > > > { > > > - static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; > > > + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1]; > > > > > > - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); > > > - declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); > > > - declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); > > > - declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); > > > - declare_vma(&vmlinux_seg[4], _data, _end, 0); > > > + declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD); > > > > Should we also put the memblock reservation back as it was, so that this > > region can't be allocated there? > > > > The issue is about the virtual address space, not the physical memory > behind it, right? So the VA range should be protected from reuse, but > nothing needs to be mapped there. You're absolutely right, but now I'm more confused about the reference to crash_prepare_elf64_headers() in the commit message. That sets both the virtual (_text) and the physical (__pa_symbol(_text)) addresses in the header, so it feels like we really need to keep that memory around because it's accessible via /proc/vmcore. > > > In fact, if we're not allocating from here, why don't we just map it > > anyway but without execute permissions? > > > > It's just 64k so if this is the simplest approach, I won't object. > > I wonder if this needs to be so intrusive, though - there is already a > precedent of VMAs not actually mapping the entire region they describe > (with guard pages), and so we might just declare the first VMA as > [_text, _etext), even though the first 64k of that region is not not > actually mapped. > > However, if that confuses the bookkeeping or creates other problems, > declaring a separate VMA to reserve the VA range seems fine, although > the patch seems a bit intrusive (and I don't even see the whole > thing). As above, I think we'll have to give /proc/vmcore the physical address of _something_. Will
On Tue, Mar 11, 2025 at 02:17:18PM +0000, Will Deacon wrote: > On Tue, Mar 11, 2025 at 02:32:47PM +0100, Ard Biesheuvel wrote: > > On Tue, 11 Mar 2025 at 13:54, Will Deacon <will@kernel.org> wrote: > > > > > > [+Ard] > > > > > > On Mon, Mar 10, 2025 at 01:05:04PM -0700, Omar Sandoval wrote: > > > > From: Omar Sandoval <osandov@fb.com> > > > > > > > > Since the referenced fixes commit, the kernel's .text section is only > > > > mapped starting from _stext; the region [_text, _stext) is omitted. As a > > > > result, other vmalloc/vmap allocations may use the virtual addresses > > > > nominally in the range [_text, _stext). This address reuse confuses > > > > multiple things: > > > > > > > > 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore > > > > mapping the entire range [_text, _end) to > > > > [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in > > > > [_text, _stext) from /proc/vmcore therefore gives the incorrect > > > > result. > > [...] > > > > > @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline); > > > > */ > > > > static void __init declare_kernel_vmas(void) > > > > { > > > > - static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; > > > > + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1]; > > > > > > > > - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); > > > > - declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); > > > > - declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); > > > > - declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); > > > > - declare_vma(&vmlinux_seg[4], _data, _end, 0); > > > > + declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD); > > > > > > Should we also put the memblock reservation back as it was, so that this > > > region can't be allocated there? > > > > > > > The issue is about the virtual address space, not the physical memory > > behind it, right? So the VA range should be protected from reuse, but > > nothing needs to be mapped there. > > You're absolutely right, but now I'm more confused about the reference > to crash_prepare_elf64_headers() in the commit message. That sets both > the virtual (_text) and the physical (__pa_symbol(_text)) addresses in > the header, so it feels like we really need to keep that memory around > because it's accessible via /proc/vmcore. > > > > > > In fact, if we're not allocating from here, why don't we just map it > > > anyway but without execute permissions? > > > > > > > It's just 64k so if this is the simplest approach, I won't object. > > > > I wonder if this needs to be so intrusive, though - there is already a > > precedent of VMAs not actually mapping the entire region they describe > > (with guard pages), and so we might just declare the first VMA as > > [_text, _etext), even though the first 64k of that region is not not > > actually mapped. > > > > However, if that confuses the bookkeeping or creates other problems, > > declaring a separate VMA to reserve the VA range seems fine, although > > the patch seems a bit intrusive (and I don't even see the whole > > thing). > > As above, I think we'll have to give /proc/vmcore the physical address > of _something_. I figured since there's nothing actually at that virtual address anymore, it doesn't matter what physical address /proc/vmcore gives. But that doesn't sound great now that you point it out. I'll rework this to map the range non-executable. Thanks, Omar
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b4df5bc5b1b8..88595ea12f39 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -703,10 +703,14 @@ static void __init declare_vma(struct vm_struct *vma, void *va_start, void *va_end, unsigned long vm_flags) { - phys_addr_t pa_start = __pa_symbol(va_start); + phys_addr_t pa_start = 0; unsigned long size = va_end - va_start; - BUG_ON(!PAGE_ALIGNED(pa_start)); + if (vm_flags & VM_MAP) { + pa_start = __pa_symbol(va_start); + BUG_ON(!PAGE_ALIGNED(pa_start)); + } + BUG_ON(!PAGE_ALIGNED(size)); if (!(vm_flags & VM_NO_GUARD)) @@ -715,7 +719,7 @@ static void __init declare_vma(struct vm_struct *vma, vma->addr = va_start; vma->phys_addr = pa_start; vma->size = size; - vma->flags = VM_MAP | vm_flags; + vma->flags = vm_flags; vma->caller = __builtin_return_address(0); vm_area_add_early(vma); @@ -765,13 +769,17 @@ core_initcall(map_entry_trampoline); */ static void __init declare_kernel_vmas(void) { - static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT + 1]; - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); - declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); - declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); - declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); - declare_vma(&vmlinux_seg[4], _data, _end, 0); + declare_vma(&vmlinux_seg[0], _text, _stext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[1], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[2], __start_rodata, __inittext_begin, + VM_MAP | VM_NO_GUARD); + declare_vma(&vmlinux_seg[3], __inittext_begin, __inittext_end, + VM_MAP | VM_NO_GUARD); + declare_vma(&vmlinux_seg[4], __initdata_begin, __initdata_end, + VM_MAP | VM_NO_GUARD); + declare_vma(&vmlinux_seg[5], _data, _end, VM_MAP); } void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,