Message ID | 1414440752-9411-8-git-send-email-lauraa@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Oct 27, 2014 at 01:12:32PM -0700, Laura Abbott wrote: > Add page protections for arm64 similar to those in arm. > This is for security reasons to prevent certain classes > of exploits. The current method: > > - Map all memory as either RWX or RW. We round to the nearest > section to avoid creating page tables before everything is mapped > - Once everything is mapped, if either end of the RWX section should > not be X, we split the PMD and remap as necessary > - When initmem is to be freed, we change the permissions back to > RW (using stop machine if necessary to flush the TLB) > - If CONFIG_DEBUG_RODATA is set, the read only sections are set > read only. > > Signed-off-by: Laura Abbott <lauraa@codeaurora.org> Hi Laura, I have some comments below. > --- > v4: Combined the Kconfig options > --- > arch/arm64/Kconfig.debug | 23 +++ > arch/arm64/include/asm/cacheflush.h | 4 + > arch/arm64/kernel/vmlinux.lds.S | 17 ++ > arch/arm64/mm/init.c | 1 + > arch/arm64/mm/mm.h | 2 + > arch/arm64/mm/mmu.c | 303 +++++++++++++++++++++++++++++++----- > 6 files changed, 314 insertions(+), 36 deletions(-) > > diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug > index 0a12933..1b25015 100644 > --- a/arch/arm64/Kconfig.debug > +++ b/arch/arm64/Kconfig.debug > @@ -54,4 +54,27 @@ config DEBUG_SET_MODULE_RONX > against certain classes of kernel exploits. > If in doubt, say "N". > > +config DEBUG_RODATA > + bool "Make kernel text and rodata read-only" > + help > + If this is set, kernel text and rodata will be made read-only. This > + is to help catch accidental or malicious attempts to change the > + kernel's executable code. Additionally splits rodata from kernel > + text so it can be made explicitly non-executable. > + > + If in doubt, say Y > + > +config DEBUG_ALIGN_RODATA > + depends on DEBUG_RODATA > + bool "Align linker sections up to SECTION_SIZE" > + help > + If this option is enabled, sections that may potentially be marked as > + read only or non-executable will be aligned up to the section size of > + the kernel. This prevents sections from being split into pages and > + avoids a potential TLB penalty. The downside is an increase in > + alignment and potentially wasted space. Turn on this option if > + performance is more important than memory pressure. > + > + If in doubt, say N > + > endmenu > diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h > index 689b637..0014207 100644 > --- a/arch/arm64/include/asm/cacheflush.h > +++ b/arch/arm64/include/asm/cacheflush.h > @@ -152,4 +152,8 @@ int set_memory_ro(unsigned long addr, int numpages); > int set_memory_rw(unsigned long addr, int numpages); > int set_memory_x(unsigned long addr, int numpages); > int set_memory_nx(unsigned long addr, int numpages); > + > +#ifdef CONFIG_DEBUG_RODATA > +void mark_rodata_ro(void); > +#endif > #endif > diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S > index edf8715..83424ad 100644 > --- a/arch/arm64/kernel/vmlinux.lds.S > +++ b/arch/arm64/kernel/vmlinux.lds.S > @@ -8,6 +8,7 @@ > #include <asm/thread_info.h> > #include <asm/memory.h> > #include <asm/page.h> > +#include <asm/pgtable.h> > > #include "image.h" > > @@ -54,6 +55,9 @@ SECTIONS > _text = .; > HEAD_TEXT > } > +#ifdef DEBUG_ALIGN_RODATA > + . = ALIGN(1<<SECTION_SHIFT); > +#endif > .text : { /* Real text segment */ > _stext = .; /* Text and read-only data */ > __exception_text_start = .; > @@ -70,19 +74,32 @@ SECTIONS > *(.got) /* Global offset table */ > } > > +#ifdef DEBUG_ALIGN_RODATA > + . = ALIGN(1<<SECTION_SHIFT); > +#endif > RO_DATA(PAGE_SIZE) > EXCEPTION_TABLE(8) > NOTES > _etext = .; /* End of text and rodata section */ > > +#ifdef DEBUG_ALIGN_RODATA > + . = ALIGN(1<<SECTION_SHIFT); > +#else > . = ALIGN(PAGE_SIZE); > +#endif > __init_begin = .; > > INIT_TEXT_SECTION(8) > .exit.text : { > ARM_EXIT_KEEP(EXIT_TEXT) > } > + > +#ifdef DEBUG_ALIGN_RODATA > + . = ALIGN(1<<SECTION_SHIFT); > + __init_data_begin = .; > +#else > . = ALIGN(16); > +#endif > .init.data : { > INIT_DATA > INIT_SETUP(16) > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 494297c..61f44c7 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -324,6 +324,7 @@ void __init mem_init(void) > > void free_initmem(void) > { > + fixup_init(); > free_initmem_default(0); > } > > diff --git a/arch/arm64/mm/mm.h b/arch/arm64/mm/mm.h > index d519f4f..82347d8 100644 > --- a/arch/arm64/mm/mm.h > +++ b/arch/arm64/mm/mm.h > @@ -1,2 +1,4 @@ > extern void __init bootmem_init(void); > extern void __init arm64_swiotlb_init(void); > + > +void fixup_init(void); > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index e92f633..0428294 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -26,6 +26,7 @@ > #include <linux/memblock.h> > #include <linux/fs.h> > #include <linux/io.h> > +#include <linux/stop_machine.h> > > #include <asm/cputype.h> > #include <asm/fixmap.h> > @@ -137,17 +138,55 @@ static void __init *early_alloc(unsigned long sz) > return ptr; > } > > -static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, > +/* > + * remap a PMD into pages > + */ > +static noinline void __ref split_pmd(pmd_t *pmd, bool early) > +{ > + pte_t *pte, *start_pte; > + unsigned long pfn; > + int i = 0; > + > + if (early) > + start_pte = pte = early_alloc(PTRS_PER_PTE*sizeof(pte_t)); > + else > + start_pte = pte = (pte_t *)__get_free_page(PGALLOC_GFP); > + > + BUG_ON(!pte); > + > + pfn = pmd_pfn(*pmd); > + > + do { > + /* > + * Need to have the least restrictive permissions available > + * permissions will be fixed up later > + */ > + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); > + pfn++; > + } while (pte++, i++, i < PTRS_PER_PTE); > + > + > + __pmd_populate(pmd, __pa(start_pte), PMD_TYPE_TABLE); > + flush_tlb_all(); > +} > + > +static void __ref alloc_init_pte(pmd_t *pmd, unsigned long addr, > unsigned long end, unsigned long pfn, > - pgprot_t prot) > + pgprot_t prot, bool early) > { > pte_t *pte; > > if (pmd_none(*pmd)) { > - pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); > + if (early) > + pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); > + else > + pte = (pte_t *)__get_free_page(PGALLOC_GFP); > + BUG_ON(!pte); > __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); > } > - BUG_ON(pmd_bad(*pmd)); > + > + if (pmd_bad(*pmd)) > + split_pmd(pmd, early); > > pte = pte_offset_kernel(pmd, addr); > do { > @@ -156,29 +195,52 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, > } while (pte++, addr += PAGE_SIZE, addr != end); > } > > -static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, > +void __ref split_pud(pud_t *old_pud, pmd_t *pmd) > +{ > + unsigned long pfn = pud_pfn(*old_pud); > + const pmdval_t mask = PMD_SECT_USER | PMD_SECT_PXN | PMD_SECT_UXN | > + PMD_SECT_RDONLY | PMD_SECT_PROT_NONE | > + PMD_SECT_VALID; > + pgprot_t prot = __pgprot(pud_val(*old_pud) & mask); This looks a little fragile, if I've understood things correctly then all we want to do is create a set of pmds with the same pgprot_t as the original pud? In that case it would probably be easier to simply mask out the pfn from our pud to create the pgprot rather than mask in a set of attributes (that may change in future)? > + int i = 0; > + > + do { > + set_pmd(pmd, pfn_pmd(pfn, prot)); > + pfn++; > + } while (pmd++, i++, i < PTRS_PER_PMD); > +} > + > +static void __ref alloc_init_pmd(pud_t *pud, unsigned long addr, > unsigned long end, phys_addr_t phys, > - int map_io) > + pgprot_t sect_prot, pgprot_t pte_prot, > + bool early, int map_io) > { > pmd_t *pmd; > unsigned long next; > - pmdval_t prot_sect; > - pgprot_t prot_pte; > > if (map_io) { > - prot_sect = PROT_SECT_DEVICE_nGnRE; > - prot_pte = __pgprot(PROT_DEVICE_nGnRE); > - } else { > - prot_sect = PROT_SECT_NORMAL_EXEC; > - prot_pte = PAGE_KERNEL_EXEC; > + sect_prot = PROT_SECT_DEVICE_nGnRE; > + pte_prot = __pgprot(PROT_DEVICE_nGnRE); > } I think we should wipe out map_io from all these functions as it's causing too much complexity. It's enough to have just sect_prot and pte_prot. I posted some similar feedback to Ard: http://lists.infradead.org/pipermail/linux-arm-kernel/2014-October/296918.html > > /* > * Check for initial section mappings in the pgd/pud and remove them. > */ > if (pud_none(*pud) || pud_bad(*pud)) { > - pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); > + if (early) > + pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); > + else > + pmd = pmd_alloc_one(&init_mm, addr); > + BUG_ON(!pmd); > + if (pud_sect(*pud)) { > + /* > + * need to have the 1G of mappings continue to be > + * present > + */ > + split_pud(pud, pmd); > + } > pud_populate(&init_mm, pud, pmd); > + flush_tlb_all(); > } > > pmd = pmd_offset(pud, addr); > @@ -186,8 +248,8 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, > next = pmd_addr_end(addr, end); > /* try section mapping first */ > if (((addr | next | phys) & ~SECTION_MASK) == 0) { > - pmd_t old_pmd =*pmd; > - set_pmd(pmd, __pmd(phys | prot_sect)); > + pmd_t old_pmd = *pmd; > + set_pmd(pmd, __pmd(phys | sect_prot)); > /* > * Check for previous table entries created during > * boot (__create_page_tables) and flush them. > @@ -196,15 +258,16 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, > flush_tlb_all(); > } else { > alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), > - prot_pte); > + pte_prot, early); > } > phys += next - addr; > } while (pmd++, addr = next, addr != end); > } > > -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, > +static void __ref alloc_init_pud(pgd_t *pgd, unsigned long addr, > unsigned long end, unsigned long phys, > - int map_io) > + pgprot_t sect_prot, pgprot_t pte_prot, > + bool early, int map_io) > { > pud_t *pud; > unsigned long next; > @@ -222,10 +285,10 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, > /* > * For 4K granule only, attempt to put down a 1GB block > */ > - if (!map_io && (PAGE_SHIFT == 12) && > + if (!map_io && early && (PAGE_SHIFT == 12) && > ((addr | next | phys) & ~PUD_MASK) == 0) { > pud_t old_pud = *pud; > - set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); > + set_pud(pud, __pud(phys | sect_prot)); > > /* > * If we have an old value for a pud, it will > @@ -240,7 +303,8 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, > flush_tlb_all(); > } > } else { > - alloc_init_pmd(pud, addr, next, phys, map_io); > + alloc_init_pmd(pud, addr, next, phys, sect_prot, > + pte_prot, early, map_io); > } > phys += next - addr; > } while (pud++, addr = next, addr != end); > @@ -250,9 +314,11 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, > * Create the page directory entries and any necessary page tables for the > * mapping specified by 'md'. > */ > -static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, > - unsigned long virt, phys_addr_t size, > - int map_io) > +static void __ref __create_mapping(pgd_t *pgd, phys_addr_t phys, > + unsigned long virt, > + phys_addr_t size, > + pgprot_t sect_prot, pgprot_t pte_prot, > + int map_io, bool early) > { > unsigned long addr, length, end, next; > > @@ -262,31 +328,140 @@ static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, > end = addr + length; > do { > next = pgd_addr_end(addr, end); > - alloc_init_pud(pgd, addr, next, phys, map_io); > + alloc_init_pud(pgd, addr, next, phys, sect_prot, pte_prot, > + early, map_io); > phys += next - addr; > } while (pgd++, addr = next, addr != end); > } > > -static void __init create_mapping(phys_addr_t phys, unsigned long virt, > - phys_addr_t size) > +void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) > +{ > + if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { > + pr_warn("BUG: not creating id mapping for %pa\n", &addr); > + return; > + } > + __create_mapping(&idmap_pg_dir[pgd_index(addr)], > + addr, addr, size, PROT_SECT_NORMAL_EXEC, > + PAGE_KERNEL_EXEC, map_io, false); > +} > + > +static inline pmd_t *pmd_off_k(unsigned long virt) > +{ > + return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); > +} > + > +#ifdef CONFIG_EARLY_PRINTK > +/* > + * Create an early I/O mapping using the pgd/pmd entries already populated > + * in head.S as this function is called too early to allocated any memory. The > + * mapping size is 2MB with 4KB pages or 64KB or 64KB pages. > + */ > +void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) > +{ > + unsigned long size, mask; > + bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd; > + pte_t *pte; > + > + /* > + * No early pte entries with !ARM64_64K_PAGES configuration, so using > + * sections (pmd). > + */ > + size = page64k ? PAGE_SIZE : SECTION_SIZE; > + mask = ~(size - 1); > + > + pgd = pgd_offset_k(virt); > + pud = pud_offset(pgd, virt); > + if (pud_none(*pud)) > + return NULL; > + pmd = pmd_offset(pud, virt); > + > + if (page64k) { > + if (pmd_none(*pmd)) > + return NULL; > + pte = pte_offset_kernel(pmd, virt); > + set_pte(pte, __pte((phys & mask) | PROT_DEVICE_nGnRE)); > + } else { > + set_pmd(pmd, __pmd((phys & mask) | PROT_SECT_DEVICE_nGnRE)); > + } > + > + return (void __iomem *)((virt & mask) + (phys & ~mask)); > +} > +#endif > + > +static void __ref create_mapping(phys_addr_t phys, unsigned long virt, > + phys_addr_t size, > + pgprot_t sect_prot, pgprot_t pte_prot) > { > if (virt < VMALLOC_START) { > pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", > &phys, virt); > return; > } > - __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, size, 0); > + > + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, > + size, sect_prot, pte_prot, 0, true); > } > > -void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) > +static void __ref create_mapping_late(phys_addr_t phys, unsigned long virt, > + phys_addr_t size, > + pgprot_t sect_prot, pgprot_t pte_prot) > { > - if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { > - pr_warn("BUG: not creating id mapping for %pa\n", &addr); > + if (virt < VMALLOC_START) { > + pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", > + &phys, virt); > return; > } > - __create_mapping(&idmap_pg_dir[pgd_index(addr)], > - addr, addr, size, map_io); > + > + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, > + size, sect_prot, pte_prot, 0, false); > +} > + > +#ifdef CONFIG_DEBUG_RODATA > +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) > +{ > + /* > + * Set up the executable regions using the existing section mappings > + * for now. This will get more fine grained later once all memory > + * is mapped > + */ > + unsigned long kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); > + unsigned long kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); > + > + if (end < kernel_x_start) { > + create_mapping(start, __phys_to_virt(start), > + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); > + } else if (start >= kernel_x_end) { > + create_mapping(start, __phys_to_virt(start), > + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); > + } else { > + if (start < kernel_x_start) > + create_mapping(start, __phys_to_virt(start), > + kernel_x_start - start, > + PROT_SECT_NORMAL, > + PAGE_KERNEL); > + create_mapping(kernel_x_start, > + __phys_to_virt(kernel_x_start), > + kernel_x_end - kernel_x_start, > + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); > + if (kernel_x_end < end) > + create_mapping(kernel_x_end, > + __phys_to_virt(kernel_x_end), > + end - kernel_x_end, > + PROT_SECT_NORMAL, > + PAGE_KERNEL); > + } > + > +} > +#else > +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) > +{ > + create_mapping(start, __phys_to_virt(start), end - start, > + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); > } > +#endif > > static void __init map_mem(void) > { > @@ -328,14 +503,69 @@ static void __init map_mem(void) > memblock_set_current_limit(limit); > } > #endif > - > - create_mapping(start, __phys_to_virt(start), end - start); > + __map_memblock(start, end); > } > > /* Limit no longer required. */ > memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); > } > > +void __init fixup_executable(void) > +{ > +#ifdef CONFIG_DEBUG_RODATA > + /* now that we are actually fully mapped, make the start/end more fine grained */ > + if (!IS_ALIGNED((unsigned long)_stext, SECTION_SIZE)) { > + unsigned long aligned_start = round_down(__pa(_stext), > + SECTION_SIZE); > + > + create_mapping(aligned_start, __phys_to_virt(aligned_start), > + __pa(_stext) - aligned_start, > + PROT_SECT_NORMAL, > + PAGE_KERNEL); > + } > + > + if (!IS_ALIGNED((unsigned long)__init_end, SECTION_SIZE)) { > + unsigned long aligned_end = round_up(__pa(__init_end), > + SECTION_SIZE); > + create_mapping(__pa(__init_end), (unsigned long)__init_end, > + aligned_end - __pa(__init_end), > + PROT_SECT_NORMAL, > + PAGE_KERNEL); > + } > +#endif > +} > + > +#ifdef CONFIG_DEBUG_RODATA > +void mark_rodata_ro(void) > +{ > + create_mapping_late(__pa(_stext), (unsigned long)_stext, > + (unsigned long)_etext - (unsigned long)_stext, > + PROT_SECT_NORMAL_EXEC | PMD_SECT_RDONLY, > + PAGE_KERNEL_EXEC | PTE_RDONLY); > + > +} > +#endif > + > +static int __flush_mappings(void *unused) > +{ > + flush_tlb_kernel_range((unsigned long)__init_begin, > + (unsigned long)__init_end); > + return 0; > +} > + > +void __ref fixup_init(void) > +{ > + phys_addr_t start = __pa(__init_begin); > + phys_addr_t end = __pa(__init_end); > + > + create_mapping_late(start, (unsigned long)__init_begin, > + end - start, > + PROT_SECT_NORMAL, > + PAGE_KERNEL); > + if (!IS_ALIGNED(start, SECTION_SIZE) || !IS_ALIGNED(end, SECTION_SIZE)) > + stop_machine(__flush_mappings, NULL, NULL); > +} > + > /* > * paging_init() sets up the page tables, initialises the zone memory > * maps and sets up the zero page. > @@ -345,6 +575,7 @@ void __init paging_init(void) > void *zero_page; > > map_mem(); > + fixup_executable(); > > /* > * Finally flush the caches and tlb to ensure that we're in a > -- > Qualcomm Innovation Center, Inc. > Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project >
On 28 October 2014 12:29, Steve Capper <steve.capper@linaro.org> wrote: > On Mon, Oct 27, 2014 at 01:12:32PM -0700, Laura Abbott wrote: >> Add page protections for arm64 similar to those in arm. >> This is for security reasons to prevent certain classes >> of exploits. The current method: >> >> - Map all memory as either RWX or RW. We round to the nearest >> section to avoid creating page tables before everything is mapped >> - Once everything is mapped, if either end of the RWX section should >> not be X, we split the PMD and remap as necessary >> - When initmem is to be freed, we change the permissions back to >> RW (using stop machine if necessary to flush the TLB) >> - If CONFIG_DEBUG_RODATA is set, the read only sections are set >> read only. >> >> Signed-off-by: Laura Abbott <lauraa@codeaurora.org> > > Hi Laura, > I have some comments below. > >> --- >> v4: Combined the Kconfig options >> --- >> arch/arm64/Kconfig.debug | 23 +++ >> arch/arm64/include/asm/cacheflush.h | 4 + >> arch/arm64/kernel/vmlinux.lds.S | 17 ++ >> arch/arm64/mm/init.c | 1 + >> arch/arm64/mm/mm.h | 2 + >> arch/arm64/mm/mmu.c | 303 +++++++++++++++++++++++++++++++----- >> 6 files changed, 314 insertions(+), 36 deletions(-) >> [...] >> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c >> index e92f633..0428294 100644 >> --- a/arch/arm64/mm/mmu.c >> +++ b/arch/arm64/mm/mmu.c >> @@ -26,6 +26,7 @@ >> #include <linux/memblock.h> >> #include <linux/fs.h> >> #include <linux/io.h> >> +#include <linux/stop_machine.h> >> >> #include <asm/cputype.h> >> #include <asm/fixmap.h> >> @@ -137,17 +138,55 @@ static void __init *early_alloc(unsigned long sz) >> return ptr; >> } >> >> -static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, >> +/* >> + * remap a PMD into pages >> + */ >> +static noinline void __ref split_pmd(pmd_t *pmd, bool early) >> +{ >> + pte_t *pte, *start_pte; >> + unsigned long pfn; >> + int i = 0; >> + >> + if (early) >> + start_pte = pte = early_alloc(PTRS_PER_PTE*sizeof(pte_t)); >> + else >> + start_pte = pte = (pte_t *)__get_free_page(PGALLOC_GFP); >> + >> + BUG_ON(!pte); >> + >> + pfn = pmd_pfn(*pmd); >> + >> + do { >> + /* >> + * Need to have the least restrictive permissions available >> + * permissions will be fixed up later >> + */ >> + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); >> + pfn++; >> + } while (pte++, i++, i < PTRS_PER_PTE); >> + >> + >> + __pmd_populate(pmd, __pa(start_pte), PMD_TYPE_TABLE); >> + flush_tlb_all(); >> +} >> + >> +static void __ref alloc_init_pte(pmd_t *pmd, unsigned long addr, >> unsigned long end, unsigned long pfn, >> - pgprot_t prot) >> + pgprot_t prot, bool early) >> { >> pte_t *pte; >> >> if (pmd_none(*pmd)) { >> - pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); >> + if (early) >> + pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); >> + else >> + pte = (pte_t *)__get_free_page(PGALLOC_GFP); >> + BUG_ON(!pte); >> __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); >> } >> - BUG_ON(pmd_bad(*pmd)); >> + >> + if (pmd_bad(*pmd)) >> + split_pmd(pmd, early); >> >> pte = pte_offset_kernel(pmd, addr); >> do { >> @@ -156,29 +195,52 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, >> } while (pte++, addr += PAGE_SIZE, addr != end); >> } >> >> -static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, >> +void __ref split_pud(pud_t *old_pud, pmd_t *pmd) >> +{ >> + unsigned long pfn = pud_pfn(*old_pud); >> + const pmdval_t mask = PMD_SECT_USER | PMD_SECT_PXN | PMD_SECT_UXN | >> + PMD_SECT_RDONLY | PMD_SECT_PROT_NONE | >> + PMD_SECT_VALID; >> + pgprot_t prot = __pgprot(pud_val(*old_pud) & mask); > > This looks a little fragile, if I've understood things correctly then > all we want to do is create a set of pmds with the same pgprot_t as the > original pud? > > In that case it would probably be easier to simply mask out the pfn > from our pud to create the pgprot rather than mask in a set of > attributes (that may change in future)? > >> + int i = 0; >> + >> + do { >> + set_pmd(pmd, pfn_pmd(pfn, prot)); >> + pfn++; >> + } while (pmd++, i++, i < PTRS_PER_PMD); >> +} >> + >> +static void __ref alloc_init_pmd(pud_t *pud, unsigned long addr, >> unsigned long end, phys_addr_t phys, >> - int map_io) >> + pgprot_t sect_prot, pgprot_t pte_prot, >> + bool early, int map_io) >> { >> pmd_t *pmd; >> unsigned long next; >> - pmdval_t prot_sect; >> - pgprot_t prot_pte; >> >> if (map_io) { >> - prot_sect = PROT_SECT_DEVICE_nGnRE; >> - prot_pte = __pgprot(PROT_DEVICE_nGnRE); >> - } else { >> - prot_sect = PROT_SECT_NORMAL_EXEC; >> - prot_pte = PAGE_KERNEL_EXEC; >> + sect_prot = PROT_SECT_DEVICE_nGnRE; >> + pte_prot = __pgprot(PROT_DEVICE_nGnRE); >> } > > I think we should wipe out map_io from all these functions as it's > causing too much complexity. It's enough to have just sect_prot and > pte_prot. I posted some similar feedback to Ard: > http://lists.infradead.org/pipermail/linux-arm-kernel/2014-October/296918.html > I do agree. But could we have a single enum that maps onto {sect_prot, pte_prot} tuples rather than having to pass both values everywhere we call any of these functions? I.e., { MMU_PROT_DEFAULT, MMU_PROT_READONLY, MMU_PROT_READWRITE, MMU_PROT_READEXEC, MMU_PROT_DEVICE }, with the mapping local to mmu.c? >> >> /* >> * Check for initial section mappings in the pgd/pud and remove them. >> */ >> if (pud_none(*pud) || pud_bad(*pud)) { >> - pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); >> + if (early) >> + pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); >> + else >> + pmd = pmd_alloc_one(&init_mm, addr); >> + BUG_ON(!pmd); >> + if (pud_sect(*pud)) { >> + /* >> + * need to have the 1G of mappings continue to be >> + * present >> + */ >> + split_pud(pud, pmd); >> + } >> pud_populate(&init_mm, pud, pmd); >> + flush_tlb_all(); >> } >> >> pmd = pmd_offset(pud, addr); >> @@ -186,8 +248,8 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, >> next = pmd_addr_end(addr, end); >> /* try section mapping first */ >> if (((addr | next | phys) & ~SECTION_MASK) == 0) { >> - pmd_t old_pmd =*pmd; >> - set_pmd(pmd, __pmd(phys | prot_sect)); >> + pmd_t old_pmd = *pmd; >> + set_pmd(pmd, __pmd(phys | sect_prot)); >> /* >> * Check for previous table entries created during >> * boot (__create_page_tables) and flush them. >> @@ -196,15 +258,16 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, >> flush_tlb_all(); >> } else { >> alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), >> - prot_pte); >> + pte_prot, early); >> } >> phys += next - addr; >> } while (pmd++, addr = next, addr != end); >> } >> >> -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, >> +static void __ref alloc_init_pud(pgd_t *pgd, unsigned long addr, >> unsigned long end, unsigned long phys, >> - int map_io) >> + pgprot_t sect_prot, pgprot_t pte_prot, >> + bool early, int map_io) >> { >> pud_t *pud; >> unsigned long next; >> @@ -222,10 +285,10 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, >> /* >> * For 4K granule only, attempt to put down a 1GB block >> */ >> - if (!map_io && (PAGE_SHIFT == 12) && >> + if (!map_io && early && (PAGE_SHIFT == 12) && >> ((addr | next | phys) & ~PUD_MASK) == 0) { >> pud_t old_pud = *pud; >> - set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); >> + set_pud(pud, __pud(phys | sect_prot)); >> >> /* >> * If we have an old value for a pud, it will >> @@ -240,7 +303,8 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, >> flush_tlb_all(); >> } >> } else { >> - alloc_init_pmd(pud, addr, next, phys, map_io); >> + alloc_init_pmd(pud, addr, next, phys, sect_prot, >> + pte_prot, early, map_io); >> } >> phys += next - addr; >> } while (pud++, addr = next, addr != end); >> @@ -250,9 +314,11 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, >> * Create the page directory entries and any necessary page tables for the >> * mapping specified by 'md'. >> */ >> -static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, >> - unsigned long virt, phys_addr_t size, >> - int map_io) >> +static void __ref __create_mapping(pgd_t *pgd, phys_addr_t phys, >> + unsigned long virt, >> + phys_addr_t size, >> + pgprot_t sect_prot, pgprot_t pte_prot, >> + int map_io, bool early) >> { >> unsigned long addr, length, end, next; >> >> @@ -262,31 +328,140 @@ static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, >> end = addr + length; >> do { >> next = pgd_addr_end(addr, end); >> - alloc_init_pud(pgd, addr, next, phys, map_io); >> + alloc_init_pud(pgd, addr, next, phys, sect_prot, pte_prot, >> + early, map_io); >> phys += next - addr; >> } while (pgd++, addr = next, addr != end); >> } >> >> -static void __init create_mapping(phys_addr_t phys, unsigned long virt, >> - phys_addr_t size) >> +void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) >> +{ >> + if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { >> + pr_warn("BUG: not creating id mapping for %pa\n", &addr); >> + return; >> + } >> + __create_mapping(&idmap_pg_dir[pgd_index(addr)], >> + addr, addr, size, PROT_SECT_NORMAL_EXEC, >> + PAGE_KERNEL_EXEC, map_io, false); >> +} >> + >> +static inline pmd_t *pmd_off_k(unsigned long virt) >> +{ >> + return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); >> +} >> + >> +#ifdef CONFIG_EARLY_PRINTK >> +/* >> + * Create an early I/O mapping using the pgd/pmd entries already populated >> + * in head.S as this function is called too early to allocated any memory. The >> + * mapping size is 2MB with 4KB pages or 64KB or 64KB pages. >> + */ >> +void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) >> +{ >> + unsigned long size, mask; >> + bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); >> + pgd_t *pgd; >> + pud_t *pud; >> + pmd_t *pmd; >> + pte_t *pte; >> + >> + /* >> + * No early pte entries with !ARM64_64K_PAGES configuration, so using >> + * sections (pmd). >> + */ >> + size = page64k ? PAGE_SIZE : SECTION_SIZE; >> + mask = ~(size - 1); >> + >> + pgd = pgd_offset_k(virt); >> + pud = pud_offset(pgd, virt); >> + if (pud_none(*pud)) >> + return NULL; >> + pmd = pmd_offset(pud, virt); >> + >> + if (page64k) { >> + if (pmd_none(*pmd)) >> + return NULL; >> + pte = pte_offset_kernel(pmd, virt); >> + set_pte(pte, __pte((phys & mask) | PROT_DEVICE_nGnRE)); >> + } else { >> + set_pmd(pmd, __pmd((phys & mask) | PROT_SECT_DEVICE_nGnRE)); >> + } >> + >> + return (void __iomem *)((virt & mask) + (phys & ~mask)); >> +} >> +#endif >> + >> +static void __ref create_mapping(phys_addr_t phys, unsigned long virt, >> + phys_addr_t size, >> + pgprot_t sect_prot, pgprot_t pte_prot) >> { >> if (virt < VMALLOC_START) { >> pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", >> &phys, virt); >> return; >> } >> - __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, size, 0); >> + >> + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, >> + size, sect_prot, pte_prot, 0, true); >> } >> >> -void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) >> +static void __ref create_mapping_late(phys_addr_t phys, unsigned long virt, >> + phys_addr_t size, >> + pgprot_t sect_prot, pgprot_t pte_prot) >> { >> - if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { >> - pr_warn("BUG: not creating id mapping for %pa\n", &addr); >> + if (virt < VMALLOC_START) { >> + pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", >> + &phys, virt); >> return; >> } >> - __create_mapping(&idmap_pg_dir[pgd_index(addr)], >> - addr, addr, size, map_io); >> + >> + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, >> + size, sect_prot, pte_prot, 0, false); >> +} >> + >> +#ifdef CONFIG_DEBUG_RODATA >> +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) >> +{ >> + /* >> + * Set up the executable regions using the existing section mappings >> + * for now. This will get more fine grained later once all memory >> + * is mapped >> + */ >> + unsigned long kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); >> + unsigned long kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); >> + >> + if (end < kernel_x_start) { >> + create_mapping(start, __phys_to_virt(start), >> + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); >> + } else if (start >= kernel_x_end) { >> + create_mapping(start, __phys_to_virt(start), >> + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); >> + } else { >> + if (start < kernel_x_start) >> + create_mapping(start, __phys_to_virt(start), >> + kernel_x_start - start, >> + PROT_SECT_NORMAL, >> + PAGE_KERNEL); >> + create_mapping(kernel_x_start, >> + __phys_to_virt(kernel_x_start), >> + kernel_x_end - kernel_x_start, >> + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); >> + if (kernel_x_end < end) >> + create_mapping(kernel_x_end, >> + __phys_to_virt(kernel_x_end), >> + end - kernel_x_end, >> + PROT_SECT_NORMAL, >> + PAGE_KERNEL); >> + } >> + >> +} >> +#else >> +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) >> +{ >> + create_mapping(start, __phys_to_virt(start), end - start, >> + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); >> } >> +#endif >> >> static void __init map_mem(void) >> { >> @@ -328,14 +503,69 @@ static void __init map_mem(void) >> memblock_set_current_limit(limit); >> } >> #endif >> - >> - create_mapping(start, __phys_to_virt(start), end - start); >> + __map_memblock(start, end); >> } >> >> /* Limit no longer required. */ >> memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); >> } >> >> +void __init fixup_executable(void) >> +{ >> +#ifdef CONFIG_DEBUG_RODATA >> + /* now that we are actually fully mapped, make the start/end more fine grained */ >> + if (!IS_ALIGNED((unsigned long)_stext, SECTION_SIZE)) { >> + unsigned long aligned_start = round_down(__pa(_stext), >> + SECTION_SIZE); >> + >> + create_mapping(aligned_start, __phys_to_virt(aligned_start), >> + __pa(_stext) - aligned_start, >> + PROT_SECT_NORMAL, >> + PAGE_KERNEL); >> + } >> + >> + if (!IS_ALIGNED((unsigned long)__init_end, SECTION_SIZE)) { >> + unsigned long aligned_end = round_up(__pa(__init_end), >> + SECTION_SIZE); >> + create_mapping(__pa(__init_end), (unsigned long)__init_end, >> + aligned_end - __pa(__init_end), >> + PROT_SECT_NORMAL, >> + PAGE_KERNEL); >> + } >> +#endif >> +} >> + >> +#ifdef CONFIG_DEBUG_RODATA >> +void mark_rodata_ro(void) >> +{ >> + create_mapping_late(__pa(_stext), (unsigned long)_stext, >> + (unsigned long)_etext - (unsigned long)_stext, >> + PROT_SECT_NORMAL_EXEC | PMD_SECT_RDONLY, >> + PAGE_KERNEL_EXEC | PTE_RDONLY); >> + >> +} >> +#endif >> + >> +static int __flush_mappings(void *unused) >> +{ >> + flush_tlb_kernel_range((unsigned long)__init_begin, >> + (unsigned long)__init_end); >> + return 0; >> +} >> + >> +void __ref fixup_init(void) >> +{ >> + phys_addr_t start = __pa(__init_begin); >> + phys_addr_t end = __pa(__init_end); >> + >> + create_mapping_late(start, (unsigned long)__init_begin, >> + end - start, >> + PROT_SECT_NORMAL, >> + PAGE_KERNEL); >> + if (!IS_ALIGNED(start, SECTION_SIZE) || !IS_ALIGNED(end, SECTION_SIZE)) >> + stop_machine(__flush_mappings, NULL, NULL); >> +} >> + >> /* >> * paging_init() sets up the page tables, initialises the zone memory >> * maps and sets up the zero page. >> @@ -345,6 +575,7 @@ void __init paging_init(void) >> void *zero_page; >> >> map_mem(); >> + fixup_executable(); >> >> /* >> * Finally flush the caches and tlb to ensure that we're in a >> -- >> Qualcomm Innovation Center, Inc. >> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project >> > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
On 28 October 2014 11:44, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote: > On 28 October 2014 12:29, Steve Capper <steve.capper@linaro.org> wrote: >> On Mon, Oct 27, 2014 at 01:12:32PM -0700, Laura Abbott wrote: [...] >> >> I think we should wipe out map_io from all these functions as it's >> causing too much complexity. It's enough to have just sect_prot and >> pte_prot. I posted some similar feedback to Ard: >> http://lists.infradead.org/pipermail/linux-arm-kernel/2014-October/296918.html >> > > I do agree. But could we have a single enum that maps onto {sect_prot, > pte_prot} tuples rather than having to pass both values everywhere we > call any of these functions? > I.e., { MMU_PROT_DEFAULT, MMU_PROT_READONLY, MMU_PROT_READWRITE, > MMU_PROT_READEXEC, MMU_PROT_DEVICE }, with the mapping local to mmu.c? > arch/arm has a mem_types array, I'm not sure how applicable that would be for arm64 One could also be (slightly) filthy and promote a pte_prot into a sect_prot via manipulation of the lower 2 bits (see for example pte_mkhuge). Basically as long as the mapping logic doesn't make decisions about which pgprots to use, then I'm happy :-). Cheers, -- Steve
[resending due to previous header destruction] Hi Laura, On Mon, Oct 27, 2014 at 08:12:32PM +0000, Laura Abbott wrote: > Add page protections for arm64 similar to those in arm. > This is for security reasons to prevent certain classes > of exploits. The current method: > > - Map all memory as either RWX or RW. We round to the nearest > section to avoid creating page tables before everything is mapped > - Once everything is mapped, if either end of the RWX section should > not be X, we split the PMD and remap as necessary > - When initmem is to be freed, we change the permissions back to > RW (using stop machine if necessary to flush the TLB) > - If CONFIG_DEBUG_RODATA is set, the read only sections are set > read only. > > Signed-off-by: Laura Abbott <lauraa@codeaurora.org> > --- > v4: Combined the Kconfig options > --- > arch/arm64/Kconfig.debug | 23 +++ > arch/arm64/include/asm/cacheflush.h | 4 + > arch/arm64/kernel/vmlinux.lds.S | 17 ++ > arch/arm64/mm/init.c | 1 + > arch/arm64/mm/mm.h | 2 + > arch/arm64/mm/mmu.c | 303 +++++++++++++++++++++++++++++++----- > 6 files changed, 314 insertions(+), 36 deletions(-) With this patch applied to v3.18-rc2, my board to blows up at boot when using UEFI (without DEBUG_RODATA selected): ---->8---- EFI stub: Booting Linux Kernel... Initializing cgroup subsys cpu Linux version 3.18.0-rc2+ (mark@leverpostej) (gcc version 4.9.2 20140904 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) ) #112 SMP PREEMPT Tue Oct 28 13:58:41 GMT 2014 CPU: AArch64 Processor [410fd030] revision 0 Detected VIPT I-cache on CPU0 bootconsole [uart0] enabled efi: Getting EFI parameters from FDT: EFI v2.40 by ARM Juno EFI Oct 7 2014 15:05:42 efi: ACPI=0xfebdc000 ACPI 2.0=0xfebdc014 cma: Reserved 16 MiB at fd800000 BUG: failure at arch/arm64/mm/mmu.c:234/alloc_init_pmd()! Kernel panic - not syncing: BUG! CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2+ #112 Call trace: [<ffffffc000087ec8>] dump_backtrace+0x0/0x124 [<ffffffc000087ffc>] show_stack+0x10/0x1c [<ffffffc0004ebd58>] dump_stack+0x74/0xb8 [<ffffffc0004eb018>] panic+0xe0/0x220 [<ffffffc0004e8e08>] alloc_init_pmd+0x1cc/0x1dc [<ffffffc0004e8e3c>] alloc_init_pud+0x24/0x6c [<ffffffc0004e8f54>] __create_mapping+0xd0/0xf0 [<ffffffc00069a0a0>] create_id_mapping+0x5c/0x68 [<ffffffc00069964c>] efi_idmap_init+0x54/0xd8 [<ffffffc0006978a8>] setup_arch+0x408/0x5c0 [<ffffffc00069566c>] start_kernel+0x94/0x3a0 ---[ end Kernel panic - not syncing: BUG! ---->8---- I've not yet figured out precisely why. I haven't tried a !EFI boot because of the way my board is set up at the moment. Mark.
On 10/28/2014 4:29 AM, Steve Capper wrote: > On Mon, Oct 27, 2014 at 01:12:32PM -0700, Laura Abbott wrote: >> Add page protections for arm64 similar to those in arm. >> This is for security reasons to prevent certain classes >> of exploits. The current method: >> >> - Map all memory as either RWX or RW. We round to the nearest >> section to avoid creating page tables before everything is mapped >> - Once everything is mapped, if either end of the RWX section should >> not be X, we split the PMD and remap as necessary >> - When initmem is to be freed, we change the permissions back to >> RW (using stop machine if necessary to flush the TLB) >> - If CONFIG_DEBUG_RODATA is set, the read only sections are set >> read only. >> >> Signed-off-by: Laura Abbott <lauraa@codeaurora.org> > > Hi Laura, > I have some comments below. > ... >> @@ -156,29 +195,52 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, >> } while (pte++, addr += PAGE_SIZE, addr != end); >> } >> >> -static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, >> +void __ref split_pud(pud_t *old_pud, pmd_t *pmd) >> +{ >> + unsigned long pfn = pud_pfn(*old_pud); >> + const pmdval_t mask = PMD_SECT_USER | PMD_SECT_PXN | PMD_SECT_UXN | >> + PMD_SECT_RDONLY | PMD_SECT_PROT_NONE | >> + PMD_SECT_VALID; >> + pgprot_t prot = __pgprot(pud_val(*old_pud) & mask); > > This looks a little fragile, if I've understood things correctly then > all we want to do is create a set of pmds with the same pgprot_t as the > original pud? > > In that case it would probably be easier to simply mask out the pfn > from our pud to create the pgprot rather than mask in a set of > attributes (that may change in future)? > Good suggestion. I'll check that in the next version. >> + int i = 0; >> + >> + do { >> + set_pmd(pmd, pfn_pmd(pfn, prot)); >> + pfn++; >> + } while (pmd++, i++, i < PTRS_PER_PMD); >> +} >> + >> +static void __ref alloc_init_pmd(pud_t *pud, unsigned long addr, >> unsigned long end, phys_addr_t phys, >> - int map_io) >> + pgprot_t sect_prot, pgprot_t pte_prot, >> + bool early, int map_io) >> { >> pmd_t *pmd; >> unsigned long next; >> - pmdval_t prot_sect; >> - pgprot_t prot_pte; >> >> if (map_io) { >> - prot_sect = PROT_SECT_DEVICE_nGnRE; >> - prot_pte = __pgprot(PROT_DEVICE_nGnRE); >> - } else { >> - prot_sect = PROT_SECT_NORMAL_EXEC; >> - prot_pte = PAGE_KERNEL_EXEC; >> + sect_prot = PROT_SECT_DEVICE_nGnRE; >> + pte_prot = __pgprot(PROT_DEVICE_nGnRE); >> } > > I think we should wipe out map_io from all these functions as it's > causing too much complexity. It's enough to have just sect_prot and > pte_prot. I posted some similar feedback to Ard: > http://lists.infradead.org/pipermail/linux-arm-kernel/2014-October/296918.html > Sounds fine. I think my series would conflict with Ard's so I should give that series a review and see if it makes sense to base this series off of that series. Part of this work might also be relevent for adding DEBUG_PAGEALLOC for arm64 so I might split that out into a separate patch as well. Thanks, Laura
On 10/28/2014 7:20 AM, Mark Rutland wrote: > [resending due to previous header destruction] > > Hi Laura, > > On Mon, Oct 27, 2014 at 08:12:32PM +0000, Laura Abbott wrote: >> Add page protections for arm64 similar to those in arm. >> This is for security reasons to prevent certain classes >> of exploits. The current method: >> >> - Map all memory as either RWX or RW. We round to the nearest >> section to avoid creating page tables before everything is mapped >> - Once everything is mapped, if either end of the RWX section should >> not be X, we split the PMD and remap as necessary >> - When initmem is to be freed, we change the permissions back to >> RW (using stop machine if necessary to flush the TLB) >> - If CONFIG_DEBUG_RODATA is set, the read only sections are set >> read only. >> >> Signed-off-by: Laura Abbott <lauraa@codeaurora.org> >> --- >> v4: Combined the Kconfig options >> --- >> arch/arm64/Kconfig.debug | 23 +++ >> arch/arm64/include/asm/cacheflush.h | 4 + >> arch/arm64/kernel/vmlinux.lds.S | 17 ++ >> arch/arm64/mm/init.c | 1 + >> arch/arm64/mm/mm.h | 2 + >> arch/arm64/mm/mmu.c | 303 +++++++++++++++++++++++++++++++----- >> 6 files changed, 314 insertions(+), 36 deletions(-) > > With this patch applied to v3.18-rc2, my board to blows up at boot when > using UEFI (without DEBUG_RODATA selected): > > ---->8---- > EFI stub: Booting Linux Kernel... > Initializing cgroup subsys cpu > Linux version 3.18.0-rc2+ (mark@leverpostej) (gcc version 4.9.2 20140904 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) ) #112 SMP PREEMPT Tue Oct 28 13:58:41 GMT 2014 > CPU: AArch64 Processor [410fd030] revision 0 > Detected VIPT I-cache on CPU0 > bootconsole [uart0] enabled > efi: Getting EFI parameters from FDT: > EFI v2.40 by ARM Juno EFI Oct 7 2014 15:05:42 > efi: ACPI=0xfebdc000 ACPI 2.0=0xfebdc014 > cma: Reserved 16 MiB at fd800000 > BUG: failure at arch/arm64/mm/mmu.c:234/alloc_init_pmd()! > Kernel panic - not syncing: BUG! > CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2+ #112 > Call trace: > [<ffffffc000087ec8>] dump_backtrace+0x0/0x124 > [<ffffffc000087ffc>] show_stack+0x10/0x1c > [<ffffffc0004ebd58>] dump_stack+0x74/0xb8 > [<ffffffc0004eb018>] panic+0xe0/0x220 > [<ffffffc0004e8e08>] alloc_init_pmd+0x1cc/0x1dc > [<ffffffc0004e8e3c>] alloc_init_pud+0x24/0x6c > [<ffffffc0004e8f54>] __create_mapping+0xd0/0xf0 > [<ffffffc00069a0a0>] create_id_mapping+0x5c/0x68 > [<ffffffc00069964c>] efi_idmap_init+0x54/0xd8 > [<ffffffc0006978a8>] setup_arch+0x408/0x5c0 > [<ffffffc00069566c>] start_kernel+0x94/0x3a0 > ---[ end Kernel panic - not syncing: BUG! > ---->8---- > > I've not yet figured out precisely why. I haven't tried a !EFI boot > because of the way my board is set up at the moment. > I think it's because the idmap is now being created earlier than expected so it's trying to get memory from the normal allocator and dying. I'll have to see if I can get my hands on an EFI board and test it out. Thanks, Laura
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 0a12933..1b25015 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -54,4 +54,27 @@ config DEBUG_SET_MODULE_RONX against certain classes of kernel exploits. If in doubt, say "N". +config DEBUG_RODATA + bool "Make kernel text and rodata read-only" + help + If this is set, kernel text and rodata will be made read-only. This + is to help catch accidental or malicious attempts to change the + kernel's executable code. Additionally splits rodata from kernel + text so it can be made explicitly non-executable. + + If in doubt, say Y + +config DEBUG_ALIGN_RODATA + depends on DEBUG_RODATA + bool "Align linker sections up to SECTION_SIZE" + help + If this option is enabled, sections that may potentially be marked as + read only or non-executable will be aligned up to the section size of + the kernel. This prevents sections from being split into pages and + avoids a potential TLB penalty. The downside is an increase in + alignment and potentially wasted space. Turn on this option if + performance is more important than memory pressure. + + If in doubt, say N + endmenu diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 689b637..0014207 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -152,4 +152,8 @@ int set_memory_ro(unsigned long addr, int numpages); int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); + +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void); +#endif #endif diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index edf8715..83424ad 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -8,6 +8,7 @@ #include <asm/thread_info.h> #include <asm/memory.h> #include <asm/page.h> +#include <asm/pgtable.h> #include "image.h" @@ -54,6 +55,9 @@ SECTIONS _text = .; HEAD_TEXT } +#ifdef DEBUG_ALIGN_RODATA + . = ALIGN(1<<SECTION_SHIFT); +#endif .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; @@ -70,19 +74,32 @@ SECTIONS *(.got) /* Global offset table */ } +#ifdef DEBUG_ALIGN_RODATA + . = ALIGN(1<<SECTION_SHIFT); +#endif RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES _etext = .; /* End of text and rodata section */ +#ifdef DEBUG_ALIGN_RODATA + . = ALIGN(1<<SECTION_SHIFT); +#else . = ALIGN(PAGE_SIZE); +#endif __init_begin = .; INIT_TEXT_SECTION(8) .exit.text : { ARM_EXIT_KEEP(EXIT_TEXT) } + +#ifdef DEBUG_ALIGN_RODATA + . = ALIGN(1<<SECTION_SHIFT); + __init_data_begin = .; +#else . = ALIGN(16); +#endif .init.data : { INIT_DATA INIT_SETUP(16) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 494297c..61f44c7 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -324,6 +324,7 @@ void __init mem_init(void) void free_initmem(void) { + fixup_init(); free_initmem_default(0); } diff --git a/arch/arm64/mm/mm.h b/arch/arm64/mm/mm.h index d519f4f..82347d8 100644 --- a/arch/arm64/mm/mm.h +++ b/arch/arm64/mm/mm.h @@ -1,2 +1,4 @@ extern void __init bootmem_init(void); extern void __init arm64_swiotlb_init(void); + +void fixup_init(void); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e92f633..0428294 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -26,6 +26,7 @@ #include <linux/memblock.h> #include <linux/fs.h> #include <linux/io.h> +#include <linux/stop_machine.h> #include <asm/cputype.h> #include <asm/fixmap.h> @@ -137,17 +138,55 @@ static void __init *early_alloc(unsigned long sz) return ptr; } -static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, +/* + * remap a PMD into pages + */ +static noinline void __ref split_pmd(pmd_t *pmd, bool early) +{ + pte_t *pte, *start_pte; + unsigned long pfn; + int i = 0; + + if (early) + start_pte = pte = early_alloc(PTRS_PER_PTE*sizeof(pte_t)); + else + start_pte = pte = (pte_t *)__get_free_page(PGALLOC_GFP); + + BUG_ON(!pte); + + pfn = pmd_pfn(*pmd); + + do { + /* + * Need to have the least restrictive permissions available + * permissions will be fixed up later + */ + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + pfn++; + } while (pte++, i++, i < PTRS_PER_PTE); + + + __pmd_populate(pmd, __pa(start_pte), PMD_TYPE_TABLE); + flush_tlb_all(); +} + +static void __ref alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, - pgprot_t prot) + pgprot_t prot, bool early) { pte_t *pte; if (pmd_none(*pmd)) { - pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); + if (early) + pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t)); + else + pte = (pte_t *)__get_free_page(PGALLOC_GFP); + BUG_ON(!pte); __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); } - BUG_ON(pmd_bad(*pmd)); + + if (pmd_bad(*pmd)) + split_pmd(pmd, early); pte = pte_offset_kernel(pmd, addr); do { @@ -156,29 +195,52 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, } while (pte++, addr += PAGE_SIZE, addr != end); } -static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, +void __ref split_pud(pud_t *old_pud, pmd_t *pmd) +{ + unsigned long pfn = pud_pfn(*old_pud); + const pmdval_t mask = PMD_SECT_USER | PMD_SECT_PXN | PMD_SECT_UXN | + PMD_SECT_RDONLY | PMD_SECT_PROT_NONE | + PMD_SECT_VALID; + pgprot_t prot = __pgprot(pud_val(*old_pud) & mask); + int i = 0; + + do { + set_pmd(pmd, pfn_pmd(pfn, prot)); + pfn++; + } while (pmd++, i++, i < PTRS_PER_PMD); +} + +static void __ref alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, - int map_io) + pgprot_t sect_prot, pgprot_t pte_prot, + bool early, int map_io) { pmd_t *pmd; unsigned long next; - pmdval_t prot_sect; - pgprot_t prot_pte; if (map_io) { - prot_sect = PROT_SECT_DEVICE_nGnRE; - prot_pte = __pgprot(PROT_DEVICE_nGnRE); - } else { - prot_sect = PROT_SECT_NORMAL_EXEC; - prot_pte = PAGE_KERNEL_EXEC; + sect_prot = PROT_SECT_DEVICE_nGnRE; + pte_prot = __pgprot(PROT_DEVICE_nGnRE); } /* * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_bad(*pud)) { - pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); + if (early) + pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t)); + else + pmd = pmd_alloc_one(&init_mm, addr); + BUG_ON(!pmd); + if (pud_sect(*pud)) { + /* + * need to have the 1G of mappings continue to be + * present + */ + split_pud(pud, pmd); + } pud_populate(&init_mm, pud, pmd); + flush_tlb_all(); } pmd = pmd_offset(pud, addr); @@ -186,8 +248,8 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, next = pmd_addr_end(addr, end); /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0) { - pmd_t old_pmd =*pmd; - set_pmd(pmd, __pmd(phys | prot_sect)); + pmd_t old_pmd = *pmd; + set_pmd(pmd, __pmd(phys | sect_prot)); /* * Check for previous table entries created during * boot (__create_page_tables) and flush them. @@ -196,15 +258,16 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, flush_tlb_all(); } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot_pte); + pte_prot, early); } phys += next - addr; } while (pmd++, addr = next, addr != end); } -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, +static void __ref alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long phys, - int map_io) + pgprot_t sect_prot, pgprot_t pte_prot, + bool early, int map_io) { pud_t *pud; unsigned long next; @@ -222,10 +285,10 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, /* * For 4K granule only, attempt to put down a 1GB block */ - if (!map_io && (PAGE_SHIFT == 12) && + if (!map_io && early && (PAGE_SHIFT == 12) && ((addr | next | phys) & ~PUD_MASK) == 0) { pud_t old_pud = *pud; - set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); + set_pud(pud, __pud(phys | sect_prot)); /* * If we have an old value for a pud, it will @@ -240,7 +303,8 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, flush_tlb_all(); } } else { - alloc_init_pmd(pud, addr, next, phys, map_io); + alloc_init_pmd(pud, addr, next, phys, sect_prot, + pte_prot, early, map_io); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -250,9 +314,11 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, * Create the page directory entries and any necessary page tables for the * mapping specified by 'md'. */ -static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, - unsigned long virt, phys_addr_t size, - int map_io) +static void __ref __create_mapping(pgd_t *pgd, phys_addr_t phys, + unsigned long virt, + phys_addr_t size, + pgprot_t sect_prot, pgprot_t pte_prot, + int map_io, bool early) { unsigned long addr, length, end, next; @@ -262,31 +328,140 @@ static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, map_io); + alloc_init_pud(pgd, addr, next, phys, sect_prot, pte_prot, + early, map_io); phys += next - addr; } while (pgd++, addr = next, addr != end); } -static void __init create_mapping(phys_addr_t phys, unsigned long virt, - phys_addr_t size) +void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) +{ + if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { + pr_warn("BUG: not creating id mapping for %pa\n", &addr); + return; + } + __create_mapping(&idmap_pg_dir[pgd_index(addr)], + addr, addr, size, PROT_SECT_NORMAL_EXEC, + PAGE_KERNEL_EXEC, map_io, false); +} + +static inline pmd_t *pmd_off_k(unsigned long virt) +{ + return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); +} + +#ifdef CONFIG_EARLY_PRINTK +/* + * Create an early I/O mapping using the pgd/pmd entries already populated + * in head.S as this function is called too early to allocated any memory. The + * mapping size is 2MB with 4KB pages or 64KB or 64KB pages. + */ +void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) +{ + unsigned long size, mask; + bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * No early pte entries with !ARM64_64K_PAGES configuration, so using + * sections (pmd). + */ + size = page64k ? PAGE_SIZE : SECTION_SIZE; + mask = ~(size - 1); + + pgd = pgd_offset_k(virt); + pud = pud_offset(pgd, virt); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, virt); + + if (page64k) { + if (pmd_none(*pmd)) + return NULL; + pte = pte_offset_kernel(pmd, virt); + set_pte(pte, __pte((phys & mask) | PROT_DEVICE_nGnRE)); + } else { + set_pmd(pmd, __pmd((phys & mask) | PROT_SECT_DEVICE_nGnRE)); + } + + return (void __iomem *)((virt & mask) + (phys & ~mask)); +} +#endif + +static void __ref create_mapping(phys_addr_t phys, unsigned long virt, + phys_addr_t size, + pgprot_t sect_prot, pgprot_t pte_prot) { if (virt < VMALLOC_START) { pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", &phys, virt); return; } - __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, size, 0); + + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, + size, sect_prot, pte_prot, 0, true); } -void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) +static void __ref create_mapping_late(phys_addr_t phys, unsigned long virt, + phys_addr_t size, + pgprot_t sect_prot, pgprot_t pte_prot) { - if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { - pr_warn("BUG: not creating id mapping for %pa\n", &addr); + if (virt < VMALLOC_START) { + pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", + &phys, virt); return; } - __create_mapping(&idmap_pg_dir[pgd_index(addr)], - addr, addr, size, map_io); + + return __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, + size, sect_prot, pte_prot, 0, false); +} + +#ifdef CONFIG_DEBUG_RODATA +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) +{ + /* + * Set up the executable regions using the existing section mappings + * for now. This will get more fine grained later once all memory + * is mapped + */ + unsigned long kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); + unsigned long kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); + + if (end < kernel_x_start) { + create_mapping(start, __phys_to_virt(start), + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); + } else if (start >= kernel_x_end) { + create_mapping(start, __phys_to_virt(start), + end - start, PROT_SECT_NORMAL, PAGE_KERNEL); + } else { + if (start < kernel_x_start) + create_mapping(start, __phys_to_virt(start), + kernel_x_start - start, + PROT_SECT_NORMAL, + PAGE_KERNEL); + create_mapping(kernel_x_start, + __phys_to_virt(kernel_x_start), + kernel_x_end - kernel_x_start, + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); + if (kernel_x_end < end) + create_mapping(kernel_x_end, + __phys_to_virt(kernel_x_end), + end - kernel_x_end, + PROT_SECT_NORMAL, + PAGE_KERNEL); + } + +} +#else +static void __init __map_memblock(phys_addr_t start, phys_addr_t end) +{ + create_mapping(start, __phys_to_virt(start), end - start, + PROT_SECT_NORMAL_EXEC, PAGE_KERNEL_EXEC); } +#endif static void __init map_mem(void) { @@ -328,14 +503,69 @@ static void __init map_mem(void) memblock_set_current_limit(limit); } #endif - - create_mapping(start, __phys_to_virt(start), end - start); + __map_memblock(start, end); } /* Limit no longer required. */ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } +void __init fixup_executable(void) +{ +#ifdef CONFIG_DEBUG_RODATA + /* now that we are actually fully mapped, make the start/end more fine grained */ + if (!IS_ALIGNED((unsigned long)_stext, SECTION_SIZE)) { + unsigned long aligned_start = round_down(__pa(_stext), + SECTION_SIZE); + + create_mapping(aligned_start, __phys_to_virt(aligned_start), + __pa(_stext) - aligned_start, + PROT_SECT_NORMAL, + PAGE_KERNEL); + } + + if (!IS_ALIGNED((unsigned long)__init_end, SECTION_SIZE)) { + unsigned long aligned_end = round_up(__pa(__init_end), + SECTION_SIZE); + create_mapping(__pa(__init_end), (unsigned long)__init_end, + aligned_end - __pa(__init_end), + PROT_SECT_NORMAL, + PAGE_KERNEL); + } +#endif +} + +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void) +{ + create_mapping_late(__pa(_stext), (unsigned long)_stext, + (unsigned long)_etext - (unsigned long)_stext, + PROT_SECT_NORMAL_EXEC | PMD_SECT_RDONLY, + PAGE_KERNEL_EXEC | PTE_RDONLY); + +} +#endif + +static int __flush_mappings(void *unused) +{ + flush_tlb_kernel_range((unsigned long)__init_begin, + (unsigned long)__init_end); + return 0; +} + +void __ref fixup_init(void) +{ + phys_addr_t start = __pa(__init_begin); + phys_addr_t end = __pa(__init_end); + + create_mapping_late(start, (unsigned long)__init_begin, + end - start, + PROT_SECT_NORMAL, + PAGE_KERNEL); + if (!IS_ALIGNED(start, SECTION_SIZE) || !IS_ALIGNED(end, SECTION_SIZE)) + stop_machine(__flush_mappings, NULL, NULL); +} + /* * paging_init() sets up the page tables, initialises the zone memory * maps and sets up the zero page. @@ -345,6 +575,7 @@ void __init paging_init(void) void *zero_page; map_mem(); + fixup_executable(); /* * Finally flush the caches and tlb to ensure that we're in a
Add page protections for arm64 similar to those in arm. This is for security reasons to prevent certain classes of exploits. The current method: - Map all memory as either RWX or RW. We round to the nearest section to avoid creating page tables before everything is mapped - Once everything is mapped, if either end of the RWX section should not be X, we split the PMD and remap as necessary - When initmem is to be freed, we change the permissions back to RW (using stop machine if necessary to flush the TLB) - If CONFIG_DEBUG_RODATA is set, the read only sections are set read only. Signed-off-by: Laura Abbott <lauraa@codeaurora.org> --- v4: Combined the Kconfig options --- arch/arm64/Kconfig.debug | 23 +++ arch/arm64/include/asm/cacheflush.h | 4 + arch/arm64/kernel/vmlinux.lds.S | 17 ++ arch/arm64/mm/init.c | 1 + arch/arm64/mm/mm.h | 2 + arch/arm64/mm/mmu.c | 303 +++++++++++++++++++++++++++++++----- 6 files changed, 314 insertions(+), 36 deletions(-)