Message ID | 20240503130147.1154804-18-joey.gouly@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: Permission Overlay Extension | expand |
On 5/3/24 18:31, Joey Gouly wrote: > Implement the PKEYS interface, using the Permission Overlay Extension. > > Signed-off-by: Joey Gouly <joey.gouly@arm.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > --- > arch/arm64/include/asm/mmu.h | 1 + > arch/arm64/include/asm/mmu_context.h | 51 ++++++++++++- > arch/arm64/include/asm/pgtable.h | 22 +++++- > arch/arm64/include/asm/pkeys.h | 110 +++++++++++++++++++++++++++ > arch/arm64/include/asm/por.h | 33 ++++++++ > arch/arm64/mm/mmu.c | 40 ++++++++++ > 6 files changed, 255 insertions(+), 2 deletions(-) > create mode 100644 arch/arm64/include/asm/pkeys.h > create mode 100644 arch/arm64/include/asm/por.h > > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h > index 65977c7783c5..983afeb4eba5 100644 > --- a/arch/arm64/include/asm/mmu.h > +++ b/arch/arm64/include/asm/mmu.h > @@ -25,6 +25,7 @@ typedef struct { > refcount_t pinned; > void *vdso; > unsigned long flags; > + u8 pkey_allocation_map; > } mm_context_t; > > /* > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h > index c768d16b81a4..cb499db7a97b 100644 > --- a/arch/arm64/include/asm/mmu_context.h > +++ b/arch/arm64/include/asm/mmu_context.h > @@ -15,12 +15,12 @@ > #include <linux/sched/hotplug.h> > #include <linux/mm_types.h> > #include <linux/pgtable.h> > +#include <linux/pkeys.h> > > #include <asm/cacheflush.h> > #include <asm/cpufeature.h> > #include <asm/daifflags.h> > #include <asm/proc-fns.h> > -#include <asm-generic/mm_hooks.h> > #include <asm/cputype.h> > #include <asm/sysreg.h> > #include <asm/tlbflush.h> > @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) > { > atomic64_set(&mm->context.id, 0); > refcount_set(&mm->context.pinned, 0); > + > + /* pkey 0 is the default, so always reserve it. */ > + mm->context.pkey_allocation_map = 0x1; > + > + return 0; > +} > + > +static inline void arch_dup_pkeys(struct mm_struct *oldmm, > + struct mm_struct *mm) > +{ > + /* Duplicate the oldmm pkey state in mm: */ > + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; > +} > + > +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) > +{ > + arch_dup_pkeys(oldmm, mm); > + > return 0; > } > > +static inline void arch_exit_mmap(struct mm_struct *mm) > +{ > +} > + > +static inline void arch_unmap(struct mm_struct *mm, > + unsigned long start, unsigned long end) > +{ > +} > + > #ifdef CONFIG_ARM64_SW_TTBR0_PAN > static inline void update_saved_ttbr0(struct task_struct *tsk, > struct mm_struct *mm) > @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) > return -1UL >> 8; > } > > +/* > + * We only want to enforce protection keys on the current process > + * because we effectively have no access to POR_EL0 for other > + * processes or any way to tell *which * POR_EL0 in a threaded > + * process we could use. > + * > + * So do not enforce things if the VMA is not from the current > + * mm, or if we are in a kernel thread. > + */ > +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, > + bool write, bool execute, bool foreign) > +{ > + if (!arch_pkeys_enabled()) > + return true; The above check can be dropped as the caller of this function fault_from_pkey() does the same check. Thanks, Amit > + > + /* allow access if the VMA is not one from this process */ > + if (foreign || vma_is_foreign(vma)) > + return true; > + > + return por_el0_allows_pkey(vma_pkey(vma), write, execute); > +} > + > #include <asm-generic/mmu_context.h> > > #endif /* !__ASSEMBLY__ */ > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > index 2449e4e27ea6..8ee68ff03016 100644 > --- a/arch/arm64/include/asm/pgtable.h > +++ b/arch/arm64/include/asm/pgtable.h > @@ -34,6 +34,7 @@ > > #include <asm/cmpxchg.h> > #include <asm/fixmap.h> > +#include <asm/por.h> > #include <linux/mmdebug.h> > #include <linux/mm_types.h> > #include <linux/sched.h> > @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > #define pte_accessible(mm, pte) \ > (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) > > +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) > +{ > + u64 por; > + > + if (!system_supports_poe()) > + return true; > + > + por = read_sysreg_s(SYS_POR_EL0); > + > + if (write) > + return por_elx_allows_write(por, pkey); > + > + if (execute) > + return por_elx_allows_exec(por, pkey); > + > + return por_elx_allows_read(por, pkey); > +} > + > /* > * p??_access_permitted() is true for valid user mappings (PTE_USER > * bit set, subject to the write permission check). For execute-only > @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > #define pte_access_permitted_no_overlay(pte, write) \ > (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) > #define pte_access_permitted(pte, write) \ > - pte_access_permitted_no_overlay(pte, write) > + (pte_access_permitted_no_overlay(pte, write) && \ > + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) > #define pmd_access_permitted(pmd, write) \ > (pte_access_permitted(pmd_pte(pmd), (write))) > #define pud_access_permitted(pud, write) \ > diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h > new file mode 100644 > index 000000000000..a284508a4d02 > --- /dev/null > +++ b/arch/arm64/include/asm/pkeys.h > @@ -0,0 +1,110 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + * > + * Based on arch/x86/include/asm/pkeys.h > + */ > + > +#ifndef _ASM_ARM64_PKEYS_H > +#define _ASM_ARM64_PKEYS_H > + > +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) > + > +#define arch_max_pkey() 7 > + > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, > + unsigned long init_val); > + > +static inline bool arch_pkeys_enabled(void) > +{ > + return false; > +} > + > +static inline int vma_pkey(struct vm_area_struct *vma) > +{ > + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; > +} > + > +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, > + int prot, int pkey) > +{ > + if (pkey != -1) > + return pkey; > + > + return vma_pkey(vma); > +} > + > +static inline int execute_only_pkey(struct mm_struct *mm) > +{ > + // Execute-only mappings are handled by EPAN/FEAT_PAN3. > + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); > + > + return -1; > +} > + > +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) > +#define mm_set_pkey_allocated(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) |= (1U << pkey); \ > +} while (0) > +#define mm_set_pkey_free(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ > +} while (0) > + > +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) > +{ > + /* > + * "Allocated" pkeys are those that have been returned > + * from pkey_alloc() or pkey 0 which is allocated > + * implicitly when the mm is created. > + */ > + if (pkey < 0) > + return false; > + if (pkey >= arch_max_pkey()) > + return false; > + > + return mm_pkey_allocation_map(mm) & (1U << pkey); > +} > + > +/* > + * Returns a positive, 3-bit key on success, or -1 on failure. > + */ > +static inline int mm_pkey_alloc(struct mm_struct *mm) > +{ > + /* > + * Note: this is the one and only place we make sure > + * that the pkey is valid as far as the hardware is > + * concerned. The rest of the kernel trusts that > + * only good, valid pkeys come out of here. > + */ > + u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); > + int ret; > + > + if (!arch_pkeys_enabled()) > + return -1; > + > + /* > + * Are we out of pkeys? We must handle this specially > + * because ffz() behavior is undefined if there are no > + * zeros. > + */ > + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) > + return -1; > + > + ret = ffz(mm_pkey_allocation_map(mm)); > + > + mm_set_pkey_allocated(mm, ret); > + > + return ret; > +} > + > +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) > +{ > + if (!mm_pkey_is_allocated(mm, pkey)) > + return -EINVAL; > + > + mm_set_pkey_free(mm, pkey); > + > + return 0; > +} > + > +#endif /* _ASM_ARM64_PKEYS_H */ > diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h > new file mode 100644 > index 000000000000..d6604e0c5c54 > --- /dev/null > +++ b/arch/arm64/include/asm/por.h > @@ -0,0 +1,33 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + */ > + > +#ifndef _ASM_ARM64_POR_H > +#define _ASM_ARM64_POR_H > + > +#define POR_BITS_PER_PKEY 4 > +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf) > + > +static inline bool por_elx_allows_read(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_R; > +} > + > +static inline bool por_elx_allows_write(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_W; > +} > + > +static inline bool por_elx_allows_exec(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_X; > +} > + > +#endif /* _ASM_ARM64_POR_H */ > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index 495b732d5af3..e50ccc86d150 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -25,6 +25,7 @@ > #include <linux/vmalloc.h> > #include <linux/set_memory.h> > #include <linux/kfence.h> > +#include <linux/pkeys.h> > > #include <asm/barrier.h> > #include <asm/cputype.h> > @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) > > cpu_uninstall_idmap(); > } > + > +#ifdef CONFIG_ARCH_HAS_PKEYS > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > +{ > + u64 new_por = POE_RXW; > + u64 old_por; > + u64 pkey_shift; > + > + if (!arch_pkeys_enabled()) > + return -ENOSPC; > + > + /* > + * This code should only be called with valid 'pkey' > + * values originating from in-kernel users. Complain > + * if a bad value is observed. > + */ > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > + return -EINVAL; > + > + /* Set the bits we need in POR: */ > + if (init_val & PKEY_DISABLE_ACCESS) > + new_por = POE_X; > + else if (init_val & PKEY_DISABLE_WRITE) > + new_por = POE_RX; > + > + /* Shift the bits in to the correct place in POR for pkey: */ > + pkey_shift = pkey * POR_BITS_PER_PKEY; > + new_por <<= pkey_shift; > + > + /* Get old POR and mask off any old bits in place: */ > + old_por = read_sysreg_s(SYS_POR_EL0); > + old_por &= ~(POE_MASK << pkey_shift); > + > + /* Write old part along with new part: */ > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > + > + return 0; > +} > +#endif
Hi Amit, Thanks for taking a look! On Tue, May 28, 2024 at 12:25:58PM +0530, Amit Daniel Kachhap wrote: > > > On 5/3/24 18:31, Joey Gouly wrote: > > Implement the PKEYS interface, using the Permission Overlay Extension. > > > > Signed-off-by: Joey Gouly <joey.gouly@arm.com> > > Cc: Catalin Marinas <catalin.marinas@arm.com> > > Cc: Will Deacon <will@kernel.org> > > --- > > arch/arm64/include/asm/mmu.h | 1 + > > arch/arm64/include/asm/mmu_context.h | 51 ++++++++++++- > > arch/arm64/include/asm/pgtable.h | 22 +++++- > > arch/arm64/include/asm/pkeys.h | 110 +++++++++++++++++++++++++++ > > arch/arm64/include/asm/por.h | 33 ++++++++ > > arch/arm64/mm/mmu.c | 40 ++++++++++ > > 6 files changed, 255 insertions(+), 2 deletions(-) > > create mode 100644 arch/arm64/include/asm/pkeys.h > > create mode 100644 arch/arm64/include/asm/por.h > > > > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h > > index 65977c7783c5..983afeb4eba5 100644 > > --- a/arch/arm64/include/asm/mmu.h > > +++ b/arch/arm64/include/asm/mmu.h > > @@ -25,6 +25,7 @@ typedef struct { > > refcount_t pinned; > > void *vdso; > > unsigned long flags; > > + u8 pkey_allocation_map; > > } mm_context_t; > > /* > > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h > > index c768d16b81a4..cb499db7a97b 100644 > > --- a/arch/arm64/include/asm/mmu_context.h > > +++ b/arch/arm64/include/asm/mmu_context.h > > @@ -15,12 +15,12 @@ > > #include <linux/sched/hotplug.h> > > #include <linux/mm_types.h> > > #include <linux/pgtable.h> > > +#include <linux/pkeys.h> > > #include <asm/cacheflush.h> > > #include <asm/cpufeature.h> > > #include <asm/daifflags.h> > > #include <asm/proc-fns.h> > > -#include <asm-generic/mm_hooks.h> > > #include <asm/cputype.h> > > #include <asm/sysreg.h> > > #include <asm/tlbflush.h> > > @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) > > { > > atomic64_set(&mm->context.id, 0); > > refcount_set(&mm->context.pinned, 0); > > + > > + /* pkey 0 is the default, so always reserve it. */ > > + mm->context.pkey_allocation_map = 0x1; > > + > > + return 0; > > +} > > + > > +static inline void arch_dup_pkeys(struct mm_struct *oldmm, > > + struct mm_struct *mm) > > +{ > > + /* Duplicate the oldmm pkey state in mm: */ > > + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; > > +} > > + > > +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) > > +{ > > + arch_dup_pkeys(oldmm, mm); > > + > > return 0; > > } > > +static inline void arch_exit_mmap(struct mm_struct *mm) > > +{ > > +} > > + > > +static inline void arch_unmap(struct mm_struct *mm, > > + unsigned long start, unsigned long end) > > +{ > > +} > > + > > #ifdef CONFIG_ARM64_SW_TTBR0_PAN > > static inline void update_saved_ttbr0(struct task_struct *tsk, > > struct mm_struct *mm) > > @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) > > return -1UL >> 8; > > } > > +/* > > + * We only want to enforce protection keys on the current process > > + * because we effectively have no access to POR_EL0 for other > > + * processes or any way to tell *which * POR_EL0 in a threaded > > + * process we could use. > > + * > > + * So do not enforce things if the VMA is not from the current > > + * mm, or if we are in a kernel thread. > > + */ > > +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, > > + bool write, bool execute, bool foreign) > > +{ > > + if (!arch_pkeys_enabled()) > > + return true; > > The above check can be dropped as the caller of this function > fault_from_pkey() does the same check. arch_vma_access_permitted() is called by other places in the kernel, so I need to leave that check in. Thanks, Joey > > Thanks, > Amit > > > + > > + /* allow access if the VMA is not one from this process */ > > + if (foreign || vma_is_foreign(vma)) > > + return true; > > + > > + return por_el0_allows_pkey(vma_pkey(vma), write, execute); > > +} > > + > > #include <asm-generic/mmu_context.h> > > #endif /* !__ASSEMBLY__ */ > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > > index 2449e4e27ea6..8ee68ff03016 100644 > > --- a/arch/arm64/include/asm/pgtable.h > > +++ b/arch/arm64/include/asm/pgtable.h > > @@ -34,6 +34,7 @@ > > #include <asm/cmpxchg.h> > > #include <asm/fixmap.h> > > +#include <asm/por.h> > > #include <linux/mmdebug.h> > > #include <linux/mm_types.h> > > #include <linux/sched.h> > > @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > > #define pte_accessible(mm, pte) \ > > (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) > > +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) > > +{ > > + u64 por; > > + > > + if (!system_supports_poe()) > > + return true; > > + > > + por = read_sysreg_s(SYS_POR_EL0); > > + > > + if (write) > > + return por_elx_allows_write(por, pkey); > > + > > + if (execute) > > + return por_elx_allows_exec(por, pkey); > > + > > + return por_elx_allows_read(por, pkey); > > +} > > + > > /* > > * p??_access_permitted() is true for valid user mappings (PTE_USER > > * bit set, subject to the write permission check). For execute-only > > @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > > #define pte_access_permitted_no_overlay(pte, write) \ > > (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) > > #define pte_access_permitted(pte, write) \ > > - pte_access_permitted_no_overlay(pte, write) > > + (pte_access_permitted_no_overlay(pte, write) && \ > > + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) > > #define pmd_access_permitted(pmd, write) \ > > (pte_access_permitted(pmd_pte(pmd), (write))) > > #define pud_access_permitted(pud, write) \ > > diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h > > new file mode 100644 > > index 000000000000..a284508a4d02 > > --- /dev/null > > +++ b/arch/arm64/include/asm/pkeys.h > > @@ -0,0 +1,110 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * Copyright (C) 2023 Arm Ltd. > > + * > > + * Based on arch/x86/include/asm/pkeys.h > > + */ > > + > > +#ifndef _ASM_ARM64_PKEYS_H > > +#define _ASM_ARM64_PKEYS_H > > + > > +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) > > + > > +#define arch_max_pkey() 7 > > + > > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, > > + unsigned long init_val); > > + > > +static inline bool arch_pkeys_enabled(void) > > +{ > > + return false; > > +} > > + > > +static inline int vma_pkey(struct vm_area_struct *vma) > > +{ > > + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; > > +} > > + > > +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, > > + int prot, int pkey) > > +{ > > + if (pkey != -1) > > + return pkey; > > + > > + return vma_pkey(vma); > > +} > > + > > +static inline int execute_only_pkey(struct mm_struct *mm) > > +{ > > + // Execute-only mappings are handled by EPAN/FEAT_PAN3. > > + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); > > + > > + return -1; > > +} > > + > > +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) > > +#define mm_set_pkey_allocated(mm, pkey) do { \ > > + mm_pkey_allocation_map(mm) |= (1U << pkey); \ > > +} while (0) > > +#define mm_set_pkey_free(mm, pkey) do { \ > > + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ > > +} while (0) > > + > > +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) > > +{ > > + /* > > + * "Allocated" pkeys are those that have been returned > > + * from pkey_alloc() or pkey 0 which is allocated > > + * implicitly when the mm is created. > > + */ > > + if (pkey < 0) > > + return false; > > + if (pkey >= arch_max_pkey()) > > + return false; > > + > > + return mm_pkey_allocation_map(mm) & (1U << pkey); > > +} > > + > > +/* > > + * Returns a positive, 3-bit key on success, or -1 on failure. > > + */ > > +static inline int mm_pkey_alloc(struct mm_struct *mm) > > +{ > > + /* > > + * Note: this is the one and only place we make sure > > + * that the pkey is valid as far as the hardware is > > + * concerned. The rest of the kernel trusts that > > + * only good, valid pkeys come out of here. > > + */ > > + u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); > > + int ret; > > + > > + if (!arch_pkeys_enabled()) > > + return -1; > > + > > + /* > > + * Are we out of pkeys? We must handle this specially > > + * because ffz() behavior is undefined if there are no > > + * zeros. > > + */ > > + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) > > + return -1; > > + > > + ret = ffz(mm_pkey_allocation_map(mm)); > > + > > + mm_set_pkey_allocated(mm, ret); > > + > > + return ret; > > +} > > + > > +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) > > +{ > > + if (!mm_pkey_is_allocated(mm, pkey)) > > + return -EINVAL; > > + > > + mm_set_pkey_free(mm, pkey); > > + > > + return 0; > > +} > > + > > +#endif /* _ASM_ARM64_PKEYS_H */ > > diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h > > new file mode 100644 > > index 000000000000..d6604e0c5c54 > > --- /dev/null > > +++ b/arch/arm64/include/asm/por.h > > @@ -0,0 +1,33 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * Copyright (C) 2023 Arm Ltd. > > + */ > > + > > +#ifndef _ASM_ARM64_POR_H > > +#define _ASM_ARM64_POR_H > > + > > +#define POR_BITS_PER_PKEY 4 > > +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf) > > + > > +static inline bool por_elx_allows_read(u64 por, u8 pkey) > > +{ > > + u8 perm = POR_ELx_IDX(por, pkey); > > + > > + return perm & POE_R; > > +} > > + > > +static inline bool por_elx_allows_write(u64 por, u8 pkey) > > +{ > > + u8 perm = POR_ELx_IDX(por, pkey); > > + > > + return perm & POE_W; > > +} > > + > > +static inline bool por_elx_allows_exec(u64 por, u8 pkey) > > +{ > > + u8 perm = POR_ELx_IDX(por, pkey); > > + > > + return perm & POE_X; > > +} > > + > > +#endif /* _ASM_ARM64_POR_H */ > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > > index 495b732d5af3..e50ccc86d150 100644 > > --- a/arch/arm64/mm/mmu.c > > +++ b/arch/arm64/mm/mmu.c > > @@ -25,6 +25,7 @@ > > #include <linux/vmalloc.h> > > #include <linux/set_memory.h> > > #include <linux/kfence.h> > > +#include <linux/pkeys.h> > > #include <asm/barrier.h> > > #include <asm/cputype.h> > > @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) > > cpu_uninstall_idmap(); > > } > > + > > +#ifdef CONFIG_ARCH_HAS_PKEYS > > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > > +{ > > + u64 new_por = POE_RXW; > > + u64 old_por; > > + u64 pkey_shift; > > + > > + if (!arch_pkeys_enabled()) > > + return -ENOSPC; > > + > > + /* > > + * This code should only be called with valid 'pkey' > > + * values originating from in-kernel users. Complain > > + * if a bad value is observed. > > + */ > > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > > + return -EINVAL; > > + > > + /* Set the bits we need in POR: */ > > + if (init_val & PKEY_DISABLE_ACCESS) > > + new_por = POE_X; > > + else if (init_val & PKEY_DISABLE_WRITE) > > + new_por = POE_RX; > > + > > + /* Shift the bits in to the correct place in POR for pkey: */ > > + pkey_shift = pkey * POR_BITS_PER_PKEY; > > + new_por <<= pkey_shift; > > + > > + /* Get old POR and mask off any old bits in place: */ > > + old_por = read_sysreg_s(SYS_POR_EL0); > > + old_por &= ~(POE_MASK << pkey_shift); > > + > > + /* Write old part along with new part: */ > > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > > + > > + return 0; > > +} > > +#endif
The 05/03/2024 14:01, Joey Gouly wrote: > Implement the PKEYS interface, using the Permission Overlay Extension. ... > +#ifdef CONFIG_ARCH_HAS_PKEYS > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > +{ > + u64 new_por = POE_RXW; > + u64 old_por; > + u64 pkey_shift; > + > + if (!arch_pkeys_enabled()) > + return -ENOSPC; > + > + /* > + * This code should only be called with valid 'pkey' > + * values originating from in-kernel users. Complain > + * if a bad value is observed. > + */ > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > + return -EINVAL; > + > + /* Set the bits we need in POR: */ > + if (init_val & PKEY_DISABLE_ACCESS) > + new_por = POE_X; > + else if (init_val & PKEY_DISABLE_WRITE) > + new_por = POE_RX; > + given that the architecture allows r,w,x permissions to be set independently, should we have a 'PKEY_DISABLE_EXEC' or similar api flag? (on other targets it can be some invalid value that fails) > + /* Shift the bits in to the correct place in POR for pkey: */ > + pkey_shift = pkey * POR_BITS_PER_PKEY; > + new_por <<= pkey_shift; > + > + /* Get old POR and mask off any old bits in place: */ > + old_por = read_sysreg_s(SYS_POR_EL0); > + old_por &= ~(POE_MASK << pkey_shift); > + > + /* Write old part along with new part: */ > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > + > + return 0; > +} > +#endif > -- > 2.25.1 >
Hi Szabolcs, On Fri, May 31, 2024 at 03:57:07PM +0100, Szabolcs Nagy wrote: > The 05/03/2024 14:01, Joey Gouly wrote: > > Implement the PKEYS interface, using the Permission Overlay Extension. > ... > > +#ifdef CONFIG_ARCH_HAS_PKEYS > > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > > +{ > > + u64 new_por = POE_RXW; > > + u64 old_por; > > + u64 pkey_shift; > > + > > + if (!arch_pkeys_enabled()) > > + return -ENOSPC; > > + > > + /* > > + * This code should only be called with valid 'pkey' > > + * values originating from in-kernel users. Complain > > + * if a bad value is observed. > > + */ > > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > > + return -EINVAL; > > + > > + /* Set the bits we need in POR: */ > > + if (init_val & PKEY_DISABLE_ACCESS) > > + new_por = POE_X; > > + else if (init_val & PKEY_DISABLE_WRITE) > > + new_por = POE_RX; > > + > > given that the architecture allows r,w,x permissions to be > set independently, should we have a 'PKEY_DISABLE_EXEC' or > similar api flag? > > (on other targets it can be some invalid value that fails) I didn't think about the best way to do that yet. PowerPC has a PKEY_DISABLE_EXECUTE. We could either make that generic, and X86 has to error if it sees that bit, or we add a arch-specific PKEY_DISABLE_EXECUTE like PowerPC. A user can still set it by interacting with the register directly, but I guess we want something for the glibc interface.. Dave, any thoughts here? > > > + /* Shift the bits in to the correct place in POR for pkey: */ > > + pkey_shift = pkey * POR_BITS_PER_PKEY; > > + new_por <<= pkey_shift; > > + > > + /* Get old POR and mask off any old bits in place: */ > > + old_por = read_sysreg_s(SYS_POR_EL0); > > + old_por &= ~(POE_MASK << pkey_shift); > > + > > + /* Write old part along with new part: */ > > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > > + > > + return 0; > > +} > > +#endif Thanks, Joey
The 05/31/2024 16:21, Joey Gouly wrote: > Hi Szabolcs, > > On Fri, May 31, 2024 at 03:57:07PM +0100, Szabolcs Nagy wrote: > > The 05/03/2024 14:01, Joey Gouly wrote: > > > Implement the PKEYS interface, using the Permission Overlay Extension. > > ... > > > +#ifdef CONFIG_ARCH_HAS_PKEYS > > > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > > > +{ > > > + u64 new_por = POE_RXW; > > > + u64 old_por; > > > + u64 pkey_shift; > > > + > > > + if (!arch_pkeys_enabled()) > > > + return -ENOSPC; > > > + > > > + /* > > > + * This code should only be called with valid 'pkey' > > > + * values originating from in-kernel users. Complain > > > + * if a bad value is observed. > > > + */ > > > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > > > + return -EINVAL; > > > + > > > + /* Set the bits we need in POR: */ > > > + if (init_val & PKEY_DISABLE_ACCESS) > > > + new_por = POE_X; > > > + else if (init_val & PKEY_DISABLE_WRITE) > > > + new_por = POE_RX; > > > + > > > > given that the architecture allows r,w,x permissions to be > > set independently, should we have a 'PKEY_DISABLE_EXEC' or > > similar api flag? > > > > (on other targets it can be some invalid value that fails) > > I didn't think about the best way to do that yet. PowerPC has a PKEY_DISABLE_EXECUTE. > > We could either make that generic, and X86 has to error if it sees that bit, or > we add a arch-specific PKEY_DISABLE_EXECUTE like PowerPC. this does not seem to be in glibc yet. (or in linux man pages) i guess you can copy whatever ppc does. > > A user can still set it by interacting with the register directly, but I guess > we want something for the glibc interface.. > > Dave, any thoughts here? adding Florian too, since i found an old thread of his that tried to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but it did not seem to end up upstream. (this makes more sense to me as libc api than the weird disable access semantics)
* Szabolcs Nagy: >> A user can still set it by interacting with the register directly, but I guess >> we want something for the glibc interface.. >> >> Dave, any thoughts here? > > adding Florian too, since i found an old thread of his that tried > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but > it did not seem to end up upstream. (this makes more sense to me > as libc api than the weird disable access semantics) I still think it makes sense to have a full complenent of PKEY_* flags complementing the PROT_* flags, in a somewhat abstract fashion for pkey_alloc only. The internal protection mask register encoding will differ from architecture to architecture, but the abstract glibc functions pkey_set and pkey_get could use them (if we are a bit careful). Thanks, Florian
The 06/17/2024 15:40, Florian Weimer wrote: > >> A user can still set it by interacting with the register directly, but I guess > >> we want something for the glibc interface.. > >> > >> Dave, any thoughts here? > > > > adding Florian too, since i found an old thread of his that tried > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but > > it did not seem to end up upstream. (this makes more sense to me > > as libc api than the weird disable access semantics) > > I still think it makes sense to have a full complenent of PKEY_* flags > complementing the PROT_* flags, in a somewhat abstract fashion for > pkey_alloc only. The internal protection mask register encoding will > differ from architecture to architecture, but the abstract glibc > functions pkey_set and pkey_get could use them (if we are a bit > careful). to me it makes sense to have abstract PKEY_DISABLE_READ PKEY_DISABLE_WRITE PKEY_DISABLE_EXECUTE PKEY_DISABLE_ACCESS where access is handled like if (flags&PKEY_DISABLE_ACCESS) flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE; disable_read = flags&PKEY_DISABLE_READ; disable_write = flags&PKEY_DISABLE_WRITE; disable_exec = flags&PKEY_DISABLE_EXECUTE; if there are unsupported combinations like disable_read&&!disable_write then those are rejected by pkey_alloc and pkey_set. this allows portable use of pkey apis. (the flags could be target specific, but don't have to be)
On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote: > @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > #define pte_access_permitted_no_overlay(pte, write) \ > (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) > #define pte_access_permitted(pte, write) \ > - pte_access_permitted_no_overlay(pte, write) > + (pte_access_permitted_no_overlay(pte, write) && \ > + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) I'm still not entirely convinced on checking the keys during fast GUP but that's what x86 and powerpc do already, so I guess we'll follow the same ABI. Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Hi Szabolcs, On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote: > The 06/17/2024 15:40, Florian Weimer wrote: > > >> A user can still set it by interacting with the register directly, but I guess > > >> we want something for the glibc interface.. > > >> > > >> Dave, any thoughts here? > > > > > > adding Florian too, since i found an old thread of his that tried > > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but > > > it did not seem to end up upstream. (this makes more sense to me > > > as libc api than the weird disable access semantics) > > > > I still think it makes sense to have a full complenent of PKEY_* flags > > complementing the PROT_* flags, in a somewhat abstract fashion for > > pkey_alloc only. The internal protection mask register encoding will > > differ from architecture to architecture, but the abstract glibc > > functions pkey_set and pkey_get could use them (if we are a bit > > careful). > > to me it makes sense to have abstract > > PKEY_DISABLE_READ > PKEY_DISABLE_WRITE > PKEY_DISABLE_EXECUTE > PKEY_DISABLE_ACCESS > > where access is handled like > > if (flags&PKEY_DISABLE_ACCESS) > flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE; > disable_read = flags&PKEY_DISABLE_READ; > disable_write = flags&PKEY_DISABLE_WRITE; > disable_exec = flags&PKEY_DISABLE_EXECUTE; > > if there are unsupported combinations like > disable_read&&!disable_write then those are rejected > by pkey_alloc and pkey_set. > > this allows portable use of pkey apis. > (the flags could be target specific, but don't have to be) On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so for powerpc there's no way to to set an execute-only permission via this interface. I wouldn't like to diverge from powerpc. However, does it matter much? That's only for the initial setup, the user can then change the permissions directly via the sysreg. So maybe we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE together with the full PKEY_DISABLE_ACCESS would probably suffice. Give that on x86 the PKEY_ACCESS_MASK will have to stay as PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE, we'll probably do the same as powerpc and define an arm64 specific PKEY_DISABLE_EXECUTE with the corresponding PKEY_ACCESS_MASK including it. We can generalise the masks with some ARCH_HAS_PKEY_DISABLE_EXECUTE but it's probably more hassle than just defining the arm64 PKEY_DISABLE_EXECUTE. I assume you'd like PKEY_DISABLE_EXECUTE to be part of this series, otherwise changing PKEY_ACCESS_MASK later will cause potential ABI issues.
The 07/08/2024 18:53, Catalin Marinas wrote: > Hi Szabolcs, > > On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote: > > The 06/17/2024 15:40, Florian Weimer wrote: > > > >> A user can still set it by interacting with the register directly, but I guess > > > >> we want something for the glibc interface.. > > > >> > > > >> Dave, any thoughts here? > > > > > > > > adding Florian too, since i found an old thread of his that tried > > > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but > > > > it did not seem to end up upstream. (this makes more sense to me > > > > as libc api than the weird disable access semantics) > > > > > > I still think it makes sense to have a full complenent of PKEY_* flags > > > complementing the PROT_* flags, in a somewhat abstract fashion for > > > pkey_alloc only. The internal protection mask register encoding will > > > differ from architecture to architecture, but the abstract glibc > > > functions pkey_set and pkey_get could use them (if we are a bit > > > careful). > > > > to me it makes sense to have abstract > > > > PKEY_DISABLE_READ > > PKEY_DISABLE_WRITE > > PKEY_DISABLE_EXECUTE > > PKEY_DISABLE_ACCESS > > > > where access is handled like > > > > if (flags&PKEY_DISABLE_ACCESS) > > flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE; > > disable_read = flags&PKEY_DISABLE_READ; > > disable_write = flags&PKEY_DISABLE_WRITE; > > disable_exec = flags&PKEY_DISABLE_EXECUTE; > > > > if there are unsupported combinations like > > disable_read&&!disable_write then those are rejected > > by pkey_alloc and pkey_set. > > > > this allows portable use of pkey apis. > > (the flags could be target specific, but don't have to be) > > On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the > kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so > for powerpc there's no way to to set an execute-only permission via this > interface. I wouldn't like to diverge from powerpc. the exec permission should be documented in the man. and i think it should be consistent across targets to allow portable use. now ppc and x86 are inconsistent, i think it's not ideal, but ok to say that targets without disable-exec support do whatever x86 does with PKEY_DISABLE_ACCESS otherwise it means whatever ppc does. > > However, does it matter much? That's only for the initial setup, the > user can then change the permissions directly via the sysreg. So maybe > we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE > together with the full PKEY_DISABLE_ACCESS would probably suffice. this is ok. a bit awkward in userspace when the register is directly set to e.g write-only and pkey_get has to return something, but we can handle settings outside of valid PKEY_* macros as unspec, users who want that would use their own register set/get code. i would have designed the permission to use either existing PROT_* flags or say that it is architectural and written to the register directly and let the libc wrapper deal with portable api, i guess it's too late now. (the signal handling behaviour should have a control and it is possible to fix e.g. via pkey_alloc flags, but that may not be the best solution and this can be done later.) > > Give that on x86 the PKEY_ACCESS_MASK will have to stay as > PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE, we'll probably do the same as > powerpc and define an arm64 specific PKEY_DISABLE_EXECUTE with the > corresponding PKEY_ACCESS_MASK including it. We can generalise the masks > with some ARCH_HAS_PKEY_DISABLE_EXECUTE but it's probably more hassle > than just defining the arm64 PKEY_DISABLE_EXECUTE. > > I assume you'd like PKEY_DISABLE_EXECUTE to be part of this series, > otherwise changing PKEY_ACCESS_MASK later will cause potential ABI > issues. yes i think we should figure this out in the initial support.
* Szabolcs Nagy: >> However, does it matter much? That's only for the initial setup, the >> user can then change the permissions directly via the sysreg. So maybe >> we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE >> together with the full PKEY_DISABLE_ACCESS would probably suffice. > > this is ok. > > a bit awkward in userspace when the register is directly > set to e.g write-only and pkey_get has to return something, > but we can handle settings outside of valid PKEY_* macros > as unspec, users who want that would use their own register > set/get code. > > i would have designed the permission to use either existing > PROT_* flags or say that it is architectural and written to > the register directly and let the libc wrapper deal with > portable api, i guess it's too late now. We can still define a portable API if we get a few more PKEY_* bits. The last attempt stalled because the kernel does not really need them, it would be for userspace benefit only. For performance-critical code, pkey_get/pkey_set are already too slow, so adding a bit more bit twiddling to it wouldn't be a proble, I think. Applications that want to change protection key bits around a very short code sequence will have to write the architecture-specific register. > (the signal handling behaviour should have a control and it > is possible to fix e.g. via pkey_alloc flags, but that may > not be the best solution and this can be done later.) For glibc, the POWER behavior is much more useful. Thanks, Florian
On 03/05/2024 15:01, Joey Gouly wrote: > @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) > return -1UL >> 8; > } > > +/* > + * We only want to enforce protection keys on the current process > + * because we effectively have no access to POR_EL0 for other > + * processes or any way to tell *which * POR_EL0 in a threaded > + * process we could use. I see that this comment is essentially copied from x86, but to me it misses the main point. Even with only one thread in the target process and a way to obtain its POR_EL0, it still wouldn't make sense to check that value. If we take the case of a debugger accessing an inferior via ptrace(), for instance, the kernel is asked to access some memory in another mm. However, the debugger's POR_EL0 is tied to its own address space, and the target's POR_EL0 is relevant to its own execution flow only. In such situations, there is essentially no user context for the access, so It fundamentally does not make sense to make checks based on pkey/POE or similar restrictions to memory accesses (e.g. MTE). Kevin > + * > + * So do not enforce things if the VMA is not from the current > + * mm, or if we are in a kernel thread. > + */ > +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, > + bool write, bool execute, bool foreign) > +{ > + if (!arch_pkeys_enabled()) > + return true; > + > + /* allow access if the VMA is not one from this process */ > + if (foreign || vma_is_foreign(vma)) > + return true; > + > + return por_el0_allows_pkey(vma_pkey(vma), write, execute); > +} > +
On Mon, Jul 08, 2024 at 06:53:18PM +0100, Catalin Marinas wrote: > Hi Szabolcs, > > On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote: > > The 06/17/2024 15:40, Florian Weimer wrote: > > > >> A user can still set it by interacting with the register directly, but I guess > > > >> we want something for the glibc interface.. > > > >> > > > >> Dave, any thoughts here? > > > > > > > > adding Florian too, since i found an old thread of his that tried > > > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but > > > > it did not seem to end up upstream. (this makes more sense to me > > > > as libc api than the weird disable access semantics) > > > > > > I still think it makes sense to have a full complenent of PKEY_* flags > > > complementing the PROT_* flags, in a somewhat abstract fashion for > > > pkey_alloc only. The internal protection mask register encoding will > > > differ from architecture to architecture, but the abstract glibc > > > functions pkey_set and pkey_get could use them (if we are a bit > > > careful). > > > > to me it makes sense to have abstract > > > > PKEY_DISABLE_READ > > PKEY_DISABLE_WRITE > > PKEY_DISABLE_EXECUTE > > PKEY_DISABLE_ACCESS > > > > where access is handled like > > > > if (flags&PKEY_DISABLE_ACCESS) > > flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE; > > disable_read = flags&PKEY_DISABLE_READ; > > disable_write = flags&PKEY_DISABLE_WRITE; > > disable_exec = flags&PKEY_DISABLE_EXECUTE; > > > > if there are unsupported combinations like > > disable_read&&!disable_write then those are rejected > > by pkey_alloc and pkey_set. > > > > this allows portable use of pkey apis. > > (the flags could be target specific, but don't have to be) > > On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the > kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so > for powerpc there's no way to to set an execute-only permission via this > interface. I wouldn't like to diverge from powerpc. I think this is wrong, look at this code from powerpc: arch/powerpc/mm/book3s64/pkeys.c: __arch_set_user_pkey_access if (init_val & PKEY_DISABLE_EXECUTE) { if (!pkey_execute_disable_supported) return -EINVAL; new_iamr_bits |= IAMR_EX_BIT; } init_iamr(pkey, new_iamr_bits); /* Set the bits we need in AMR: */ if (init_val & PKEY_DISABLE_ACCESS) new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; else if (init_val & PKEY_DISABLE_WRITE) new_amr_bits |= AMR_WR_BIT; init_amr(pkey, new_amr_bits); Seems to me that PKEY_DISABLE_ACCESS leaves exec permissions as-is. Here is the patch I am planning to include in the next version of the series. This should support all PKEY_DISABLE_* combinations. Any comments? commit ba51371a544f6b0a4a0f03df62ad894d53f5039b Author: Joey Gouly <joey.gouly@arm.com> Date: Thu Jul 4 11:29:20 2024 +0100 arm64: add PKEY_DISABLE_READ and PKEY_DISABLE_EXEC TODO Signed-off-by: Joey Gouly <joey.gouly@arm.com> diff --git arch/arm64/include/uapi/asm/mman.h arch/arm64/include/uapi/asm/mman.h index 1e6482a838e1..e7e0c8216243 100644 --- arch/arm64/include/uapi/asm/mman.h +++ arch/arm64/include/uapi/asm/mman.h @@ -7,4 +7,13 @@ #define PROT_BTI 0x10 /* BTI guarded page */ #define PROT_MTE 0x20 /* Normal Tagged mapping */ +/* Override any generic PKEY permission defines */ +#define PKEY_DISABLE_EXECUTE 0x4 +#define PKEY_DISABLE_READ 0x8 +#undef PKEY_ACCESS_MASK +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE |\ + PKEY_DISABLE_READ |\ + PKEY_DISABLE_EXECUTE) + #endif /* ! _UAPI__ASM_MMAN_H */ diff --git arch/arm64/mm/mmu.c arch/arm64/mm/mmu.c index 68afe5fc3071..ce4cc6bdee4e 100644 --- arch/arm64/mm/mmu.c +++ arch/arm64/mm/mmu.c @@ -1570,10 +1570,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i return -EINVAL; /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; if (init_val & PKEY_DISABLE_ACCESS) - new_por = POE_X; - else if (init_val & PKEY_DISABLE_WRITE) - new_por = POE_RX; + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; /* Shift the bits in to the correct place in POR for pkey: */ pkey_shift = pkey * POR_BITS_PER_PKEY; Thanks, Joey
On 7/9/24 18:37, Kevin Brodsky wrote: > On 03/05/2024 15:01, Joey Gouly wrote: >> @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) >> return -1UL >> 8; >> } >> >> +/* >> + * We only want to enforce protection keys on the current process >> + * because we effectively have no access to POR_EL0 for other >> + * processes or any way to tell *which * POR_EL0 in a threaded >> + * process we could use. > > I see that this comment is essentially copied from x86, but to me it > misses the main point. Even with only one thread in the target process > and a way to obtain its POR_EL0, it still wouldn't make sense to check > that value. If we take the case of a debugger accessing an inferior via > ptrace(), for instance, the kernel is asked to access some memory in > another mm. However, the debugger's POR_EL0 is tied to its own address > space, and the target's POR_EL0 is relevant to its own execution flow > only. In such situations, there is essentially no user context for the > access, so It fundamentally does not make sense to make checks based on > pkey/POE or similar restrictions to memory accesses (e.g. MTE). Indeed this makes more sense. There is no memory context even if there is access to another POR_EL0. The comment above could be improved describing this limitation.
The 07/11/2024 10:50, Joey Gouly wrote: > On Mon, Jul 08, 2024 at 06:53:18PM +0100, Catalin Marinas wrote: > > On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote: > > > to me it makes sense to have abstract > > > > > > PKEY_DISABLE_READ > > > PKEY_DISABLE_WRITE > > > PKEY_DISABLE_EXECUTE > > > PKEY_DISABLE_ACCESS > > > > > > where access is handled like > > > > > > if (flags&PKEY_DISABLE_ACCESS) > > > flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE; > > > disable_read = flags&PKEY_DISABLE_READ; > > > disable_write = flags&PKEY_DISABLE_WRITE; > > > disable_exec = flags&PKEY_DISABLE_EXECUTE; ... > > On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the ... > Seems to me that PKEY_DISABLE_ACCESS leaves exec permissions as-is. assuming this is right the patch below looks reasonable to me. thanks. > Here is the patch I am planning to include in the next version of the series. > This should support all PKEY_DISABLE_* combinations. Any comments? > > commit ba51371a544f6b0a4a0f03df62ad894d53f5039b > Author: Joey Gouly <joey.gouly@arm.com> > Date: Thu Jul 4 11:29:20 2024 +0100 > > arm64: add PKEY_DISABLE_READ and PKEY_DISABLE_EXEC it's PKEY_DISABLE_EXECUTE (fwiw i like the shorter exec better but ppc seems to use execute) > > TODO > > Signed-off-by: Joey Gouly <joey.gouly@arm.com> > > diff --git arch/arm64/include/uapi/asm/mman.h arch/arm64/include/uapi/asm/mman.h > index 1e6482a838e1..e7e0c8216243 100644 > --- arch/arm64/include/uapi/asm/mman.h > +++ arch/arm64/include/uapi/asm/mman.h > @@ -7,4 +7,13 @@ > #define PROT_BTI 0x10 /* BTI guarded page */ > #define PROT_MTE 0x20 /* Normal Tagged mapping */ > > +/* Override any generic PKEY permission defines */ > +#define PKEY_DISABLE_EXECUTE 0x4 > +#define PKEY_DISABLE_READ 0x8 > +#undef PKEY_ACCESS_MASK > +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ > + PKEY_DISABLE_WRITE |\ > + PKEY_DISABLE_READ |\ > + PKEY_DISABLE_EXECUTE) > + > #endif /* ! _UAPI__ASM_MMAN_H */ > diff --git arch/arm64/mm/mmu.c arch/arm64/mm/mmu.c > index 68afe5fc3071..ce4cc6bdee4e 100644 > --- arch/arm64/mm/mmu.c > +++ arch/arm64/mm/mmu.c > @@ -1570,10 +1570,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i > return -EINVAL; > > /* Set the bits we need in POR: */ > + new_por = POE_RXW; > + if (init_val & PKEY_DISABLE_WRITE) > + new_por &= ~POE_W; > if (init_val & PKEY_DISABLE_ACCESS) > - new_por = POE_X; > - else if (init_val & PKEY_DISABLE_WRITE) > - new_por = POE_RX; > + new_por &= ~POE_RW; > + if (init_val & PKEY_DISABLE_READ) > + new_por &= ~POE_R; > + if (init_val & PKEY_DISABLE_EXECUTE) > + new_por &= ~POE_X; > > /* Shift the bits in to the correct place in POR for pkey: */ > pkey_shift = pkey * POR_BITS_PER_PKEY; > > > > Thanks, > Joey
On 05/07/2024 18:59, Catalin Marinas wrote: > On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote: >> @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) >> #define pte_access_permitted_no_overlay(pte, write) \ >> (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) >> #define pte_access_permitted(pte, write) \ >> - pte_access_permitted_no_overlay(pte, write) >> + (pte_access_permitted_no_overlay(pte, write) && \ >> + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) > I'm still not entirely convinced on checking the keys during fast GUP > but that's what x86 and powerpc do already, so I guess we'll follow the > same ABI. I've thought about this some more. In summary I don't think adding this check to pte_access_permitted() is controversial, but we should decide how POR_EL0 is set for kernel threads. This change essentially means that fast GUP behaves like uaccess for pages that are already present: in both cases POR_EL0 will be looked up based on the POIndex of the page being accessed (by the hardware in the uaccess case, and explicitly in the fast GUP case). Fast GUP always operates on current->mm, so to me checking POR_EL0 in pte_access_permitted() should be no more restrictive than a uaccess check from a user perspective. In other words, POR_EL0 is checked when the kernel accesses user memory on the user's behalf, whether through uaccess or GUP. It's also worth noting that the "slow" GUP path (which get_user_pages_fast() falls back to if a page is missing) also checks POR_EL0 by virtue of calling handle_mm_fault(), which in turn calls arch_vma_access_permitted(). It would be pretty inconsistent for the slow GUP path to do a pkey check but not the fast path. (That said, the slow GUP path does not call arch_vma_access_permitted() if a page is already present, so callers of get_user_pages() and similar will get inconsistent checking. Not great, that may be worth fixing - but that's clearly beyond the scope of this series.) Now an interesting question is what happens with kernel threads that access user memory, as is the case for the optional io_uring kernel thread (IORING_SETUP_SQPOLL). The discussion above holds regardless of the type of thread, so the sqpoll thread will have its POR_EL0 checked when processing commands that involve uaccess or GUP. AFAICT, this series does not have special handling for kernel threads w.r.t. POR_EL0, which means that it is left unchanged when a new kernel thread is cloned (create_io_thread() in the IORING_SETUP_SQPOLL case). The sqpoll thread will therefore inherit POR_EL0 from the (user) thread that calls io_uring_setup(). In other words, the sqpoll thread ends up with the same view of user memory as that user thread - for instance if its POR_EL0 prevents access to POIndex 1, then any I/O that the sqpoll thread attempts on mappings with POIndex/pkey 1 will fail. This behaviour seems potentially useful to me, as the io_uring SQ could easily become a way to bypass POE without some restriction. However, it feels like this should be documented, as one should keep it in mind when using pkeys, and there may well be other cases where kernel threads are impacted by POR_EL0. I am also unsure how x86/ppc handle this. Kevin
On 5/3/24 18:31, Joey Gouly wrote: > Implement the PKEYS interface, using the Permission Overlay Extension. This commit message should contain some more details here considering the amount of code change proposed in this patch. > > Signed-off-by: Joey Gouly <joey.gouly@arm.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > --- > arch/arm64/include/asm/mmu.h | 1 + > arch/arm64/include/asm/mmu_context.h | 51 ++++++++++++- > arch/arm64/include/asm/pgtable.h | 22 +++++- > arch/arm64/include/asm/pkeys.h | 110 +++++++++++++++++++++++++++ > arch/arm64/include/asm/por.h | 33 ++++++++ > arch/arm64/mm/mmu.c | 40 ++++++++++ > 6 files changed, 255 insertions(+), 2 deletions(-) > create mode 100644 arch/arm64/include/asm/pkeys.h > create mode 100644 arch/arm64/include/asm/por.h > > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h > index 65977c7783c5..983afeb4eba5 100644 > --- a/arch/arm64/include/asm/mmu.h > +++ b/arch/arm64/include/asm/mmu.h > @@ -25,6 +25,7 @@ typedef struct { > refcount_t pinned; > void *vdso; > unsigned long flags; > + u8 pkey_allocation_map; arch_max_pkey() is 7 on arm64, with bit 0 reserved for the first pkey, so is it possible for the entire pkey_allocation_map to be completely used up in reality ? OR the maximum pkey bits that can be allocated is actually ARCH_PKEY_BITS ? > } mm_context_t; > > /* > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h > index c768d16b81a4..cb499db7a97b 100644 > --- a/arch/arm64/include/asm/mmu_context.h > +++ b/arch/arm64/include/asm/mmu_context.h > @@ -15,12 +15,12 @@ > #include <linux/sched/hotplug.h> > #include <linux/mm_types.h> > #include <linux/pgtable.h> > +#include <linux/pkeys.h> > > #include <asm/cacheflush.h> > #include <asm/cpufeature.h> > #include <asm/daifflags.h> > #include <asm/proc-fns.h> > -#include <asm-generic/mm_hooks.h> > #include <asm/cputype.h> > #include <asm/sysreg.h> > #include <asm/tlbflush.h> > @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) > { > atomic64_set(&mm->context.id, 0); > refcount_set(&mm->context.pinned, 0); > + > + /* pkey 0 is the default, so always reserve it. */ > + mm->context.pkey_allocation_map = 0x1; Very small nit. Considering the 1U << pkey allocation mechanism, the following might make more sense, considering the first bit being the default one. mm->context.pkey_allocation_map = (1U << 0); OR probably even making it a const or something. > + > + return 0; > +} > + > +static inline void arch_dup_pkeys(struct mm_struct *oldmm, > + struct mm_struct *mm) > +{ > + /* Duplicate the oldmm pkey state in mm: */ > + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; > +} > + > +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) > +{ > + arch_dup_pkeys(oldmm, mm); > + > return 0; > } > > +static inline void arch_exit_mmap(struct mm_struct *mm) > +{ > +} > + > +static inline void arch_unmap(struct mm_struct *mm, > + unsigned long start, unsigned long end) > +{ > +} > + > #ifdef CONFIG_ARM64_SW_TTBR0_PAN > static inline void update_saved_ttbr0(struct task_struct *tsk, > struct mm_struct *mm) > @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) > return -1UL >> 8; > } > > +/* > + * We only want to enforce protection keys on the current process > + * because we effectively have no access to POR_EL0 for other > + * processes or any way to tell *which * POR_EL0 in a threaded > + * process we could use. > + * > + * So do not enforce things if the VMA is not from the current > + * mm, or if we are in a kernel thread. > + */ As mentioned in the other thread, this comment can be improved. > +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, > + bool write, bool execute, bool foreign) > +{ > + if (!arch_pkeys_enabled()) > + return true; > + > + /* allow access if the VMA is not one from this process */ > + if (foreign || vma_is_foreign(vma)) > + return true; > + > + return por_el0_allows_pkey(vma_pkey(vma), write, execute); > +} > + > #include <asm-generic/mmu_context.h> > > #endif /* !__ASSEMBLY__ */ > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > index 2449e4e27ea6..8ee68ff03016 100644 > --- a/arch/arm64/include/asm/pgtable.h > +++ b/arch/arm64/include/asm/pgtable.h > @@ -34,6 +34,7 @@ > > #include <asm/cmpxchg.h> > #include <asm/fixmap.h> > +#include <asm/por.h> > #include <linux/mmdebug.h> > #include <linux/mm_types.h> > #include <linux/sched.h> > @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > #define pte_accessible(mm, pte) \ > (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) > > +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) > +{ > + u64 por; > + > + if (!system_supports_poe()) > + return true; This is redundant. Same check is there in arch_vma_access_permitted() as well which is the sole caller for this function. > + > + por = read_sysreg_s(SYS_POR_EL0); > + > + if (write) > + return por_elx_allows_write(por, pkey); > + > + if (execute) > + return por_elx_allows_exec(por, pkey); > + > + return por_elx_allows_read(por, pkey); > +} > + > /* > * p??_access_permitted() is true for valid user mappings (PTE_USER > * bit set, subject to the write permission check). For execute-only > @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) > #define pte_access_permitted_no_overlay(pte, write) \ > (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) > #define pte_access_permitted(pte, write) \ > - pte_access_permitted_no_overlay(pte, write) > + (pte_access_permitted_no_overlay(pte, write) && \ > + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) > #define pmd_access_permitted(pmd, write) \ > (pte_access_permitted(pmd_pte(pmd), (write))) > #define pud_access_permitted(pud, write) \ > diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h > new file mode 100644 > index 000000000000..a284508a4d02 > --- /dev/null > +++ b/arch/arm64/include/asm/pkeys.h > @@ -0,0 +1,110 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + * > + * Based on arch/x86/include/asm/pkeys.h > + */ > + > +#ifndef _ASM_ARM64_PKEYS_H > +#define _ASM_ARM64_PKEYS_H > + > +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) > + > +#define arch_max_pkey() 7 May be this should be made 8 including the default pkey bit 0. > + > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, > + unsigned long init_val); > + > +static inline bool arch_pkeys_enabled(void) > +{ > + return false; > +} > + > +static inline int vma_pkey(struct vm_area_struct *vma) > +{ > + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; > +} > + > +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, > + int prot, int pkey) > +{ Following comment is there in x86 __arch_override_mprotect_pkey() which also seems to be applicable here as well. Please consider adding. /* * Is this an mprotect_pkey() call? If so, never * override the value that came from the user. */ > + if (pkey != -1) > + return pkey; > + > + return vma_pkey(vma); > +} > + > +static inline int execute_only_pkey(struct mm_struct *mm) > +{ > + // Execute-only mappings are handled by EPAN/FEAT_PAN3. > + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); > + > + return -1; > +} > + > +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) > +#define mm_set_pkey_allocated(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) |= (1U << pkey); \ > +} while (0) > +#define mm_set_pkey_free(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ > +} while (0) > + > +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) > +{ > + /* > + * "Allocated" pkeys are those that have been returned > + * from pkey_alloc() or pkey 0 which is allocated > + * implicitly when the mm is created. > + */ > + if (pkey < 0) > + return false; > + if (pkey >= arch_max_pkey()) > + return false; These range checks can be folded into the same conditional statement. > + > + return mm_pkey_allocation_map(mm) & (1U << pkey); > +} > + > +/* > + * Returns a positive, 3-bit key on success, or -1 on failure. > + */ > +static inline int mm_pkey_alloc(struct mm_struct *mm) > +{ > + /* > + * Note: this is the one and only place we make sure > + * that the pkey is valid as far as the hardware is > + * concerned. The rest of the kernel trusts that > + * only good, valid pkeys come out of here. > + */ > + u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); > + int ret; > + > + if (!arch_pkeys_enabled()) > + return -1; I am wondering should not pkey's range be asserted here first like as in mm_pkey_is_allocated() ? > + > + /* > + * Are we out of pkeys? We must handle this specially > + * because ffz() behavior is undefined if there are no > + * zeros. > + */ > + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) > + return -1; > + > + ret = ffz(mm_pkey_allocation_map(mm)); > + > + mm_set_pkey_allocated(mm, ret); > + > + return ret; > +} > + > +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) > +{ > + if (!mm_pkey_is_allocated(mm, pkey)) > + return -EINVAL; > + > + mm_set_pkey_free(mm, pkey); > + > + return 0; > +} > + > +#endif /* _ASM_ARM64_PKEYS_H */ > diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h > new file mode 100644 > index 000000000000..d6604e0c5c54 > --- /dev/null > +++ b/arch/arm64/include/asm/por.h > @@ -0,0 +1,33 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + */ > + > +#ifndef _ASM_ARM64_POR_H > +#define _ASM_ARM64_POR_H > + > +#define POR_BITS_PER_PKEY 4 > +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf) > + > +static inline bool por_elx_allows_read(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_R; > +} > + > +static inline bool por_elx_allows_write(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_W; > +} > + > +static inline bool por_elx_allows_exec(u64 por, u8 pkey) > +{ > + u8 perm = POR_ELx_IDX(por, pkey); > + > + return perm & POE_X; > +} > + > +#endif /* _ASM_ARM64_POR_H */ > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index 495b732d5af3..e50ccc86d150 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -25,6 +25,7 @@ > #include <linux/vmalloc.h> > #include <linux/set_memory.h> > #include <linux/kfence.h> > +#include <linux/pkeys.h> > > #include <asm/barrier.h> > #include <asm/cputype.h> > @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) > > cpu_uninstall_idmap(); > } > + > +#ifdef CONFIG_ARCH_HAS_PKEYS > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > +{ > + u64 new_por = POE_RXW; > + u64 old_por; > + u64 pkey_shift; > + > + if (!arch_pkeys_enabled()) > + return -ENOSPC; This code path might not be possible and hence the check is redundant. If arch_pkeys_enabled() returns negative, then pkey_alloc() will just bail out and arch_set_user_pkey_access() would not be called afterwards. SYSCALL..(pkey_alloc) mm_pkey_alloc() arch_pkeys_enabled() ............... arch_set_user_pkey_access() arch_pkeys_enabled() > + > + /* > + * This code should only be called with valid 'pkey' > + * values originating from in-kernel users. Complain > + * if a bad value is observed. > + */ > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > + return -EINVAL; If the pkey's range check could have been done in mm_pkey_alloc() itself - which seems to be a better place instead, this warning here would not have been necessary. > + > + /* Set the bits we need in POR: */ > + if (init_val & PKEY_DISABLE_ACCESS) > + new_por = POE_X; > + else if (init_val & PKEY_DISABLE_WRITE) > + new_por = POE_RX; > + > + /* Shift the bits in to the correct place in POR for pkey: */ > + pkey_shift = pkey * POR_BITS_PER_PKEY; > + new_por <<= pkey_shift; > + > + /* Get old POR and mask off any old bits in place: */ > + old_por = read_sysreg_s(SYS_POR_EL0); > + old_por &= ~(POE_MASK << pkey_shift); > + > + /* Write old part along with new part: */ > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > + > + return 0; > +} > +#endif
On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote: > Implement the PKEYS interface, using the Permission Overlay Extension. > > Signed-off-by: Joey Gouly <joey.gouly@arm.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > --- > arch/arm64/include/asm/mmu.h | 1 + > arch/arm64/include/asm/mmu_context.h | 51 ++++++++++++- > arch/arm64/include/asm/pgtable.h | 22 +++++- > arch/arm64/include/asm/pkeys.h | 110 +++++++++++++++++++++++++++ > arch/arm64/include/asm/por.h | 33 ++++++++ > arch/arm64/mm/mmu.c | 40 ++++++++++ > 6 files changed, 255 insertions(+), 2 deletions(-) > create mode 100644 arch/arm64/include/asm/pkeys.h > create mode 100644 arch/arm64/include/asm/por.h [...] > diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h > new file mode 100644 > index 000000000000..a284508a4d02 > --- /dev/null > +++ b/arch/arm64/include/asm/pkeys.h > @@ -0,0 +1,110 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + * > + * Based on arch/x86/include/asm/pkeys.h > + */ > + > +#ifndef _ASM_ARM64_PKEYS_H > +#define _ASM_ARM64_PKEYS_H > + > +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) > + > +#define arch_max_pkey() 7 Did you mean 8 ? I'm guessing this may be the "off by one error" you alluded to in your own reply to the cover letter, but just in case... (x86 and powerpc seem to have booby-trapped the name of this macro for the unwary...) See also mm_pkey_{is_allocated,alloc}(). > + > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, > + unsigned long init_val); > + > +static inline bool arch_pkeys_enabled(void) > +{ > + return false; > +} > + > +static inline int vma_pkey(struct vm_area_struct *vma) > +{ > + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; > +} > + > +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, > + int prot, int pkey) > +{ > + if (pkey != -1) > + return pkey; > + > + return vma_pkey(vma); > +} > + > +static inline int execute_only_pkey(struct mm_struct *mm) > +{ > + // Execute-only mappings are handled by EPAN/FEAT_PAN3. > + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); > + > + return -1; > +} > + > +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) Pedantic nit: (mm) although other arches have the same nit already, and it's probably low risk given the scope and usage of these macros. (Also, the outer parentheses are also redundant (if harmless).) > +#define mm_set_pkey_allocated(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) |= (1U << pkey); \ > +} while (0) > +#define mm_set_pkey_free(mm, pkey) do { \ > + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ > +} while (0) > + > +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) > +{ > + /* > + * "Allocated" pkeys are those that have been returned > + * from pkey_alloc() or pkey 0 which is allocated > + * implicitly when the mm is created. > + */ > + if (pkey < 0) > + return false; > + if (pkey >= arch_max_pkey()) > + return false; Did you mean > ? > + > + return mm_pkey_allocation_map(mm) & (1U << pkey); > +} > + > +/* > + * Returns a positive, 3-bit key on success, or -1 on failure. > + */ > +static inline int mm_pkey_alloc(struct mm_struct *mm) > +{ > + /* > + * Note: this is the one and only place we make sure > + * that the pkey is valid as far as the hardware is > + * concerned. The rest of the kernel trusts that > + * only good, valid pkeys come out of here. > + */ > + u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); Nit: redundant outer (). Also, GENMASK() and friends might be cleaner that spelling out this idiom explicitly (but no big deal). (1 << 7) - 1 is 0x7f, which doesn't feel right if pkeys 0..7 are all supposed to be valid. (See arch_max_pkey() above.) (Also it looks mildly weird to have this before checking arch_pkeys_enabled(), but since this is likely to be constant-folded by the compiler, I guess it almost certainly makes no difference. It's harmless either way.) > + int ret; > + > + if (!arch_pkeys_enabled()) > + return -1; > + > + /* > + * Are we out of pkeys? We must handle this specially > + * because ffz() behavior is undefined if there are no > + * zeros. > + */ > + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) > + return -1; > + > + ret = ffz(mm_pkey_allocation_map(mm)); > + > + mm_set_pkey_allocated(mm, ret); > + > + return ret; > +} > + > +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) > +{ > + if (!mm_pkey_is_allocated(mm, pkey)) > + return -EINVAL; Does anything prevent a pkey_free(0)? I couldn't find any check related to this so far. If not, this may be a generic problem, better solved through a wrapper in the generic mm code. Userspace has to have at least one PKEY allocated, since the pte field has to be set to something... unless we turn PKEYs on or off per mm. But the pkeys API doesn't seem to be designed that way (and it doesn't look very useful). > + > + mm_set_pkey_free(mm, pkey); > + > + return 0; > +} > + > +#endif /* _ASM_ARM64_PKEYS_H */ > diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h > new file mode 100644 > index 000000000000..d6604e0c5c54 > --- /dev/null > +++ b/arch/arm64/include/asm/por.h > @@ -0,0 +1,33 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Arm Ltd. > + */ > + > +#ifndef _ASM_ARM64_POR_H > +#define _ASM_ARM64_POR_H > + > +#define POR_BITS_PER_PKEY 4 > +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf) Nit: (idx) Since this is shared with other code in a header, it's probably best to avoid surprises. [...] > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index 495b732d5af3..e50ccc86d150 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -25,6 +25,7 @@ > #include <linux/vmalloc.h> > #include <linux/set_memory.h> > #include <linux/kfence.h> > +#include <linux/pkeys.h> > > #include <asm/barrier.h> > #include <asm/cputype.h> > @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) > > cpu_uninstall_idmap(); > } > + > +#ifdef CONFIG_ARCH_HAS_PKEYS > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) > +{ > + u64 new_por = POE_RXW; > + u64 old_por; > + u64 pkey_shift; > + > + if (!arch_pkeys_enabled()) > + return -ENOSPC; > + > + /* > + * This code should only be called with valid 'pkey' > + * values originating from in-kernel users. Complain > + * if a bad value is observed. > + */ > + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) > + return -EINVAL; > + > + /* Set the bits we need in POR: */ > + if (init_val & PKEY_DISABLE_ACCESS) > + new_por = POE_X; > + else if (init_val & PKEY_DISABLE_WRITE) > + new_por = POE_RX; > + > + /* Shift the bits in to the correct place in POR for pkey: */ > + pkey_shift = pkey * POR_BITS_PER_PKEY; > + new_por <<= pkey_shift; > + > + /* Get old POR and mask off any old bits in place: */ > + old_por = read_sysreg_s(SYS_POR_EL0); > + old_por &= ~(POE_MASK << pkey_shift); > + > + /* Write old part along with new part: */ > + write_sysreg_s(old_por | new_por, SYS_POR_EL0); > + > + return 0; > +} > +#endif <bikeshed> Although this is part of the existing PKEYS support, it feels weird to have to initialise the permissions with one interface and one set of flags, then change the permissions using an arch-specific interface and a different set of flags (i.e., directly writing POR_EL0) later on. Is there any merit in defining a vDSO function for changing the flags in userspace? This would allow userspace to use PKEYS in a generic way without a nasty per-arch volatile asm hack. (Maybe too late for stopping user libraries rolling their own, though.) Since we ideally don't want to write the above flags-mungeing code twice, there would be the option of implementing pkey_alloc() via a vDSO wrapper on arm64 (though this might be more trouble than it is worth). Of course, this is all pointless if people thought that even the overhead of a vDSO call was unacceptable for flipping the permissions on and off. Either way, this is a potential enhancement, orthogonal to this series... </bikeshed> [...] Cheers ---Dave
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 65977c7783c5..983afeb4eba5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -25,6 +25,7 @@ typedef struct { refcount_t pinned; void *vdso; unsigned long flags; + u8 pkey_allocation_map; } mm_context_t; /* diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index c768d16b81a4..cb499db7a97b 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -15,12 +15,12 @@ #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/pkeys.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/proc-fns.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); + + /* pkey 0 is the default, so always reserve it. */ + mm->context.pkey_allocation_map = 0x1; + + return 0; +} + +static inline void arch_dup_pkeys(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + /* Duplicate the oldmm pkey state in mm: */ + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; +} + +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + return 0; } +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +/* + * We only want to enforce protection keys on the current process + * because we effectively have no access to POR_EL0 for other + * processes or any way to tell *which * POR_EL0 in a threaded + * process we could use. + * + * So do not enforce things if the VMA is not from the current + * mm, or if we are in a kernel thread. + */ +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (!arch_pkeys_enabled()) + return true; + + /* allow access if the VMA is not one from this process */ + if (foreign || vma_is_foreign(vma)) + return true; + + return por_el0_allows_pkey(vma_pkey(vma), write, execute); +} + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2449e4e27ea6..8ee68ff03016 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,6 +34,7 @@ #include <asm/cmpxchg.h> #include <asm/fixmap.h> +#include <asm/por.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/sched.h> @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) +{ + u64 por; + + if (!system_supports_poe()) + return true; + + por = read_sysreg_s(SYS_POR_EL0); + + if (write) + return por_elx_allows_write(por, pkey); + + if (execute) + return por_elx_allows_exec(por, pkey); + + return por_elx_allows_read(por, pkey); +} + /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) #define pte_access_permitted(pte, write) \ - pte_access_permitted_no_overlay(pte, write) + (pte_access_permitted_no_overlay(pte, write) && \ + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h new file mode 100644 index 000000000000..a284508a4d02 --- /dev/null +++ b/arch/arm64/include/asm/pkeys.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + * + * Based on arch/x86/include/asm/pkeys.h + */ + +#ifndef _ASM_ARM64_PKEYS_H +#define _ASM_ARM64_PKEYS_H + +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) + +#define arch_max_pkey() 7 + +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + +static inline bool arch_pkeys_enabled(void) +{ + return false; +} + +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; +} + +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (pkey != -1) + return pkey; + + return vma_pkey(vma); +} + +static inline int execute_only_pkey(struct mm_struct *mm) +{ + // Execute-only mappings are handled by EPAN/FEAT_PAN3. + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); + + return -1; +} + +#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. + */ + if (pkey < 0) + return false; + if (pkey >= arch_max_pkey()) + return false; + + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 3-bit key on success, or -1 on failure. + */ +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); + int ret; + + if (!arch_pkeys_enabled()) + return -1; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +#endif /* _ASM_ARM64_PKEYS_H */ diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h new file mode 100644 index 000000000000..d6604e0c5c54 --- /dev/null +++ b/arch/arm64/include/asm/por.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _ASM_ARM64_POR_H +#define _ASM_ARM64_POR_H + +#define POR_BITS_PER_PKEY 4 +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf) + +static inline bool por_elx_allows_read(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_R; +} + +static inline bool por_elx_allows_write(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_W; +} + +static inline bool por_elx_allows_exec(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_X; +} + +#endif /* _ASM_ARM64_POR_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 495b732d5af3..e50ccc86d150 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!arch_pkeys_enabled()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_por = POE_X; + else if (init_val & PKEY_DISABLE_WRITE) + new_por = POE_RX; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif
Implement the PKEYS interface, using the Permission Overlay Extension. Signed-off-by: Joey Gouly <joey.gouly@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/mmu_context.h | 51 ++++++++++++- arch/arm64/include/asm/pgtable.h | 22 +++++- arch/arm64/include/asm/pkeys.h | 110 +++++++++++++++++++++++++++ arch/arm64/include/asm/por.h | 33 ++++++++ arch/arm64/mm/mmu.c | 40 ++++++++++ 6 files changed, 255 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/pkeys.h create mode 100644 arch/arm64/include/asm/por.h