diff mbox series

[v4,17/29] arm64: implement PKEYS support

Message ID 20240503130147.1154804-18-joey.gouly@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Permission Overlay Extension | expand

Commit Message

Joey Gouly May 3, 2024, 1:01 p.m. UTC
Implement the PKEYS interface, using the Permission Overlay Extension.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/mmu.h         |   1 +
 arch/arm64/include/asm/mmu_context.h |  51 ++++++++++++-
 arch/arm64/include/asm/pgtable.h     |  22 +++++-
 arch/arm64/include/asm/pkeys.h       | 110 +++++++++++++++++++++++++++
 arch/arm64/include/asm/por.h         |  33 ++++++++
 arch/arm64/mm/mmu.c                  |  40 ++++++++++
 6 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/include/asm/pkeys.h
 create mode 100644 arch/arm64/include/asm/por.h

Comments

Amit Daniel Kachhap May 28, 2024, 6:55 a.m. UTC | #1
On 5/3/24 18:31, Joey Gouly wrote:
> Implement the PKEYS interface, using the Permission Overlay Extension.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> ---
>   arch/arm64/include/asm/mmu.h         |   1 +
>   arch/arm64/include/asm/mmu_context.h |  51 ++++++++++++-
>   arch/arm64/include/asm/pgtable.h     |  22 +++++-
>   arch/arm64/include/asm/pkeys.h       | 110 +++++++++++++++++++++++++++
>   arch/arm64/include/asm/por.h         |  33 ++++++++
>   arch/arm64/mm/mmu.c                  |  40 ++++++++++
>   6 files changed, 255 insertions(+), 2 deletions(-)
>   create mode 100644 arch/arm64/include/asm/pkeys.h
>   create mode 100644 arch/arm64/include/asm/por.h
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 65977c7783c5..983afeb4eba5 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -25,6 +25,7 @@ typedef struct {
>   	refcount_t	pinned;
>   	void		*vdso;
>   	unsigned long	flags;
> +	u8		pkey_allocation_map;
>   } mm_context_t;
>   
>   /*
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index c768d16b81a4..cb499db7a97b 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -15,12 +15,12 @@
>   #include <linux/sched/hotplug.h>
>   #include <linux/mm_types.h>
>   #include <linux/pgtable.h>
> +#include <linux/pkeys.h>
>   
>   #include <asm/cacheflush.h>
>   #include <asm/cpufeature.h>
>   #include <asm/daifflags.h>
>   #include <asm/proc-fns.h>
> -#include <asm-generic/mm_hooks.h>
>   #include <asm/cputype.h>
>   #include <asm/sysreg.h>
>   #include <asm/tlbflush.h>
> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>   {
>   	atomic64_set(&mm->context.id, 0);
>   	refcount_set(&mm->context.pinned, 0);
> +
> +	/* pkey 0 is the default, so always reserve it. */
> +	mm->context.pkey_allocation_map = 0x1;
> +
> +	return 0;
> +}
> +
> +static inline void arch_dup_pkeys(struct mm_struct *oldmm,
> +				  struct mm_struct *mm)
> +{
> +	/* Duplicate the oldmm pkey state in mm: */
> +	mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
> +}
> +
> +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +	arch_dup_pkeys(oldmm, mm);
> +
>   	return 0;
>   }
>   
> +static inline void arch_exit_mmap(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void arch_unmap(struct mm_struct *mm,
> +			unsigned long start, unsigned long end)
> +{
> +}
> +
>   #ifdef CONFIG_ARM64_SW_TTBR0_PAN
>   static inline void update_saved_ttbr0(struct task_struct *tsk,
>   				      struct mm_struct *mm)
> @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
>   	return -1UL >> 8;
>   }
>   
> +/*
> + * We only want to enforce protection keys on the current process
> + * because we effectively have no access to POR_EL0 for other
> + * processes or any way to tell *which * POR_EL0 in a threaded
> + * process we could use.
> + *
> + * So do not enforce things if the VMA is not from the current
> + * mm, or if we are in a kernel thread.
> + */
> +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> +		bool write, bool execute, bool foreign)
> +{
> +	if (!arch_pkeys_enabled())
> +		return true;

The above check can be dropped as the caller of this function
fault_from_pkey() does the same check.

Thanks,
Amit

> +
> +	/* allow access if the VMA is not one from this process */
> +	if (foreign || vma_is_foreign(vma))
> +		return true;
> +
> +	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
> +}
> +
>   #include <asm-generic/mmu_context.h>
>   
>   #endif /* !__ASSEMBLY__ */
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 2449e4e27ea6..8ee68ff03016 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -34,6 +34,7 @@
>   
>   #include <asm/cmpxchg.h>
>   #include <asm/fixmap.h>
> +#include <asm/por.h>
>   #include <linux/mmdebug.h>
>   #include <linux/mm_types.h>
>   #include <linux/sched.h>
> @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>   #define pte_accessible(mm, pte)	\
>   	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
>   
> +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute)
> +{
> +	u64 por;
> +
> +	if (!system_supports_poe())
> +		return true;
> +
> +	por = read_sysreg_s(SYS_POR_EL0);
> +
> +	if (write)
> +		return por_elx_allows_write(por, pkey);
> +
> +	if (execute)
> +		return por_elx_allows_exec(por, pkey);
> +
> +	return por_elx_allows_read(por, pkey);
> +}
> +
>   /*
>    * p??_access_permitted() is true for valid user mappings (PTE_USER
>    * bit set, subject to the write permission check). For execute-only
> @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>   #define pte_access_permitted_no_overlay(pte, write) \
>   	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
>   #define pte_access_permitted(pte, write) \
> -	pte_access_permitted_no_overlay(pte, write)
> +	(pte_access_permitted_no_overlay(pte, write) && \
> +	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
>   #define pmd_access_permitted(pmd, write) \
>   	(pte_access_permitted(pmd_pte(pmd), (write)))
>   #define pud_access_permitted(pud, write) \
> diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h
> new file mode 100644
> index 000000000000..a284508a4d02
> --- /dev/null
> +++ b/arch/arm64/include/asm/pkeys.h
> @@ -0,0 +1,110 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + *
> + * Based on arch/x86/include/asm/pkeys.h
> + */
> +
> +#ifndef _ASM_ARM64_PKEYS_H
> +#define _ASM_ARM64_PKEYS_H
> +
> +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2)
> +
> +#define arch_max_pkey() 7
> +
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
> +		unsigned long init_val);
> +
> +static inline bool arch_pkeys_enabled(void)
> +{
> +	return false;
> +}
> +
> +static inline int vma_pkey(struct vm_area_struct *vma)
> +{
> +	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
> +}
> +
> +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
> +		int prot, int pkey)
> +{
> +	if (pkey != -1)
> +		return pkey;
> +
> +	return vma_pkey(vma);
> +}
> +
> +static inline int execute_only_pkey(struct mm_struct *mm)
> +{
> +	// Execute-only mappings are handled by EPAN/FEAT_PAN3.
> +	WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN));
> +
> +	return -1;
> +}
> +
> +#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
> +#define mm_set_pkey_allocated(mm, pkey) do {		\
> +	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
> +} while (0)
> +#define mm_set_pkey_free(mm, pkey) do {			\
> +	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
> +} while (0)
> +
> +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
> +{
> +	/*
> +	 * "Allocated" pkeys are those that have been returned
> +	 * from pkey_alloc() or pkey 0 which is allocated
> +	 * implicitly when the mm is created.
> +	 */
> +	if (pkey < 0)
> +		return false;
> +	if (pkey >= arch_max_pkey())
> +		return false;
> +
> +	return mm_pkey_allocation_map(mm) & (1U << pkey);
> +}
> +
> +/*
> + * Returns a positive, 3-bit key on success, or -1 on failure.
> + */
> +static inline int mm_pkey_alloc(struct mm_struct *mm)
> +{
> +	/*
> +	 * Note: this is the one and only place we make sure
> +	 * that the pkey is valid as far as the hardware is
> +	 * concerned.  The rest of the kernel trusts that
> +	 * only good, valid pkeys come out of here.
> +	 */
> +	u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
> +	int ret;
> +
> +	if (!arch_pkeys_enabled())
> +		return -1;
> +
> +	/*
> +	 * Are we out of pkeys?  We must handle this specially
> +	 * because ffz() behavior is undefined if there are no
> +	 * zeros.
> +	 */
> +	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
> +		return -1;
> +
> +	ret = ffz(mm_pkey_allocation_map(mm));
> +
> +	mm_set_pkey_allocated(mm, ret);
> +
> +	return ret;
> +}
> +
> +static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
> +{
> +	if (!mm_pkey_is_allocated(mm, pkey))
> +		return -EINVAL;
> +
> +	mm_set_pkey_free(mm, pkey);
> +
> +	return 0;
> +}
> +
> +#endif /* _ASM_ARM64_PKEYS_H */
> diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
> new file mode 100644
> index 000000000000..d6604e0c5c54
> --- /dev/null
> +++ b/arch/arm64/include/asm/por.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + */
> +
> +#ifndef _ASM_ARM64_POR_H
> +#define _ASM_ARM64_POR_H
> +
> +#define POR_BITS_PER_PKEY		4
> +#define POR_ELx_IDX(por_elx, idx)	(((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf)
> +
> +static inline bool por_elx_allows_read(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_R;
> +}
> +
> +static inline bool por_elx_allows_write(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_W;
> +}
> +
> +static inline bool por_elx_allows_exec(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_X;
> +}
> +
> +#endif /* _ASM_ARM64_POR_H */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 495b732d5af3..e50ccc86d150 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -25,6 +25,7 @@
>   #include <linux/vmalloc.h>
>   #include <linux/set_memory.h>
>   #include <linux/kfence.h>
> +#include <linux/pkeys.h>
>   
>   #include <asm/barrier.h>
>   #include <asm/cputype.h>
> @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
>   
>   	cpu_uninstall_idmap();
>   }
> +
> +#ifdef CONFIG_ARCH_HAS_PKEYS
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> +{
> +	u64 new_por = POE_RXW;
> +	u64 old_por;
> +	u64 pkey_shift;
> +
> +	if (!arch_pkeys_enabled())
> +		return -ENOSPC;
> +
> +	/*
> +	 * This code should only be called with valid 'pkey'
> +	 * values originating from in-kernel users.  Complain
> +	 * if a bad value is observed.
> +	 */
> +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> +		return -EINVAL;
> +
> +	/* Set the bits we need in POR:  */
> +	if (init_val & PKEY_DISABLE_ACCESS)
> +		new_por = POE_X;
> +	else if (init_val & PKEY_DISABLE_WRITE)
> +		new_por = POE_RX;
> +
> +	/* Shift the bits in to the correct place in POR for pkey: */
> +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> +	new_por <<= pkey_shift;
> +
> +	/* Get old POR and mask off any old bits in place: */
> +	old_por = read_sysreg_s(SYS_POR_EL0);
> +	old_por &= ~(POE_MASK << pkey_shift);
> +
> +	/* Write old part along with new part: */
> +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> +
> +	return 0;
> +}
> +#endif
Joey Gouly May 28, 2024, 11:26 a.m. UTC | #2
Hi Amit,

Thanks for taking a look!

On Tue, May 28, 2024 at 12:25:58PM +0530, Amit Daniel Kachhap wrote:
> 
> 
> On 5/3/24 18:31, Joey Gouly wrote:
> > Implement the PKEYS interface, using the Permission Overlay Extension.
> > 
> > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > ---
> >   arch/arm64/include/asm/mmu.h         |   1 +
> >   arch/arm64/include/asm/mmu_context.h |  51 ++++++++++++-
> >   arch/arm64/include/asm/pgtable.h     |  22 +++++-
> >   arch/arm64/include/asm/pkeys.h       | 110 +++++++++++++++++++++++++++
> >   arch/arm64/include/asm/por.h         |  33 ++++++++
> >   arch/arm64/mm/mmu.c                  |  40 ++++++++++
> >   6 files changed, 255 insertions(+), 2 deletions(-)
> >   create mode 100644 arch/arm64/include/asm/pkeys.h
> >   create mode 100644 arch/arm64/include/asm/por.h
> > 
> > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> > index 65977c7783c5..983afeb4eba5 100644
> > --- a/arch/arm64/include/asm/mmu.h
> > +++ b/arch/arm64/include/asm/mmu.h
> > @@ -25,6 +25,7 @@ typedef struct {
> >   	refcount_t	pinned;
> >   	void		*vdso;
> >   	unsigned long	flags;
> > +	u8		pkey_allocation_map;
> >   } mm_context_t;
> >   /*
> > diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> > index c768d16b81a4..cb499db7a97b 100644
> > --- a/arch/arm64/include/asm/mmu_context.h
> > +++ b/arch/arm64/include/asm/mmu_context.h
> > @@ -15,12 +15,12 @@
> >   #include <linux/sched/hotplug.h>
> >   #include <linux/mm_types.h>
> >   #include <linux/pgtable.h>
> > +#include <linux/pkeys.h>
> >   #include <asm/cacheflush.h>
> >   #include <asm/cpufeature.h>
> >   #include <asm/daifflags.h>
> >   #include <asm/proc-fns.h>
> > -#include <asm-generic/mm_hooks.h>
> >   #include <asm/cputype.h>
> >   #include <asm/sysreg.h>
> >   #include <asm/tlbflush.h>
> > @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> >   {
> >   	atomic64_set(&mm->context.id, 0);
> >   	refcount_set(&mm->context.pinned, 0);
> > +
> > +	/* pkey 0 is the default, so always reserve it. */
> > +	mm->context.pkey_allocation_map = 0x1;
> > +
> > +	return 0;
> > +}
> > +
> > +static inline void arch_dup_pkeys(struct mm_struct *oldmm,
> > +				  struct mm_struct *mm)
> > +{
> > +	/* Duplicate the oldmm pkey state in mm: */
> > +	mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
> > +}
> > +
> > +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> > +{
> > +	arch_dup_pkeys(oldmm, mm);
> > +
> >   	return 0;
> >   }
> > +static inline void arch_exit_mmap(struct mm_struct *mm)
> > +{
> > +}
> > +
> > +static inline void arch_unmap(struct mm_struct *mm,
> > +			unsigned long start, unsigned long end)
> > +{
> > +}
> > +
> >   #ifdef CONFIG_ARM64_SW_TTBR0_PAN
> >   static inline void update_saved_ttbr0(struct task_struct *tsk,
> >   				      struct mm_struct *mm)
> > @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
> >   	return -1UL >> 8;
> >   }
> > +/*
> > + * We only want to enforce protection keys on the current process
> > + * because we effectively have no access to POR_EL0 for other
> > + * processes or any way to tell *which * POR_EL0 in a threaded
> > + * process we could use.
> > + *
> > + * So do not enforce things if the VMA is not from the current
> > + * mm, or if we are in a kernel thread.
> > + */
> > +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> > +		bool write, bool execute, bool foreign)
> > +{
> > +	if (!arch_pkeys_enabled())
> > +		return true;
> 
> The above check can be dropped as the caller of this function
> fault_from_pkey() does the same check.

arch_vma_access_permitted() is called by other places in the kernel, so I need to leave that check in.

Thanks,
Joey

> 
> Thanks,
> Amit
> 
> > +
> > +	/* allow access if the VMA is not one from this process */
> > +	if (foreign || vma_is_foreign(vma))
> > +		return true;
> > +
> > +	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
> > +}
> > +
> >   #include <asm-generic/mmu_context.h>
> >   #endif /* !__ASSEMBLY__ */
> > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > index 2449e4e27ea6..8ee68ff03016 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -34,6 +34,7 @@
> >   #include <asm/cmpxchg.h>
> >   #include <asm/fixmap.h>
> > +#include <asm/por.h>
> >   #include <linux/mmdebug.h>
> >   #include <linux/mm_types.h>
> >   #include <linux/sched.h>
> > @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> >   #define pte_accessible(mm, pte)	\
> >   	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
> > +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute)
> > +{
> > +	u64 por;
> > +
> > +	if (!system_supports_poe())
> > +		return true;
> > +
> > +	por = read_sysreg_s(SYS_POR_EL0);
> > +
> > +	if (write)
> > +		return por_elx_allows_write(por, pkey);
> > +
> > +	if (execute)
> > +		return por_elx_allows_exec(por, pkey);
> > +
> > +	return por_elx_allows_read(por, pkey);
> > +}
> > +
> >   /*
> >    * p??_access_permitted() is true for valid user mappings (PTE_USER
> >    * bit set, subject to the write permission check). For execute-only
> > @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> >   #define pte_access_permitted_no_overlay(pte, write) \
> >   	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
> >   #define pte_access_permitted(pte, write) \
> > -	pte_access_permitted_no_overlay(pte, write)
> > +	(pte_access_permitted_no_overlay(pte, write) && \
> > +	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
> >   #define pmd_access_permitted(pmd, write) \
> >   	(pte_access_permitted(pmd_pte(pmd), (write)))
> >   #define pud_access_permitted(pud, write) \
> > diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h
> > new file mode 100644
> > index 000000000000..a284508a4d02
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/pkeys.h
> > @@ -0,0 +1,110 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2023 Arm Ltd.
> > + *
> > + * Based on arch/x86/include/asm/pkeys.h
> > + */
> > +
> > +#ifndef _ASM_ARM64_PKEYS_H
> > +#define _ASM_ARM64_PKEYS_H
> > +
> > +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2)
> > +
> > +#define arch_max_pkey() 7
> > +
> > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
> > +		unsigned long init_val);
> > +
> > +static inline bool arch_pkeys_enabled(void)
> > +{
> > +	return false;
> > +}
> > +
> > +static inline int vma_pkey(struct vm_area_struct *vma)
> > +{
> > +	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
> > +}
> > +
> > +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
> > +		int prot, int pkey)
> > +{
> > +	if (pkey != -1)
> > +		return pkey;
> > +
> > +	return vma_pkey(vma);
> > +}
> > +
> > +static inline int execute_only_pkey(struct mm_struct *mm)
> > +{
> > +	// Execute-only mappings are handled by EPAN/FEAT_PAN3.
> > +	WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN));
> > +
> > +	return -1;
> > +}
> > +
> > +#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
> > +#define mm_set_pkey_allocated(mm, pkey) do {		\
> > +	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
> > +} while (0)
> > +#define mm_set_pkey_free(mm, pkey) do {			\
> > +	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
> > +} while (0)
> > +
> > +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
> > +{
> > +	/*
> > +	 * "Allocated" pkeys are those that have been returned
> > +	 * from pkey_alloc() or pkey 0 which is allocated
> > +	 * implicitly when the mm is created.
> > +	 */
> > +	if (pkey < 0)
> > +		return false;
> > +	if (pkey >= arch_max_pkey())
> > +		return false;
> > +
> > +	return mm_pkey_allocation_map(mm) & (1U << pkey);
> > +}
> > +
> > +/*
> > + * Returns a positive, 3-bit key on success, or -1 on failure.
> > + */
> > +static inline int mm_pkey_alloc(struct mm_struct *mm)
> > +{
> > +	/*
> > +	 * Note: this is the one and only place we make sure
> > +	 * that the pkey is valid as far as the hardware is
> > +	 * concerned.  The rest of the kernel trusts that
> > +	 * only good, valid pkeys come out of here.
> > +	 */
> > +	u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
> > +	int ret;
> > +
> > +	if (!arch_pkeys_enabled())
> > +		return -1;
> > +
> > +	/*
> > +	 * Are we out of pkeys?  We must handle this specially
> > +	 * because ffz() behavior is undefined if there are no
> > +	 * zeros.
> > +	 */
> > +	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
> > +		return -1;
> > +
> > +	ret = ffz(mm_pkey_allocation_map(mm));
> > +
> > +	mm_set_pkey_allocated(mm, ret);
> > +
> > +	return ret;
> > +}
> > +
> > +static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
> > +{
> > +	if (!mm_pkey_is_allocated(mm, pkey))
> > +		return -EINVAL;
> > +
> > +	mm_set_pkey_free(mm, pkey);
> > +
> > +	return 0;
> > +}
> > +
> > +#endif /* _ASM_ARM64_PKEYS_H */
> > diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
> > new file mode 100644
> > index 000000000000..d6604e0c5c54
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/por.h
> > @@ -0,0 +1,33 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2023 Arm Ltd.
> > + */
> > +
> > +#ifndef _ASM_ARM64_POR_H
> > +#define _ASM_ARM64_POR_H
> > +
> > +#define POR_BITS_PER_PKEY		4
> > +#define POR_ELx_IDX(por_elx, idx)	(((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf)
> > +
> > +static inline bool por_elx_allows_read(u64 por, u8 pkey)
> > +{
> > +	u8 perm = POR_ELx_IDX(por, pkey);
> > +
> > +	return perm & POE_R;
> > +}
> > +
> > +static inline bool por_elx_allows_write(u64 por, u8 pkey)
> > +{
> > +	u8 perm = POR_ELx_IDX(por, pkey);
> > +
> > +	return perm & POE_W;
> > +}
> > +
> > +static inline bool por_elx_allows_exec(u64 por, u8 pkey)
> > +{
> > +	u8 perm = POR_ELx_IDX(por, pkey);
> > +
> > +	return perm & POE_X;
> > +}
> > +
> > +#endif /* _ASM_ARM64_POR_H */
> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> > index 495b732d5af3..e50ccc86d150 100644
> > --- a/arch/arm64/mm/mmu.c
> > +++ b/arch/arm64/mm/mmu.c
> > @@ -25,6 +25,7 @@
> >   #include <linux/vmalloc.h>
> >   #include <linux/set_memory.h>
> >   #include <linux/kfence.h>
> > +#include <linux/pkeys.h>
> >   #include <asm/barrier.h>
> >   #include <asm/cputype.h>
> > @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
> >   	cpu_uninstall_idmap();
> >   }
> > +
> > +#ifdef CONFIG_ARCH_HAS_PKEYS
> > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> > +{
> > +	u64 new_por = POE_RXW;
> > +	u64 old_por;
> > +	u64 pkey_shift;
> > +
> > +	if (!arch_pkeys_enabled())
> > +		return -ENOSPC;
> > +
> > +	/*
> > +	 * This code should only be called with valid 'pkey'
> > +	 * values originating from in-kernel users.  Complain
> > +	 * if a bad value is observed.
> > +	 */
> > +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> > +		return -EINVAL;
> > +
> > +	/* Set the bits we need in POR:  */
> > +	if (init_val & PKEY_DISABLE_ACCESS)
> > +		new_por = POE_X;
> > +	else if (init_val & PKEY_DISABLE_WRITE)
> > +		new_por = POE_RX;
> > +
> > +	/* Shift the bits in to the correct place in POR for pkey: */
> > +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> > +	new_por <<= pkey_shift;
> > +
> > +	/* Get old POR and mask off any old bits in place: */
> > +	old_por = read_sysreg_s(SYS_POR_EL0);
> > +	old_por &= ~(POE_MASK << pkey_shift);
> > +
> > +	/* Write old part along with new part: */
> > +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> > +
> > +	return 0;
> > +}
> > +#endif
Szabolcs Nagy May 31, 2024, 2:57 p.m. UTC | #3
The 05/03/2024 14:01, Joey Gouly wrote:
> Implement the PKEYS interface, using the Permission Overlay Extension.
...
> +#ifdef CONFIG_ARCH_HAS_PKEYS
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> +{
> +	u64 new_por = POE_RXW;
> +	u64 old_por;
> +	u64 pkey_shift;
> +
> +	if (!arch_pkeys_enabled())
> +		return -ENOSPC;
> +
> +	/*
> +	 * This code should only be called with valid 'pkey'
> +	 * values originating from in-kernel users.  Complain
> +	 * if a bad value is observed.
> +	 */
> +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> +		return -EINVAL;
> +
> +	/* Set the bits we need in POR:  */
> +	if (init_val & PKEY_DISABLE_ACCESS)
> +		new_por = POE_X;
> +	else if (init_val & PKEY_DISABLE_WRITE)
> +		new_por = POE_RX;
> +

given that the architecture allows r,w,x permissions to be
set independently, should we have a 'PKEY_DISABLE_EXEC' or
similar api flag?

(on other targets it can be some invalid value that fails)

> +	/* Shift the bits in to the correct place in POR for pkey: */
> +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> +	new_por <<= pkey_shift;
> +
> +	/* Get old POR and mask off any old bits in place: */
> +	old_por = read_sysreg_s(SYS_POR_EL0);
> +	old_por &= ~(POE_MASK << pkey_shift);
> +
> +	/* Write old part along with new part: */
> +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> +
> +	return 0;
> +}
> +#endif
> -- 
> 2.25.1
>
Joey Gouly May 31, 2024, 3:21 p.m. UTC | #4
Hi Szabolcs,

On Fri, May 31, 2024 at 03:57:07PM +0100, Szabolcs Nagy wrote:
> The 05/03/2024 14:01, Joey Gouly wrote:
> > Implement the PKEYS interface, using the Permission Overlay Extension.
> ...
> > +#ifdef CONFIG_ARCH_HAS_PKEYS
> > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> > +{
> > +	u64 new_por = POE_RXW;
> > +	u64 old_por;
> > +	u64 pkey_shift;
> > +
> > +	if (!arch_pkeys_enabled())
> > +		return -ENOSPC;
> > +
> > +	/*
> > +	 * This code should only be called with valid 'pkey'
> > +	 * values originating from in-kernel users.  Complain
> > +	 * if a bad value is observed.
> > +	 */
> > +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> > +		return -EINVAL;
> > +
> > +	/* Set the bits we need in POR:  */
> > +	if (init_val & PKEY_DISABLE_ACCESS)
> > +		new_por = POE_X;
> > +	else if (init_val & PKEY_DISABLE_WRITE)
> > +		new_por = POE_RX;
> > +
> 
> given that the architecture allows r,w,x permissions to be
> set independently, should we have a 'PKEY_DISABLE_EXEC' or
> similar api flag?
> 
> (on other targets it can be some invalid value that fails)

I didn't think about the best way to do that yet. PowerPC has a PKEY_DISABLE_EXECUTE.

We could either make that generic, and X86 has to error if it sees that bit, or
we add a arch-specific PKEY_DISABLE_EXECUTE like PowerPC.

A user can still set it by interacting with the register directly, but I guess
we want something for the glibc interface..

Dave, any thoughts here?

> 
> > +	/* Shift the bits in to the correct place in POR for pkey: */
> > +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> > +	new_por <<= pkey_shift;
> > +
> > +	/* Get old POR and mask off any old bits in place: */
> > +	old_por = read_sysreg_s(SYS_POR_EL0);
> > +	old_por &= ~(POE_MASK << pkey_shift);
> > +
> > +	/* Write old part along with new part: */
> > +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> > +
> > +	return 0;
> > +}
> > +#endif

Thanks,
Joey
Szabolcs Nagy May 31, 2024, 4:27 p.m. UTC | #5
The 05/31/2024 16:21, Joey Gouly wrote:
> Hi Szabolcs,
> 
> On Fri, May 31, 2024 at 03:57:07PM +0100, Szabolcs Nagy wrote:
> > The 05/03/2024 14:01, Joey Gouly wrote:
> > > Implement the PKEYS interface, using the Permission Overlay Extension.
> > ...
> > > +#ifdef CONFIG_ARCH_HAS_PKEYS
> > > +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> > > +{
> > > +	u64 new_por = POE_RXW;
> > > +	u64 old_por;
> > > +	u64 pkey_shift;
> > > +
> > > +	if (!arch_pkeys_enabled())
> > > +		return -ENOSPC;
> > > +
> > > +	/*
> > > +	 * This code should only be called with valid 'pkey'
> > > +	 * values originating from in-kernel users.  Complain
> > > +	 * if a bad value is observed.
> > > +	 */
> > > +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> > > +		return -EINVAL;
> > > +
> > > +	/* Set the bits we need in POR:  */
> > > +	if (init_val & PKEY_DISABLE_ACCESS)
> > > +		new_por = POE_X;
> > > +	else if (init_val & PKEY_DISABLE_WRITE)
> > > +		new_por = POE_RX;
> > > +
> > 
> > given that the architecture allows r,w,x permissions to be
> > set independently, should we have a 'PKEY_DISABLE_EXEC' or
> > similar api flag?
> > 
> > (on other targets it can be some invalid value that fails)
> 
> I didn't think about the best way to do that yet. PowerPC has a PKEY_DISABLE_EXECUTE.
> 
> We could either make that generic, and X86 has to error if it sees that bit, or
> we add a arch-specific PKEY_DISABLE_EXECUTE like PowerPC.

this does not seem to be in glibc yet. (or in linux man pages)

i guess you can copy whatever ppc does.

> 
> A user can still set it by interacting with the register directly, but I guess
> we want something for the glibc interface..
> 
> Dave, any thoughts here?

adding Florian too, since i found an old thread of his that tried
to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
it did not seem to end up upstream. (this makes more sense to me
as libc api than the weird disable access semantics)
Florian Weimer June 17, 2024, 1:40 p.m. UTC | #6
* Szabolcs Nagy:

>> A user can still set it by interacting with the register directly, but I guess
>> we want something for the glibc interface..
>> 
>> Dave, any thoughts here?
>
> adding Florian too, since i found an old thread of his that tried
> to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
> it did not seem to end up upstream. (this makes more sense to me
> as libc api than the weird disable access semantics)

I still think it makes sense to have a full complenent of PKEY_* flags
complementing the PROT_* flags, in a somewhat abstract fashion for
pkey_alloc only.  The internal protection mask register encoding will
differ from architecture to architecture, but the abstract glibc
functions pkey_set and pkey_get could use them (if we are a bit
careful).

Thanks,
Florian
Szabolcs Nagy June 17, 2024, 2:51 p.m. UTC | #7
The 06/17/2024 15:40, Florian Weimer wrote:
> >> A user can still set it by interacting with the register directly, but I guess
> >> we want something for the glibc interface..
> >> 
> >> Dave, any thoughts here?
> >
> > adding Florian too, since i found an old thread of his that tried
> > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
> > it did not seem to end up upstream. (this makes more sense to me
> > as libc api than the weird disable access semantics)
> 
> I still think it makes sense to have a full complenent of PKEY_* flags
> complementing the PROT_* flags, in a somewhat abstract fashion for
> pkey_alloc only.  The internal protection mask register encoding will
> differ from architecture to architecture, but the abstract glibc
> functions pkey_set and pkey_get could use them (if we are a bit
> careful).

to me it makes sense to have abstract

PKEY_DISABLE_READ
PKEY_DISABLE_WRITE
PKEY_DISABLE_EXECUTE
PKEY_DISABLE_ACCESS

where access is handled like

if (flags&PKEY_DISABLE_ACCESS)
	flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE;
disable_read = flags&PKEY_DISABLE_READ;
disable_write = flags&PKEY_DISABLE_WRITE;
disable_exec = flags&PKEY_DISABLE_EXECUTE;

if there are unsupported combinations like
disable_read&&!disable_write then those are rejected
by pkey_alloc and pkey_set.

this allows portable use of pkey apis.
(the flags could be target specific, but don't have to be)
Catalin Marinas July 5, 2024, 4:59 p.m. UTC | #8
On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote:
> @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>  #define pte_access_permitted_no_overlay(pte, write) \
>  	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
>  #define pte_access_permitted(pte, write) \
> -	pte_access_permitted_no_overlay(pte, write)
> +	(pte_access_permitted_no_overlay(pte, write) && \
> +	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))

I'm still not entirely convinced on checking the keys during fast GUP
but that's what x86 and powerpc do already, so I guess we'll follow the
same ABI.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Catalin Marinas July 8, 2024, 5:53 p.m. UTC | #9
Hi Szabolcs,

On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote:
> The 06/17/2024 15:40, Florian Weimer wrote:
> > >> A user can still set it by interacting with the register directly, but I guess
> > >> we want something for the glibc interface..
> > >> 
> > >> Dave, any thoughts here?
> > >
> > > adding Florian too, since i found an old thread of his that tried
> > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
> > > it did not seem to end up upstream. (this makes more sense to me
> > > as libc api than the weird disable access semantics)
> > 
> > I still think it makes sense to have a full complenent of PKEY_* flags
> > complementing the PROT_* flags, in a somewhat abstract fashion for
> > pkey_alloc only.  The internal protection mask register encoding will
> > differ from architecture to architecture, but the abstract glibc
> > functions pkey_set and pkey_get could use them (if we are a bit
> > careful).
> 
> to me it makes sense to have abstract
> 
> PKEY_DISABLE_READ
> PKEY_DISABLE_WRITE
> PKEY_DISABLE_EXECUTE
> PKEY_DISABLE_ACCESS
> 
> where access is handled like
> 
> if (flags&PKEY_DISABLE_ACCESS)
> 	flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE;
> disable_read = flags&PKEY_DISABLE_READ;
> disable_write = flags&PKEY_DISABLE_WRITE;
> disable_exec = flags&PKEY_DISABLE_EXECUTE;
> 
> if there are unsupported combinations like
> disable_read&&!disable_write then those are rejected
> by pkey_alloc and pkey_set.
> 
> this allows portable use of pkey apis.
> (the flags could be target specific, but don't have to be)

On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the
kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so
for powerpc there's no way to to set an execute-only permission via this
interface. I wouldn't like to diverge from powerpc.

However, does it matter much? That's only for the initial setup, the
user can then change the permissions directly via the sysreg. So maybe
we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE
together with the full PKEY_DISABLE_ACCESS would probably suffice.

Give that on x86 the PKEY_ACCESS_MASK will have to stay as
PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE, we'll probably do the same as
powerpc and define an arm64 specific PKEY_DISABLE_EXECUTE with the
corresponding PKEY_ACCESS_MASK including it. We can generalise the masks
with some ARCH_HAS_PKEY_DISABLE_EXECUTE but it's probably more hassle
than just defining the arm64 PKEY_DISABLE_EXECUTE.

I assume you'd like PKEY_DISABLE_EXECUTE to be part of this series,
otherwise changing PKEY_ACCESS_MASK later will cause potential ABI
issues.
Szabolcs Nagy July 9, 2024, 8:32 a.m. UTC | #10
The 07/08/2024 18:53, Catalin Marinas wrote:
> Hi Szabolcs,
> 
> On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote:
> > The 06/17/2024 15:40, Florian Weimer wrote:
> > > >> A user can still set it by interacting with the register directly, but I guess
> > > >> we want something for the glibc interface..
> > > >> 
> > > >> Dave, any thoughts here?
> > > >
> > > > adding Florian too, since i found an old thread of his that tried
> > > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
> > > > it did not seem to end up upstream. (this makes more sense to me
> > > > as libc api than the weird disable access semantics)
> > > 
> > > I still think it makes sense to have a full complenent of PKEY_* flags
> > > complementing the PROT_* flags, in a somewhat abstract fashion for
> > > pkey_alloc only.  The internal protection mask register encoding will
> > > differ from architecture to architecture, but the abstract glibc
> > > functions pkey_set and pkey_get could use them (if we are a bit
> > > careful).
> > 
> > to me it makes sense to have abstract
> > 
> > PKEY_DISABLE_READ
> > PKEY_DISABLE_WRITE
> > PKEY_DISABLE_EXECUTE
> > PKEY_DISABLE_ACCESS
> > 
> > where access is handled like
> > 
> > if (flags&PKEY_DISABLE_ACCESS)
> > 	flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE;
> > disable_read = flags&PKEY_DISABLE_READ;
> > disable_write = flags&PKEY_DISABLE_WRITE;
> > disable_exec = flags&PKEY_DISABLE_EXECUTE;
> > 
> > if there are unsupported combinations like
> > disable_read&&!disable_write then those are rejected
> > by pkey_alloc and pkey_set.
> > 
> > this allows portable use of pkey apis.
> > (the flags could be target specific, but don't have to be)
> 
> On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the
> kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so
> for powerpc there's no way to to set an execute-only permission via this
> interface. I wouldn't like to diverge from powerpc.

the exec permission should be documented in the man.
and i think it should be consistent across targets
to allow portable use.

now ppc and x86 are inconsistent, i think it's not
ideal, but ok to say that targets without disable-exec
support do whatever x86 does with PKEY_DISABLE_ACCESS
otherwise it means whatever ppc does.

> 
> However, does it matter much? That's only for the initial setup, the
> user can then change the permissions directly via the sysreg. So maybe
> we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE
> together with the full PKEY_DISABLE_ACCESS would probably suffice.

this is ok.

a bit awkward in userspace when the register is directly
set to e.g write-only and pkey_get has to return something,
but we can handle settings outside of valid PKEY_* macros
as unspec, users who want that would use their own register
set/get code.

i would have designed the permission to use either existing
PROT_* flags or say that it is architectural and written to
the register directly and let the libc wrapper deal with
portable api, i guess it's too late now.

(the signal handling behaviour should have a control and it
is possible to fix e.g. via pkey_alloc flags, but that may
not be the best solution and this can be done later.)

> 
> Give that on x86 the PKEY_ACCESS_MASK will have to stay as
> PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE, we'll probably do the same as
> powerpc and define an arm64 specific PKEY_DISABLE_EXECUTE with the
> corresponding PKEY_ACCESS_MASK including it. We can generalise the masks
> with some ARCH_HAS_PKEY_DISABLE_EXECUTE but it's probably more hassle
> than just defining the arm64 PKEY_DISABLE_EXECUTE.
> 
> I assume you'd like PKEY_DISABLE_EXECUTE to be part of this series,
> otherwise changing PKEY_ACCESS_MASK later will cause potential ABI
> issues.

yes i think we should figure this out in the initial support.
Florian Weimer July 9, 2024, 8:52 a.m. UTC | #11
* Szabolcs Nagy:

>> However, does it matter much? That's only for the initial setup, the
>> user can then change the permissions directly via the sysreg. So maybe
>> we don't need all those combinations upfront. A PKEY_DISABLE_EXECUTE
>> together with the full PKEY_DISABLE_ACCESS would probably suffice.
>
> this is ok.
>
> a bit awkward in userspace when the register is directly
> set to e.g write-only and pkey_get has to return something,
> but we can handle settings outside of valid PKEY_* macros
> as unspec, users who want that would use their own register
> set/get code.
>
> i would have designed the permission to use either existing
> PROT_* flags or say that it is architectural and written to
> the register directly and let the libc wrapper deal with
> portable api, i guess it's too late now.

We can still define a portable API if we get a few more PKEY_* bits.
The last attempt stalled because the kernel does not really need them,
it would be for userspace benefit only.

For performance-critical code, pkey_get/pkey_set are already too slow,
so adding a bit more bit twiddling to it wouldn't be a proble, I think.
Applications that want to change protection key bits around a very short
code sequence will have to write the architecture-specific register.

> (the signal handling behaviour should have a control and it
> is possible to fix e.g. via pkey_alloc flags, but that may
> not be the best solution and this can be done later.)

For glibc, the POWER behavior is much more useful.

Thanks,
Florian
Kevin Brodsky July 9, 2024, 1:07 p.m. UTC | #12
On 03/05/2024 15:01, Joey Gouly wrote:
> @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
>  	return -1UL >> 8;
>  }
>  
> +/*
> + * We only want to enforce protection keys on the current process
> + * because we effectively have no access to POR_EL0 for other
> + * processes or any way to tell *which * POR_EL0 in a threaded
> + * process we could use.

I see that this comment is essentially copied from x86, but to me it
misses the main point. Even with only one thread in the target process
and a way to obtain its POR_EL0, it still wouldn't make sense to check
that value. If we take the case of a debugger accessing an inferior via
ptrace(), for instance, the kernel is asked to access some memory in
another mm. However, the debugger's POR_EL0 is tied to its own address
space, and the target's POR_EL0 is relevant to its own execution flow
only. In such situations, there is essentially no user context for the
access, so It fundamentally does not make sense to make checks based on
pkey/POE or similar restrictions to memory accesses (e.g. MTE).

Kevin

> + *
> + * So do not enforce things if the VMA is not from the current
> + * mm, or if we are in a kernel thread.
> + */
> +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> +		bool write, bool execute, bool foreign)
> +{
> +	if (!arch_pkeys_enabled())
> +		return true;
> +
> +	/* allow access if the VMA is not one from this process */
> +	if (foreign || vma_is_foreign(vma))
> +		return true;
> +
> +	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
> +}
> +
Joey Gouly July 11, 2024, 9:50 a.m. UTC | #13
On Mon, Jul 08, 2024 at 06:53:18PM +0100, Catalin Marinas wrote:
> Hi Szabolcs,
> 
> On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote:
> > The 06/17/2024 15:40, Florian Weimer wrote:
> > > >> A user can still set it by interacting with the register directly, but I guess
> > > >> we want something for the glibc interface..
> > > >> 
> > > >> Dave, any thoughts here?
> > > >
> > > > adding Florian too, since i found an old thread of his that tried
> > > > to add separate PKEY_DISABLE_READ and PKEY_DISABLE_EXECUTE, but
> > > > it did not seem to end up upstream. (this makes more sense to me
> > > > as libc api than the weird disable access semantics)
> > > 
> > > I still think it makes sense to have a full complenent of PKEY_* flags
> > > complementing the PROT_* flags, in a somewhat abstract fashion for
> > > pkey_alloc only.  The internal protection mask register encoding will
> > > differ from architecture to architecture, but the abstract glibc
> > > functions pkey_set and pkey_get could use them (if we are a bit
> > > careful).
> > 
> > to me it makes sense to have abstract
> > 
> > PKEY_DISABLE_READ
> > PKEY_DISABLE_WRITE
> > PKEY_DISABLE_EXECUTE
> > PKEY_DISABLE_ACCESS
> > 
> > where access is handled like
> > 
> > if (flags&PKEY_DISABLE_ACCESS)
> > 	flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE;
> > disable_read = flags&PKEY_DISABLE_READ;
> > disable_write = flags&PKEY_DISABLE_WRITE;
> > disable_exec = flags&PKEY_DISABLE_EXECUTE;
> > 
> > if there are unsupported combinations like
> > disable_read&&!disable_write then those are rejected
> > by pkey_alloc and pkey_set.
> > 
> > this allows portable use of pkey apis.
> > (the flags could be target specific, but don't have to be)
> 
> On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the
> kernel doesn't define a PKEY_DISABLE_READ, only PKEY_DISABLE_ACCESS so
> for powerpc there's no way to to set an execute-only permission via this
> interface. I wouldn't like to diverge from powerpc.

I think this is wrong, look at this code from powerpc:

arch/powerpc/mm/book3s64/pkeys.c: __arch_set_user_pkey_access

        if (init_val & PKEY_DISABLE_EXECUTE) {
                if (!pkey_execute_disable_supported)
                        return -EINVAL;
                new_iamr_bits |= IAMR_EX_BIT;
        }
        init_iamr(pkey, new_iamr_bits);

        /* Set the bits we need in AMR: */
        if (init_val & PKEY_DISABLE_ACCESS)
                new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
        else if (init_val & PKEY_DISABLE_WRITE)
                new_amr_bits |= AMR_WR_BIT;

        init_amr(pkey, new_amr_bits);

Seems to me that PKEY_DISABLE_ACCESS leaves exec permissions as-is.

Here is the patch I am planning to include in the next version of the series.
This should support all PKEY_DISABLE_* combinations. Any comments? 

commit ba51371a544f6b0a4a0f03df62ad894d53f5039b
Author: Joey Gouly <joey.gouly@arm.com>
Date:   Thu Jul 4 11:29:20 2024 +0100

    arm64: add PKEY_DISABLE_READ and PKEY_DISABLE_EXEC
    
    TODO
    
    Signed-off-by: Joey Gouly <joey.gouly@arm.com>

diff --git arch/arm64/include/uapi/asm/mman.h arch/arm64/include/uapi/asm/mman.h
index 1e6482a838e1..e7e0c8216243 100644
--- arch/arm64/include/uapi/asm/mman.h
+++ arch/arm64/include/uapi/asm/mman.h
@@ -7,4 +7,13 @@
 #define PROT_BTI       0x10            /* BTI guarded page */
 #define PROT_MTE       0x20            /* Normal Tagged mapping */
 
+/* Override any generic PKEY permission defines */
+#define PKEY_DISABLE_EXECUTE   0x4
+#define PKEY_DISABLE_READ      0x8
+#undef PKEY_ACCESS_MASK
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                               PKEY_DISABLE_WRITE  |\
+                               PKEY_DISABLE_READ   |\
+                               PKEY_DISABLE_EXECUTE)
+
 #endif /* ! _UAPI__ASM_MMAN_H */
diff --git arch/arm64/mm/mmu.c arch/arm64/mm/mmu.c
index 68afe5fc3071..ce4cc6bdee4e 100644
--- arch/arm64/mm/mmu.c
+++ arch/arm64/mm/mmu.c
@@ -1570,10 +1570,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
                return -EINVAL;
 
        /* Set the bits we need in POR:  */
+       new_por = POE_RXW;
+       if (init_val & PKEY_DISABLE_WRITE)
+               new_por &= ~POE_W;
        if (init_val & PKEY_DISABLE_ACCESS)
-               new_por = POE_X;
-       else if (init_val & PKEY_DISABLE_WRITE)
-               new_por = POE_RX;
+               new_por &= ~POE_RW;
+       if (init_val & PKEY_DISABLE_READ)
+               new_por &= ~POE_R;
+       if (init_val & PKEY_DISABLE_EXECUTE)
+               new_por &= ~POE_X;
 
        /* Shift the bits in to the correct place in POR for pkey: */
        pkey_shift = pkey * POR_BITS_PER_PKEY;



Thanks,
Joey
Anshuman Khandual July 16, 2024, 11:40 a.m. UTC | #14
On 7/9/24 18:37, Kevin Brodsky wrote:
> On 03/05/2024 15:01, Joey Gouly wrote:
>> @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
>>  	return -1UL >> 8;
>>  }
>>  
>> +/*
>> + * We only want to enforce protection keys on the current process
>> + * because we effectively have no access to POR_EL0 for other
>> + * processes or any way to tell *which * POR_EL0 in a threaded
>> + * process we could use.
> 
> I see that this comment is essentially copied from x86, but to me it
> misses the main point. Even with only one thread in the target process
> and a way to obtain its POR_EL0, it still wouldn't make sense to check
> that value. If we take the case of a debugger accessing an inferior via
> ptrace(), for instance, the kernel is asked to access some memory in
> another mm. However, the debugger's POR_EL0 is tied to its own address
> space, and the target's POR_EL0 is relevant to its own execution flow
> only. In such situations, there is essentially no user context for the
> access, so It fundamentally does not make sense to make checks based on
> pkey/POE or similar restrictions to memory accesses (e.g. MTE).

Indeed this makes more sense. There is no memory context even if there is
access to another POR_EL0. The comment above could be improved describing
this limitation.
Szabolcs Nagy July 18, 2024, 2:45 p.m. UTC | #15
The 07/11/2024 10:50, Joey Gouly wrote:
> On Mon, Jul 08, 2024 at 06:53:18PM +0100, Catalin Marinas wrote:
> > On Mon, Jun 17, 2024 at 03:51:35PM +0100, Szabolcs Nagy wrote:
> > > to me it makes sense to have abstract
> > > 
> > > PKEY_DISABLE_READ
> > > PKEY_DISABLE_WRITE
> > > PKEY_DISABLE_EXECUTE
> > > PKEY_DISABLE_ACCESS
> > > 
> > > where access is handled like
> > > 
> > > if (flags&PKEY_DISABLE_ACCESS)
> > > 	flags |= PKEY_DISABLE_READ|PKEY_DISABLE_WRITE;
> > > disable_read = flags&PKEY_DISABLE_READ;
> > > disable_write = flags&PKEY_DISABLE_WRITE;
> > > disable_exec = flags&PKEY_DISABLE_EXECUTE;
...
> > On powerpc, PKEY_DISABLE_ACCESS also disables execution. AFAICT, the
...
> Seems to me that PKEY_DISABLE_ACCESS leaves exec permissions as-is.

assuming this is right the patch below looks
reasonable to me. thanks.

> Here is the patch I am planning to include in the next version of the series.
> This should support all PKEY_DISABLE_* combinations. Any comments? 
> 
> commit ba51371a544f6b0a4a0f03df62ad894d53f5039b
> Author: Joey Gouly <joey.gouly@arm.com>
> Date:   Thu Jul 4 11:29:20 2024 +0100
> 
>     arm64: add PKEY_DISABLE_READ and PKEY_DISABLE_EXEC

it's PKEY_DISABLE_EXECUTE (fwiw i like the shorter
exec better but ppc seems to use execute)

>     
>     TODO
>     
>     Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> 
> diff --git arch/arm64/include/uapi/asm/mman.h arch/arm64/include/uapi/asm/mman.h
> index 1e6482a838e1..e7e0c8216243 100644
> --- arch/arm64/include/uapi/asm/mman.h
> +++ arch/arm64/include/uapi/asm/mman.h
> @@ -7,4 +7,13 @@
>  #define PROT_BTI       0x10            /* BTI guarded page */
>  #define PROT_MTE       0x20            /* Normal Tagged mapping */
>  
> +/* Override any generic PKEY permission defines */
> +#define PKEY_DISABLE_EXECUTE   0x4
> +#define PKEY_DISABLE_READ      0x8
> +#undef PKEY_ACCESS_MASK
> +#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
> +                               PKEY_DISABLE_WRITE  |\
> +                               PKEY_DISABLE_READ   |\
> +                               PKEY_DISABLE_EXECUTE)
> +
>  #endif /* ! _UAPI__ASM_MMAN_H */
> diff --git arch/arm64/mm/mmu.c arch/arm64/mm/mmu.c
> index 68afe5fc3071..ce4cc6bdee4e 100644
> --- arch/arm64/mm/mmu.c
> +++ arch/arm64/mm/mmu.c
> @@ -1570,10 +1570,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
>                 return -EINVAL;
>  
>         /* Set the bits we need in POR:  */
> +       new_por = POE_RXW;
> +       if (init_val & PKEY_DISABLE_WRITE)
> +               new_por &= ~POE_W;
>         if (init_val & PKEY_DISABLE_ACCESS)
> -               new_por = POE_X;
> -       else if (init_val & PKEY_DISABLE_WRITE)
> -               new_por = POE_RX;
> +               new_por &= ~POE_RW;
> +       if (init_val & PKEY_DISABLE_READ)
> +               new_por &= ~POE_R;
> +       if (init_val & PKEY_DISABLE_EXECUTE)
> +               new_por &= ~POE_X;
>  
>         /* Shift the bits in to the correct place in POR for pkey: */
>         pkey_shift = pkey * POR_BITS_PER_PKEY;
> 
> 
> 
> Thanks,
> Joey
Kevin Brodsky July 22, 2024, 1:39 p.m. UTC | #16
On 05/07/2024 18:59, Catalin Marinas wrote:
> On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote:
>> @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>>  #define pte_access_permitted_no_overlay(pte, write) \
>>  	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
>>  #define pte_access_permitted(pte, write) \
>> -	pte_access_permitted_no_overlay(pte, write)
>> +	(pte_access_permitted_no_overlay(pte, write) && \
>> +	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
> I'm still not entirely convinced on checking the keys during fast GUP
> but that's what x86 and powerpc do already, so I guess we'll follow the
> same ABI.

I've thought about this some more. In summary I don't think adding this
check to pte_access_permitted() is controversial, but we should decide
how POR_EL0 is set for kernel threads.

This change essentially means that fast GUP behaves like uaccess for
pages that are already present: in both cases POR_EL0 will be looked up
based on the POIndex of the page being accessed (by the hardware in the
uaccess case, and explicitly in the fast GUP case). Fast GUP always
operates on current->mm, so to me checking POR_EL0 in
pte_access_permitted() should be no more restrictive than a uaccess
check from a user perspective. In other words, POR_EL0 is checked when
the kernel accesses user memory on the user's behalf, whether through
uaccess or GUP.

It's also worth noting that the "slow" GUP path (which
get_user_pages_fast() falls back to if a page is missing) also checks
POR_EL0 by virtue of calling handle_mm_fault(), which in turn calls
arch_vma_access_permitted(). It would be pretty inconsistent for the
slow GUP path to do a pkey check but not the fast path. (That said, the
slow GUP path does not call arch_vma_access_permitted() if a page is
already present, so callers of get_user_pages() and similar will get
inconsistent checking. Not great, that may be worth fixing - but that's
clearly beyond the scope of this series.)

Now an interesting question is what happens with kernel threads that
access user memory, as is the case for the optional io_uring kernel
thread (IORING_SETUP_SQPOLL). The discussion above holds regardless of
the type of thread, so the sqpoll thread will have its POR_EL0 checked
when processing commands that involve uaccess or GUP. AFAICT, this
series does not have special handling for kernel threads w.r.t. POR_EL0,
which means that it is left unchanged when a new kernel thread is cloned
(create_io_thread() in the IORING_SETUP_SQPOLL case). The sqpoll thread
will therefore inherit POR_EL0 from the (user) thread that calls
io_uring_setup(). In other words, the sqpoll thread ends up with the
same view of user memory as that user thread - for instance if its
POR_EL0 prevents access to POIndex 1, then any I/O that the sqpoll
thread attempts on mappings with POIndex/pkey 1 will fail.

This behaviour seems potentially useful to me, as the io_uring SQ could
easily become a way to bypass POE without some restriction. However, it
feels like this should be documented, as one should keep it in mind when
using pkeys, and there may well be other cases where kernel threads are
impacted by POR_EL0. I am also unsure how x86/ppc handle this.

Kevin
Anshuman Khandual July 23, 2024, 4:22 a.m. UTC | #17
On 5/3/24 18:31, Joey Gouly wrote:
> Implement the PKEYS interface, using the Permission Overlay Extension.

This commit message should contain some more details here considering
the amount of code change proposed in this patch.

> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> ---
>  arch/arm64/include/asm/mmu.h         |   1 +
>  arch/arm64/include/asm/mmu_context.h |  51 ++++++++++++-
>  arch/arm64/include/asm/pgtable.h     |  22 +++++-
>  arch/arm64/include/asm/pkeys.h       | 110 +++++++++++++++++++++++++++
>  arch/arm64/include/asm/por.h         |  33 ++++++++
>  arch/arm64/mm/mmu.c                  |  40 ++++++++++
>  6 files changed, 255 insertions(+), 2 deletions(-)
>  create mode 100644 arch/arm64/include/asm/pkeys.h
>  create mode 100644 arch/arm64/include/asm/por.h
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 65977c7783c5..983afeb4eba5 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -25,6 +25,7 @@ typedef struct {
>  	refcount_t	pinned;
>  	void		*vdso;
>  	unsigned long	flags;
> +	u8		pkey_allocation_map;

arch_max_pkey() is 7 on arm64, with bit 0 reserved for the first pkey,
so is it possible for the entire pkey_allocation_map to be completely
used up in reality ? OR the maximum pkey bits that can be allocated is
actually ARCH_PKEY_BITS ?

>  } mm_context_t;
>  
>  /*
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index c768d16b81a4..cb499db7a97b 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -15,12 +15,12 @@
>  #include <linux/sched/hotplug.h>
>  #include <linux/mm_types.h>
>  #include <linux/pgtable.h>
> +#include <linux/pkeys.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/cpufeature.h>
>  #include <asm/daifflags.h>
>  #include <asm/proc-fns.h>
> -#include <asm-generic/mm_hooks.h>
>  #include <asm/cputype.h>
>  #include <asm/sysreg.h>
>  #include <asm/tlbflush.h>
> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>  {
>  	atomic64_set(&mm->context.id, 0);
>  	refcount_set(&mm->context.pinned, 0);
> +
> +	/* pkey 0 is the default, so always reserve it. */
> +	mm->context.pkey_allocation_map = 0x1;

Very small nit. Considering the 1U << pkey allocation mechanism, the
following might make more sense, considering the first bit being the
default one.

	mm->context.pkey_allocation_map = (1U << 0);

OR probably even making it a const or something.

> +
> +	return 0;
> +}
> +
> +static inline void arch_dup_pkeys(struct mm_struct *oldmm,
> +				  struct mm_struct *mm)
> +{
> +	/* Duplicate the oldmm pkey state in mm: */
> +	mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
> +}
> +
> +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +	arch_dup_pkeys(oldmm, mm);
> +
>  	return 0;
>  }
>  
> +static inline void arch_exit_mmap(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void arch_unmap(struct mm_struct *mm,
> +			unsigned long start, unsigned long end)
> +{
> +}
> +
>  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
>  static inline void update_saved_ttbr0(struct task_struct *tsk,
>  				      struct mm_struct *mm)
> @@ -267,6 +294,28 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm)
>  	return -1UL >> 8;
>  }
>  
> +/*
> + * We only want to enforce protection keys on the current process
> + * because we effectively have no access to POR_EL0 for other
> + * processes or any way to tell *which * POR_EL0 in a threaded
> + * process we could use.
> + *
> + * So do not enforce things if the VMA is not from the current
> + * mm, or if we are in a kernel thread.
> + */

As mentioned in the other thread, this comment can be improved.

> +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> +		bool write, bool execute, bool foreign)
> +{
> +	if (!arch_pkeys_enabled())
> +		return true;
> +
> +	/* allow access if the VMA is not one from this process */
> +	if (foreign || vma_is_foreign(vma))
> +		return true;
> +
> +	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
> +}
> +
>  #include <asm-generic/mmu_context.h>
>  
>  #endif /* !__ASSEMBLY__ */
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 2449e4e27ea6..8ee68ff03016 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -34,6 +34,7 @@
>  
>  #include <asm/cmpxchg.h>
>  #include <asm/fixmap.h>
> +#include <asm/por.h>
>  #include <linux/mmdebug.h>
>  #include <linux/mm_types.h>
>  #include <linux/sched.h>
> @@ -153,6 +154,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>  #define pte_accessible(mm, pte)	\
>  	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
>  
> +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute)
> +{
> +	u64 por;
> +
> +	if (!system_supports_poe())
> +		return true;

This is redundant. Same check is there in arch_vma_access_permitted()
as well which is the sole caller for this function.

> +
> +	por = read_sysreg_s(SYS_POR_EL0);
> +
> +	if (write)
> +		return por_elx_allows_write(por, pkey);
> +
> +	if (execute)
> +		return por_elx_allows_exec(por, pkey);
> +
> +	return por_elx_allows_read(por, pkey);
> +}
> +
>  /*
>   * p??_access_permitted() is true for valid user mappings (PTE_USER
>   * bit set, subject to the write permission check). For execute-only
> @@ -163,7 +182,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>  #define pte_access_permitted_no_overlay(pte, write) \
>  	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
>  #define pte_access_permitted(pte, write) \
> -	pte_access_permitted_no_overlay(pte, write)
> +	(pte_access_permitted_no_overlay(pte, write) && \
> +	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
>  #define pmd_access_permitted(pmd, write) \
>  	(pte_access_permitted(pmd_pte(pmd), (write)))
>  #define pud_access_permitted(pud, write) \
> diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h
> new file mode 100644
> index 000000000000..a284508a4d02
> --- /dev/null
> +++ b/arch/arm64/include/asm/pkeys.h
> @@ -0,0 +1,110 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + *
> + * Based on arch/x86/include/asm/pkeys.h
> + */
> +
> +#ifndef _ASM_ARM64_PKEYS_H
> +#define _ASM_ARM64_PKEYS_H
> +
> +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2)
> +
> +#define arch_max_pkey() 7

May be this should be made 8 including the default pkey bit 0.

> +
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
> +		unsigned long init_val);
> +
> +static inline bool arch_pkeys_enabled(void)
> +{
> +	return false;
> +}
> +
> +static inline int vma_pkey(struct vm_area_struct *vma)
> +{
> +	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
> +}
> +
> +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
> +		int prot, int pkey)
> +{

Following comment is there in x86 __arch_override_mprotect_pkey() which
also seems to be applicable here as well. Please consider adding.

        /*
         * Is this an mprotect_pkey() call?  If so, never
         * override the value that came from the user.
         */

> +	if (pkey != -1)
> +		return pkey;
> +
> +	return vma_pkey(vma);
> +}
> +
> +static inline int execute_only_pkey(struct mm_struct *mm)
> +{
> +	// Execute-only mappings are handled by EPAN/FEAT_PAN3.
> +	WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN));
> +
> +	return -1;
> +}
> +
> +#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
> +#define mm_set_pkey_allocated(mm, pkey) do {		\
> +	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
> +} while (0)
> +#define mm_set_pkey_free(mm, pkey) do {			\
> +	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
> +} while (0)
> +
> +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
> +{
> +	/*
> +	 * "Allocated" pkeys are those that have been returned
> +	 * from pkey_alloc() or pkey 0 which is allocated
> +	 * implicitly when the mm is created.
> +	 */
> +	if (pkey < 0)
> +		return false;
> +	if (pkey >= arch_max_pkey())
> +		return false;

These range checks can be folded into the same conditional statement.

> +
> +	return mm_pkey_allocation_map(mm) & (1U << pkey);
> +}
> +
> +/*
> + * Returns a positive, 3-bit key on success, or -1 on failure.
> + */
> +static inline int mm_pkey_alloc(struct mm_struct *mm)
> +{
> +	/*
> +	 * Note: this is the one and only place we make sure
> +	 * that the pkey is valid as far as the hardware is
> +	 * concerned.  The rest of the kernel trusts that
> +	 * only good, valid pkeys come out of here.
> +	 */
> +	u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
> +	int ret;
> +
> +	if (!arch_pkeys_enabled())
> +		return -1;

I am wondering should not pkey's range be asserted here first
like as in mm_pkey_is_allocated() ?

> +
> +	/*
> +	 * Are we out of pkeys?  We must handle this specially
> +	 * because ffz() behavior is undefined if there are no
> +	 * zeros.
> +	 */
> +	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
> +		return -1;
> +
> +	ret = ffz(mm_pkey_allocation_map(mm));
> +
> +	mm_set_pkey_allocated(mm, ret);
> +
> +	return ret;
> +}
> +
> +static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
> +{
> +	if (!mm_pkey_is_allocated(mm, pkey))
> +		return -EINVAL;
> +
> +	mm_set_pkey_free(mm, pkey);
> +
> +	return 0;
> +}
> +
> +#endif /* _ASM_ARM64_PKEYS_H */
> diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
> new file mode 100644
> index 000000000000..d6604e0c5c54
> --- /dev/null
> +++ b/arch/arm64/include/asm/por.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + */
> +
> +#ifndef _ASM_ARM64_POR_H
> +#define _ASM_ARM64_POR_H
> +
> +#define POR_BITS_PER_PKEY		4
> +#define POR_ELx_IDX(por_elx, idx)	(((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf)
> +
> +static inline bool por_elx_allows_read(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_R;
> +}
> +
> +static inline bool por_elx_allows_write(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_W;
> +}
> +
> +static inline bool por_elx_allows_exec(u64 por, u8 pkey)
> +{
> +	u8 perm = POR_ELx_IDX(por, pkey);
> +
> +	return perm & POE_X;
> +}
> +
> +#endif /* _ASM_ARM64_POR_H */
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 495b732d5af3..e50ccc86d150 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -25,6 +25,7 @@
>  #include <linux/vmalloc.h>
>  #include <linux/set_memory.h>
>  #include <linux/kfence.h>
> +#include <linux/pkeys.h>
>  
>  #include <asm/barrier.h>
>  #include <asm/cputype.h>
> @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
>  
>  	cpu_uninstall_idmap();
>  }
> +
> +#ifdef CONFIG_ARCH_HAS_PKEYS
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> +{
> +	u64 new_por = POE_RXW;
> +	u64 old_por;
> +	u64 pkey_shift;
> +
> +	if (!arch_pkeys_enabled())
> +		return -ENOSPC;

This code path might not be possible and hence the check is redundant.
If arch_pkeys_enabled() returns negative, then pkey_alloc() will just
bail out and arch_set_user_pkey_access() would not be called afterwards.

SYSCALL..(pkey_alloc)
	mm_pkey_alloc()
		arch_pkeys_enabled()		
	...............
	arch_set_user_pkey_access()
		arch_pkeys_enabled()
> +
> +	/*
> +	 * This code should only be called with valid 'pkey'
> +	 * values originating from in-kernel users.  Complain
> +	 * if a bad value is observed.
> +	 */
> +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> +		return -EINVAL;

If the pkey's range check could have been done in mm_pkey_alloc() itself
- which seems to be a better place instead, this warning here would not
have been necessary.

> +
> +	/* Set the bits we need in POR:  */
> +	if (init_val & PKEY_DISABLE_ACCESS)
> +		new_por = POE_X;
> +	else if (init_val & PKEY_DISABLE_WRITE)
> +		new_por = POE_RX;
> +
> +	/* Shift the bits in to the correct place in POR for pkey: */
> +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> +	new_por <<= pkey_shift;
> +
> +	/* Get old POR and mask off any old bits in place: */
> +	old_por = read_sysreg_s(SYS_POR_EL0);
> +	old_por &= ~(POE_MASK << pkey_shift);
> +
> +	/* Write old part along with new part: */
> +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> +
> +	return 0;
> +}
> +#endif
Dave Martin July 25, 2024, 4:12 p.m. UTC | #18
On Fri, May 03, 2024 at 02:01:35PM +0100, Joey Gouly wrote:
> Implement the PKEYS interface, using the Permission Overlay Extension.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> ---
>  arch/arm64/include/asm/mmu.h         |   1 +
>  arch/arm64/include/asm/mmu_context.h |  51 ++++++++++++-
>  arch/arm64/include/asm/pgtable.h     |  22 +++++-
>  arch/arm64/include/asm/pkeys.h       | 110 +++++++++++++++++++++++++++
>  arch/arm64/include/asm/por.h         |  33 ++++++++
>  arch/arm64/mm/mmu.c                  |  40 ++++++++++
>  6 files changed, 255 insertions(+), 2 deletions(-)
>  create mode 100644 arch/arm64/include/asm/pkeys.h
>  create mode 100644 arch/arm64/include/asm/por.h

[...]

> diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h
> new file mode 100644
> index 000000000000..a284508a4d02
> --- /dev/null
> +++ b/arch/arm64/include/asm/pkeys.h
> @@ -0,0 +1,110 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + *
> + * Based on arch/x86/include/asm/pkeys.h
> + */
> +
> +#ifndef _ASM_ARM64_PKEYS_H
> +#define _ASM_ARM64_PKEYS_H
> +
> +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2)
> +
> +#define arch_max_pkey() 7

Did you mean 8 ?  I'm guessing this may be the "off by one error" you
alluded to in your own reply to the cover letter, but just in case...

(x86 and powerpc seem to have booby-trapped the name of this macro for
the unwary...)

See also mm_pkey_{is_allocated,alloc}().

> +
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
> +		unsigned long init_val);
> +
> +static inline bool arch_pkeys_enabled(void)
> +{
> +	return false;
> +}
> +
> +static inline int vma_pkey(struct vm_area_struct *vma)
> +{
> +	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
> +}
> +
> +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
> +		int prot, int pkey)
> +{
> +	if (pkey != -1)
> +		return pkey;
> +
> +	return vma_pkey(vma);
> +}
> +
> +static inline int execute_only_pkey(struct mm_struct *mm)
> +{
> +	// Execute-only mappings are handled by EPAN/FEAT_PAN3.
> +	WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN));
> +
> +	return -1;
> +}
> +
> +#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)

Pedantic nit: (mm)

although other arches have the same nit already, and it's probably low
risk given the scope and usage of these macros.

(Also, the outer parentheses are also redundant (if harmless).)

> +#define mm_set_pkey_allocated(mm, pkey) do {		\
> +	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
> +} while (0)
> +#define mm_set_pkey_free(mm, pkey) do {			\
> +	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
> +} while (0)
> +
> +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
> +{
> +	/*
> +	 * "Allocated" pkeys are those that have been returned
> +	 * from pkey_alloc() or pkey 0 which is allocated
> +	 * implicitly when the mm is created.
> +	 */
> +	if (pkey < 0)
> +		return false;
> +	if (pkey >= arch_max_pkey())
> +		return false;

Did you mean > ?

> +
> +	return mm_pkey_allocation_map(mm) & (1U << pkey);
> +}
> +
> +/*
> + * Returns a positive, 3-bit key on success, or -1 on failure.
> + */
> +static inline int mm_pkey_alloc(struct mm_struct *mm)
> +{
> +	/*
> +	 * Note: this is the one and only place we make sure
> +	 * that the pkey is valid as far as the hardware is
> +	 * concerned.  The rest of the kernel trusts that
> +	 * only good, valid pkeys come out of here.
> +	 */
> +	u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);

Nit: redundant outer ().

Also, GENMASK() and friends might be cleaner that spelling out this
idiom explicitly (but no big deal).

(1 << 7) - 1 is 0x7f, which doesn't feel right if pkeys 0..7 are all
supposed to be valid.  (See arch_max_pkey() above.)


(Also it looks mildly weird to have this before checking
arch_pkeys_enabled(), but since this is likely to be constant-folded by
the compiler, I guess it almost certainly makes no difference.  It's
harmless either way.)

> +	int ret;
> +
> +	if (!arch_pkeys_enabled())
> +		return -1;
> +
> +	/*
> +	 * Are we out of pkeys?  We must handle this specially
> +	 * because ffz() behavior is undefined if there are no
> +	 * zeros.
> +	 */
> +	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
> +		return -1;
> +
> +	ret = ffz(mm_pkey_allocation_map(mm));
> +
> +	mm_set_pkey_allocated(mm, ret);
> +
> +	return ret;
> +}
> +
> +static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
> +{
> +	if (!mm_pkey_is_allocated(mm, pkey))
> +		return -EINVAL;

Does anything prevent a pkey_free(0)?

I couldn't find any check related to this so far.

If not, this may be a generic problem, better solved through a wrapper
in the generic mm code.

Userspace has to have at least one PKEY allocated, since the pte field
has to be set to something...  unless we turn PKEYs on or off per mm.
But the pkeys API doesn't seem to be designed that way (and it doesn't
look very useful).


> +
> +	mm_set_pkey_free(mm, pkey);
> +
> +	return 0;
> +}
> +
> +#endif /* _ASM_ARM64_PKEYS_H */
> diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
> new file mode 100644
> index 000000000000..d6604e0c5c54
> --- /dev/null
> +++ b/arch/arm64/include/asm/por.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 Arm Ltd.
> + */
> +
> +#ifndef _ASM_ARM64_POR_H
> +#define _ASM_ARM64_POR_H
> +
> +#define POR_BITS_PER_PKEY		4
> +#define POR_ELx_IDX(por_elx, idx)	(((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf)

Nit: (idx)

Since this is shared with other code in a header, it's probably best
to avoid surprises.

[...]

> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 495b732d5af3..e50ccc86d150 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -25,6 +25,7 @@
>  #include <linux/vmalloc.h>
>  #include <linux/set_memory.h>
>  #include <linux/kfence.h>
> +#include <linux/pkeys.h>
>  
>  #include <asm/barrier.h>
>  #include <asm/cputype.h>
> @@ -1535,3 +1536,42 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
>  
>  	cpu_uninstall_idmap();
>  }
> +
> +#ifdef CONFIG_ARCH_HAS_PKEYS
> +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
> +{
> +	u64 new_por = POE_RXW;
> +	u64 old_por;
> +	u64 pkey_shift;
> +
> +	if (!arch_pkeys_enabled())
> +		return -ENOSPC;
> +
> +	/*
> +	 * This code should only be called with valid 'pkey'
> +	 * values originating from in-kernel users.  Complain
> +	 * if a bad value is observed.
> +	 */
> +	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
> +		return -EINVAL;
> +
> +	/* Set the bits we need in POR:  */
> +	if (init_val & PKEY_DISABLE_ACCESS)
> +		new_por = POE_X;
> +	else if (init_val & PKEY_DISABLE_WRITE)
> +		new_por = POE_RX;
> +
> +	/* Shift the bits in to the correct place in POR for pkey: */
> +	pkey_shift = pkey * POR_BITS_PER_PKEY;
> +	new_por <<= pkey_shift;
> +
> +	/* Get old POR and mask off any old bits in place: */
> +	old_por = read_sysreg_s(SYS_POR_EL0);
> +	old_por &= ~(POE_MASK << pkey_shift);
> +
> +	/* Write old part along with new part: */
> +	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
> +
> +	return 0;
> +}
> +#endif

<bikeshed>

Although this is part of the existing PKEYS support, it feels weird to
have to initialise the permissions with one interface and one set of
flags, then change the permissions using an arch-specific interface and
a different set of flags (i.e., directly writing POR_EL0) later on.

Is there any merit in defining a vDSO function for changing the flags in
userspace?  This would allow userspace to use PKEYS in a generic way
without a nasty per-arch volatile asm hack.  

(Maybe too late for stopping user libraries rolling their own, though.)


Since we ideally don't want to write the above flags-mungeing code
twice, there would be the option of implementing pkey_alloc() via a
vDSO wrapper on arm64 (though this might be more trouble than it is
worth).

Of course, this is all pointless if people thought that even the
overhead of a vDSO call was unacceptable for flipping the permissions
on and off.  Either way, this is a potential enhancement, orthogonal to
this series...

</bikeshed>

[...]

Cheers
---Dave
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 65977c7783c5..983afeb4eba5 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -25,6 +25,7 @@  typedef struct {
 	refcount_t	pinned;
 	void		*vdso;
 	unsigned long	flags;
+	u8		pkey_allocation_map;
 } mm_context_t;
 
 /*
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index c768d16b81a4..cb499db7a97b 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -15,12 +15,12 @@ 
 #include <linux/sched/hotplug.h>
 #include <linux/mm_types.h>
 #include <linux/pgtable.h>
+#include <linux/pkeys.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/daifflags.h>
 #include <asm/proc-fns.h>
-#include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
 #include <asm/sysreg.h>
 #include <asm/tlbflush.h>
@@ -175,9 +175,36 @@  init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	atomic64_set(&mm->context.id, 0);
 	refcount_set(&mm->context.pinned, 0);
+
+	/* pkey 0 is the default, so always reserve it. */
+	mm->context.pkey_allocation_map = 0x1;
+
+	return 0;
+}
+
+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+				  struct mm_struct *mm)
+{
+	/* Duplicate the oldmm pkey state in mm: */
+	mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+}
+
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	arch_dup_pkeys(oldmm, mm);
+
 	return 0;
 }
 
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+			unsigned long start, unsigned long end)
+{
+}
+
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 static inline void update_saved_ttbr0(struct task_struct *tsk,
 				      struct mm_struct *mm)
@@ -267,6 +294,28 @@  static inline unsigned long mm_untag_mask(struct mm_struct *mm)
 	return -1UL >> 8;
 }
 
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to POR_EL0 for other
+ * processes or any way to tell *which * POR_EL0 in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+		bool write, bool execute, bool foreign)
+{
+	if (!arch_pkeys_enabled())
+		return true;
+
+	/* allow access if the VMA is not one from this process */
+	if (foreign || vma_is_foreign(vma))
+		return true;
+
+	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
+}
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2449e4e27ea6..8ee68ff03016 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -34,6 +34,7 @@ 
 
 #include <asm/cmpxchg.h>
 #include <asm/fixmap.h>
+#include <asm/por.h>
 #include <linux/mmdebug.h>
 #include <linux/mm_types.h>
 #include <linux/sched.h>
@@ -153,6 +154,24 @@  static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 #define pte_accessible(mm, pte)	\
 	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
 
+static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute)
+{
+	u64 por;
+
+	if (!system_supports_poe())
+		return true;
+
+	por = read_sysreg_s(SYS_POR_EL0);
+
+	if (write)
+		return por_elx_allows_write(por, pkey);
+
+	if (execute)
+		return por_elx_allows_exec(por, pkey);
+
+	return por_elx_allows_read(por, pkey);
+}
+
 /*
  * p??_access_permitted() is true for valid user mappings (PTE_USER
  * bit set, subject to the write permission check). For execute-only
@@ -163,7 +182,8 @@  static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 #define pte_access_permitted_no_overlay(pte, write) \
 	(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
 #define pte_access_permitted(pte, write) \
-	pte_access_permitted_no_overlay(pte, write)
+	(pte_access_permitted_no_overlay(pte, write) && \
+	por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
 #define pmd_access_permitted(pmd, write) \
 	(pte_access_permitted(pmd_pte(pmd), (write)))
 #define pud_access_permitted(pud, write) \
diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h
new file mode 100644
index 000000000000..a284508a4d02
--- /dev/null
+++ b/arch/arm64/include/asm/pkeys.h
@@ -0,0 +1,110 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Arm Ltd.
+ *
+ * Based on arch/x86/include/asm/pkeys.h
+ */
+
+#ifndef _ASM_ARM64_PKEYS_H
+#define _ASM_ARM64_PKEYS_H
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2)
+
+#define arch_max_pkey() 7
+
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+
+static inline bool arch_pkeys_enabled(void)
+{
+	return false;
+}
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
+}
+
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+		int prot, int pkey)
+{
+	if (pkey != -1)
+		return pkey;
+
+	return vma_pkey(vma);
+}
+
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+	// Execute-only mappings are handled by EPAN/FEAT_PAN3.
+	WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN));
+
+	return -1;
+}
+
+#define mm_pkey_allocation_map(mm)	(mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do {		\
+	mm_pkey_allocation_map(mm) |= (1U << pkey);	\
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do {			\
+	mm_pkey_allocation_map(mm) &= ~(1U << pkey);	\
+} while (0)
+
+static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+	/*
+	 * "Allocated" pkeys are those that have been returned
+	 * from pkey_alloc() or pkey 0 which is allocated
+	 * implicitly when the mm is created.
+	 */
+	if (pkey < 0)
+		return false;
+	if (pkey >= arch_max_pkey())
+		return false;
+
+	return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 3-bit key on success, or -1 on failure.
+ */
+static inline int mm_pkey_alloc(struct mm_struct *mm)
+{
+	/*
+	 * Note: this is the one and only place we make sure
+	 * that the pkey is valid as far as the hardware is
+	 * concerned.  The rest of the kernel trusts that
+	 * only good, valid pkeys come out of here.
+	 */
+	u8 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+	int ret;
+
+	if (!arch_pkeys_enabled())
+		return -1;
+
+	/*
+	 * Are we out of pkeys?  We must handle this specially
+	 * because ffz() behavior is undefined if there are no
+	 * zeros.
+	 */
+	if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+		return -1;
+
+	ret = ffz(mm_pkey_allocation_map(mm));
+
+	mm_set_pkey_allocated(mm, ret);
+
+	return ret;
+}
+
+static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+	if (!mm_pkey_is_allocated(mm, pkey))
+		return -EINVAL;
+
+	mm_set_pkey_free(mm, pkey);
+
+	return 0;
+}
+
+#endif /* _ASM_ARM64_PKEYS_H */
diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
new file mode 100644
index 000000000000..d6604e0c5c54
--- /dev/null
+++ b/arch/arm64/include/asm/por.h
@@ -0,0 +1,33 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Arm Ltd.
+ */
+
+#ifndef _ASM_ARM64_POR_H
+#define _ASM_ARM64_POR_H
+
+#define POR_BITS_PER_PKEY		4
+#define POR_ELx_IDX(por_elx, idx)	(((por_elx) >> (idx * POR_BITS_PER_PKEY)) & 0xf)
+
+static inline bool por_elx_allows_read(u64 por, u8 pkey)
+{
+	u8 perm = POR_ELx_IDX(por, pkey);
+
+	return perm & POE_R;
+}
+
+static inline bool por_elx_allows_write(u64 por, u8 pkey)
+{
+	u8 perm = POR_ELx_IDX(por, pkey);
+
+	return perm & POE_W;
+}
+
+static inline bool por_elx_allows_exec(u64 por, u8 pkey)
+{
+	u8 perm = POR_ELx_IDX(por, pkey);
+
+	return perm & POE_X;
+}
+
+#endif /* _ASM_ARM64_POR_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 495b732d5af3..e50ccc86d150 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -25,6 +25,7 @@ 
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
 #include <linux/kfence.h>
+#include <linux/pkeys.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -1535,3 +1536,42 @@  void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
 
 	cpu_uninstall_idmap();
 }
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
+{
+	u64 new_por = POE_RXW;
+	u64 old_por;
+	u64 pkey_shift;
+
+	if (!arch_pkeys_enabled())
+		return -ENOSPC;
+
+	/*
+	 * This code should only be called with valid 'pkey'
+	 * values originating from in-kernel users.  Complain
+	 * if a bad value is observed.
+	 */
+	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+		return -EINVAL;
+
+	/* Set the bits we need in POR:  */
+	if (init_val & PKEY_DISABLE_ACCESS)
+		new_por = POE_X;
+	else if (init_val & PKEY_DISABLE_WRITE)
+		new_por = POE_RX;
+
+	/* Shift the bits in to the correct place in POR for pkey: */
+	pkey_shift = pkey * POR_BITS_PER_PKEY;
+	new_por <<= pkey_shift;
+
+	/* Get old POR and mask off any old bits in place: */
+	old_por = read_sysreg_s(SYS_POR_EL0);
+	old_por &= ~(POE_MASK << pkey_shift);
+
+	/* Write old part along with new part: */
+	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
+
+	return 0;
+}
+#endif