diff mbox series

[v4,18/29] arm64: add POE signal support

Message ID 20240503130147.1154804-19-joey.gouly@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Permission Overlay Extension | expand

Commit Message

Joey Gouly May 3, 2024, 1:01 p.m. UTC
Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
 arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
 arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

Comments

Amit Daniel Kachhap May 28, 2024, 6:56 a.m. UTC | #1
On 5/3/24 18:31, Joey Gouly wrote:
> Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Reviewed-by: Mark Brown <broonie@kernel.org>
> Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> ---
>   arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
>   arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
>   2 files changed, 59 insertions(+)
> 
> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8a45b7a411e0..e4cba8a6c9a2 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> @@ -98,6 +98,13 @@ struct esr_context {
>   	__u64 esr;
>   };
>   
> +#define POE_MAGIC	0x504f4530
> +
> +struct poe_context {
> +	struct _aarch64_ctx head;
> +	__u64 por_el0;
> +};

There is a comment section in the beginning which mentions the size
of the context frame structure and subsequent reduction in the
reserved range. So this new context description can be added there.
Although looks like it is broken for za, zt and fpmr context.

> +
>   /*
>    * extra_context: describes extra space in the signal frame for
>    * additional structures that don't fit in sigcontext.__reserved[].
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 4a77f4976e11..077436a8bc10 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -63,6 +63,7 @@ struct rt_sigframe_user_layout {
>   	unsigned long fpmr_offset;
>   	unsigned long extra_offset;
>   	unsigned long end_offset;
> +	unsigned long poe_offset;

For consistency this can be added after fpmr_offset.

Thanks,
Amit

>   };
>   
>   #define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
> @@ -185,6 +186,8 @@ struct user_ctxs {
>   	u32 zt_size;
>   	struct fpmr_context __user *fpmr;
>   	u32 fpmr_size;
> +	struct poe_context __user *poe;
Mark Brown May 31, 2024, 4:39 p.m. UTC | #2
On Tue, May 28, 2024 at 12:26:54PM +0530, Amit Daniel Kachhap wrote:
> On 5/3/24 18:31, Joey Gouly wrote:

> > +#define POE_MAGIC	0x504f4530

> > +struct poe_context {
> > +	struct _aarch64_ctx head;
> > +	__u64 por_el0;
> > +};

> There is a comment section in the beginning which mentions the size
> of the context frame structure and subsequent reduction in the
> reserved range. So this new context description can be added there.
> Although looks like it is broken for za, zt and fpmr context.

Could you be more specific about how you think these existing contexts
are broken?  The above looks perfectly good and standard and the
existing contexts do a reasonable simulation of working.  Note that the
ZA and ZT contexts don't generate data payload unless userspace has set
PSTATE.ZA.
Amit Daniel Kachhap June 3, 2024, 9:21 a.m. UTC | #3
On 5/31/24 22:09, Mark Brown wrote:
> On Tue, May 28, 2024 at 12:26:54PM +0530, Amit Daniel Kachhap wrote:
>> On 5/3/24 18:31, Joey Gouly wrote:
> 
>>> +#define POE_MAGIC	0x504f4530
> 
>>> +struct poe_context {
>>> +	struct _aarch64_ctx head;
>>> +	__u64 por_el0;
>>> +};
> 
>> There is a comment section in the beginning which mentions the size
>> of the context frame structure and subsequent reduction in the
>> reserved range. So this new context description can be added there.
>> Although looks like it is broken for za, zt and fpmr context.
> 
> Could you be more specific about how you think these existing contexts
> are broken?  The above looks perfectly good and standard and the
> existing contexts do a reasonable simulation of working.  Note that the
> ZA and ZT contexts don't generate data payload unless userspace has set
> PSTATE.ZA.

Sorry for not being clear on this as I was only referring to the
comments in file arch/arm64/include/uapi/asm/sigcontext.h and no code
as such is broken.

  * Allocation of __reserved[]:
  * (Note: records do not necessarily occur in the order shown here.)
  *
  *      size            description
  *
  *      0x210           fpsimd_context
  *       0x10           esr_context
  *      0x8a0           sve_context (vl <= 64) (optional)
  *       0x20           extra_context (optional)
  *       0x10           terminator (null _aarch64_ctx)
  *
  *      0x510           (reserved for future allocation)

Here I think that optional context like za, zt, fpmr and poe should have
size mentioned here to make the description consistent.As you said ZA
and ZT context are enabled by userspace so some extra details can be
added for them too.
Catalin Marinas July 5, 2024, 5:04 p.m. UTC | #4
On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Reviewed-by: Mark Brown <broonie@kernel.org>
> Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Kevin Brodsky July 9, 2024, 1:08 p.m. UTC | #5
On 03/05/2024 15:01, Joey Gouly wrote:
> @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
>  	}
>  
> +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> +		struct poe_context __user *poe_ctx =
> +			apply_user_offset(user, user->poe_offset);
> +
> +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);

Nit: would be nicer to have this in its own helper
(preserve_poe_context()), like for the other optional records.

Kevin
Anshuman Khandual July 22, 2024, 9:16 a.m. UTC | #6
On 5/3/24 18:31, Joey Gouly wrote:
> Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Reviewed-by: Mark Brown <broonie@kernel.org>
> Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>

> ---
>  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
>  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8a45b7a411e0..e4cba8a6c9a2 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> @@ -98,6 +98,13 @@ struct esr_context {
>  	__u64 esr;
>  };
>  
> +#define POE_MAGIC	0x504f4530
> +
> +struct poe_context {
> +	struct _aarch64_ctx head;
> +	__u64 por_el0;
> +};
> +
>  /*
>   * extra_context: describes extra space in the signal frame for
>   * additional structures that don't fit in sigcontext.__reserved[].
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 4a77f4976e11..077436a8bc10 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -63,6 +63,7 @@ struct rt_sigframe_user_layout {
>  	unsigned long fpmr_offset;
>  	unsigned long extra_offset;
>  	unsigned long end_offset;
> +	unsigned long poe_offset;
>  };
>  
>  #define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
> @@ -185,6 +186,8 @@ struct user_ctxs {
>  	u32 zt_size;
>  	struct fpmr_context __user *fpmr;
>  	u32 fpmr_size;
> +	struct poe_context __user *poe;
> +	u32 poe_size;
>  };
>  
>  static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
> @@ -258,6 +261,21 @@ static int restore_fpmr_context(struct user_ctxs *user)
>  	return err;
>  }
>  
> +static int restore_poe_context(struct user_ctxs *user)
> +{
> +	u64 por_el0;
> +	int err = 0;
> +
> +	if (user->poe_size != sizeof(*user->poe))
> +		return -EINVAL;
> +
> +	__get_user_error(por_el0, &(user->poe->por_el0), err);
> +	if (!err)
> +		write_sysreg_s(por_el0, SYS_POR_EL0);
> +
> +	return err;
> +}
> +
>  #ifdef CONFIG_ARM64_SVE
>  
>  static int preserve_sve_context(struct sve_context __user *ctx)
> @@ -621,6 +639,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
>  	user->za = NULL;
>  	user->zt = NULL;
>  	user->fpmr = NULL;
> +	user->poe = NULL;
>  
>  	if (!IS_ALIGNED((unsigned long)base, 16))
>  		goto invalid;
> @@ -671,6 +690,17 @@ static int parse_user_sigframe(struct user_ctxs *user,
>  			/* ignore */
>  			break;
>  
> +		case POE_MAGIC:
> +			if (!system_supports_poe())
> +				goto invalid;
> +
> +			if (user->poe)
> +				goto invalid;
> +
> +			user->poe = (struct poe_context __user *)head;
> +			user->poe_size = size;
> +			break;
> +
>  		case SVE_MAGIC:
>  			if (!system_supports_sve() && !system_supports_sme())
>  				goto invalid;
> @@ -857,6 +887,9 @@ static int restore_sigframe(struct pt_regs *regs,
>  	if (err == 0 && system_supports_sme2() && user.zt)
>  		err = restore_zt_context(&user);
>  
> +	if (err == 0 && system_supports_poe() && user.poe)
> +		err = restore_poe_context(&user);
> +
>  	return err;
>  }
>  
> @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>  			return err;
>  	}
>  
> +	if (system_supports_poe()) {
> +		err = sigframe_alloc(user, &user->poe_offset,
> +				     sizeof(struct poe_context));
> +		if (err)
> +			return err;
> +	}
> +
>  	return sigframe_alloc_end(user);
>  }
>  
> @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
>  	}
>  
> +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> +		struct poe_context __user *poe_ctx =
> +			apply_user_offset(user, user->poe_offset);
> +
> +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> +	}
> +
>  	/* Scalable Vector Extension state (including streaming), if present */
>  	if ((system_supports_sve() || system_supports_sme()) &&
>  	    err == 0 && user->sve_offset) {
> @@ -1178,6 +1227,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
>  		sme_smstop();
>  	}
>  
> +	if (system_supports_poe())
> +		write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
> +
>  	if (ka->sa.sa_flags & SA_RESTORER)
>  		sigtramp = ka->sa.sa_restorer;
>  	else
Dave Martin July 25, 2024, 3:58 p.m. UTC | #7
On Mon, Jun 03, 2024 at 02:51:46PM +0530, Amit Daniel Kachhap wrote:
> 
> 
> On 5/31/24 22:09, Mark Brown wrote:
> > On Tue, May 28, 2024 at 12:26:54PM +0530, Amit Daniel Kachhap wrote:
> > > On 5/3/24 18:31, Joey Gouly wrote:
> > 
> > > > +#define POE_MAGIC	0x504f4530
> > > > +struct poe_context {
> > > > +	struct _aarch64_ctx head;
> > > > +	__u64 por_el0;
> > > > +};
> > 
> > > There is a comment section in the beginning which mentions the size
> > > of the context frame structure and subsequent reduction in the
> > > reserved range. So this new context description can be added there.
> > > Although looks like it is broken for za, zt and fpmr context.
> > 
> > Could you be more specific about how you think these existing contexts
> > are broken?  The above looks perfectly good and standard and the
> > existing contexts do a reasonable simulation of working.  Note that the
> > ZA and ZT contexts don't generate data payload unless userspace has set
> > PSTATE.ZA.
> 
> Sorry for not being clear on this as I was only referring to the
> comments in file arch/arm64/include/uapi/asm/sigcontext.h and no code
> as such is broken.
> 
>  * Allocation of __reserved[]:
>  * (Note: records do not necessarily occur in the order shown here.)
>  *
>  *      size            description
>  *
>  *      0x210           fpsimd_context
>  *       0x10           esr_context
>  *      0x8a0           sve_context (vl <= 64) (optional)
>  *       0x20           extra_context (optional)
>  *       0x10           terminator (null _aarch64_ctx)
>  *
>  *      0x510           (reserved for future allocation)
> 
> Here I think that optional context like za, zt, fpmr and poe should have
> size mentioned here to make the description consistent.As you said ZA
> and ZT context are enabled by userspace so some extra details can be
> added for them too.

Regarding this, __reserved[] is looking very full now.

I'll post a draft patch separately, since I think the update could
benefit from separate discussion, but my back-of-the-envelope
calculation suggests that (before this patch) we are down to 0x90
bytes of free space (i.e., over 96% full).


I wonder whether it is time to start pushing back on adding a new
_foo_context for every individual register, though?

Maybe we could add some kind of _misc_context for miscellaneous 64-bit
regs.

[...]

Cheers
---Dave
Dave Martin July 25, 2024, 4 p.m. UTC | #8
Hi,

On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> 
> Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Reviewed-by: Mark Brown <broonie@kernel.org>
> Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> ---
>  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
>  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8a45b7a411e0..e4cba8a6c9a2 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h

[...]

> @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>  			return err;
>  	}
>  
> +	if (system_supports_poe()) {
> +		err = sigframe_alloc(user, &user->poe_offset,
> +				     sizeof(struct poe_context));
> +		if (err)
> +			return err;
> +	}
> +
>  	return sigframe_alloc_end(user);
>  }
>  
> @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
>  	}
>  
> +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> +		struct poe_context __user *poe_ctx =
> +			apply_user_offset(user, user->poe_offset);
> +
> +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> +	}
> +

Does the AArch64 procedure call standard say anything about whether
POR_EL0 is caller-saved?

<bikeshed>

In theory we could skip saving this register if it is already
POR_EL0_INIT (which it often will be), and if the signal handler is not
supposed to modify and leave the modified value in the register when
returning.

The complexity of the additional check my be a bit pointless though,
and the the handler might theoretically want to change the interrupted
code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
sometimes there and sometimes not.

</bikeshed>

[...]

Cheers
---Dave
Mark Brown July 25, 2024, 6:11 p.m. UTC | #9
On Thu, Jul 25, 2024 at 04:58:27PM +0100, Dave Martin wrote:

> I'll post a draft patch separately, since I think the update could
> benefit from separate discussion, but my back-of-the-envelope
> calculation suggests that (before this patch) we are down to 0x90
> bytes of free space (i.e., over 96% full).

> I wonder whether it is time to start pushing back on adding a new
> _foo_context for every individual register, though?

> Maybe we could add some kind of _misc_context for miscellaneous 64-bit
> regs.

That'd have to be a variably sized structure with pairs of sysreg
ID/value items in it I think which would be a bit of a pain to implement
but doable.  The per-record header is 64 bits, we'd get maximal saving
by allocating a byte for the IDs.

It would be very unfortunate timing to start gating things on such a
change though (I'm particularly worried about GCS here, at this point
the kernel changes are blocking the entire ecosystem).
Dave Martin July 26, 2024, 4:14 p.m. UTC | #10
On Thu, Jul 25, 2024 at 07:11:41PM +0100, Mark Brown wrote:
> On Thu, Jul 25, 2024 at 04:58:27PM +0100, Dave Martin wrote:
> 
> > I'll post a draft patch separately, since I think the update could
> > benefit from separate discussion, but my back-of-the-envelope
> > calculation suggests that (before this patch) we are down to 0x90
> > bytes of free space (i.e., over 96% full).
> 
> > I wonder whether it is time to start pushing back on adding a new
> > _foo_context for every individual register, though?
> 
> > Maybe we could add some kind of _misc_context for miscellaneous 64-bit
> > regs.
> 
> That'd have to be a variably sized structure with pairs of sysreg
> ID/value items in it I think which would be a bit of a pain to implement
> but doable.  The per-record header is 64 bits, we'd get maximal saving
> by allocating a byte for the IDs.

Or possibly the regs could be identified positionally, avoiding the
need for IDs.  Space would be at a premium, and we would have to think
carefully about what should and should not be allowed in there.

> It would be very unfortunate timing to start gating things on such a
> change though (I'm particularly worried about GCS here, at this point
> the kernel changes are blocking the entire ecosystem).

For GCS, I wonder whether it should be made a strictly opt-in feature:
i.e., if you use it then you must tolerate large sigframes, and if it
is turned off then its state is neither dumped nor restored.  Since GCS
requires an explict prctl to turn it on, the mechanism seems partly
there already in your series.

I guess the GCS thread is the better place to discuss that, though.

Cheers
---Dave
Mark Brown July 26, 2024, 5:39 p.m. UTC | #11
On Fri, Jul 26, 2024 at 05:14:01PM +0100, Dave Martin wrote:
> On Thu, Jul 25, 2024 at 07:11:41PM +0100, Mark Brown wrote:

> > That'd have to be a variably sized structure with pairs of sysreg
> > ID/value items in it I think which would be a bit of a pain to implement
> > but doable.  The per-record header is 64 bits, we'd get maximal saving
> > by allocating a byte for the IDs.

> Or possibly the regs could be identified positionally, avoiding the
> need for IDs.  Space would be at a premium, and we would have to think
> carefully about what should and should not be allowed in there.

Yes, though that would mean if we had to generate any register in there
we'd always have to generate at least as many entries as whatever number
it got assigned which depending on how much optionality ends up getting
used might be unfortunate.

> > It would be very unfortunate timing to start gating things on such a
> > change though (I'm particularly worried about GCS here, at this point
> > the kernel changes are blocking the entire ecosystem).

> For GCS, I wonder whether it should be made a strictly opt-in feature:
> i.e., if you use it then you must tolerate large sigframes, and if it
> is turned off then its state is neither dumped nor restored.  Since GCS
> requires an explict prctl to turn it on, the mechanism seems partly
> there already in your series.

Yeah, that's what the current code does actually.  In any case it's not
just a single register - there's also the GCS mode in there.
Dave Martin July 29, 2024, 2:27 p.m. UTC | #12
On Fri, Jul 26, 2024 at 06:39:27PM +0100, Mark Brown wrote:
> On Fri, Jul 26, 2024 at 05:14:01PM +0100, Dave Martin wrote:
> > On Thu, Jul 25, 2024 at 07:11:41PM +0100, Mark Brown wrote:
> 
> > > That'd have to be a variably sized structure with pairs of sysreg
> > > ID/value items in it I think which would be a bit of a pain to implement
> > > but doable.  The per-record header is 64 bits, we'd get maximal saving
> > > by allocating a byte for the IDs.
> 
> > Or possibly the regs could be identified positionally, avoiding the
> > need for IDs.  Space would be at a premium, and we would have to think
> > carefully about what should and should not be allowed in there.
> 
> Yes, though that would mean if we had to generate any register in there
> we'd always have to generate at least as many entries as whatever number
> it got assigned which depending on how much optionality ends up getting
> used might be unfortunate.

Ack, though it's only 150 bytes or so at most, so just zeroing it all
(or as much as we know about) doesn't feel like a big cost.

It depends how determined we are to squeeze the most out of the
remaining space.


> > > It would be very unfortunate timing to start gating things on such a
> > > change though (I'm particularly worried about GCS here, at this point
> > > the kernel changes are blocking the entire ecosystem).
> 
> > For GCS, I wonder whether it should be made a strictly opt-in feature:
> > i.e., if you use it then you must tolerate large sigframes, and if it
> > is turned off then its state is neither dumped nor restored.  Since GCS
> > requires an explict prctl to turn it on, the mechanism seems partly
> > there already in your series.
> 
> Yeah, that's what the current code does actually.  In any case it's not
> just a single register - there's also the GCS mode in there.

Agreed -- I'll ping the GCS series, but this sounds like a reasonable
starting point.

Cheers
---Dave
Mark Brown July 29, 2024, 2:41 p.m. UTC | #13
On Mon, Jul 29, 2024 at 03:27:11PM +0100, Dave Martin wrote:
> On Fri, Jul 26, 2024 at 06:39:27PM +0100, Mark Brown wrote:

> > Yes, though that would mean if we had to generate any register in there
> > we'd always have to generate at least as many entries as whatever number
> > it got assigned which depending on how much optionality ends up getting
> > used might be unfortunate.

> Ack, though it's only 150 bytes or so at most, so just zeroing it all
> (or as much as we know about) doesn't feel like a big cost.

> It depends how determined we are to squeeze the most out of the
> remaining space.

Indeed, I was more thinking about how it might scale as the number of
extensions grows rather than the current costs.
Joey Gouly Aug. 1, 2024, 3:54 p.m. UTC | #14
On Thu, Jul 25, 2024 at 05:00:18PM +0100, Dave Martin wrote:
> Hi,
> 
> On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> > Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> > 
> > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Reviewed-by: Mark Brown <broonie@kernel.org>
> > Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> > ---
> >  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
> >  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
> >  2 files changed, 59 insertions(+)
> > 
> > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > index 8a45b7a411e0..e4cba8a6c9a2 100644
> > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> 
> [...]
> 
> > @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> >  			return err;
> >  	}
> >  
> > +	if (system_supports_poe()) {
> > +		err = sigframe_alloc(user, &user->poe_offset,
> > +				     sizeof(struct poe_context));
> > +		if (err)
> > +			return err;
> > +	}
> > +
> >  	return sigframe_alloc_end(user);
> >  }
> >  
> > @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> >  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> >  	}
> >  
> > +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> > +		struct poe_context __user *poe_ctx =
> > +			apply_user_offset(user, user->poe_offset);
> > +
> > +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> > +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> > +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> > +	}
> > +
> 
> Does the AArch64 procedure call standard say anything about whether
> POR_EL0 is caller-saved?

I asked about this, and it doesn't say anything and they don't plan on it,
since it's very application specific.

> 
> <bikeshed>
> 
> In theory we could skip saving this register if it is already
> POR_EL0_INIT (which it often will be), and if the signal handler is not
> supposed to modify and leave the modified value in the register when
> returning.
> 
> The complexity of the additional check my be a bit pointless though,
> and the the handler might theoretically want to change the interrupted
> code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
> sometimes there and sometimes not.
> 
> </bikeshed>
> 
I think trying to skip/optimise something here would be more effort than any
possible benefits!

Thanks,
Joey
Dave Martin Aug. 1, 2024, 4:22 p.m. UTC | #15
On Thu, Aug 01, 2024 at 04:54:41PM +0100, Joey Gouly wrote:
> On Thu, Jul 25, 2024 at 05:00:18PM +0100, Dave Martin wrote:
> > Hi,
> > 
> > On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> > > Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> > > 
> > > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will@kernel.org>
> > > Reviewed-by: Mark Brown <broonie@kernel.org>
> > > Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> > > ---
> > >  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
> > >  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
> > >  2 files changed, 59 insertions(+)
> > > 
> > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > index 8a45b7a411e0..e4cba8a6c9a2 100644
> > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > 
> > [...]
> > 
> > > @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > >  			return err;
> > >  	}
> > >  
> > > +	if (system_supports_poe()) {
> > > +		err = sigframe_alloc(user, &user->poe_offset,
> > > +				     sizeof(struct poe_context));
> > > +		if (err)
> > > +			return err;
> > > +	}
> > > +
> > >  	return sigframe_alloc_end(user);
> > >  }
> > >  
> > > @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > >  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > >  	}
> > >  
> > > +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> > > +		struct poe_context __user *poe_ctx =
> > > +			apply_user_offset(user, user->poe_offset);
> > > +
> > > +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> > > +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> > > +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> > > +	}
> > > +
> > 
> > Does the AArch64 procedure call standard say anything about whether
> > POR_EL0 is caller-saved?
> 
> I asked about this, and it doesn't say anything and they don't plan on it,
> since it's very application specific.

Right.  I think that confirms that we don't absolutely need to preserve
POR_EL0, because if compiler-generated code was allowed to fiddle with
this and not clean up after itself, the PCS would have to document this.

> > 
> > <bikeshed>
> > 
> > In theory we could skip saving this register if it is already
> > POR_EL0_INIT (which it often will be), and if the signal handler is not
> > supposed to modify and leave the modified value in the register when
> > returning.
> > 
> > The complexity of the additional check my be a bit pointless though,
> > and the the handler might theoretically want to change the interrupted
> > code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
> > sometimes there and sometimes not.
> > 
> > </bikeshed>
> > 
> I think trying to skip/optimise something here would be more effort than any
> possible benefits!

Actually, having thought about this some more I think that only dumping
this register if != POR_EL0_INIT may be right right thing to do.

This would mean that old binary would stacks never see poe_context in
the signal frame, and so will never experience unexpected stack
overruns (at least, not due solely to the presence of this feature).

POE-aware signal handlers have to do something fiddly and nonportable
to obtain the original value of POR_EL0 regardless, so requiring them
do handle both cases (present in sigframe and absent) doesn't seem too
onerous to me.


Do you think this approach would break any known use cases?

Cheers
---Dave
Joey Gouly Aug. 6, 2024, 10:35 a.m. UTC | #16
On Thu, Aug 01, 2024 at 05:22:45PM +0100, Dave Martin wrote:
> On Thu, Aug 01, 2024 at 04:54:41PM +0100, Joey Gouly wrote:
> > On Thu, Jul 25, 2024 at 05:00:18PM +0100, Dave Martin wrote:
> > > Hi,
> > > 
> > > On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> > > > Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> > > > 
> > > > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > Cc: Will Deacon <will@kernel.org>
> > > > Reviewed-by: Mark Brown <broonie@kernel.org>
> > > > Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> > > > ---
> > > >  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
> > > >  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
> > > >  2 files changed, 59 insertions(+)
> > > > 
> > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > index 8a45b7a411e0..e4cba8a6c9a2 100644
> > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > 
> > > [...]
> > > 
> > > > @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > > >  			return err;
> > > >  	}
> > > >  
> > > > +	if (system_supports_poe()) {
> > > > +		err = sigframe_alloc(user, &user->poe_offset,
> > > > +				     sizeof(struct poe_context));
> > > > +		if (err)
> > > > +			return err;
> > > > +	}
> > > > +
> > > >  	return sigframe_alloc_end(user);
> > > >  }
> > > >  
> > > > @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > >  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > > >  	}
> > > >  
> > > > +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> > > > +		struct poe_context __user *poe_ctx =
> > > > +			apply_user_offset(user, user->poe_offset);
> > > > +
> > > > +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> > > > +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> > > > +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> > > > +	}
> > > > +
> > > 
> > > Does the AArch64 procedure call standard say anything about whether
> > > POR_EL0 is caller-saved?
> > 
> > I asked about this, and it doesn't say anything and they don't plan on it,
> > since it's very application specific.
> 
> Right.  I think that confirms that we don't absolutely need to preserve
> POR_EL0, because if compiler-generated code was allowed to fiddle with
> this and not clean up after itself, the PCS would have to document this.
> 
> > > 
> > > <bikeshed>
> > > 
> > > In theory we could skip saving this register if it is already
> > > POR_EL0_INIT (which it often will be), and if the signal handler is not
> > > supposed to modify and leave the modified value in the register when
> > > returning.
> > > 
> > > The complexity of the additional check my be a bit pointless though,
> > > and the the handler might theoretically want to change the interrupted
> > > code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
> > > sometimes there and sometimes not.
> > > 
> > > </bikeshed>
> > > 
> > I think trying to skip/optimise something here would be more effort than any
> > possible benefits!
> 
> Actually, having thought about this some more I think that only dumping
> this register if != POR_EL0_INIT may be right right thing to do.
> 
> This would mean that old binary would stacks never see poe_context in
> the signal frame, and so will never experience unexpected stack
> overruns (at least, not due solely to the presence of this feature).

They can already see things they don't expect, like FPMR that was added
recently.

> 
> POE-aware signal handlers have to do something fiddly and nonportable
> to obtain the original value of POR_EL0 regardless, so requiring them
> do handle both cases (present in sigframe and absent) doesn't seem too
> onerous to me.

If the signal handler wanted to modify the value, from the default, wouldn't it
need to mess around with the sig context stuff, to allocate some space for
POR_EL0, such that the kernel would restore it properly? (If that's even
possible).

> 
> 
> Do you think this approach would break any known use cases?

Not sure.

> 
> Cheers
> ---Dave
>
Joey Gouly Aug. 6, 2024, 2:31 p.m. UTC | #17
On Tue, Aug 06, 2024 at 11:35:32AM +0100, Joey Gouly wrote:
> On Thu, Aug 01, 2024 at 05:22:45PM +0100, Dave Martin wrote:
> > On Thu, Aug 01, 2024 at 04:54:41PM +0100, Joey Gouly wrote:
> > > On Thu, Jul 25, 2024 at 05:00:18PM +0100, Dave Martin wrote:
> > > > Hi,
> > > > 
> > > > On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> > > > > Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> > > > > 
> > > > > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > > Cc: Will Deacon <will@kernel.org>
> > > > > Reviewed-by: Mark Brown <broonie@kernel.org>
> > > > > Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> > > > > ---
> > > > >  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
> > > > >  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
> > > > >  2 files changed, 59 insertions(+)
> > > > > 
> > > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > > index 8a45b7a411e0..e4cba8a6c9a2 100644
> > > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > 
> > > > [...]
> > > > 
> > > > > @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > > > >  			return err;
> > > > >  	}
> > > > >  
> > > > > +	if (system_supports_poe()) {
> > > > > +		err = sigframe_alloc(user, &user->poe_offset,
> > > > > +				     sizeof(struct poe_context));
> > > > > +		if (err)
> > > > > +			return err;
> > > > > +	}
> > > > > +
> > > > >  	return sigframe_alloc_end(user);
> > > > >  }
> > > > >  
> > > > > @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > > >  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > > > >  	}
> > > > >  
> > > > > +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> > > > > +		struct poe_context __user *poe_ctx =
> > > > > +			apply_user_offset(user, user->poe_offset);
> > > > > +
> > > > > +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> > > > > +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> > > > > +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> > > > > +	}
> > > > > +
> > > > 
> > > > Does the AArch64 procedure call standard say anything about whether
> > > > POR_EL0 is caller-saved?
> > > 
> > > I asked about this, and it doesn't say anything and they don't plan on it,
> > > since it's very application specific.
> > 
> > Right.  I think that confirms that we don't absolutely need to preserve
> > POR_EL0, because if compiler-generated code was allowed to fiddle with
> > this and not clean up after itself, the PCS would have to document this.
> > 
> > > > 
> > > > <bikeshed>
> > > > 
> > > > In theory we could skip saving this register if it is already
> > > > POR_EL0_INIT (which it often will be), and if the signal handler is not
> > > > supposed to modify and leave the modified value in the register when
> > > > returning.
> > > > 
> > > > The complexity of the additional check my be a bit pointless though,
> > > > and the the handler might theoretically want to change the interrupted
> > > > code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
> > > > sometimes there and sometimes not.
> > > > 
> > > > </bikeshed>
> > > > 
> > > I think trying to skip/optimise something here would be more effort than any
> > > possible benefits!
> > 
> > Actually, having thought about this some more I think that only dumping
> > this register if != POR_EL0_INIT may be right right thing to do.
> > 
> > This would mean that old binary would stacks never see poe_context in
> > the signal frame, and so will never experience unexpected stack
> > overruns (at least, not due solely to the presence of this feature).
> 
> They can already see things they don't expect, like FPMR that was added
> recently.
> 
> > 
> > POE-aware signal handlers have to do something fiddly and nonportable
> > to obtain the original value of POR_EL0 regardless, so requiring them
> > do handle both cases (present in sigframe and absent) doesn't seem too
> > onerous to me.
> 
> If the signal handler wanted to modify the value, from the default, wouldn't it
> need to mess around with the sig context stuff, to allocate some space for
> POR_EL0, such that the kernel would restore it properly? (If that's even
> possible).
> 
> > 
> > 
> > Do you think this approach would break any known use cases?
> 
> Not sure.
> 

We talked about this offline, helped me understand it more, and I think
something like this makes sense:

diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c
index 561986947530..ca7d4e0be275 100644
--- arch/arm64/kernel/signal.c
+++ arch/arm64/kernel/signal.c
@@ -1024,7 +1025,10 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
                        return err;
        }
 
-       if (system_supports_poe()) {
+       if (system_supports_poe() &&
+                       (add_all ||
+                        mm_pkey_allocation_map(current->mm) != 0x1 ||
+                        read_sysreg_s(SYS_POR_EL0) != POR_EL0_INIT)) {
                err = sigframe_alloc(user, &user->poe_offset,
                                     sizeof(struct poe_context));
                if (err)


That is, we only save the POR_EL0 value if any pkeys have been allocated (other
than pkey 0) *or* if POR_EL0 is a non-default value.

The latter case is a corner case, where a userspace would have changed POR_EL0
before allocating any extra pkeys.
That could be:
	- pkey 0, if it restricts pkey 0 without allocating other pkeys, it's
	  unlikely the program can do anything useful anyway
	- Another pkey, which userspace probably shouldn't do anyway.
	  The man pages say:
		The kernel guarantees that the contents of the hardware rights
		register (PKRU) will be preserved only for allocated protection keys. Any time
		a key is unallocated (either before the first call returning that key from
		pkey_alloc() or after it is freed via pkey_free()), the kernel may make
		arbitrary changes to the parts of the rights register affecting access to that
		key.
	  So userspace shouldn't be changing POR_EL0 before allocating pkeys anyway..

Thanks,
Joey
Dave Martin Aug. 6, 2024, 3 p.m. UTC | #18
On Tue, Aug 06, 2024 at 03:31:03PM +0100, Joey Gouly wrote:
> On Tue, Aug 06, 2024 at 11:35:32AM +0100, Joey Gouly wrote:
> > On Thu, Aug 01, 2024 at 05:22:45PM +0100, Dave Martin wrote:
> > > On Thu, Aug 01, 2024 at 04:54:41PM +0100, Joey Gouly wrote:
> > > > On Thu, Jul 25, 2024 at 05:00:18PM +0100, Dave Martin wrote:
> > > > > Hi,
> > > > > 
> > > > > On Fri, May 03, 2024 at 02:01:36PM +0100, Joey Gouly wrote:
> > > > > > Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
> > > > > > 
> > > > > > Signed-off-by: Joey Gouly <joey.gouly@arm.com>
> > > > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > > > Cc: Will Deacon <will@kernel.org>
> > > > > > Reviewed-by: Mark Brown <broonie@kernel.org>
> > > > > > Acked-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> > > > > > ---
> > > > > >  arch/arm64/include/uapi/asm/sigcontext.h |  7 ++++
> > > > > >  arch/arm64/kernel/signal.c               | 52 ++++++++++++++++++++++++
> > > > > >  2 files changed, 59 insertions(+)
> > > > > > 
> > > > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > > > index 8a45b7a411e0..e4cba8a6c9a2 100644
> > > > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > > 
> > > > > [...]
> > > > > 
> > > > > > @@ -980,6 +1013,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > > > > >  			return err;
> > > > > >  	}
> > > > > >  
> > > > > > +	if (system_supports_poe()) {
> > > > > > +		err = sigframe_alloc(user, &user->poe_offset,
> > > > > > +				     sizeof(struct poe_context));
> > > > > > +		if (err)
> > > > > > +			return err;
> > > > > > +	}
> > > > > > +
> > > > > >  	return sigframe_alloc_end(user);
> > > > > >  }
> > > > > >  
> > > > > > @@ -1020,6 +1060,15 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > > > >  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > > > > >  	}
> > > > > >  
> > > > > > +	if (system_supports_poe() && err == 0 && user->poe_offset) {
> > > > > > +		struct poe_context __user *poe_ctx =
> > > > > > +			apply_user_offset(user, user->poe_offset);
> > > > > > +
> > > > > > +		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
> > > > > > +		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
> > > > > > +		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
> > > > > > +	}
> > > > > > +
> > > > > 
> > > > > Does the AArch64 procedure call standard say anything about whether
> > > > > POR_EL0 is caller-saved?
> > > > 
> > > > I asked about this, and it doesn't say anything and they don't plan on it,
> > > > since it's very application specific.
> > > 
> > > Right.  I think that confirms that we don't absolutely need to preserve
> > > POR_EL0, because if compiler-generated code was allowed to fiddle with
> > > this and not clean up after itself, the PCS would have to document this.
> > > 
> > > > > 
> > > > > <bikeshed>
> > > > > 
> > > > > In theory we could skip saving this register if it is already
> > > > > POR_EL0_INIT (which it often will be), and if the signal handler is not
> > > > > supposed to modify and leave the modified value in the register when
> > > > > returning.
> > > > > 
> > > > > The complexity of the additional check my be a bit pointless though,
> > > > > and the the handler might theoretically want to change the interrupted
> > > > > code's POR_EL0 explicitly, which would be complicated if POE_MAGIC is
> > > > > sometimes there and sometimes not.
> > > > > 
> > > > > </bikeshed>
> > > > > 
> > > > I think trying to skip/optimise something here would be more effort than any
> > > > possible benefits!
> > > 
> > > Actually, having thought about this some more I think that only dumping
> > > this register if != POR_EL0_INIT may be right right thing to do.
> > > 
> > > This would mean that old binary would stacks never see poe_context in
> > > the signal frame, and so will never experience unexpected stack
> > > overruns (at least, not due solely to the presence of this feature).
> > 
> > They can already see things they don't expect, like FPMR that was added
> > recently.
> > 
> > > 
> > > POE-aware signal handlers have to do something fiddly and nonportable
> > > to obtain the original value of POR_EL0 regardless, so requiring them
> > > do handle both cases (present in sigframe and absent) doesn't seem too
> > > onerous to me.
> > 
> > If the signal handler wanted to modify the value, from the default, wouldn't it
> > need to mess around with the sig context stuff, to allocate some space for
> > POR_EL0, such that the kernel would restore it properly? (If that's even
> > possible).
> > 
> > > 
> > > 
> > > Do you think this approach would break any known use cases?
> > 
> > Not sure.
> > 
> 
> We talked about this offline, helped me understand it more, and I think
> something like this makes sense:
> 
> diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c
> index 561986947530..ca7d4e0be275 100644
> --- arch/arm64/kernel/signal.c
> +++ arch/arm64/kernel/signal.c
> @@ -1024,7 +1025,10 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>                         return err;
>         }
>  
> -       if (system_supports_poe()) {
> +       if (system_supports_poe() &&
> +                       (add_all ||
> +                        mm_pkey_allocation_map(current->mm) != 0x1 ||

We probably ought to holding the mm lock for read around this (as in
mm/mprotect.c:pkey_alloc()), or have a wrapper to encapsulate that.

Signal delivery is not fast path, so I think we should stick to the
simple and obviously correct approach rather than trying to be
lockless (at least until somebody comes up with a compelling reason to
change it).

If doing that, we should probably put the condition on the allocation
map last so that we don't take the lock unnecessarily.

> +                        read_sysreg_s(SYS_POR_EL0) != POR_EL0_INIT)) {
>                 err = sigframe_alloc(user, &user->poe_offset,
>                                      sizeof(struct poe_context));
>                 if (err)
> 
> 
> That is, we only save the POR_EL0 value if any pkeys have been allocated (other
> than pkey 0) *or* if POR_EL0 is a non-default value.
> 
> The latter case is a corner case, where a userspace would have changed POR_EL0
> before allocating any extra pkeys.
> That could be:
> 	- pkey 0, if it restricts pkey 0 without allocating other pkeys, it's
> 	  unlikely the program can do anything useful anyway
> 	- Another pkey, which userspace probably shouldn't do anyway.
> 	  The man pages say:
> 		The kernel guarantees that the contents of the hardware rights
> 		register (PKRU) will be preserved only for allocated protection keys. Any time
> 		a key is unallocated (either before the first call returning that key from
> 		pkey_alloc() or after it is freed via pkey_free()), the kernel may make
> 		arbitrary changes to the parts of the rights register affecting access to that
> 		key.
> 	  So userspace shouldn't be changing POR_EL0 before allocating pkeys anyway..
> 
> Thanks,
> Joey

This seems better, thanks.

I'll leave it for others to comment on whether this is an issue for any
pkeys use case, but it does mean that non-POE-aware arm64 code
shouldn't see any impact on the signal handling side.

Your new approach means that poe_context is always present in the
sigframe for code that is using POE, so I think that reasonable
scenarios of wanting to change the POR_EL0 value for sigreturn ought
to work.

Processes that contain a mixture of POE and non-POE code are a bit more
of a grey area, but I think that libraries should not be arbitrarily
commandeering pkeys since they have no way to be sure that won't break
the calling program...  I'm assuming that we won't have to worry about
that scenario in practice.

Cheers
---Dave
Catalin Marinas Aug. 14, 2024, 3:03 p.m. UTC | #19
Hi Joey,

On Tue, Aug 06, 2024 at 03:31:03PM +0100, Joey Gouly wrote:
> diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c
> index 561986947530..ca7d4e0be275 100644
> --- arch/arm64/kernel/signal.c
> +++ arch/arm64/kernel/signal.c
> @@ -1024,7 +1025,10 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>                         return err;
>         }
>  
> -       if (system_supports_poe()) {
> +       if (system_supports_poe() &&
> +                       (add_all ||
> +                        mm_pkey_allocation_map(current->mm) != 0x1 ||
> +                        read_sysreg_s(SYS_POR_EL0) != POR_EL0_INIT)) {
>                 err = sigframe_alloc(user, &user->poe_offset,
>                                      sizeof(struct poe_context));
>                 if (err)
> 
> 
> That is, we only save the POR_EL0 value if any pkeys have been allocated (other
> than pkey 0) *or* if POR_EL0 is a non-default value.

I had a chat with Dave as well on this and, in principle, we don't want
to add stuff to the signal frame unnecessarily, especially for old
binaries that have no clue of pkeys. OTOH, it looks like too complicated
for just 16 bytes. Also POR_EL0 all RWX is a valid combination, I don't
think we should exclude it.

If no pkey has been allocated, I guess we could skip this and it also
matches the x86 description of the PKRU being guaranteed to be preserved
only for the allocated keys. Do we reserve pkey 0 for arm64? I thought
that's only an x86 thing to emulate execute-only mappings.

Another corner case would be the signal handler doing a pkey_alloc() and
willing to populate POR_EL0 on sigreturn. It will have to find room in
the signal handler, though I don't think that's a problem.
Joey Gouly Aug. 15, 2024, 1:18 p.m. UTC | #20
Hi Catalin,

On Wed, Aug 14, 2024 at 04:03:47PM +0100, Catalin Marinas wrote:
> Hi Joey,
> 
> On Tue, Aug 06, 2024 at 03:31:03PM +0100, Joey Gouly wrote:
> > diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c
> > index 561986947530..ca7d4e0be275 100644
> > --- arch/arm64/kernel/signal.c
> > +++ arch/arm64/kernel/signal.c
> > @@ -1024,7 +1025,10 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> >                         return err;
> >         }
> >  
> > -       if (system_supports_poe()) {
> > +       if (system_supports_poe() &&
> > +                       (add_all ||
> > +                        mm_pkey_allocation_map(current->mm) != 0x1 ||
> > +                        read_sysreg_s(SYS_POR_EL0) != POR_EL0_INIT)) {
> >                 err = sigframe_alloc(user, &user->poe_offset,
> >                                      sizeof(struct poe_context));
> >                 if (err)
> > 
> > 
> > That is, we only save the POR_EL0 value if any pkeys have been allocated (other
> > than pkey 0) *or* if POR_EL0 is a non-default value.
> 
> I had a chat with Dave as well on this and, in principle, we don't want
> to add stuff to the signal frame unnecessarily, especially for old
> binaries that have no clue of pkeys. OTOH, it looks like too complicated
> for just 16 bytes. Also POR_EL0 all RWX is a valid combination, I don't
> think we should exclude it.
> 
> If no pkey has been allocated, I guess we could skip this and it also
> matches the x86 description of the PKRU being guaranteed to be preserved
> only for the allocated keys. Do we reserve pkey 0 for arm64? I thought
> that's only an x86 thing to emulate execute-only mappings.

To make it less complicated, I could drop the POR_EL0 check and just do:

-       if (system_supports_poe()) {
+       if (system_supports_poe() &&
+                       (add_all ||
+                        mm_pkey_allocation_map(current->mm) != 0x1) {

This wouldn't preserve the value of POR_EL0 if no pkeys had been allocated, but
that is fine, as you said / the man pages say.

We don't preserve pkey 0, but it is the default for mappings and defaults to
RWX. So changing it probably will lead to unexpected things.

> 
> Another corner case would be the signal handler doing a pkey_alloc() and
> willing to populate POR_EL0 on sigreturn. It will have to find room in
> the signal handler, though I don't think that's a problem.

pkey_alloc() doesn't appear in the signal safety man page, but that might just
be an omission due to permission keys being newer, than actually saying
pkey_alloc() can't be used.

If POR_EL0 isn't in the sig context, I think the signal handler could just
write the POR_EL0 system register directly? The kernel wouldn't restore POR_EL0
in that case, so the value set in the signal handler would just be preserved.

The reason that trying to preserve the value of POR_EL0 without any pkeys
allocated (like in the patch in my previous e-mail had) doesn't really make
sense, is that when you do pkey_alloc() you have to pass an initial value for
the pkey, so that will overwite what you may have manually written into
POR_EL0. Also you can't pass an unallocated pkey value to pkey_mprotect().


That's a lot of words to say, or ask, do you agree with the approach of only
saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?

Thanks,
Joey
Dave Martin Aug. 15, 2024, 3:09 p.m. UTC | #21
On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> Hi Catalin,
> 
> On Wed, Aug 14, 2024 at 04:03:47PM +0100, Catalin Marinas wrote:
> > Hi Joey,
> > 
> > On Tue, Aug 06, 2024 at 03:31:03PM +0100, Joey Gouly wrote:
> > > diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c
> > > index 561986947530..ca7d4e0be275 100644
> > > --- arch/arm64/kernel/signal.c
> > > +++ arch/arm64/kernel/signal.c
> > > @@ -1024,7 +1025,10 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > >                         return err;
> > >         }
> > >  
> > > -       if (system_supports_poe()) {
> > > +       if (system_supports_poe() &&
> > > +                       (add_all ||
> > > +                        mm_pkey_allocation_map(current->mm) != 0x1 ||
> > > +                        read_sysreg_s(SYS_POR_EL0) != POR_EL0_INIT)) {
> > >                 err = sigframe_alloc(user, &user->poe_offset,
> > >                                      sizeof(struct poe_context));
> > >                 if (err)
> > > 
> > > 
> > > That is, we only save the POR_EL0 value if any pkeys have been allocated (other
> > > than pkey 0) *or* if POR_EL0 is a non-default value.
> > 
> > I had a chat with Dave as well on this and, in principle, we don't want
> > to add stuff to the signal frame unnecessarily, especially for old
> > binaries that have no clue of pkeys. OTOH, it looks like too complicated
> > for just 16 bytes. Also POR_EL0 all RWX is a valid combination, I don't
> > think we should exclude it.

Unfortunately, this is always going to be the obviously simpler and
more robust option for dealing with any new register state.

In effect, the policy will be to push back on unconditional additions
to the signal frame, except for 100% of proposed additions...


I'm coming round to the view that trying to provide absolute guarantees
about the signal frame size is unsustainable.  x86 didn't, and got away
with it for some time...  Maybe we should just get rid of the relevant
comments in headers, and water down guarantees in the SVE/SME
documentation to recommendations with no promise attached?

I can propose a patch for that.

> > 
> > If no pkey has been allocated, I guess we could skip this and it also
> > matches the x86 description of the PKRU being guaranteed to be preserved
> > only for the allocated keys. Do we reserve pkey 0 for arm64? I thought
> > that's only an x86 thing to emulate execute-only mappings.

It's not clear whether pkey 0 is reserved in the sense of being
permanently allocated, or in the sense of being unavailable for
allocation.

Since userspace gets pages with pkey 0 by default and can fiddle with
the permissions on POR_EL0 and set this pkey onto pages using
pkey_mprotect(), I'd say pkey 0 counts as always allocated; and the
value of the POR_EL0 bits for pkey 0 needs to be maintained.

> 
> To make it less complicated, I could drop the POR_EL0 check and just do:
> 
> -       if (system_supports_poe()) {
> +       if (system_supports_poe() &&
> +                       (add_all ||
> +                        mm_pkey_allocation_map(current->mm) != 0x1) {
> 
> This wouldn't preserve the value of POR_EL0 if no pkeys had been allocated, but
> that is fine, as you said / the man pages say.
> 
> We don't preserve pkey 0, but it is the default for mappings and defaults to
> RWX. So changing it probably will lead to unexpected things.
> 
> > 
> > Another corner case would be the signal handler doing a pkey_alloc() and
> > willing to populate POR_EL0 on sigreturn. It will have to find room in
> > the signal handler, though I don't think that's a problem.
> 
> pkey_alloc() doesn't appear in the signal safety man page, but that might just
> be an omission due to permission keys being newer, than actually saying
> pkey_alloc() can't be used.

In practice this is likely to be a thin syscall wrapper; those are
async-signal-safe in practice on Linux (but documentation tends to take
a while to catch up).  (Exceptions exists where "safe" calls are used
in ways that interfere with the internal operation of libc... but those
cases are mostly at least semi-obvious and rarely documented.)

Using pkey_alloc() in a signal handler doesn't seem a great idea for
more straightforward reasons, though:  pkeys are a scarce, per-process
resource, and allocating them asynchronously in the presence of other
threads etc., doesn't seem like a recipe for success.

I haven't looked, but I'd be surprised if there's any code doing this!

Generally, it's too late to allocate any non-trivial kind of resource
one you're in a signal handler... you need to plan ahead.

> 
> If POR_EL0 isn't in the sig context, I think the signal handler could just
> write the POR_EL0 system register directly? The kernel wouldn't restore POR_EL0
> in that case, so the value set in the signal handler would just be preserved.
> 
> The reason that trying to preserve the value of POR_EL0 without any pkeys
> allocated (like in the patch in my previous e-mail had) doesn't really make
> sense, is that when you do pkey_alloc() you have to pass an initial value for
> the pkey, so that will overwite what you may have manually written into
> POR_EL0. Also you can't pass an unallocated pkey value to pkey_mprotect().

My argument here was that from the signal handler's point of view, the
POR_EL0 value of the interrupted context lives in the sigframe if it's
there (and will then be restored from there), and directly in POR_EL0
otherwise.  Parsing the sigframe determine where the image of POR_EL0 is.

I see two potential problems.

1) (probably not a big deal in practice)

If the signal handler wants to withdraw a permission from pkey 0 for
the interrupted context, and the interrupted context had no permission
on any other pkey (so POR_EL0 is not dumped and the handler must update
POR_EL0 directly before returning).

In this scenario, the interrupted code would explode on return unless
it can cope with globally execute-only or execute-read-only permissions.
(no-execute is obviously dead on arrival).

If a signal handler really really wanted to do this, it could return
through an asm trampoline that is able to cope with the reduced
permissions.  This seems like a highly contrived scenario though, and I
can't see how it could be useful...

2) (possibly a bigger deal) pkeys(7) does say explicitly (well, sort of)
that the PKRU bits are restored on sigreturn.

Since there are generic APIs to manipulate pkeys, it might cause
problems if sigreturn restores the pkey permissions on some arches
but not on others.  Some non-x86-specific software might already be 
relying on the restoration of the permissions.


> That's a lot of words to say, or ask, do you agree with the approach of only
> saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> 
> Thanks,
> Joey

...So..., given all the above, it is perhaps best to go back to
dumping POR_EL0 unconditionally after all, unless we have a mechanism
to determine whether pkeys are in use at all.

If we initially trapped POR_EL0, and set a flag the first time it is
accessed or one of the pkeys syscalls is used, then we could dump
POR_EL0 conditionally based on that: once the flag is set, we always
dump it for that process.  If the first POR_EL0 access or pkeys API
interaction is in a signal handler, and that handler modifies POR_EL0,
then it wouldn't get restored (or at least, not automatically).  Not
sure if this would ever matter in practice.

It's potentially fiddly to make it work 100% consistently though (does
a sigreturn count as a write?  What if the first access to POR_EL0 is
through ptrace, etc.?)

Cheers
---Dave
Mark Brown Aug. 15, 2024, 3:24 p.m. UTC | #22
On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave Martin wrote:

> I'm coming round to the view that trying to provide absolute guarantees
> about the signal frame size is unsustainable.  x86 didn't, and got away
> with it for some time...  Maybe we should just get rid of the relevant
> comments in headers, and water down guarantees in the SVE/SME
> documentation to recommendations with no promise attached?

I tend to agree, especially given that even within the fixed size our
layout is variable.  It creates contortions and realistically the big
issue is the vector extensions rather than anything else.  There we're
kind of constrained in what we can do.
Catalin Marinas Aug. 19, 2024, 5:09 p.m. UTC | #23
On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave P Martin wrote:
> On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> > That's a lot of words to say, or ask, do you agree with the approach of only
> > saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> > 
> > Thanks,
> > Joey
> 
> ...So..., given all the above, it is perhaps best to go back to
> dumping POR_EL0 unconditionally after all, unless we have a mechanism
> to determine whether pkeys are in use at all.

Ah, I can see why checking for POR_EL0_INIT is useful. Only checking for
the allocated keys gets confusing with pkey 0.

Not sure what the deal is with pkey 0. Is it considered allocated by
default or unallocatable? If the former, it implies that pkeys are
already in use (hence the additional check for POR_EL0_INIT). In
principle the hardware allows us to use permissions where the pkeys do
not apply but we'd run out of indices and PTE bits to encode them, so I
think by default we should assume that pkey 0 is pre-allocated.

So I agree that it's probably best to save it unconditionally.
Joey Gouly Aug. 20, 2024, 9:54 a.m. UTC | #24
On Mon, Aug 19, 2024 at 06:09:06PM +0100, Catalin Marinas wrote:
> On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave P Martin wrote:
> > On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> > > That's a lot of words to say, or ask, do you agree with the approach of only
> > > saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> > > 
> > > Thanks,
> > > Joey
> > 
> > ...So..., given all the above, it is perhaps best to go back to
> > dumping POR_EL0 unconditionally after all, unless we have a mechanism
> > to determine whether pkeys are in use at all.
> 
> Ah, I can see why checking for POR_EL0_INIT is useful. Only checking for
> the allocated keys gets confusing with pkey 0.
> 
> Not sure what the deal is with pkey 0. Is it considered allocated by
> default or unallocatable? If the former, it implies that pkeys are
> already in use (hence the additional check for POR_EL0_INIT). In
> principle the hardware allows us to use permissions where the pkeys do
> not apply but we'd run out of indices and PTE bits to encode them, so I
> think by default we should assume that pkey 0 is pre-allocated.
> 
> 

You can consider pkey 0 allocated by default. You can actually pkey_free(0), there's nothing stopping that.

> So I agree that it's probably best to save it unconditionally.

Alright, will leave it as is!

Thanks,
Joey
Dave Martin Aug. 20, 2024, 1:54 p.m. UTC | #25
On Tue, Aug 20, 2024 at 10:54:41AM +0100, Joey Gouly wrote:
> On Mon, Aug 19, 2024 at 06:09:06PM +0100, Catalin Marinas wrote:
> > On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave P Martin wrote:
> > > On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> > > > That's a lot of words to say, or ask, do you agree with the approach of only
> > > > saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> > > > 
> > > > Thanks,
> > > > Joey
> > > 
> > > ...So..., given all the above, it is perhaps best to go back to
> > > dumping POR_EL0 unconditionally after all, unless we have a mechanism
> > > to determine whether pkeys are in use at all.
> > 
> > Ah, I can see why checking for POR_EL0_INIT is useful. Only checking for
> > the allocated keys gets confusing with pkey 0.
> > 
> > Not sure what the deal is with pkey 0. Is it considered allocated by
> > default or unallocatable? If the former, it implies that pkeys are
> > already in use (hence the additional check for POR_EL0_INIT). In
> > principle the hardware allows us to use permissions where the pkeys do
> > not apply but we'd run out of indices and PTE bits to encode them, so I
> > think by default we should assume that pkey 0 is pre-allocated.
> > 
> > 
> 
> You can consider pkey 0 allocated by default. You can actually pkey_free(0), there's nothing stopping that.

Is that intentional?

You're not supposed to free pkeys that are in use, and it's quasi-
impossible to know whether pkey 0 is in use: all binaries in the
process assume that pkey is available and use it by default for their
pages, plus the stack will be painted with pkey 0, and the vDSO has to
be painted with some pkey.

Actually, that's a good point, because of the vDSO I think that only
special bits of code with a private ABI (e.g., JITted code etc.) that
definitely don't call into the vDSO can block permissions on pkey 0...
otherwise, stuff will break.

> 
> > So I agree that it's probably best to save it unconditionally.
> 
> Alright, will leave it as is!

Ack, I think the whole discussion around this has shown that there
isn't a _simple_ argument for conditionally dumping POR_EL0... so I'm
prepared to admit defeat here.

We might still try to slow down the consumption of the remaining space
with a "misc registers" record, instead of dedicating a record to
POR_EL0.  I have some thoughts on that, but if nobody cares that much
then this probably isn't worth pursuing.

Cheers
---Dave
Joey Gouly Aug. 20, 2024, 2:06 p.m. UTC | #26
On Tue, Aug 20, 2024 at 02:54:50PM +0100, Dave Martin wrote:
> On Tue, Aug 20, 2024 at 10:54:41AM +0100, Joey Gouly wrote:
> > On Mon, Aug 19, 2024 at 06:09:06PM +0100, Catalin Marinas wrote:
> > > On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave P Martin wrote:
> > > > On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> > > > > That's a lot of words to say, or ask, do you agree with the approach of only
> > > > > saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> > > > > 
> > > > > Thanks,
> > > > > Joey
> > > > 
> > > > ...So..., given all the above, it is perhaps best to go back to
> > > > dumping POR_EL0 unconditionally after all, unless we have a mechanism
> > > > to determine whether pkeys are in use at all.
> > > 
> > > Ah, I can see why checking for POR_EL0_INIT is useful. Only checking for
> > > the allocated keys gets confusing with pkey 0.
> > > 
> > > Not sure what the deal is with pkey 0. Is it considered allocated by
> > > default or unallocatable? If the former, it implies that pkeys are
> > > already in use (hence the additional check for POR_EL0_INIT). In
> > > principle the hardware allows us to use permissions where the pkeys do
> > > not apply but we'd run out of indices and PTE bits to encode them, so I
> > > think by default we should assume that pkey 0 is pre-allocated.
> > > 
> > > 
> > 
> > You can consider pkey 0 allocated by default. You can actually pkey_free(0), there's nothing stopping that.
> 
> Is that intentional?

I don't really know? It's intentional from my side in that it, I allow it,
because it doesn't look like x86 or PPC block pkey_free(0).

I found this code that does pkey_free(0), but obviously it's a bit of a weird test case:

	https://github.com/ColinIanKing/stress-ng/blob/master/test/test-pkey-free.c#L29

> 
> You're not supposed to free pkeys that are in use, and it's quasi-
> impossible to know whether pkey 0 is in use: all binaries in the
> process assume that pkey is available and use it by default for their
> pages, plus the stack will be painted with pkey 0, and the vDSO has to
> be painted with some pkey.
> 
> Actually, that's a good point, because of the vDSO I think that only
> special bits of code with a private ABI (e.g., JITted code etc.) that
> definitely don't call into the vDSO can block permissions on pkey 0...
> otherwise, stuff will break.
> 
> > 
> > > So I agree that it's probably best to save it unconditionally.
> > 
> > Alright, will leave it as is!
> 
> Ack, I think the whole discussion around this has shown that there
> isn't a _simple_ argument for conditionally dumping POR_EL0... so I'm
> prepared to admit defeat here.
> 
> We might still try to slow down the consumption of the remaining space
> with a "misc registers" record, instead of dedicating a record to
> POR_EL0.  I have some thoughts on that, but if nobody cares that much
> then this probably isn't worth pursuing.
> 
> Cheers
> ---Dave
>
Dave Martin Aug. 20, 2024, 2:45 p.m. UTC | #27
On Tue, Aug 20, 2024 at 03:06:06PM +0100, Joey Gouly wrote:
> On Tue, Aug 20, 2024 at 02:54:50PM +0100, Dave Martin wrote:
> > On Tue, Aug 20, 2024 at 10:54:41AM +0100, Joey Gouly wrote:
> > > On Mon, Aug 19, 2024 at 06:09:06PM +0100, Catalin Marinas wrote:
> > > > On Thu, Aug 15, 2024 at 04:09:26PM +0100, Dave P Martin wrote:
> > > > > On Thu, Aug 15, 2024 at 02:18:15PM +0100, Joey Gouly wrote:
> > > > > > That's a lot of words to say, or ask, do you agree with the approach of only
> > > > > > saving POR_EL0 in the signal frame if num_allocated_pkeys() > 1?
> > > > > > 
> > > > > > Thanks,
> > > > > > Joey
> > > > > 
> > > > > ...So..., given all the above, it is perhaps best to go back to
> > > > > dumping POR_EL0 unconditionally after all, unless we have a mechanism
> > > > > to determine whether pkeys are in use at all.
> > > > 
> > > > Ah, I can see why checking for POR_EL0_INIT is useful. Only checking for
> > > > the allocated keys gets confusing with pkey 0.
> > > > 
> > > > Not sure what the deal is with pkey 0. Is it considered allocated by
> > > > default or unallocatable? If the former, it implies that pkeys are
> > > > already in use (hence the additional check for POR_EL0_INIT). In
> > > > principle the hardware allows us to use permissions where the pkeys do
> > > > not apply but we'd run out of indices and PTE bits to encode them, so I
> > > > think by default we should assume that pkey 0 is pre-allocated.
> > > > 
> > > > 
> > > 
> > > You can consider pkey 0 allocated by default. You can actually pkey_free(0), there's nothing stopping that.
> > 
> > Is that intentional?
> 
> I don't really know? It's intentional from my side in that it, I allow it,
> because it doesn't look like x86 or PPC block pkey_free(0).
> 
> I found this code that does pkey_free(0), but obviously it's a bit of a weird test case:
> 
> 	https://github.com/ColinIanKing/stress-ng/blob/master/test/test-pkey-free.c#L29

Of course, pkey 0 will still be in use for everything, and if the man
pages are to be believed, the PKRU bits for pkey 0 may no longer be
maintained after this call...

So this test is possibly a little braindead.  A clear use-case for
freeing pkey 0 would be more convincing.

In the meantime though, it makes most sense for arm64 to follow
the precedent set by other arches on this (as you did).

[...]

Cheers
---Dave
diff mbox series

Patch

diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8a45b7a411e0..e4cba8a6c9a2 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -98,6 +98,13 @@  struct esr_context {
 	__u64 esr;
 };
 
+#define POE_MAGIC	0x504f4530
+
+struct poe_context {
+	struct _aarch64_ctx head;
+	__u64 por_el0;
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 4a77f4976e11..077436a8bc10 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -63,6 +63,7 @@  struct rt_sigframe_user_layout {
 	unsigned long fpmr_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
+	unsigned long poe_offset;
 };
 
 #define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
@@ -185,6 +186,8 @@  struct user_ctxs {
 	u32 zt_size;
 	struct fpmr_context __user *fpmr;
 	u32 fpmr_size;
+	struct poe_context __user *poe;
+	u32 poe_size;
 };
 
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
@@ -258,6 +261,21 @@  static int restore_fpmr_context(struct user_ctxs *user)
 	return err;
 }
 
+static int restore_poe_context(struct user_ctxs *user)
+{
+	u64 por_el0;
+	int err = 0;
+
+	if (user->poe_size != sizeof(*user->poe))
+		return -EINVAL;
+
+	__get_user_error(por_el0, &(user->poe->por_el0), err);
+	if (!err)
+		write_sysreg_s(por_el0, SYS_POR_EL0);
+
+	return err;
+}
+
 #ifdef CONFIG_ARM64_SVE
 
 static int preserve_sve_context(struct sve_context __user *ctx)
@@ -621,6 +639,7 @@  static int parse_user_sigframe(struct user_ctxs *user,
 	user->za = NULL;
 	user->zt = NULL;
 	user->fpmr = NULL;
+	user->poe = NULL;
 
 	if (!IS_ALIGNED((unsigned long)base, 16))
 		goto invalid;
@@ -671,6 +690,17 @@  static int parse_user_sigframe(struct user_ctxs *user,
 			/* ignore */
 			break;
 
+		case POE_MAGIC:
+			if (!system_supports_poe())
+				goto invalid;
+
+			if (user->poe)
+				goto invalid;
+
+			user->poe = (struct poe_context __user *)head;
+			user->poe_size = size;
+			break;
+
 		case SVE_MAGIC:
 			if (!system_supports_sve() && !system_supports_sme())
 				goto invalid;
@@ -857,6 +887,9 @@  static int restore_sigframe(struct pt_regs *regs,
 	if (err == 0 && system_supports_sme2() && user.zt)
 		err = restore_zt_context(&user);
 
+	if (err == 0 && system_supports_poe() && user.poe)
+		err = restore_poe_context(&user);
+
 	return err;
 }
 
@@ -980,6 +1013,13 @@  static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 			return err;
 	}
 
+	if (system_supports_poe()) {
+		err = sigframe_alloc(user, &user->poe_offset,
+				     sizeof(struct poe_context));
+		if (err)
+			return err;
+	}
+
 	return sigframe_alloc_end(user);
 }
 
@@ -1020,6 +1060,15 @@  static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (system_supports_poe() && err == 0 && user->poe_offset) {
+		struct poe_context __user *poe_ctx =
+			apply_user_offset(user, user->poe_offset);
+
+		__put_user_error(POE_MAGIC, &poe_ctx->head.magic, err);
+		__put_user_error(sizeof(*poe_ctx), &poe_ctx->head.size, err);
+		__put_user_error(read_sysreg_s(SYS_POR_EL0), &poe_ctx->por_el0, err);
+	}
+
 	/* Scalable Vector Extension state (including streaming), if present */
 	if ((system_supports_sve() || system_supports_sme()) &&
 	    err == 0 && user->sve_offset) {
@@ -1178,6 +1227,9 @@  static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 		sme_smstop();
 	}
 
+	if (system_supports_poe())
+		write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
+
 	if (ka->sa.sa_flags & SA_RESTORER)
 		sigtramp = ka->sa.sa_restorer;
 	else