diff mbox series

[23/35] x86/fpu: Add helpers for modifying supervisor xstate

Message ID 20220130211838.8382-24-rick.p.edgecombe@intel.com (mailing list archive)
State New
Headers show
Series Shadow stacks for userspace | expand

Commit Message

Edgecombe, Rick P Jan. 30, 2022, 9:18 p.m. UTC
Add helpers that can be used to modify supervisor xstate safely for the
current task.

State for supervisors xstate based features can be live and
accesses via MSR's, or saved in memory in an xsave buffer. When the
kernel needs to modify this state it needs to be sure to operate on it
in the right place, so the modifications don't get clobbered.

In the past supervisor xstate features have used get_xsave_addr()
directly, and performed open coded logic handle operating on the saved
state correctly. This has posed two problems:
 1. It has logic that has been gotten wrong more than once.
 2. To reduce code, less common path's are not optimized. Determination
    of which path's are less common is based on assumptions about far away
    code that could change.

In addition, now that get_xsave_addr() is not available outside of the
core fpu code, there isn't even a way for these supervisor features to
modify the in memory state.

To resolve these problems, add some helpers that encapsulate the correct
logic to operate on the correct copy of the state. Map the MSR's to the
struct field location in a case statements in __get_xsave_member().

Use the helpers like this, to write to either the MSR or saved state:
void *xstate;

xstate = start_update_xsave_msrs(XFEATURE_FOO);
r = xsave_rdmsrl(state, MSR_IA32_FOO_1, &val)
if (r)
	xsave_wrmsrl(state, MSR_IA32_FOO_2, FOO_ENABLE);
end_update_xsave_msrs();

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---

v1:
 - New patch.

 arch/x86/include/asm/fpu/api.h |   5 ++
 arch/x86/kernel/fpu/xstate.c   | 134 +++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)

Comments

Thomas Gleixner Feb. 8, 2022, 8:51 a.m. UTC | #1
On Sun, Jan 30 2022 at 13:18, Rick Edgecombe wrote:
> In addition, now that get_xsave_addr() is not available outside of the
> core fpu code, there isn't even a way for these supervisor features to
> modify the in memory state.
>
> To resolve these problems, add some helpers that encapsulate the correct
> logic to operate on the correct copy of the state. Map the MSR's to the
> struct field location in a case statements in __get_xsave_member().

I like the approach in principle, but you still expose the xstate
internals via the void pointer. It's just a question of time that this
is type casted and abused in interesting ways.

Something like the below untested (on top of the whole series) preserves
the encapsulation and reduces the code at the call sites.

Thanks,

        tglx
---
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -165,12 +165,7 @@ static inline bool fpstate_is_confidenti
 struct task_struct;
 extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
 
-void *start_update_xsave_msrs(int xfeature_nr);
-void end_update_xsave_msrs(void);
-int xsave_rdmsrl(void *xstate, unsigned int msr, unsigned long long *p);
-int xsave_wrmsrl(void *xstate, u32 msr, u64 val);
-int xsave_set_clear_bits_msrl(void *xstate, u32 msr, u64 set, u64 clear);
-
-void *get_xsave_buffer_unsafe(struct fpu *fpu, int xfeature_nr);
-int xsave_wrmsrl_unsafe(void *xstate, u32 msr, u64 val);
+int xsave_rdmsrs(int xfeature_nr, struct xstate_msr *xmsr, int num_msrs);
+int xsave_wrmsrs(int xfeature_nr, struct xstate_msr *xmsr, int num_msrs);
+int xsave_wrmsrs_on_task(struct task_struct *tsk, int xfeature_nr, struct xstate_msr *xmsr, int num_msrs);
 #endif /* _ASM_X86_FPU_API_H */
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -601,4 +601,12 @@ struct fpu_state_config {
 /* FPU state configuration information */
 extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
 
+struct xstate_msr {
+	unsigned int	msr;
+	unsigned int	bitop;
+	u64		val;
+	u64		set;
+	u64		clear;
+};
+
 #endif /* _ASM_X86_FPU_H */
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1868,7 +1868,7 @@ int proc_pid_arch_status(struct seq_file
 }
 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
 
-static u64 *__get_xsave_member(void *xstate, u32 msr)
+static u64 *xstate_get_member(void *xstate, u32 msr)
 {
 	switch (msr) {
 	case MSR_IA32_PL3_SSP:
@@ -1882,22 +1882,11 @@ static u64 *__get_xsave_member(void *xst
 }
 
 /*
- * Operate on the xsave buffer directly. It makes no gaurantees that the
- * buffer will stay valid now or in the futre. This function is pretty
- * much only useful when the caller knows the fpu's thread can't be
- * scheduled or otherwise operated on concurrently.
- */
-void *get_xsave_buffer_unsafe(struct fpu *fpu, int xfeature_nr)
-{
-	return get_xsave_addr(&fpu->fpstate->regs.xsave, xfeature_nr);
-}
-
-/*
  * Return a pointer to the xstate for the feature if it should be used, or NULL
  * if the MSRs should be written to directly. To do this safely, using the
  * associated read/write helpers is required.
  */
-void *start_update_xsave_msrs(int xfeature_nr)
+static void *xsave_msrs_op_start(int xfeature_nr)
 {
 	void *xstate;
 
@@ -1938,7 +1927,7 @@ void *start_update_xsave_msrs(int xfeatu
 	return xstate;
 }
 
-void end_update_xsave_msrs(void)
+static void xsave_msrs_op_end(void)
 {
 	fpregs_unlock();
 }
@@ -1951,7 +1940,7 @@ void end_update_xsave_msrs(void)
  *
  * But if this correspondence is broken by either a write to the in-memory
  * buffer or the registers, the kernel needs to be notified so it doesn't miss
- * an xsave or restore. __xsave_msrl_prepare_write() peforms this check and
+ * an xsave or restore. xsave_msrs_prepare_write() performs this check and
  * notifies the kernel if needed. Use before writes only, to not take away
  * the kernel's options when not required.
  *
@@ -1959,65 +1948,107 @@ void end_update_xsave_msrs(void)
  * must have resulted in targeting the in-memory state, so invaliding the
  * registers is the right thing to do.
  */
-static void __xsave_msrl_prepare_write(void)
+static void xsave_msrs_prepare_write(void)
 {
 	if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
 	    fpregs_state_valid(&current->thread.fpu, smp_processor_id()))
 		__fpu_invalidate_fpregs_state(&current->thread.fpu);
 }
 
-int xsave_rdmsrl(void *xstate, unsigned int msr, unsigned long long *p)
+static int read_xstate_or_msr(struct xstate_msr *xmsr, void *xstate)
 {
 	u64 *member_ptr;
 
 	if (!xstate)
-		return rdmsrl_safe(msr, p);
+		return rdmsrl_safe(xmsr->msr, &xmsr->val);
 
-	member_ptr = __get_xsave_member(xstate, msr);
+	member_ptr = xstate_get_member(xstate, xmsr->msr);
 	if (!member_ptr)
 		return 1;
 
-	*p = *member_ptr;
-
+	xmsr->val = *member_ptr;
 	return 0;
 }
 
+int xsave_rdmsrs(int xfeature_nr, struct xstate_msr *xmsr, int num_msrs)
+{
+	void *xstate = xsave_msrs_op_start(xfeature_nr);
+	int i, ret;
+
+	for (i = 0, ret = 0; !ret && i < num_msrs; i++, xmsr++)
+		ret = read_xstate_or_msr(xmsr, xstate);
+
+	xsave_msrs_op_end();
+	return ret;
+}
 
-int xsave_wrmsrl_unsafe(void *xstate, u32 msr, u64 val)
+static int write_xstate(struct xstate_msr *xmsr, void *xstate)
 {
-	u64 *member_ptr;
+	u64 *member_ptr = xstate_get_member(xstate, xmsr->msr);
 
-	member_ptr = __get_xsave_member(xstate, msr);
 	if (!member_ptr)
 		return 1;
 
-	*member_ptr = val;
-
+	*member_ptr = xmsr->val;
 	return 0;
 }
 
-int xsave_wrmsrl(void *xstate, u32 msr, u64 val)
+static int write_xstate_or_msr(struct xstate_msr *xmsr, void *xstate)
 {
-	__xsave_msrl_prepare_write();
 	if (!xstate)
-		return wrmsrl_safe(msr, val);
-
-	return xsave_wrmsrl_unsafe(xstate, msr, val);
+		return wrmsrl_safe(xmsr->msr, xmsr->val);
+	return write_xstate(xmsr, xstate);
 }
 
-int xsave_set_clear_bits_msrl(void *xstate, u32 msr, u64 set, u64 clear)
+static int mod_xstate_or_msr_bits(struct xstate_msr *xmsr, void *xstate)
 {
-	u64 val, new_val;
+	u64 val;
 	int ret;
 
-	ret = xsave_rdmsrl(xstate, msr, &val);
+	ret = read_xstate_or_msr(xmsr, xstate);
 	if (ret)
 		return ret;
 
-	new_val = (val & ~clear) | set;
+	val = xmsr->val;
+	xmsr->val = (val & ~xmsr->clear) | xmsr->set;
 
-	if (new_val != val)
-		return xsave_wrmsrl(xstate, msr, new_val);
+	if (val != xmsr->val)
+		return write_xstate_or_msr(xmsr, xstate);
 
 	return 0;
 }
+
+static int __xsave_wrmsrs(void *xstate, struct xstate_msr *xmsr, int num_msrs)
+{
+	int i, ret;
+
+	for (i = 0, ret = 0; !ret && i < num_msrs; i++, xmsr++) {
+		if (!xmsr->bitop)
+			ret = write_xstate_or_msr(xmsr, xstate);
+		else
+			ret = mod_xstate_or_msr_bits(xmsr, xstate);
+	}
+
+	return ret;
+}
+
+int xsave_wrmsrs(int xfeature_nr, struct xstate_msr *xmsr, int num_msrs)
+{
+	void *xstate = xsave_msrs_op_start(xfeature_nr);
+	int ret;
+
+	xsave_msrs_prepare_write();
+	ret = __xsave_wrmsrs(xstate, xmsr, num_msrs);
+	xsave_msrs_op_end();
+	return ret;
+}
+
+int xsave_wrmsrs_on_task(struct task_struct *tsk, int xfeature_nr, struct xstate_msr *xmsr,
+			 int num_msrs)
+{
+	void *xstate = get_xsave_addr(&tsk->thread.fpu.fpstate->regs.xsave, xfeature_nr);
+
+	if (WARN_ON_ONCE(!xstate))
+		return -EINVAL;
+	return __xsave_wrmsrs(xstate, xmsr, num_msrs);
+}
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -106,8 +106,7 @@ int shstk_setup(void)
 {
 	struct thread_shstk *shstk = &current->thread.shstk;
 	unsigned long addr, size;
-	void *xstate;
-	int err;
+	struct xstate_msr xmsr[2];
 
 	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
 	    shstk->size ||
@@ -119,13 +118,10 @@ int shstk_setup(void)
 	if (IS_ERR_VALUE(addr))
 		return 1;
 
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
-	err = xsave_wrmsrl(xstate, MSR_IA32_PL3_SSP, addr + size);
-	if (!err)
-		err = xsave_wrmsrl(xstate, MSR_IA32_U_CET, CET_SHSTK_EN);
-	end_update_xsave_msrs();
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_PL3_SSP, .val = addr + size };
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_U_CET, .val = CET_SHSTK_EN };
 
-	if (err) {
+	if (xsave_wrmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr))) {
 		/*
 		 * Don't leak shadow stack if something went wrong with writing the
 		 * msrs. Warn about it because things may be in a weird state.
@@ -150,8 +146,8 @@ int shstk_alloc_thread_stack(struct task
 			     unsigned long stack_size)
 {
 	struct thread_shstk *shstk = &tsk->thread.shstk;
+	struct xstate_msr xmsr[1];
 	unsigned long addr;
-	void *xstate;
 
 	/*
 	 * If shadow stack is not enabled on the new thread, skip any
@@ -183,15 +179,6 @@ int shstk_alloc_thread_stack(struct task
 	if (in_compat_syscall())
 		stack_size /= 4;
 
-	/*
-	 * 'tsk' is configured with a shadow stack and the fpu.state is
-	 * up to date since it was just copied from the parent.  There
-	 * must be a valid non-init CET state location in the buffer.
-	 */
-	xstate = get_xsave_buffer_unsafe(&tsk->thread.fpu, XFEATURE_CET_USER);
-	if (WARN_ON_ONCE(!xstate))
-		return -EINVAL;
-
 	stack_size = PAGE_ALIGN(stack_size);
 	addr = alloc_shstk(stack_size, stack_size, false);
 	if (IS_ERR_VALUE(addr)) {
@@ -200,7 +187,11 @@ int shstk_alloc_thread_stack(struct task
 		return PTR_ERR((void *)addr);
 	}
 
-	xsave_wrmsrl_unsafe(xstate, MSR_IA32_PL3_SSP, (u64)(addr + stack_size));
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_PL3_SSP, .val = addr + stack_size };
+	if (xsave_wrmsrs_on_task(tsk, XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr))) {
+		unmap_shadow_stack(addr, stack_size);
+		return 1;
+	}
 	shstk->base = addr;
 	shstk->size = stack_size;
 	return 0;
@@ -232,8 +223,8 @@ void shstk_free(struct task_struct *tsk)
 
 int wrss_control(bool enable)
 {
+	struct xstate_msr xmsr[1] = {[0] = { .msr = MSR_IA32_U_CET, .bitop = 1,}, };
 	struct thread_shstk *shstk = &current->thread.shstk;
-	void *xstate;
 	int err;
 
 	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
@@ -246,13 +237,11 @@ int wrss_control(bool enable)
 	if (!shstk->size || shstk->wrss == enable)
 		return 1;
 
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
 	if (enable)
-		err = xsave_set_clear_bits_msrl(xstate, MSR_IA32_U_CET, CET_WRSS_EN, 0);
+		xmsr[0].set = CET_WRSS_EN;
 	else
-		err = xsave_set_clear_bits_msrl(xstate, MSR_IA32_U_CET, 0, CET_WRSS_EN);
-	end_update_xsave_msrs();
-
+		xmsr[0].clear = CET_WRSS_EN;
+	err = xsave_wrmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr));
 	if (err)
 		return 1;
 
@@ -263,7 +252,7 @@ int wrss_control(bool enable)
 int shstk_disable(void)
 {
 	struct thread_shstk *shstk = &current->thread.shstk;
-	void *xstate;
+	struct xstate_msr xmsr[2];
 	int err;
 
 	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
@@ -271,14 +260,11 @@ int shstk_disable(void)
 	    !shstk->base)
 		return 1;
 
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
-	/* Disable WRSS too when disabling shadow stack */
-	err = xsave_set_clear_bits_msrl(xstate, MSR_IA32_U_CET, 0,
-					CET_SHSTK_EN | CET_WRSS_EN);
-	if (!err)
-		err = xsave_wrmsrl(xstate, MSR_IA32_PL3_SSP, 0);
-	end_update_xsave_msrs();
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_U_CET, .bitop = 1,
+		.set = 0, .clear = CET_SHSTK_EN | CET_WRSS_EN };
+	xmsr[1] = (struct xstate_msr) { .msr = MSR_IA32_PL3_SSP, .val = 0 };
 
+	err = xsave_wrmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr));
 	if (err)
 		return 1;
 
@@ -289,16 +275,10 @@ int shstk_disable(void)
 
 static unsigned long get_user_shstk_addr(void)
 {
-	void *xstate;
-	unsigned long long ssp;
-
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
-
-	xsave_rdmsrl(xstate, MSR_IA32_PL3_SSP, &ssp);
-
-	end_update_xsave_msrs();
+	struct xstate_msr xmsr[1] = { [0] = {.msr = MSR_IA32_PL3_SSP, }, };
 
-	return ssp;
+	xsave_rdmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr));
+	return xmsr[0].val;
 }
 
 /*
@@ -385,8 +365,8 @@ int shstk_check_rstor_token(bool proc32,
 int setup_signal_shadow_stack(int proc32, void __user *restorer)
 {
 	struct thread_shstk *shstk = &current->thread.shstk;
+	struct xstate_msr xmsr[1];
 	unsigned long new_ssp;
-	void *xstate;
 	int err;
 
 	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) || !shstk->size)
@@ -397,18 +377,15 @@ int setup_signal_shadow_stack(int proc32
 	if (err)
 		return err;
 
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
-	err = xsave_wrmsrl(xstate, MSR_IA32_PL3_SSP, new_ssp);
-	end_update_xsave_msrs();
-
-	return err;
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_PL3_SSP, .val = new_ssp };
+	return xsave_wrmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr));
 }
 
 int restore_signal_shadow_stack(void)
 {
 	struct thread_shstk *shstk = &current->thread.shstk;
-	void *xstate;
 	int proc32 = in_ia32_syscall();
+	struct xstate_msr xmsr[1];
 	unsigned long new_ssp;
 	int err;
 
@@ -419,11 +396,8 @@ int restore_signal_shadow_stack(void)
 	if (err)
 		return err;
 
-	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
-	err = xsave_wrmsrl(xstate, MSR_IA32_PL3_SSP, new_ssp);
-	end_update_xsave_msrs();
-
-	return err;
+	xmsr[0] = (struct xstate_msr) { .msr = MSR_IA32_PL3_SSP, .val = new_ssp };
+	return xsave_wrmsrs(XFEATURE_CET_USER, xmsr, ARRAY_SIZE(xmsr));
 }
 
 SYSCALL_DEFINE2(map_shadow_stack, unsigned long, size, unsigned int, flags)
Edgecombe, Rick P Feb. 9, 2022, 7:55 p.m. UTC | #2
On Tue, 2022-02-08 at 09:51 +0100, Thomas Gleixner wrote:
> I like the approach in principle, but you still expose the xstate
> internals via the void pointer. It's just a question of time that
> this
> is type casted and abused in interesting ways.

Thanks for taking a look. I have to say though, these changes are
making me scratch my head a bit. Should we really design around callers
digging into mysterious pointers with magic offsets instead of using
easy helpers full of warnings about pitfalls? It should look odd in a
code review too I would think.

> 
> Something like the below untested (on top of the whole series)
> preserves
> the encapsulation and reduces the code at the call sites.
> 
> 
It loses the ability to know which MSR write actually failed. It also
loses the ability to perform read/write logic under a single
transaction. The latter is not needed for this series, but this snippet
from the IBT series does it:

int ibt_get_clear_wait_endbr(void)
{
	void *xstate;
	u64 msr_val = 0;

	if (!current->thread.shstk.ibt)
		return 0;

	xstate = start_update_xsave_msrs(XFEATURE_CET_USER);
	if (!xsave_rdmsrl(xstate, MSR_IA32_U_CET, &msr_val))
		xsave_wrmsrl(xstate, MSR_IA32_U_CET, msr_val &
~CET_WAIT_ENDBR);
	end_update_xsave_msrs();

	return msr_val & CET_WAIT_ENDBR;
}

I suppose we could just add a new function to do that logic in a single
transaction when the time comes. But inventing data structures to
describe work to be passed off to some executor always seems to break
at the first new requirement. What I usually wanted was a programming
language, and I already had it.

Not to bikeshed though, it will still get the job done.
Dave Hansen Feb. 12, 2022, 12:27 a.m. UTC | #3
On 1/30/22 13:18, Rick Edgecombe wrote:
> Add helpers that can be used to modify supervisor xstate safely for the
> current task.

This should be at the end of the changelog.

> State for supervisors xstate based features can be live and
> accesses via MSR's, or saved in memory in an xsave buffer. When the
> kernel needs to modify this state it needs to be sure to operate on it
> in the right place, so the modifications don't get clobbered.

We tend to call these "supervisor xfeatures".  The "state is in the
registers" we call "active".  Maybe:

	Just like user xfeatures, supervisor xfeatures can be either
	active in the registers or inactive and present in the task FPU
	buffer.  If the registers are active, the registers can be
	modified directly.  If the registers are not active, the
	modification must be performed on the task FPU buffer.


> In the past supervisor xstate features have used get_xsave_addr()
> directly, and performed open coded logic handle operating on the saved
> state correctly. This has posed two problems:
>  1. It has logic that has been gotten wrong more than once.
>  2. To reduce code, less common path's are not optimized. Determination

			   "paths" ^


> xstate = start_update_xsave_msrs(XFEATURE_FOO);
> r = xsave_rdmsrl(state, MSR_IA32_FOO_1, &val)
> if (r)
> 	xsave_wrmsrl(state, MSR_IA32_FOO_2, FOO_ENABLE);
> end_update_xsave_msrs();

This looks OK.  I'm not thrilled about it.  The
start_update_xsave_msrs() can probably drop the "_msrs".  Maybe:

	start_xfeature_update(...);

Also, if you have to do the address lookup in xsave_rdmsrl() anyway, I
wonder if the 'xstate' should just be a full fledged 'struct xregs_state'.

The other option would be to make a little on-stack structure like:

	struct xsave_update {
		int feature;
		struct xregs_state *xregs;
	};

Then you do:

	struct xsave_update xsu;
	...
	start_update_xsave_msrs(&xsu, XFEATURE_FOO);

and then pass it along to each of the other operations:

	r = xsave_rdmsrl(xsu, MSR_IA32_FOO_1, &val)

It's slightly less likely to get type confused as a 'void *';

> +static u64 *__get_xsave_member(void *xstate, u32 msr)
> +{
> +	switch (msr) {
> +	/* Currently there are no MSR's supported */
> +	default:
> +		WARN_ONCE(1, "x86/fpu: unsupported xstate msr (%u)\n", msr);
> +		return NULL;
> +	}
> +}

Just to get an idea what this is doing, it's OK to include the shadow
stack MSRs in here.

Are you sure this should return a u64*?  We have lots of <=64-bit XSAVE
fields.

> +/*
> + * Return a pointer to the xstate for the feature if it should be used, or NULL
> + * if the MSRs should be written to directly. To do this safely, using the
> + * associated read/write helpers is required.
> + */
> +void *start_update_xsave_msrs(int xfeature_nr)
> +{
> +	void *xstate;
> +
> +	/*
> +	 * fpregs_lock() only disables preemption (mostly). So modifing state

							 modifying ^
	
> +	 * in an interrupt could screw up some in progress fpregs operation,

						^ in-progress

> +	 * but appear to work. Warn about it.
> +	 */
> +	WARN_ON_ONCE(!in_task());
> +	WARN_ON_ONCE(current->flags & PF_KTHREAD);

This might also be a good spot to check that xfeature_nr is in
fpstate.xfeatures.

> +	fpregs_lock();
> +
> +	fpregs_assert_state_consistent();
> +
> +	/*
> +	 * If the registers don't need to be reloaded. Go ahead and operate on the
> +	 * registers.
> +	 */
> +	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
> +		return NULL;
> +
> +	xstate = get_xsave_addr(&current->thread.fpu.fpstate->regs.xsave, xfeature_nr);
> +
> +	/*
> +	 * If regs are in the init state, they can't be retrieved from
> +	 * init_fpstate due to the init optimization, but are not nessarily

							necessarily ^

Spell checker time.  ":set spell" in vim works for me nicely.

> +	 * zero. The only option is to restore to make everything live and
> +	 * operate on registers. This will clear TIF_NEED_FPU_LOAD.
> +	 *
> +	 * Otherwise, if not in the init state but TIF_NEED_FPU_LOAD is set,
> +	 * operate on the buffer. The registers will be restored before going
> +	 * to userspace in any case, but the task might get preempted before
> +	 * then, so this possibly saves an xsave.
> +	 */
> +	if (!xstate)
> +		fpregs_restore_userregs();

Won't fpregs_restore_userregs() end up setting TIF_NEED_FPU_LOAD=0?
Isn't that a case where a "return NULL" is needed?

In any case, this makes me think this code should start out stupid and
slow.  Keep the API as-is, but make the first patch unconditionally do
the WRMSR.  Leave the "fast" buffer modifications for a follow-on patch.
Edgecombe, Rick P Feb. 12, 2022, 2:31 a.m. UTC | #4
On Fri, 2022-02-11 at 16:27 -0800, Dave Hansen wrote:
> On 1/30/22 13:18, Rick Edgecombe wrote:
> > Add helpers that can be used to modify supervisor xstate safely for
> > the
> > current task.
> 
> This should be at the end of the changelog.

Hmm, ok.

> 
> > State for supervisors xstate based features can be live and
> > accesses via MSR's, or saved in memory in an xsave buffer. When the
> > kernel needs to modify this state it needs to be sure to operate on
> > it
> > in the right place, so the modifications don't get clobbered.
> 
> We tend to call these "supervisor xfeatures".  The "state is in the
> registers" we call "active".  Maybe:
> 
> 	Just like user xfeatures, supervisor xfeatures can be either
> 	active in the registers or inactive and present in the task FPU
> 	buffer.  If the registers are active, the registers can be
> 	modified directly.  If the registers are not active, the
> 	modification must be performed on the task FPU buffer.

Ok, thanks.

> 
> 
> > In the past supervisor xstate features have used get_xsave_addr()
> > directly, and performed open coded logic handle operating on the
> > saved
> > state correctly. This has posed two problems:
> >  1. It has logic that has been gotten wrong more than once.
> >  2. To reduce code, less common path's are not optimized.
> > Determination
> 
> 			   "paths" ^
> 

Arg, thanks.

> 
> > xstate = start_update_xsave_msrs(XFEATURE_FOO);
> > r = xsave_rdmsrl(state, MSR_IA32_FOO_1, &val)
> > if (r)
> > 	xsave_wrmsrl(state, MSR_IA32_FOO_2, FOO_ENABLE);
> > end_update_xsave_msrs();
> 
> This looks OK.  I'm not thrilled about it.  The
> start_update_xsave_msrs() can probably drop the "_msrs".  Maybe:
> 
> 	start_xfeature_update(...);

Hmm, this whole thing pretends to be updating MSRs, which is often not
true. Maybe the xsave_rdmsrl/xsave_wrmsrl should be renamed too.
xsave_readl()/xsave_writel() or something.

> 
> Also, if you have to do the address lookup in xsave_rdmsrl() anyway,
> I
> wonder if the 'xstate' should just be a full fledged 'struct
> xregs_state'.
> 
> The other option would be to make a little on-stack structure like:
> 
> 	struct xsave_update {
> 		int feature;
> 		struct xregs_state *xregs;
> 	};
> 
> Then you do:
> 
> 	struct xsave_update xsu;
> 	...
> 	start_update_xsave_msrs(&xsu, XFEATURE_FOO);
> 
> and then pass it along to each of the other operations:
> 
> 	r = xsave_rdmsrl(xsu, MSR_IA32_FOO_1, &val)
> 
> It's slightly less likely to get type confused as a 'void *';

The 'void *' is actually a pointer to the specific xfeature in the
buffer. So the read/writes don't have to re-compute the offset every
time. It's not too much work though. I'm really surprised by the desire
to obfuscate the pointer, but I guess if we really want to, I'd rather
do that and keep the regular read/write operations.

If we don't care about the extra lookups this can totally drop the
caller side state. The feature nr can be looked up from the MSR along
with the struct offset. Then it doesn't expose the pointer to the
buffer, since it's all recomputed every operation.

So like:
start_xfeature_update();
r = xsave_readl(MSR_IA32_FOO_1, &val)
if (r)
        xsave_writel(MSR_IA32_FOO_2, FOO_ENABLE);
end_xfeature_update();

The WARNs then happen in the read/writes. An early iteration looked
like that. I liked this version with caller side state, but thought it
might be worth revisiting if there really is a strong desire to hide
the pointer.

> 
> > +static u64 *__get_xsave_member(void *xstate, u32 msr)
> > +{
> > +	switch (msr) {
> > +	/* Currently there are no MSR's supported */
> > +	default:
> > +		WARN_ONCE(1, "x86/fpu: unsupported xstate msr (%u)\n",
> > msr);
> > +		return NULL;
> > +	}
> > +}
> 
> Just to get an idea what this is doing, it's OK to include the shadow
> stack MSRs in here.

Ok.

> 
> Are you sure this should return a u64*?  We have lots of <=64-bit
> XSAVE
> fields.

I thought it should only be used with 64 bit msrs. Maybe it needs a
better name?

> 
> > +/*
> > + * Return a pointer to the xstate for the feature if it should be
> > used, or NULL
> > + * if the MSRs should be written to directly. To do this safely,
> > using the
> > + * associated read/write helpers is required.
> > + */
> > +void *start_update_xsave_msrs(int xfeature_nr)
> > +{
> > +	void *xstate;
> > +
> > +	/*
> > +	 * fpregs_lock() only disables preemption (mostly). So modifing
> > state
> 
> 							 modifying ^
> 	
> > +	 * in an interrupt could screw up some in progress fpregs
> > operation,
> 
> 						^ in-progress

I swear I ran checkpatch...

> 
> > +	 * but appear to work. Warn about it.
> > +	 */
> > +	WARN_ON_ONCE(!in_task());
> > +	WARN_ON_ONCE(current->flags & PF_KTHREAD);
> 
> This might also be a good spot to check that xfeature_nr is in
> fpstate.xfeatures.

Hmm, good idea.

> 
> > +	fpregs_lock();
> > +
> > +	fpregs_assert_state_consistent();
> > +
> > +	/*
> > +	 * If the registers don't need to be reloaded. Go ahead and
> > operate on the
> > +	 * registers.
> > +	 */
> > +	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
> > +		return NULL;
> > +
> > +	xstate = get_xsave_addr(&current->thread.fpu.fpstate-
> > >regs.xsave, xfeature_nr);
> > +
> > +	/*
> > +	 * If regs are in the init state, they can't be retrieved from
> > +	 * init_fpstate due to the init optimization, but are not
> > nessarily
> 
> 							necessarily ^

Oof, thanks.

> 
> Spell checker time.  ":set spell" in vim works for me nicely.
> 
> > +	 * zero. The only option is to restore to make everything live
> > and
> > +	 * operate on registers. This will clear TIF_NEED_FPU_LOAD.
> > +	 *
> > +	 * Otherwise, if not in the init state but TIF_NEED_FPU_LOAD is
> > set,
> > +	 * operate on the buffer. The registers will be restored before
> > going
> > +	 * to userspace in any case, but the task might get preempted
> > before
> > +	 * then, so this possibly saves an xsave.
> > +	 */
> > +	if (!xstate)
> > +		fpregs_restore_userregs();
> 
> Won't fpregs_restore_userregs() end up setting TIF_NEED_FPU_LOAD=0?
> Isn't that a case where a "return NULL" is needed?

This is for the case when the feature is in the init state. For CET's 
case this could just zero the buffer and return the pointer to it, but
for other features the init state wasn't always zero. So this just
makes all the features "active" and TIF_NEED_FPU_LOAD is cleared. It
then returns NULL and the read/writes go to the MSRs. It still looks
correct to me, am I missing something?

> 
> In any case, this makes me think this code should start out stupid
> and
> slow.  Keep the API as-is, but make the first patch unconditionally
> do
> the WRMSR.  Leave the "fast" buffer modifications for a follow-on
> patch.

Ok. Should I drop the optimized versions from the series or just split
them out? The optimizations were trying to address Boris' comments:
https://lore.kernel.org/lkml/YS95VzrNhDhFpsop@zn.tnic/
diff mbox series

Patch

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index c83b3020350a..6aec27984b62 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -165,4 +165,9 @@  static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
 struct task_struct;
 extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
 
+void *start_update_xsave_msrs(int xfeature_nr);
+void end_update_xsave_msrs(void);
+int xsave_rdmsrl(void *state, unsigned int msr, unsigned long long *p);
+int xsave_wrmsrl(void *state, u32 msr, u64 val);
+int xsave_set_clear_bits_msrl(void *state, u32 msr, u64 set, u64 clear);
 #endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 44397202762b..c5e20e0d0725 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1867,3 +1867,137 @@  int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+static u64 *__get_xsave_member(void *xstate, u32 msr)
+{
+	switch (msr) {
+	/* Currently there are no MSR's supported */
+	default:
+		WARN_ONCE(1, "x86/fpu: unsupported xstate msr (%u)\n", msr);
+		return NULL;
+	}
+}
+
+/*
+ * Return a pointer to the xstate for the feature if it should be used, or NULL
+ * if the MSRs should be written to directly. To do this safely, using the
+ * associated read/write helpers is required.
+ */
+void *start_update_xsave_msrs(int xfeature_nr)
+{
+	void *xstate;
+
+	/*
+	 * fpregs_lock() only disables preemption (mostly). So modifing state
+	 * in an interrupt could screw up some in progress fpregs operation,
+	 * but appear to work. Warn about it.
+	 */
+	WARN_ON_ONCE(!in_task());
+	WARN_ON_ONCE(current->flags & PF_KTHREAD);
+
+	fpregs_lock();
+
+	fpregs_assert_state_consistent();
+
+	/*
+	 * If the registers don't need to be reloaded. Go ahead and operate on the
+	 * registers.
+	 */
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		return NULL;
+
+	xstate = get_xsave_addr(&current->thread.fpu.fpstate->regs.xsave, xfeature_nr);
+
+	/*
+	 * If regs are in the init state, they can't be retrieved from
+	 * init_fpstate due to the init optimization, but are not nessarily
+	 * zero. The only option is to restore to make everything live and
+	 * operate on registers. This will clear TIF_NEED_FPU_LOAD.
+	 *
+	 * Otherwise, if not in the init state but TIF_NEED_FPU_LOAD is set,
+	 * operate on the buffer. The registers will be restored before going
+	 * to userspace in any case, but the task might get preempted before
+	 * then, so this possibly saves an xsave.
+	 */
+	if (!xstate)
+		fpregs_restore_userregs();
+	return xstate;
+}
+
+void end_update_xsave_msrs(void)
+{
+	fpregs_unlock();
+}
+
+/*
+ * When TIF_NEED_FPU_LOAD is set and fpregs_state_valid() is true, the saved
+ * state and fp state match. In this case, the kernel has some good options -
+ * it can skip the restore before returning to userspace or it could skip
+ * an xsave if preempted before then.
+ *
+ * But if this correspondence is broken by either a write to the in-memory
+ * buffer or the registers, the kernel needs to be notified so it doesn't miss
+ * an xsave or restore. __xsave_msrl_prepare_write() peforms this check and
+ * notifies the kernel if needed. Use before writes only, to not take away
+ * the kernel's options when not required.
+ *
+ * If TIF_NEED_FPU_LOAD is set, then the logic in start_update_xsave_msrs()
+ * must have resulted in targeting the in-memory state, so invaliding the
+ * registers is the right thing to do.
+ */
+static void __xsave_msrl_prepare_write(void)
+{
+	if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
+	    fpregs_state_valid(&current->thread.fpu, smp_processor_id()))
+		__fpu_invalidate_fpregs_state(&current->thread.fpu);
+}
+
+int xsave_rdmsrl(void *xstate, unsigned int msr, unsigned long long *p)
+{
+	u64 *member_ptr;
+
+	if (!xstate)
+		return rdmsrl_safe(msr, p);
+
+	member_ptr = __get_xsave_member(xstate, msr);
+	if (!member_ptr)
+		return 1;
+
+	*p = *member_ptr;
+
+	return 0;
+}
+
+int xsave_wrmsrl(void *xstate, u32 msr, u64 val)
+{
+	u64 *member_ptr;
+
+	__xsave_msrl_prepare_write();
+	if (!xstate)
+		return wrmsrl_safe(msr, val);
+
+	member_ptr = __get_xsave_member(xstate, msr);
+	if (!member_ptr)
+		return 1;
+
+	*member_ptr = val;
+
+	return 0;
+}
+
+int xsave_set_clear_bits_msrl(void *xstate, u32 msr, u64 set, u64 clear)
+{
+	u64 val, new_val;
+	int ret;
+
+	ret = xsave_rdmsrl(xstate, msr, &val);
+	if (ret)
+		return ret;
+
+	new_val = (val & ~clear) | set;
+
+	if (new_val != val)
+		return xsave_wrmsrl(xstate, msr, new_val);
+
+	return 0;
+}