Message ID | 20210527235118.88C9831B@viggo.jf.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/pkeys: PKRU manipulation bug fixes and cleanups | expand |
On 28.5.2021 2.51, Dave Hansen wrote: > From: Dave Hansen <dave.hansen@linux.intel.com> > > There are two points in the kernel which write to PKRU in a buggy way: > > * In switch_fpu_finish(), where having xfeatures[PKRU]=0 will result > in PKRU being assigned 'init_pkru_value' instead of 0x0. > * In write_pkru(), xfeatures[PKRU]=0 will result in PKRU having the > correct value, but the XSAVE buffer will remain stale because > xfeatures is not updated. > > Both of them screw up the fact that get_xsave_addr() will return NULL > for PKRU when it is in the XSAVE "init state". This went unnoticed > until now because on Intel hardware XINUSE[PKRU] is never 0 because > of the kernel policy around 'init_pkru_value'. AMD hardware, on the > other hand, can set XINUSE[PKRU]=0 via a normal WRPKRU(0). The > handy selftests somewhat accidentally produced a case[2] where > WRPKRU(0) occurs. > > get_xsave_addr() is a horrible interface[1], especially when used for > writing state. It is too easy for callers to be tricked into thinking: > 1. On a NULL return that they have no work to do > 2. On a valid pointer return that they *can* safely write state > without doing more work like setting an xfeatures bit. > > Wrap get_xsave_addr() with some additional infrastructure. Ensure > that callers must declare their intent to read or write to the state. > Inject the init state into both reads *and* writes. This ensures > that writers never have to deal with detritus from previous state. > > The new common xstate infrastructure: > > xstatebuf_get_write_ptr() > and > xfeature_init_space() > > should be quite usable for other xfeatures with trivial updates to > xfeature_init_space(). My hope is that we can move away from > all use of get_xsave_addr(), replacing it with things like > xstate_read_pkru(). > > The new BUG_ON()s are not great. But, they do represent a severe > violation of expectations and XSAVE state can be security-sensitive > and these represent a truly dazed-and-confused situation. > > 1. I know, I wrote it. I'm really sorry. > 2. https://lore.kernel.org/linux-kselftest/b2e0324a-9125-bb34-9e76-81817df27c48@amd.com/ > > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> > Fixes: 0d714dba1626 ("x86/fpu: Update xstate's PKRU value on write_pkru()") > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Borislav Petkov <bp@alien8.de> > Cc: x86@kernel.org > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Shuah Khan <shuah@kernel.org> > Cc: Babu Moger <babu.moger@amd.com> > Cc: Dave Kleikamp <dave.kleikamp@oracle.com> > Cc: Ram Pai <linuxram@us.ibm.com> > Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > --- > > b/arch/x86/include/asm/fpu/internal.h | 8 -- > b/arch/x86/include/asm/fpu/xstate.h | 111 +++++++++++++++++++++++++++++++--- > b/arch/x86/include/asm/processor.h | 7 ++ > b/arch/x86/kernel/cpu/common.c | 6 - > b/arch/x86/mm/pkeys.c | 6 - > 5 files changed, 115 insertions(+), 23 deletions(-) > > diff -puN arch/x86/include/asm/fpu/internal.h~write_pkru arch/x86/include/asm/fpu/internal.h > --- a/arch/x86/include/asm/fpu/internal.h~write_pkru 2021-05-27 16:40:26.903705463 -0700 > +++ b/arch/x86/include/asm/fpu/internal.h 2021-05-27 16:40:26.919705463 -0700 > @@ -564,7 +564,6 @@ static inline void switch_fpu_prepare(st > static inline void switch_fpu_finish(struct fpu *new_fpu) > { > u32 pkru_val = init_pkru_value; > - struct pkru_state *pk; > > if (!static_cpu_has(X86_FEATURE_FPU)) > return; > @@ -578,11 +577,8 @@ static inline void switch_fpu_finish(str > * PKRU state is switched eagerly because it needs to be valid before we > * return to userland e.g. for a copy_to_user() operation. > */ > - if (current->mm) { > - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); > - if (pk) > - pkru_val = pk->pkru; > - } > + if (current->mm) > + pkru_val = xstate_read_pkru(&new_fpu->state.xsave); > __write_pkru(pkru_val); > > /* > diff -puN arch/x86/include/asm/fpu/xstate.h~write_pkru arch/x86/include/asm/fpu/xstate.h > --- a/arch/x86/include/asm/fpu/xstate.h~write_pkru 2021-05-27 16:40:26.906705463 -0700 > +++ b/arch/x86/include/asm/fpu/xstate.h 2021-05-27 16:40:26.919705463 -0700 > @@ -124,27 +124,124 @@ static inline u32 read_pkru(void) > return 0; > } > > +static inline void xfeature_mark_non_init(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + /* > + * Caller will place data in the @xstate buffer. > + * Mark the xfeature as non-init: > + */ > + xstate->header.xfeatures |= BIT_ULL(xfeature_nr); > +} > + > + > +/* Set the contents of @xfeature_nr to the hardware init state */ > +static inline void xfeature_init_space(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + void *state = get_xsave_addr(xstate, xfeature_nr); > + > + switch (xfeature_nr) { > + case XFEATURE_PKRU: > + /* zero the whole state, including reserved bits */ > + memset(state, 0, sizeof(struct pkru_state)); > + break; > + default: > + BUG(); > + } > +} > + > +/* > + * Called when it is necessary to write to an XSAVE > + * component feature. Guarantees that a future > + * XRSTOR of the 'xstate' buffer will not consider > + * @xfeature_nr to be in its init state. > + * > + * The returned buffer may contain old state. The > + * caller must be prepared to fill the entire buffer. > + * > + * Caller must first ensure that @xfeature_nr is > + * enabled and present in the @xstate buffer. > + */ > +static inline void *xstatebuf_get_write_ptr(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + bool feature_was_init = xstate->header.xfeatures & BIT_ULL(xfeature_nr); > + Maybe bool feature_was_init = !(xstate->header.xfeatures & BIT_ULL(xfeature_nr)); > + /* > + * xcomp_bv represents whether 'xstate' has space for > + * features. If not, something is horribly wrong and > + * a write would corrupt memory. Perhaps xfeature_nr > + * was not enabled. > + */ > + BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); > + > + /* > + * Ensure a sane xfeature_nr, including avoiding > + * confusion with XCOMP_BV_COMPACTED_FORMAT. > + */ > + BUG_ON(xfeature_nr >= XFEATURE_MAX); > + > + /* Prepare xstate for a write to the xfeature: */ > + xfeature_mark_non_init(xstate, xfeature_nr); > + > + /* > + * If xfeature_nr was in the init state, update the buffer > + * to match the state. Ensures that callers can safely > + * write only a part of the state, they are not forced to > + * write it in its entirety. > + */ > + if (feature_was_init) > + xfeature_init_space(xstate, xfeature_nr); > + > + return get_xsave_addr(xstate, xfeature_nr); > +} > + > +/* Caller must ensure X86_FEATURE_OSPKE is enabled. */ > +static inline void xstate_write_pkru(struct xregs_state *xstate, u32 pkru) > +{ > + struct pkru_state *pk; > + > + pk = xstatebuf_get_write_ptr(xstate, XFEATURE_PKRU); > + pk->pkru = pkru; > +} > + > +/* > + * What PKRU value is represented in the 'xstate'? Note, > + * this returns the *architecturally* represented value, > + * not the literal in-memory value. They may be different. > + */ > +static inline u32 xstate_read_pkru(struct xregs_state *xstate) > +{ > + struct pkru_state *pk; > + > + pk = get_xsave_addr(xstate, XFEATURE_PKRU); > + /* > + * If present, pull PKRU out of the XSAVE buffer. > + * Otherwise, use the hardware init value. > + */ > + if (pk) > + return pk->pkru; > + else > + return PKRU_HW_INIT_VALUE; > +} > + > /* > * Update all of the PKRU state for the current task: > * PKRU register and PKRU xstate. > */ > static inline void current_write_pkru(u32 pkru) > { > - struct pkru_state *pk; > - > if (!boot_cpu_has(X86_FEATURE_OSPKE)) > return; > > - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); > - > + fpregs_lock(); > /* > * The PKRU value in xstate needs to be in sync with the value that is > * written to the CPU. The FPU restore on return to userland would > * otherwise load the previous value again. > */ > - fpregs_lock(); > - if (pk) > - pk->pkru = pkru; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, pkru); > __write_pkru(pkru); > fpregs_unlock(); > } > diff -puN arch/x86/include/asm/processor.h~write_pkru arch/x86/include/asm/processor.h > --- a/arch/x86/include/asm/processor.h~write_pkru 2021-05-27 16:40:26.908705463 -0700 > +++ b/arch/x86/include/asm/processor.h 2021-05-27 16:40:26.921705463 -0700 > @@ -854,4 +854,11 @@ enum mds_mitigations { > MDS_MITIGATION_VMWERV, > }; > > +/* > + * The XSAVE architecture defines an "init state" for > + * PKRU. PKRU is set to this value by XRSTOR when it > + * tries to restore PKRU but has on value in the buffer. > + */ > +#define PKRU_HW_INIT_VALUE 0x0 > + > #endif /* _ASM_X86_PROCESSOR_H */ > diff -puN arch/x86/kernel/cpu/common.c~write_pkru arch/x86/kernel/cpu/common.c > --- a/arch/x86/kernel/cpu/common.c~write_pkru 2021-05-27 16:40:26.912705463 -0700 > +++ b/arch/x86/kernel/cpu/common.c 2021-05-27 16:40:26.924705463 -0700 > @@ -466,8 +466,6 @@ static bool pku_disabled; > > static __always_inline void setup_pku(struct cpuinfo_x86 *c) > { > - struct pkru_state *pk; > - > /* check the boot processor, plus compile options for PKU: */ > if (!cpu_feature_enabled(X86_FEATURE_PKU)) > return; > @@ -478,9 +476,7 @@ static __always_inline void setup_pku(st > return; > > cr4_set_bits(X86_CR4_PKE); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (pk) > - pk->pkru = init_pkru_value; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, init_pkru_value); > /* > * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE > * cpuid bit to be set. We need to ensure that we > diff -puN arch/x86/mm/pkeys.c~write_pkru arch/x86/mm/pkeys.c > --- a/arch/x86/mm/pkeys.c~write_pkru 2021-05-27 16:40:26.914705463 -0700 > +++ b/arch/x86/mm/pkeys.c 2021-05-27 16:40:26.926705463 -0700 > @@ -155,7 +155,6 @@ static ssize_t init_pkru_read_file(struc > static ssize_t init_pkru_write_file(struct file *file, > const char __user *user_buf, size_t count, loff_t *ppos) > { > - struct pkru_state *pk; > char buf[32]; > ssize_t len; > u32 new_init_pkru; > @@ -178,10 +177,7 @@ static ssize_t init_pkru_write_file(stru > return -EINVAL; > > WRITE_ONCE(init_pkru_value, new_init_pkru); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (!pk) > - return -EINVAL; > - pk->pkru = new_init_pkru; > + xstate_write_pkru(&init_fpstate.xsave, new_init_pkru); > return count; > } > > _
Noticed a typo in a comment, but I haven't reviewed these thoroughly. Shaggy On 5/27/21 6:51 PM, Dave Hansen wrote: > > From: Dave Hansen <dave.hansen@linux.intel.com> > > There are two points in the kernel which write to PKRU in a buggy way: > > * In switch_fpu_finish(), where having xfeatures[PKRU]=0 will result > in PKRU being assigned 'init_pkru_value' instead of 0x0. > * In write_pkru(), xfeatures[PKRU]=0 will result in PKRU having the > correct value, but the XSAVE buffer will remain stale because > xfeatures is not updated. > > Both of them screw up the fact that get_xsave_addr() will return NULL > for PKRU when it is in the XSAVE "init state". This went unnoticed > until now because on Intel hardware XINUSE[PKRU] is never 0 because > of the kernel policy around 'init_pkru_value'. AMD hardware, on the > other hand, can set XINUSE[PKRU]=0 via a normal WRPKRU(0). The > handy selftests somewhat accidentally produced a case[2] where > WRPKRU(0) occurs. > > get_xsave_addr() is a horrible interface[1], especially when used for > writing state. It is too easy for callers to be tricked into thinking: > 1. On a NULL return that they have no work to do > 2. On a valid pointer return that they *can* safely write state > without doing more work like setting an xfeatures bit. > > Wrap get_xsave_addr() with some additional infrastructure. Ensure > that callers must declare their intent to read or write to the state. > Inject the init state into both reads *and* writes. This ensures > that writers never have to deal with detritus from previous state. > > The new common xstate infrastructure: > > xstatebuf_get_write_ptr() > and > xfeature_init_space() > > should be quite usable for other xfeatures with trivial updates to > xfeature_init_space(). My hope is that we can move away from > all use of get_xsave_addr(), replacing it with things like > xstate_read_pkru(). > > The new BUG_ON()s are not great. But, they do represent a severe > violation of expectations and XSAVE state can be security-sensitive > and these represent a truly dazed-and-confused situation. > > 1. I know, I wrote it. I'm really sorry. > 2. https://lore.kernel.org/linux-kselftest/b2e0324a-9125-bb34-9e76-81817df27c48@amd.com/ > > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> > Fixes: 0d714dba1626 ("x86/fpu: Update xstate's PKRU value on write_pkru()") > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Borislav Petkov <bp@alien8.de> > Cc: x86@kernel.org > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Shuah Khan <shuah@kernel.org> > Cc: Babu Moger <babu.moger@amd.com> > Cc: Dave Kleikamp <dave.kleikamp@oracle.com> > Cc: Ram Pai <linuxram@us.ibm.com> > Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > --- > > b/arch/x86/include/asm/fpu/internal.h | 8 -- > b/arch/x86/include/asm/fpu/xstate.h | 111 +++++++++++++++++++++++++++++++--- > b/arch/x86/include/asm/processor.h | 7 ++ > b/arch/x86/kernel/cpu/common.c | 6 - > b/arch/x86/mm/pkeys.c | 6 - > 5 files changed, 115 insertions(+), 23 deletions(-) > > diff -puN arch/x86/include/asm/fpu/internal.h~write_pkru arch/x86/include/asm/fpu/internal.h > --- a/arch/x86/include/asm/fpu/internal.h~write_pkru 2021-05-27 16:40:26.903705463 -0700 > +++ b/arch/x86/include/asm/fpu/internal.h 2021-05-27 16:40:26.919705463 -0700 > @@ -564,7 +564,6 @@ static inline void switch_fpu_prepare(st > static inline void switch_fpu_finish(struct fpu *new_fpu) > { > u32 pkru_val = init_pkru_value; > - struct pkru_state *pk; > > if (!static_cpu_has(X86_FEATURE_FPU)) > return; > @@ -578,11 +577,8 @@ static inline void switch_fpu_finish(str > * PKRU state is switched eagerly because it needs to be valid before we > * return to userland e.g. for a copy_to_user() operation. > */ > - if (current->mm) { > - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); > - if (pk) > - pkru_val = pk->pkru; > - } > + if (current->mm) > + pkru_val = xstate_read_pkru(&new_fpu->state.xsave); > __write_pkru(pkru_val); > > /* > diff -puN arch/x86/include/asm/fpu/xstate.h~write_pkru arch/x86/include/asm/fpu/xstate.h > --- a/arch/x86/include/asm/fpu/xstate.h~write_pkru 2021-05-27 16:40:26.906705463 -0700 > +++ b/arch/x86/include/asm/fpu/xstate.h 2021-05-27 16:40:26.919705463 -0700 > @@ -124,27 +124,124 @@ static inline u32 read_pkru(void) > return 0; > } > > +static inline void xfeature_mark_non_init(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + /* > + * Caller will place data in the @xstate buffer. > + * Mark the xfeature as non-init: > + */ > + xstate->header.xfeatures |= BIT_ULL(xfeature_nr); > +} > + > + > +/* Set the contents of @xfeature_nr to the hardware init state */ > +static inline void xfeature_init_space(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + void *state = get_xsave_addr(xstate, xfeature_nr); > + > + switch (xfeature_nr) { > + case XFEATURE_PKRU: > + /* zero the whole state, including reserved bits */ > + memset(state, 0, sizeof(struct pkru_state)); > + break; > + default: > + BUG(); > + } > +} > + > +/* > + * Called when it is necessary to write to an XSAVE > + * component feature. Guarantees that a future > + * XRSTOR of the 'xstate' buffer will not consider > + * @xfeature_nr to be in its init state. > + * > + * The returned buffer may contain old state. The > + * caller must be prepared to fill the entire buffer. > + * > + * Caller must first ensure that @xfeature_nr is > + * enabled and present in the @xstate buffer. > + */ > +static inline void *xstatebuf_get_write_ptr(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + bool feature_was_init = xstate->header.xfeatures & BIT_ULL(xfeature_nr); > + > + /* > + * xcomp_bv represents whether 'xstate' has space for > + * features. If not, something is horribly wrong and > + * a write would corrupt memory. Perhaps xfeature_nr > + * was not enabled. > + */ > + BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); > + > + /* > + * Ensure a sane xfeature_nr, including avoiding > + * confusion with XCOMP_BV_COMPACTED_FORMAT. > + */ > + BUG_ON(xfeature_nr >= XFEATURE_MAX); > + > + /* Prepare xstate for a write to the xfeature: */ > + xfeature_mark_non_init(xstate, xfeature_nr); > + > + /* > + * If xfeature_nr was in the init state, update the buffer > + * to match the state. Ensures that callers can safely > + * write only a part of the state, they are not forced to > + * write it in its entirety. > + */ > + if (feature_was_init) > + xfeature_init_space(xstate, xfeature_nr); > + > + return get_xsave_addr(xstate, xfeature_nr); > +} > + > +/* Caller must ensure X86_FEATURE_OSPKE is enabled. */ > +static inline void xstate_write_pkru(struct xregs_state *xstate, u32 pkru) > +{ > + struct pkru_state *pk; > + > + pk = xstatebuf_get_write_ptr(xstate, XFEATURE_PKRU); > + pk->pkru = pkru; > +} > + > +/* > + * What PKRU value is represented in the 'xstate'? Note, > + * this returns the *architecturally* represented value, > + * not the literal in-memory value. They may be different. > + */ > +static inline u32 xstate_read_pkru(struct xregs_state *xstate) > +{ > + struct pkru_state *pk; > + > + pk = get_xsave_addr(xstate, XFEATURE_PKRU); > + /* > + * If present, pull PKRU out of the XSAVE buffer. > + * Otherwise, use the hardware init value. > + */ > + if (pk) > + return pk->pkru; > + else > + return PKRU_HW_INIT_VALUE; > +} > + > /* > * Update all of the PKRU state for the current task: > * PKRU register and PKRU xstate. > */ > static inline void current_write_pkru(u32 pkru) > { > - struct pkru_state *pk; > - > if (!boot_cpu_has(X86_FEATURE_OSPKE)) > return; > > - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); > - > + fpregs_lock(); > /* > * The PKRU value in xstate needs to be in sync with the value that is > * written to the CPU. The FPU restore on return to userland would > * otherwise load the previous value again. > */ > - fpregs_lock(); > - if (pk) > - pk->pkru = pkru; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, pkru); > __write_pkru(pkru); > fpregs_unlock(); > } > diff -puN arch/x86/include/asm/processor.h~write_pkru arch/x86/include/asm/processor.h > --- a/arch/x86/include/asm/processor.h~write_pkru 2021-05-27 16:40:26.908705463 -0700 > +++ b/arch/x86/include/asm/processor.h 2021-05-27 16:40:26.921705463 -0700 > @@ -854,4 +854,11 @@ enum mds_mitigations { > MDS_MITIGATION_VMWERV, > }; > > +/* > + * The XSAVE architecture defines an "init state" for > + * PKRU. PKRU is set to this value by XRSTOR when it > + * tries to restore PKRU but has on value in the buffer. ... but has *no* value ... > + */ > +#define PKRU_HW_INIT_VALUE 0x0 > + > #endif /* _ASM_X86_PROCESSOR_H */ > diff -puN arch/x86/kernel/cpu/common.c~write_pkru arch/x86/kernel/cpu/common.c > --- a/arch/x86/kernel/cpu/common.c~write_pkru 2021-05-27 16:40:26.912705463 -0700 > +++ b/arch/x86/kernel/cpu/common.c 2021-05-27 16:40:26.924705463 -0700 > @@ -466,8 +466,6 @@ static bool pku_disabled; > > static __always_inline void setup_pku(struct cpuinfo_x86 *c) > { > - struct pkru_state *pk; > - > /* check the boot processor, plus compile options for PKU: */ > if (!cpu_feature_enabled(X86_FEATURE_PKU)) > return; > @@ -478,9 +476,7 @@ static __always_inline void setup_pku(st > return; > > cr4_set_bits(X86_CR4_PKE); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (pk) > - pk->pkru = init_pkru_value; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, init_pkru_value); > /* > * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE > * cpuid bit to be set. We need to ensure that we > diff -puN arch/x86/mm/pkeys.c~write_pkru arch/x86/mm/pkeys.c > --- a/arch/x86/mm/pkeys.c~write_pkru 2021-05-27 16:40:26.914705463 -0700 > +++ b/arch/x86/mm/pkeys.c 2021-05-27 16:40:26.926705463 -0700 > @@ -155,7 +155,6 @@ static ssize_t init_pkru_read_file(struc > static ssize_t init_pkru_write_file(struct file *file, > const char __user *user_buf, size_t count, loff_t *ppos) > { > - struct pkru_state *pk; > char buf[32]; > ssize_t len; > u32 new_init_pkru; > @@ -178,10 +177,7 @@ static ssize_t init_pkru_write_file(stru > return -EINVAL; > > WRITE_ONCE(init_pkru_value, new_init_pkru); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (!pk) > - return -EINVAL; > - pk->pkru = new_init_pkru; > + xstate_write_pkru(&init_fpstate.xsave, new_init_pkru); > return count; > } > > _ >
Hi Dave, Thanks for the patches and trying to address the issues. I know these patches are in early stages and there is still a discussion on different ways to address these issues. But I wanted to give a try anyway. Tried to boot the system with these patches. But system did not come up after this patch(#4). System fails very early in the boot process. So, I could'nt collect much logs. It failed both on AMD and Intel machines. I will try to collect more logs. Thanks Babu On 5/27/21 6:51 PM, Dave Hansen wrote: > > From: Dave Hansen <dave.hansen@linux.intel.com> > > There are two points in the kernel which write to PKRU in a buggy way: > > * In switch_fpu_finish(), where having xfeatures[PKRU]=0 will result > in PKRU being assigned 'init_pkru_value' instead of 0x0. > * In write_pkru(), xfeatures[PKRU]=0 will result in PKRU having the > correct value, but the XSAVE buffer will remain stale because > xfeatures is not updated. > > Both of them screw up the fact that get_xsave_addr() will return NULL > for PKRU when it is in the XSAVE "init state". This went unnoticed > until now because on Intel hardware XINUSE[PKRU] is never 0 because > of the kernel policy around 'init_pkru_value'. AMD hardware, on the > other hand, can set XINUSE[PKRU]=0 via a normal WRPKRU(0). The > handy selftests somewhat accidentally produced a case[2] where > WRPKRU(0) occurs. > > get_xsave_addr() is a horrible interface[1], especially when used for > writing state. It is too easy for callers to be tricked into thinking: > 1. On a NULL return that they have no work to do > 2. On a valid pointer return that they *can* safely write state > without doing more work like setting an xfeatures bit. > > Wrap get_xsave_addr() with some additional infrastructure. Ensure > that callers must declare their intent to read or write to the state. > Inject the init state into both reads *and* writes. This ensures > that writers never have to deal with detritus from previous state. > > The new common xstate infrastructure: > > xstatebuf_get_write_ptr() > and > xfeature_init_space() > > should be quite usable for other xfeatures with trivial updates to > xfeature_init_space(). My hope is that we can move away from > all use of get_xsave_addr(), replacing it with things like > xstate_read_pkru(). > > The new BUG_ON()s are not great. But, they do represent a severe > violation of expectations and XSAVE state can be security-sensitive > and these represent a truly dazed-and-confused situation. > > 1. I know, I wrote it. I'm really sorry. > 2. https://lore.kernel.org/linux-kselftest/b2e0324a-9125-bb34-9e76-81817df27c48@amd.com/ > > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> > Fixes: 0d714dba1626 ("x86/fpu: Update xstate's PKRU value on write_pkru()") > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Borislav Petkov <bp@alien8.de> > Cc: x86@kernel.org > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Shuah Khan <shuah@kernel.org> > Cc: Babu Moger <babu.moger@amd.com> > Cc: Dave Kleikamp <dave.kleikamp@oracle.com> > Cc: Ram Pai <linuxram@us.ibm.com> > Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > --- > > b/arch/x86/include/asm/fpu/internal.h | 8 -- > b/arch/x86/include/asm/fpu/xstate.h | 111 +++++++++++++++++++++++++++++++--- > b/arch/x86/include/asm/processor.h | 7 ++ > b/arch/x86/kernel/cpu/common.c | 6 - > b/arch/x86/mm/pkeys.c | 6 - > 5 files changed, 115 insertions(+), 23 deletions(-) > > diff -puN arch/x86/include/asm/fpu/internal.h~write_pkru arch/x86/include/asm/fpu/internal.h > --- a/arch/x86/include/asm/fpu/internal.h~write_pkru 2021-05-27 16:40:26.903705463 -0700 > +++ b/arch/x86/include/asm/fpu/internal.h 2021-05-27 16:40:26.919705463 -0700 > @@ -564,7 +564,6 @@ static inline void switch_fpu_prepare(st > static inline void switch_fpu_finish(struct fpu *new_fpu) > { > u32 pkru_val = init_pkru_value; > - struct pkru_state *pk; > > if (!static_cpu_has(X86_FEATURE_FPU)) > return; > @@ -578,11 +577,8 @@ static inline void switch_fpu_finish(str > * PKRU state is switched eagerly because it needs to be valid before we > * return to userland e.g. for a copy_to_user() operation. > */ > - if (current->mm) { > - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); > - if (pk) > - pkru_val = pk->pkru; > - } > + if (current->mm) > + pkru_val = xstate_read_pkru(&new_fpu->state.xsave); > __write_pkru(pkru_val); > > /* > diff -puN arch/x86/include/asm/fpu/xstate.h~write_pkru arch/x86/include/asm/fpu/xstate.h > --- a/arch/x86/include/asm/fpu/xstate.h~write_pkru 2021-05-27 16:40:26.906705463 -0700 > +++ b/arch/x86/include/asm/fpu/xstate.h 2021-05-27 16:40:26.919705463 -0700 > @@ -124,27 +124,124 @@ static inline u32 read_pkru(void) > return 0; > } > > +static inline void xfeature_mark_non_init(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + /* > + * Caller will place data in the @xstate buffer. > + * Mark the xfeature as non-init: > + */ > + xstate->header.xfeatures |= BIT_ULL(xfeature_nr); > +} > + > + > +/* Set the contents of @xfeature_nr to the hardware init state */ > +static inline void xfeature_init_space(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + void *state = get_xsave_addr(xstate, xfeature_nr); > + > + switch (xfeature_nr) { > + case XFEATURE_PKRU: > + /* zero the whole state, including reserved bits */ > + memset(state, 0, sizeof(struct pkru_state)); > + break; > + default: > + BUG(); > + } > +} > + > +/* > + * Called when it is necessary to write to an XSAVE > + * component feature. Guarantees that a future > + * XRSTOR of the 'xstate' buffer will not consider > + * @xfeature_nr to be in its init state. > + * > + * The returned buffer may contain old state. The > + * caller must be prepared to fill the entire buffer. > + * > + * Caller must first ensure that @xfeature_nr is > + * enabled and present in the @xstate buffer. > + */ > +static inline void *xstatebuf_get_write_ptr(struct xregs_state *xstate, > + int xfeature_nr) > +{ > + bool feature_was_init = xstate->header.xfeatures & BIT_ULL(xfeature_nr); > + > + /* > + * xcomp_bv represents whether 'xstate' has space for > + * features. If not, something is horribly wrong and > + * a write would corrupt memory. Perhaps xfeature_nr > + * was not enabled. > + */ > + BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); > + > + /* > + * Ensure a sane xfeature_nr, including avoiding > + * confusion with XCOMP_BV_COMPACTED_FORMAT. > + */ > + BUG_ON(xfeature_nr >= XFEATURE_MAX); > + > + /* Prepare xstate for a write to the xfeature: */ > + xfeature_mark_non_init(xstate, xfeature_nr); > + > + /* > + * If xfeature_nr was in the init state, update the buffer > + * to match the state. Ensures that callers can safely > + * write only a part of the state, they are not forced to > + * write it in its entirety. > + */ > + if (feature_was_init) > + xfeature_init_space(xstate, xfeature_nr); > + > + return get_xsave_addr(xstate, xfeature_nr); > +} > + > +/* Caller must ensure X86_FEATURE_OSPKE is enabled. */ > +static inline void xstate_write_pkru(struct xregs_state *xstate, u32 pkru) > +{ > + struct pkru_state *pk; > + > + pk = xstatebuf_get_write_ptr(xstate, XFEATURE_PKRU); > + pk->pkru = pkru; > +} > + > +/* > + * What PKRU value is represented in the 'xstate'? Note, > + * this returns the *architecturally* represented value, > + * not the literal in-memory value. They may be different. > + */ > +static inline u32 xstate_read_pkru(struct xregs_state *xstate) > +{ > + struct pkru_state *pk; > + > + pk = get_xsave_addr(xstate, XFEATURE_PKRU); > + /* > + * If present, pull PKRU out of the XSAVE buffer. > + * Otherwise, use the hardware init value. > + */ > + if (pk) > + return pk->pkru; > + else > + return PKRU_HW_INIT_VALUE; > +} > + > /* > * Update all of the PKRU state for the current task: > * PKRU register and PKRU xstate. > */ > static inline void current_write_pkru(u32 pkru) > { > - struct pkru_state *pk; > - > if (!boot_cpu_has(X86_FEATURE_OSPKE)) > return; > > - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); > - > + fpregs_lock(); > /* > * The PKRU value in xstate needs to be in sync with the value that is > * written to the CPU. The FPU restore on return to userland would > * otherwise load the previous value again. > */ > - fpregs_lock(); > - if (pk) > - pk->pkru = pkru; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, pkru); > __write_pkru(pkru); > fpregs_unlock(); > } > diff -puN arch/x86/include/asm/processor.h~write_pkru arch/x86/include/asm/processor.h > --- a/arch/x86/include/asm/processor.h~write_pkru 2021-05-27 16:40:26.908705463 -0700 > +++ b/arch/x86/include/asm/processor.h 2021-05-27 16:40:26.921705463 -0700 > @@ -854,4 +854,11 @@ enum mds_mitigations { > MDS_MITIGATION_VMWERV, > }; > > +/* > + * The XSAVE architecture defines an "init state" for > + * PKRU. PKRU is set to this value by XRSTOR when it > + * tries to restore PKRU but has on value in the buffer. > + */ > +#define PKRU_HW_INIT_VALUE 0x0 > + > #endif /* _ASM_X86_PROCESSOR_H */ > diff -puN arch/x86/kernel/cpu/common.c~write_pkru arch/x86/kernel/cpu/common.c > --- a/arch/x86/kernel/cpu/common.c~write_pkru 2021-05-27 16:40:26.912705463 -0700 > +++ b/arch/x86/kernel/cpu/common.c 2021-05-27 16:40:26.924705463 -0700 > @@ -466,8 +466,6 @@ static bool pku_disabled; > > static __always_inline void setup_pku(struct cpuinfo_x86 *c) > { > - struct pkru_state *pk; > - > /* check the boot processor, plus compile options for PKU: */ > if (!cpu_feature_enabled(X86_FEATURE_PKU)) > return; > @@ -478,9 +476,7 @@ static __always_inline void setup_pku(st > return; > > cr4_set_bits(X86_CR4_PKE); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (pk) > - pk->pkru = init_pkru_value; > + xstate_write_pkru(¤t->thread.fpu.state.xsave, init_pkru_value); > /* > * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE > * cpuid bit to be set. We need to ensure that we > diff -puN arch/x86/mm/pkeys.c~write_pkru arch/x86/mm/pkeys.c > --- a/arch/x86/mm/pkeys.c~write_pkru 2021-05-27 16:40:26.914705463 -0700 > +++ b/arch/x86/mm/pkeys.c 2021-05-27 16:40:26.926705463 -0700 > @@ -155,7 +155,6 @@ static ssize_t init_pkru_read_file(struc > static ssize_t init_pkru_write_file(struct file *file, > const char __user *user_buf, size_t count, loff_t *ppos) > { > - struct pkru_state *pk; > char buf[32]; > ssize_t len; > u32 new_init_pkru; > @@ -178,10 +177,7 @@ static ssize_t init_pkru_write_file(stru > return -EINVAL; > > WRITE_ONCE(init_pkru_value, new_init_pkru); > - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); > - if (!pk) > - return -EINVAL; > - pk->pkru = new_init_pkru; > + xstate_write_pkru(&init_fpstate.xsave, new_init_pkru); > return count; > } > > _ >
On 6/1/21 4:54 PM, Babu Moger wrote: > Hi Dave, > Thanks for the patches and trying to address the issues. > > I know these patches are in early stages and there is still a discussion > on different ways to address these issues. But I wanted to give a try anyway. > > Tried to boot the system with these patches. But system did not come up > after this patch(#4). System fails very early in the boot process. So, I > could'nt collect much logs. It failed both on AMD and Intel machines. > I will try to collect more logs. > Thanks > Babu Same here. Hitting this line in arch/x86/include/asm/fpu/xstate.h BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); [ 0.384929] kernel BUG at arch/x86/include/asm/fpu/xstate.h:177! [ 0.385529] invalid opcode: 0000 [#1] SMP NOPTI [ 0.386519] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.13.0-rc4+ #1 [ 0.386519] RIP: 0010:identify_cpu+0x5ee/0x5f0 [ 0.386519] Code: ff 0f 0b ff 14 25 e0 72 e4 bc eb 84 48 89 cf be 09 00 00 00 48 89 4d c0 e8 ef 97 ff ff 48 8b 4d c0 48 c7 00 00 00 00 00 eb a5 <0f> 0b 48 8b 05 01 3e b7 01 48 c1 e8 1d 83 e0 01 74 0b 55 48 89 e5 [ 0.386519] RSP: 0000:ffffffffbce03e28 EFLAGS: 00010246 [ 0.386519] RAX: ffffffffbce14940 RBX: ffffffffbd3c0220 RCX: ffffffffbce15e80 [ 0.386519] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000770eb0 [ 0.386519] RBP: ffffffffbce03e68 R08: 0000000000000000 R09: c0000000fffeffff [ 0.386519] R10: 0000000000000001 R11: ffffffffbce03bf8 R12: ffffffffbd6bcd00 [ 0.386519] R13: ffffffffbd6bcd58 R14: 0000000000000000 R15: 0000000000000246 [ 0.386519] FS: 0000000000000000(0000) GS:ffff91280d200000(0000) knlGS:0000000000000000 [ 0.386519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.386519] CR2: ffff924b3fbff000 CR3: 0000017261a0a001 CR4: 0000000000770eb0 [ 0.386519] Call Trace: [ 0.386519] identify_boot_cpu+0x10/0x9a [ 0.386519] check_bugs+0x2a/0xa19 [ 0.386519] start_kernel+0x4c7/0x4fa [ 0.386519] x86_64_start_reservations+0x32/0x34 [ 0.386519] x86_64_start_kernel+0x8a/0x8d [ 0.386519] secondary_startup_64_no_verify+0xc2/0xcb [ 0.386519] Modules linked in: [ 0.386521] ---[ end trace 12db536e6a291746 ]--- [ 0.387520] RIP: 0010:identify_cpu+0x5ee/0x5f0 [ 0.388519] Code: ff 0f 0b ff 14 25 e0 72 e4 bc eb 84 48 89 cf be 09 00 00 00 48 89 4d c0 e8 ef 97 ff ff 48 8b 4d c0 48 c7 00 00 00 00 00 eb a5 <0f> 0b 48 8b 05 01 3e b7 01 48 c1 e8 1d 83 e0 01 74 0b 55 48 89 e5 [ 0.389519] RSP: 0000:ffffffffbce03e28 EFLAGS: 00010246 [ 0.390519] RAX: ffffffffbce14940 RBX: ffffffffbd3c0220 RCX: ffffffffbce15e80 [ 0.391519] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000770eb0 [ 0.392519] RBP: ffffffffbce03e68 R08: 0000000000000000 R09: c0000000fffeffff [ 0.393519] R10: 0000000000000001 R11: ffffffffbce03bf8 R12: ffffffffbd6bcd00 [ 0.394519] R13: ffffffffbd6bcd58 R14: 0000000000000000 R15: 0000000000000246 [ 0.395519] FS: 0000000000000000(0000) GS:ffff91280d200000(0000) knlGS:0000000000000000 [ 0.396519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.397519] CR2: ffff924b3fbff000 CR3: 0000017261a0a001 CR4: 0000000000770eb0 [ 0.398520] Kernel panic - not syncing: Fatal exception [ 0.399519] ---[ end Kernel panic - not syncing: Fatal exception ]--- > > On 5/27/21 6:51 PM, Dave Hansen wrote: >> >> From: Dave Hansen <dave.hansen@linux.intel.com> >> >> There are two points in the kernel which write to PKRU in a buggy way: >> >> * In switch_fpu_finish(), where having xfeatures[PKRU]=0 will result >> in PKRU being assigned 'init_pkru_value' instead of 0x0. >> * In write_pkru(), xfeatures[PKRU]=0 will result in PKRU having the >> correct value, but the XSAVE buffer will remain stale because >> xfeatures is not updated. >> >> Both of them screw up the fact that get_xsave_addr() will return NULL >> for PKRU when it is in the XSAVE "init state". This went unnoticed >> until now because on Intel hardware XINUSE[PKRU] is never 0 because >> of the kernel policy around 'init_pkru_value'. AMD hardware, on the >> other hand, can set XINUSE[PKRU]=0 via a normal WRPKRU(0). The >> handy selftests somewhat accidentally produced a case[2] where >> WRPKRU(0) occurs. >> >> get_xsave_addr() is a horrible interface[1], especially when used for >> writing state. It is too easy for callers to be tricked into thinking: >> 1. On a NULL return that they have no work to do >> 2. On a valid pointer return that they *can* safely write state >> without doing more work like setting an xfeatures bit. >> >> Wrap get_xsave_addr() with some additional infrastructure. Ensure >> that callers must declare their intent to read or write to the state. >> Inject the init state into both reads *and* writes. This ensures >> that writers never have to deal with detritus from previous state. >> >> The new common xstate infrastructure: >> >> xstatebuf_get_write_ptr() >> and >> xfeature_init_space() >> >> should be quite usable for other xfeatures with trivial updates to >> xfeature_init_space(). My hope is that we can move away from >> all use of get_xsave_addr(), replacing it with things like >> xstate_read_pkru(). >> >> The new BUG_ON()s are not great. But, they do represent a severe >> violation of expectations and XSAVE state can be security-sensitive >> and these represent a truly dazed-and-confused situation. >> >> 1. I know, I wrote it. I'm really sorry. >> 2. https://lore.kernel.org/linux-kselftest/b2e0324a-9125-bb34-9e76-81817df27c48@amd.com/ >> >> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> >> Fixes: 0d714dba1626 ("x86/fpu: Update xstate's PKRU value on write_pkru()") >> Cc: Thomas Gleixner <tglx@linutronix.de> >> Cc: Ingo Molnar <mingo@redhat.com> >> Cc: Borislav Petkov <bp@alien8.de> >> Cc: x86@kernel.org >> Cc: Andy Lutomirski <luto@kernel.org> >> Cc: Shuah Khan <shuah@kernel.org> >> Cc: Babu Moger <babu.moger@amd.com> >> Cc: Dave Kleikamp <dave.kleikamp@oracle.com> >> Cc: Ram Pai <linuxram@us.ibm.com> >> Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com> >> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> >> --- >> >> b/arch/x86/include/asm/fpu/internal.h | 8 -- >> b/arch/x86/include/asm/fpu/xstate.h | 111 +++++++++++++++++++++++++++++++--- >> b/arch/x86/include/asm/processor.h | 7 ++ >> b/arch/x86/kernel/cpu/common.c | 6 - >> b/arch/x86/mm/pkeys.c | 6 - >> 5 files changed, 115 insertions(+), 23 deletions(-) >> >> diff -puN arch/x86/include/asm/fpu/internal.h~write_pkru arch/x86/include/asm/fpu/internal.h >> --- a/arch/x86/include/asm/fpu/internal.h~write_pkru 2021-05-27 16:40:26.903705463 -0700 >> +++ b/arch/x86/include/asm/fpu/internal.h 2021-05-27 16:40:26.919705463 -0700 >> @@ -564,7 +564,6 @@ static inline void switch_fpu_prepare(st >> static inline void switch_fpu_finish(struct fpu *new_fpu) >> { >> u32 pkru_val = init_pkru_value; >> - struct pkru_state *pk; >> >> if (!static_cpu_has(X86_FEATURE_FPU)) >> return; >> @@ -578,11 +577,8 @@ static inline void switch_fpu_finish(str >> * PKRU state is switched eagerly because it needs to be valid before we >> * return to userland e.g. for a copy_to_user() operation. >> */ >> - if (current->mm) { >> - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); >> - if (pk) >> - pkru_val = pk->pkru; >> - } >> + if (current->mm) >> + pkru_val = xstate_read_pkru(&new_fpu->state.xsave); >> __write_pkru(pkru_val); >> >> /* >> diff -puN arch/x86/include/asm/fpu/xstate.h~write_pkru arch/x86/include/asm/fpu/xstate.h >> --- a/arch/x86/include/asm/fpu/xstate.h~write_pkru 2021-05-27 16:40:26.906705463 -0700 >> +++ b/arch/x86/include/asm/fpu/xstate.h 2021-05-27 16:40:26.919705463 -0700 >> @@ -124,27 +124,124 @@ static inline u32 read_pkru(void) >> return 0; >> } >> >> +static inline void xfeature_mark_non_init(struct xregs_state *xstate, >> + int xfeature_nr) >> +{ >> + /* >> + * Caller will place data in the @xstate buffer. >> + * Mark the xfeature as non-init: >> + */ >> + xstate->header.xfeatures |= BIT_ULL(xfeature_nr); >> +} >> + >> + >> +/* Set the contents of @xfeature_nr to the hardware init state */ >> +static inline void xfeature_init_space(struct xregs_state *xstate, >> + int xfeature_nr) >> +{ >> + void *state = get_xsave_addr(xstate, xfeature_nr); >> + >> + switch (xfeature_nr) { >> + case XFEATURE_PKRU: >> + /* zero the whole state, including reserved bits */ >> + memset(state, 0, sizeof(struct pkru_state)); >> + break; >> + default: >> + BUG(); >> + } >> +} >> + >> +/* >> + * Called when it is necessary to write to an XSAVE >> + * component feature. Guarantees that a future >> + * XRSTOR of the 'xstate' buffer will not consider >> + * @xfeature_nr to be in its init state. >> + * >> + * The returned buffer may contain old state. The >> + * caller must be prepared to fill the entire buffer. >> + * >> + * Caller must first ensure that @xfeature_nr is >> + * enabled and present in the @xstate buffer. >> + */ >> +static inline void *xstatebuf_get_write_ptr(struct xregs_state *xstate, >> + int xfeature_nr) >> +{ >> + bool feature_was_init = xstate->header.xfeatures & BIT_ULL(xfeature_nr); >> + >> + /* >> + * xcomp_bv represents whether 'xstate' has space for >> + * features. If not, something is horribly wrong and >> + * a write would corrupt memory. Perhaps xfeature_nr >> + * was not enabled. >> + */ >> + BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); >> + >> + /* >> + * Ensure a sane xfeature_nr, including avoiding >> + * confusion with XCOMP_BV_COMPACTED_FORMAT. >> + */ >> + BUG_ON(xfeature_nr >= XFEATURE_MAX); >> + >> + /* Prepare xstate for a write to the xfeature: */ >> + xfeature_mark_non_init(xstate, xfeature_nr); >> + >> + /* >> + * If xfeature_nr was in the init state, update the buffer >> + * to match the state. Ensures that callers can safely >> + * write only a part of the state, they are not forced to >> + * write it in its entirety. >> + */ >> + if (feature_was_init) >> + xfeature_init_space(xstate, xfeature_nr); >> + >> + return get_xsave_addr(xstate, xfeature_nr); >> +} >> + >> +/* Caller must ensure X86_FEATURE_OSPKE is enabled. */ >> +static inline void xstate_write_pkru(struct xregs_state *xstate, u32 pkru) >> +{ >> + struct pkru_state *pk; >> + >> + pk = xstatebuf_get_write_ptr(xstate, XFEATURE_PKRU); >> + pk->pkru = pkru; >> +} >> + >> +/* >> + * What PKRU value is represented in the 'xstate'? Note, >> + * this returns the *architecturally* represented value, >> + * not the literal in-memory value. They may be different. >> + */ >> +static inline u32 xstate_read_pkru(struct xregs_state *xstate) >> +{ >> + struct pkru_state *pk; >> + >> + pk = get_xsave_addr(xstate, XFEATURE_PKRU); >> + /* >> + * If present, pull PKRU out of the XSAVE buffer. >> + * Otherwise, use the hardware init value. >> + */ >> + if (pk) >> + return pk->pkru; >> + else >> + return PKRU_HW_INIT_VALUE; >> +} >> + >> /* >> * Update all of the PKRU state for the current task: >> * PKRU register and PKRU xstate. >> */ >> static inline void current_write_pkru(u32 pkru) >> { >> - struct pkru_state *pk; >> - >> if (!boot_cpu_has(X86_FEATURE_OSPKE)) >> return; >> >> - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); >> - >> + fpregs_lock(); >> /* >> * The PKRU value in xstate needs to be in sync with the value that is >> * written to the CPU. The FPU restore on return to userland would >> * otherwise load the previous value again. >> */ >> - fpregs_lock(); >> - if (pk) >> - pk->pkru = pkru; >> + xstate_write_pkru(¤t->thread.fpu.state.xsave, pkru); >> __write_pkru(pkru); >> fpregs_unlock(); >> } >> diff -puN arch/x86/include/asm/processor.h~write_pkru arch/x86/include/asm/processor.h >> --- a/arch/x86/include/asm/processor.h~write_pkru 2021-05-27 16:40:26.908705463 -0700 >> +++ b/arch/x86/include/asm/processor.h 2021-05-27 16:40:26.921705463 -0700 >> @@ -854,4 +854,11 @@ enum mds_mitigations { >> MDS_MITIGATION_VMWERV, >> }; >> >> +/* >> + * The XSAVE architecture defines an "init state" for >> + * PKRU. PKRU is set to this value by XRSTOR when it >> + * tries to restore PKRU but has on value in the buffer. >> + */ >> +#define PKRU_HW_INIT_VALUE 0x0 >> + >> #endif /* _ASM_X86_PROCESSOR_H */ >> diff -puN arch/x86/kernel/cpu/common.c~write_pkru arch/x86/kernel/cpu/common.c >> --- a/arch/x86/kernel/cpu/common.c~write_pkru 2021-05-27 16:40:26.912705463 -0700 >> +++ b/arch/x86/kernel/cpu/common.c 2021-05-27 16:40:26.924705463 -0700 >> @@ -466,8 +466,6 @@ static bool pku_disabled; >> >> static __always_inline void setup_pku(struct cpuinfo_x86 *c) >> { >> - struct pkru_state *pk; >> - >> /* check the boot processor, plus compile options for PKU: */ >> if (!cpu_feature_enabled(X86_FEATURE_PKU)) >> return; >> @@ -478,9 +476,7 @@ static __always_inline void setup_pku(st >> return; >> >> cr4_set_bits(X86_CR4_PKE); >> - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); >> - if (pk) >> - pk->pkru = init_pkru_value; >> + xstate_write_pkru(¤t->thread.fpu.state.xsave, init_pkru_value); >> /* >> * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE >> * cpuid bit to be set. We need to ensure that we >> diff -puN arch/x86/mm/pkeys.c~write_pkru arch/x86/mm/pkeys.c >> --- a/arch/x86/mm/pkeys.c~write_pkru 2021-05-27 16:40:26.914705463 -0700 >> +++ b/arch/x86/mm/pkeys.c 2021-05-27 16:40:26.926705463 -0700 >> @@ -155,7 +155,6 @@ static ssize_t init_pkru_read_file(struc >> static ssize_t init_pkru_write_file(struct file *file, >> const char __user *user_buf, size_t count, loff_t *ppos) >> { >> - struct pkru_state *pk; >> char buf[32]; >> ssize_t len; >> u32 new_init_pkru; >> @@ -178,10 +177,7 @@ static ssize_t init_pkru_write_file(stru >> return -EINVAL; >> >> WRITE_ONCE(init_pkru_value, new_init_pkru); >> - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); >> - if (!pk) >> - return -EINVAL; >> - pk->pkru = new_init_pkru; >> + xstate_write_pkru(&init_fpstate.xsave, new_init_pkru); >> return count; >> } >> >> _ >>
On 6/1/21 3:43 PM, Dave Kleikamp wrote: > On 6/1/21 4:54 PM, Babu Moger wrote: >> Hi Dave, >> Thanks for the patches and trying to address the issues. >> >> I know these patches are in early stages and there is still a discussion >> on different ways to address these issues. But I wanted to give a try >> anyway. >> >> Tried to boot the system with these patches. But system did not come up >> after this patch(#4). System fails very early in the boot process. So, I >> could'nt collect much logs. It failed both on AMD and Intel machines. >> I will try to collect more logs. >> Thanks >> Babu > > Same here. Hitting this line in arch/x86/include/asm/fpu/xstate.h > > BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); > > [ 0.384929] kernel BUG at arch/x86/include/asm/fpu/xstate.h:177! Thanks for the report. I am, indeed, reworking this code a bit. I'll pay close attention if this code remains in some form.
diff -puN arch/x86/include/asm/fpu/internal.h~write_pkru arch/x86/include/asm/fpu/internal.h --- a/arch/x86/include/asm/fpu/internal.h~write_pkru 2021-05-27 16:40:26.903705463 -0700 +++ b/arch/x86/include/asm/fpu/internal.h 2021-05-27 16:40:26.919705463 -0700 @@ -564,7 +564,6 @@ static inline void switch_fpu_prepare(st static inline void switch_fpu_finish(struct fpu *new_fpu) { u32 pkru_val = init_pkru_value; - struct pkru_state *pk; if (!static_cpu_has(X86_FEATURE_FPU)) return; @@ -578,11 +577,8 @@ static inline void switch_fpu_finish(str * PKRU state is switched eagerly because it needs to be valid before we * return to userland e.g. for a copy_to_user() operation. */ - if (current->mm) { - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); - if (pk) - pkru_val = pk->pkru; - } + if (current->mm) + pkru_val = xstate_read_pkru(&new_fpu->state.xsave); __write_pkru(pkru_val); /* diff -puN arch/x86/include/asm/fpu/xstate.h~write_pkru arch/x86/include/asm/fpu/xstate.h --- a/arch/x86/include/asm/fpu/xstate.h~write_pkru 2021-05-27 16:40:26.906705463 -0700 +++ b/arch/x86/include/asm/fpu/xstate.h 2021-05-27 16:40:26.919705463 -0700 @@ -124,27 +124,124 @@ static inline u32 read_pkru(void) return 0; } +static inline void xfeature_mark_non_init(struct xregs_state *xstate, + int xfeature_nr) +{ + /* + * Caller will place data in the @xstate buffer. + * Mark the xfeature as non-init: + */ + xstate->header.xfeatures |= BIT_ULL(xfeature_nr); +} + + +/* Set the contents of @xfeature_nr to the hardware init state */ +static inline void xfeature_init_space(struct xregs_state *xstate, + int xfeature_nr) +{ + void *state = get_xsave_addr(xstate, xfeature_nr); + + switch (xfeature_nr) { + case XFEATURE_PKRU: + /* zero the whole state, including reserved bits */ + memset(state, 0, sizeof(struct pkru_state)); + break; + default: + BUG(); + } +} + +/* + * Called when it is necessary to write to an XSAVE + * component feature. Guarantees that a future + * XRSTOR of the 'xstate' buffer will not consider + * @xfeature_nr to be in its init state. + * + * The returned buffer may contain old state. The + * caller must be prepared to fill the entire buffer. + * + * Caller must first ensure that @xfeature_nr is + * enabled and present in the @xstate buffer. + */ +static inline void *xstatebuf_get_write_ptr(struct xregs_state *xstate, + int xfeature_nr) +{ + bool feature_was_init = xstate->header.xfeatures & BIT_ULL(xfeature_nr); + + /* + * xcomp_bv represents whether 'xstate' has space for + * features. If not, something is horribly wrong and + * a write would corrupt memory. Perhaps xfeature_nr + * was not enabled. + */ + BUG_ON(!(xstate->header.xcomp_bv & BIT_ULL(xfeature_nr))); + + /* + * Ensure a sane xfeature_nr, including avoiding + * confusion with XCOMP_BV_COMPACTED_FORMAT. + */ + BUG_ON(xfeature_nr >= XFEATURE_MAX); + + /* Prepare xstate for a write to the xfeature: */ + xfeature_mark_non_init(xstate, xfeature_nr); + + /* + * If xfeature_nr was in the init state, update the buffer + * to match the state. Ensures that callers can safely + * write only a part of the state, they are not forced to + * write it in its entirety. + */ + if (feature_was_init) + xfeature_init_space(xstate, xfeature_nr); + + return get_xsave_addr(xstate, xfeature_nr); +} + +/* Caller must ensure X86_FEATURE_OSPKE is enabled. */ +static inline void xstate_write_pkru(struct xregs_state *xstate, u32 pkru) +{ + struct pkru_state *pk; + + pk = xstatebuf_get_write_ptr(xstate, XFEATURE_PKRU); + pk->pkru = pkru; +} + +/* + * What PKRU value is represented in the 'xstate'? Note, + * this returns the *architecturally* represented value, + * not the literal in-memory value. They may be different. + */ +static inline u32 xstate_read_pkru(struct xregs_state *xstate) +{ + struct pkru_state *pk; + + pk = get_xsave_addr(xstate, XFEATURE_PKRU); + /* + * If present, pull PKRU out of the XSAVE buffer. + * Otherwise, use the hardware init value. + */ + if (pk) + return pk->pkru; + else + return PKRU_HW_INIT_VALUE; +} + /* * Update all of the PKRU state for the current task: * PKRU register and PKRU xstate. */ static inline void current_write_pkru(u32 pkru) { - struct pkru_state *pk; - if (!boot_cpu_has(X86_FEATURE_OSPKE)) return; - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); - + fpregs_lock(); /* * The PKRU value in xstate needs to be in sync with the value that is * written to the CPU. The FPU restore on return to userland would * otherwise load the previous value again. */ - fpregs_lock(); - if (pk) - pk->pkru = pkru; + xstate_write_pkru(¤t->thread.fpu.state.xsave, pkru); __write_pkru(pkru); fpregs_unlock(); } diff -puN arch/x86/include/asm/processor.h~write_pkru arch/x86/include/asm/processor.h --- a/arch/x86/include/asm/processor.h~write_pkru 2021-05-27 16:40:26.908705463 -0700 +++ b/arch/x86/include/asm/processor.h 2021-05-27 16:40:26.921705463 -0700 @@ -854,4 +854,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +/* + * The XSAVE architecture defines an "init state" for + * PKRU. PKRU is set to this value by XRSTOR when it + * tries to restore PKRU but has on value in the buffer. + */ +#define PKRU_HW_INIT_VALUE 0x0 + #endif /* _ASM_X86_PROCESSOR_H */ diff -puN arch/x86/kernel/cpu/common.c~write_pkru arch/x86/kernel/cpu/common.c --- a/arch/x86/kernel/cpu/common.c~write_pkru 2021-05-27 16:40:26.912705463 -0700 +++ b/arch/x86/kernel/cpu/common.c 2021-05-27 16:40:26.924705463 -0700 @@ -466,8 +466,6 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - struct pkru_state *pk; - /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -478,9 +476,7 @@ static __always_inline void setup_pku(st return; cr4_set_bits(X86_CR4_PKE); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (pk) - pk->pkru = init_pkru_value; + xstate_write_pkru(¤t->thread.fpu.state.xsave, init_pkru_value); /* * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we diff -puN arch/x86/mm/pkeys.c~write_pkru arch/x86/mm/pkeys.c --- a/arch/x86/mm/pkeys.c~write_pkru 2021-05-27 16:40:26.914705463 -0700 +++ b/arch/x86/mm/pkeys.c 2021-05-27 16:40:26.926705463 -0700 @@ -155,7 +155,6 @@ static ssize_t init_pkru_read_file(struc static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - struct pkru_state *pk; char buf[32]; ssize_t len; u32 new_init_pkru; @@ -178,10 +177,7 @@ static ssize_t init_pkru_write_file(stru return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (!pk) - return -EINVAL; - pk->pkru = new_init_pkru; + xstate_write_pkru(&init_fpstate.xsave, new_init_pkru); return count; }