Message ID | 20211115152835.3212149-14-broonie@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | arm64/sme: Initial support for the Scalable Matrix Extension | expand |
On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote: > diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h > index 7b23b16f21ce..6f8ca04b6566 100644 > --- a/arch/arm64/include/uapi/asm/hwcap.h > +++ b/arch/arm64/include/uapi/asm/hwcap.h > @@ -76,5 +76,13 @@ > #define HWCAP2_BTI (1 << 17) > #define HWCAP2_MTE (1 << 18) > #define HWCAP2_ECV (1 << 19) > +#define HWCAP2_SME (1 << 20) > +#define HWCAP2_SME_I16I64 (1 << 21) > +#define HWCAP2_SME_F64F64 (1 << 22) > +#define HWCAP2_SME_I8I32 (1 << 23) > +#define HWCAP2_SME_F16F32 (1 << 24) > +#define HWCAP2_SME_B16F32 (1 << 25) > +#define HWCAP2_SME_F32F32 (1 << 26) > +#define HWCAP2_SME_FA64 (1 << 27) At this pace we'll need HWCAP3 pretty soon (since we only allocated 32-bit in each). I wonder whether we could instead not bother at all and just provide user-space emulation for ID_AA64SMFR0_EL1. > #endif /* _UAPI__ASM_HWCAP_H */ > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c > index 81824c7ea74f..3cf60819c354 100644 > --- a/arch/arm64/kernel/cpufeature.c > +++ b/arch/arm64/kernel/cpufeature.c > @@ -246,6 +246,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { > }; > > static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { > + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0), > ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), > ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), > ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), > @@ -278,6 +279,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { > ARM64_FTR_END, > }; > > +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0), > + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), > + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0), > + ARM64_FTR_END, > +}; > + > static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { > ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), > ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), > @@ -628,6 +647,7 @@ static const struct __ftr_reg_entry { > ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, > &id_aa64pfr1_override), > ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), > + ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0), > > /* Op1 = 0, CRn = 0, CRm = 5 */ > ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), > @@ -939,6 +959,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) > init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); > init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); > init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); > + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); > > if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) > init_32bit_cpu_features(&info->aarch32); > @@ -2370,6 +2391,30 @@ static const struct arm64_cpu_capabilities arm64_features[] = { > .matches = has_cpuid_feature, > .min_field_value = 1, > }, > +#ifdef CONFIG_ARM64_SME > + { > + .desc = "Scalable Matrix Extension", > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > + .capability = ARM64_SME, > + .sys_reg = SYS_ID_AA64PFR1_EL1, > + .sign = FTR_UNSIGNED, > + .field_pos = ID_AA64PFR1_SME_SHIFT, > + .min_field_value = ID_AA64PFR1_SME, > + .matches = has_cpuid_feature, > + .cpu_enable = sme_kernel_enable, > + }, > + { > + .desc = "FA64", > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > + .capability = ARM64_SME_FA64, > + .sys_reg = SYS_ID_AA64SMFR0_EL1, > + .sign = FTR_UNSIGNED, > + .field_pos = ID_AA64SMFR0_FA64_SHIFT, > + .min_field_value = ID_AA64SMFR0_FA64, > + .matches = has_feature_flag, > + .cpu_enable = fa64_kernel_enable, > + }, I'll comment here rather than the patch introducing has_feature_flag(): an alternative would be to add a .field_width option and in feature_matches() use cpuid_feature_extract_field_width() directly. All the arm64_ftr_bits entries already have a width, so just generalise it for arm64_cpu_capabilities.
On Thu, Dec 09, 2021 at 06:41:26PM +0000, Catalin Marinas wrote: > On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote: > > +#define HWCAP2_SME (1 << 20) > > +#define HWCAP2_SME_I16I64 (1 << 21) > > +#define HWCAP2_SME_F64F64 (1 << 22) > > +#define HWCAP2_SME_I8I32 (1 << 23) > > +#define HWCAP2_SME_F16F32 (1 << 24) > > +#define HWCAP2_SME_B16F32 (1 << 25) > > +#define HWCAP2_SME_F32F32 (1 << 26) > > +#define HWCAP2_SME_FA64 (1 << 27) > At this pace we'll need HWCAP3 pretty soon (since we only allocated > 32-bit in each). I wonder whether we could instead not bother at all and > just provide user-space emulation for ID_AA64SMFR0_EL1. I think so if people are willing to go along with just having userspace check the ID register (IIRC access to it already does the right thing but I need to confirm). We'll also need to think about how we handle any new SVE features, that's got a similar thing going on and is most of the existing usage of HWCAP2. > > + { > > + .desc = "FA64", > > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > > + .capability = ARM64_SME_FA64, > > + .sys_reg = SYS_ID_AA64SMFR0_EL1, > > + .sign = FTR_UNSIGNED, > > + .field_pos = ID_AA64SMFR0_FA64_SHIFT, > > + .min_field_value = ID_AA64SMFR0_FA64, > > + .matches = has_feature_flag, > > + .cpu_enable = fa64_kernel_enable, > > + }, > I'll comment here rather than the patch introducing has_feature_flag(): > an alternative would be to add a .field_width option and in > feature_matches() use cpuid_feature_extract_field_width() directly. All > the arm64_ftr_bits entries already have a width, so just generalise it > for arm64_cpu_capabilities. Sure, if people are happy with that - it's a more invasive change since we don't currently set the widths, I wasn't clear if that was a case of not needing it right now or a design decision.
On Thu, Dec 09, 2021 at 07:28:16PM +0000, Mark Brown wrote: > On Thu, Dec 09, 2021 at 06:41:26PM +0000, Catalin Marinas wrote: > > On Mon, Nov 15, 2021 at 03:28:11PM +0000, Mark Brown wrote: > > > +#define HWCAP2_SME (1 << 20) > > > +#define HWCAP2_SME_I16I64 (1 << 21) > > > +#define HWCAP2_SME_F64F64 (1 << 22) > > > +#define HWCAP2_SME_I8I32 (1 << 23) > > > +#define HWCAP2_SME_F16F32 (1 << 24) > > > +#define HWCAP2_SME_B16F32 (1 << 25) > > > +#define HWCAP2_SME_F32F32 (1 << 26) > > > +#define HWCAP2_SME_FA64 (1 << 27) > > > At this pace we'll need HWCAP3 pretty soon (since we only allocated > > 32-bit in each). I wonder whether we could instead not bother at all and > > just provide user-space emulation for ID_AA64SMFR0_EL1. > > I think so if people are willing to go along with just having userspace > check the ID register (IIRC access to it already does the right thing > but I need to confirm). We'll also need to think about how we handle > any new SVE features, that's got a similar thing going on and is most of > the existing usage of HWCAP2. It would be good to get feedback from the libc people. IIRC the ifunc resolver relies currently on the HWCAP bits. Could this be adapted to use the MRS instruction? We'd still keep the main HWCAP2_SME but without the finer-grained bits. The other option is to start going into the upper 32-bit of the elf_hwcap. We tried to avoid this some time back when we were still having doubts about merging ILP32. > > > + { > > > + .desc = "FA64", > > > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > > > + .capability = ARM64_SME_FA64, > > > + .sys_reg = SYS_ID_AA64SMFR0_EL1, > > > + .sign = FTR_UNSIGNED, > > > + .field_pos = ID_AA64SMFR0_FA64_SHIFT, > > > + .min_field_value = ID_AA64SMFR0_FA64, > > > + .matches = has_feature_flag, > > > + .cpu_enable = fa64_kernel_enable, > > > + }, > > > I'll comment here rather than the patch introducing has_feature_flag(): > > an alternative would be to add a .field_width option and in > > feature_matches() use cpuid_feature_extract_field_width() directly. All > > the arm64_ftr_bits entries already have a width, so just generalise it > > for arm64_cpu_capabilities. > > Sure, if people are happy with that - it's a more invasive change since > we don't currently set the widths, I wasn't clear if that was a case of > not needing it right now or a design decision. We didn't have a field_width since they were all 4 bits until SVE. If you don't want to touch all the entries in the array, we can say that a 0 value (i.e. not explicitly initialised) means default 4 and update it during init_cpu_features().
On Fri, Dec 10, 2021 at 10:41:18AM +0000, Catalin Marinas wrote: > On Thu, Dec 09, 2021 at 07:28:16PM +0000, Mark Brown wrote: > > > > +#define HWCAP2_SME_B16F32 (1 << 25) > > > > +#define HWCAP2_SME_F32F32 (1 << 26) > > > > +#define HWCAP2_SME_FA64 (1 << 27) > > > > > At this pace we'll need HWCAP3 pretty soon (since we only allocated > > > 32-bit in each). I wonder whether we could instead not bother at all and > > > just provide user-space emulation for ID_AA64SMFR0_EL1. > > I think so if people are willing to go along with just having userspace > > check the ID register (IIRC access to it already does the right thing > > but I need to confirm). We'll also need to think about how we handle > > any new SVE features, that's got a similar thing going on and is most of > > the existing usage of HWCAP2. > It would be good to get feedback from the libc people. IIRC the ifunc > resolver relies currently on the HWCAP bits. Could this be adapted to > use the MRS instruction? We'd still keep the main HWCAP2_SME but without > the finer-grained bits. > The other option is to start going into the upper 32-bit of the > elf_hwcap. We tried to avoid this some time back when we were still > having doubts about merging ILP32. I'll leave things as they are just now to give more time for feedback. > > > I'll comment here rather than the patch introducing has_feature_flag(): > > > an alternative would be to add a .field_width option and in > > > feature_matches() use cpuid_feature_extract_field_width() directly. All > > Sure, if people are happy with that - it's a more invasive change since > > we don't currently set the widths, I wasn't clear if that was a case of > > not needing it right now or a design decision. > We didn't have a field_width since they were all 4 bits until SVE. If > you don't want to touch all the entries in the array, we can say that a > 0 value (i.e. not explicitly initialised) means default 4 and update it > during init_cpu_features(). It's annoying to have to update everything but it feels more idiomatic and less likely to cause surprises later on.
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index af106af8e1c0..155f726816f9 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -251,6 +251,39 @@ HWCAP2_ECV Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001. +HWCAP2_SME + + Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described + by Documentation/arm64/sme.rst. + +HWCAP2_SME_I16I64 + + Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111. + +HWCAP2_SME_F64F64 + + Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1. + +HWCAP2_SME_I8I32 + + Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111. + +HWCAP2_SME_F16F32 + + Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1. + +HWCAP2_SME_B16F32 + + Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1. + +HWCAP2_SME_F32F32 + + Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1. + +HWCAP2_SME_FA64 + + Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 0f6d16faa540..667b66fe1a53 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -57,6 +57,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; u64 reg_id_aa64zfr0; + u64 reg_id_aa64smfr0; struct cpuinfo_32bit aarch32; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ef6be92b1921..079e9f15c5eb 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -727,6 +727,18 @@ static __always_inline bool system_supports_sve(void) cpus_have_const_cap(ARM64_SVE); } +static __always_inline bool system_supports_sme(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME); +} + +static __always_inline bool system_supports_fa64(void) +{ + return IS_ENABLED(CONFIG_ARM64_SME) && + cpus_have_const_cap(ARM64_SME_FA64); +} + static __always_inline bool system_supports_cnp(void) { return IS_ENABLED(CONFIG_ARM64_CNP) && diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index cb24385e3632..d5f2825a2412 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -74,6 +74,8 @@ extern void sve_set_vq(unsigned long vq_minus_1); struct arm64_cpu_capabilities; extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused); +extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused); extern u64 read_zcr_features(void); diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index b100e0055eab..dc008749e746 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -106,6 +106,14 @@ #define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) #define KERNEL_HWCAP_MTE __khwcap2_feature(MTE) #define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) +#define KERNEL_HWCAP_SME __khwcap2_feature(SME) +#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64) +#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64) +#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32) +#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32) +#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) +#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) +#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 7b23b16f21ce..6f8ca04b6566 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -76,5 +76,13 @@ #define HWCAP2_BTI (1 << 17) #define HWCAP2_MTE (1 << 18) #define HWCAP2_ECV (1 << 19) +#define HWCAP2_SME (1 << 20) +#define HWCAP2_SME_I16I64 (1 << 21) +#define HWCAP2_SME_F64F64 (1 << 22) +#define HWCAP2_SME_I8I32 (1 << 23) +#define HWCAP2_SME_F16F32 (1 << 24) +#define HWCAP2_SME_B16F32 (1 << 25) +#define HWCAP2_SME_F32F32 (1 << 26) +#define HWCAP2_SME_FA64 (1 << 27) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 81824c7ea74f..3cf60819c354 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -246,6 +246,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), @@ -278,6 +279,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), @@ -628,6 +647,7 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), + ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -939,6 +959,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); @@ -2370,6 +2391,30 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .min_field_value = 1, }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR1_SME_SHIFT, + .min_field_value = ID_AA64PFR1_SME, + .matches = has_cpuid_feature, + .cpu_enable = sme_kernel_enable, + }, + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .sys_reg = SYS_ID_AA64SMFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64SMFR0_FA64_SHIFT, + .min_field_value = ID_AA64SMFR0_FA64, + .matches = has_feature_flag, + .cpu_enable = fa64_kernel_enable, + }, +#endif /* CONFIG_ARM64_SME */ {}, }; @@ -2502,6 +2547,16 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE), #endif /* CONFIG_ARM64_MTE */ HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I16I64_SHIFT, FTR_UNSIGNED, ID_AA64SMFR0_I16I64, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F64F64_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I8I32_SHIFT, FTR_UNSIGNED, ID_AA64SMFR0_I8I32, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F16F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_B16F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP_FLAG(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F32F32_SHIFT, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), +#endif /* CONFIG_ARM64_SME */ {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 6e27b759056a..5b0e9d2643a8 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -95,6 +95,14 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_BTI] = "bti", [KERNEL_HWCAP_MTE] = "mte", [KERNEL_HWCAP_ECV] = "ecv", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", }; #ifdef CONFIG_COMPAT @@ -397,6 +405,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) info->reg_gmid = read_cpuid(GMID_EL1); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4a98cc3b1df1..8dde4593ca73 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -983,6 +983,32 @@ void fpsimd_release_task(struct task_struct *dead_task) #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); +} + +/* + * This must be called after sme_kernel_enable(), we rely on the + * feature table being sorted to ensure this. + */ +void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +#endif /* CONFIG_ARM64_SVE */ + /* * Trapped SVE access * @@ -1525,6 +1551,10 @@ static int __init fpsimd_init(void) if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); + + if (cpu_have_named_feature(SME) && !cpu_have_named_feature(SVE)) + pr_notice("SME is implemented but not SVE\n"); + return sve_sysctl_init(); } core_initcall(fpsimd_init); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 870c39537dd0..58dfc8547c64 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -41,6 +41,8 @@ KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE MTE MTE_ASYMM +SME +SME_FA64 SPECTRE_V2 SPECTRE_V3A SPECTRE_V4
This patch introduces basic cpufeature support for discovering the presence of the Scalable Matrix Extension and reporting hwcaps for the detected features. Signed-off-by: Mark Brown <broonie@kernel.org> --- Documentation/arm64/elf_hwcaps.rst | 33 +++++++++++++++++ arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/cpufeature.h | 12 +++++++ arch/arm64/include/asm/fpsimd.h | 2 ++ arch/arm64/include/asm/hwcap.h | 8 +++++ arch/arm64/include/uapi/asm/hwcap.h | 8 +++++ arch/arm64/kernel/cpufeature.c | 55 +++++++++++++++++++++++++++++ arch/arm64/kernel/cpuinfo.c | 9 +++++ arch/arm64/kernel/fpsimd.c | 30 ++++++++++++++++ arch/arm64/tools/cpucaps | 2 ++ 10 files changed, 160 insertions(+)