Message ID | 1626229291-6569-7-git-send-email-anshuman.khandual@arm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | arm64/mm: Enable FEAT_LPA2 (52 bits PA support on 4K|16K pages) | expand |
On 14/07/2021 03:21, Anshuman Khandual wrote: > FEAT_LPA2 requires different PTE representation formats for both 4K and 16K > page size config. This adds FEAT_LPA2 specific new PTE encodings as per ARM > ARM (0487G.A) which updates [pte|phys]_to_[phys|pte](). The updated helpers > would be used when FEAT_LPA2 gets enabled via CONFIG_ARM64_PA_BITS_52 on 4K > and 16K page size. Although TTBR encoding and phys_to_ttbr() helper remains > the same as FEAT_LPA for FEAT_LPA2 as well. It updates 'phys_to_pte' helper > to accept a temporary variable and changes impacted call sites. > > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> > --- > arch/arm64/include/asm/assembler.h | 23 +++++++++++++++++++---- > arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ > arch/arm64/include/asm/pgtable.h | 4 ++++ > arch/arm64/kernel/head.S | 25 +++++++++++++------------ > 4 files changed, 40 insertions(+), 16 deletions(-) > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h > index fedc202..0492543 100644 > --- a/arch/arm64/include/asm/assembler.h > +++ b/arch/arm64/include/asm/assembler.h > @@ -606,7 +606,7 @@ alternative_endif > #endif > .endm > > - .macro phys_to_pte, pte, phys > + .macro phys_to_pte, pte, phys, tmp > #ifdef CONFIG_ARM64_PA_BITS_52_LPA > /* > * We assume \phys is 64K aligned and this is guaranteed by only > @@ -614,6 +614,17 @@ alternative_endif > */ > orr \pte, \phys, \phys, lsr #36 > and \pte, \pte, #PTE_ADDR_MASK > +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) > + orr \pte, \phys, \phys, lsr #42 > + > + /* > + * The 'tmp' is being used here to just prepare > + * and hold PTE_ADDR_MASK which cannot be passed > + * to the subsequent 'and' instruction. > + */ > + mov \tmp, #PTE_ADDR_LOW > + orr \tmp, \tmp, #PTE_ADDR_HIGH > + and \pte, \pte, \tmp Rather than adding an extra temporary register (and the fallout of various other macros needing an extra register), this can be done with two AND instructions: /* PTE_ADDR_MASK cannot be encoded as an immediate, so * mask off all but two bits, followed by masking the * extra two bits */ and \pte, \pte, #PTE_ADDR_MASK | (3 << 10) and \pte, \pte, #~(3 << 10) Steve > #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ > mov \pte, \phys > #endif /* CONFIG_ARM64_PA_BITS_52_LPA */ > @@ -621,9 +632,13 @@ alternative_endif > > .macro pte_to_phys, phys, pte > #ifdef CONFIG_ARM64_PA_BITS_52_LPA > - ubfiz \phys, \pte, #(48 - 16 - 12), #16 > - bfxil \phys, \pte, #16, #32 > - lsl \phys, \phys, #16 > + ubfiz \phys, \pte, #(48 - PAGE_SHIFT - 12), #16 > + bfxil \phys, \pte, #PAGE_SHIFT, #(48 - PAGE_SHIFT) > + lsl \phys, \phys, #PAGE_SHIFT > +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) > + ubfiz \phys, \pte, #(52 - PAGE_SHIFT - 10), #10 > + bfxil \phys, \pte, #PAGE_SHIFT, #(50 - PAGE_SHIFT) > + lsl \phys, \phys, #PAGE_SHIFT > #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ > and \phys, \pte, #PTE_ADDR_MASK > #endif /* CONFIG_ARM64_PA_BITS_52_LPA */ > diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h > index f375bcf..c815a85 100644 > --- a/arch/arm64/include/asm/pgtable-hwdef.h > +++ b/arch/arm64/include/asm/pgtable-hwdef.h > @@ -159,6 +159,10 @@ > #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) > #define PTE_ADDR_HIGH (_AT(pteval_t, 0xf) << 12) > #define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) > +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) > +#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (50 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) > +#define PTE_ADDR_HIGH (_AT(pteval_t, 0x3) << 8) > +#define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) > #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ > #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) > #define PTE_ADDR_MASK PTE_ADDR_LOW > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > index 3c57fb2..5e7e402 100644 > --- a/arch/arm64/include/asm/pgtable.h > +++ b/arch/arm64/include/asm/pgtable.h > @@ -70,6 +70,10 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; > #define __pte_to_phys(pte) \ > ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36)) > #define __phys_to_pte_val(phys) (((phys) | ((phys) >> 36)) & PTE_ADDR_MASK) > +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) > +#define __pte_to_phys(pte) \ > + ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 42)) > +#define __phys_to_pte_val(phys) (((phys) | ((phys) >> 42)) & PTE_ADDR_MASK) > #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ > #define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) > #define __phys_to_pte_val(phys) (phys) > diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S > index c5c994a..6444147 100644 > --- a/arch/arm64/kernel/head.S > +++ b/arch/arm64/kernel/head.S > @@ -134,9 +134,9 @@ SYM_CODE_END(preserve_boot_args) > * Corrupts: ptrs, tmp1, tmp2 > * Returns: tbl -> next level table page address > */ > - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 > + .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2, tmp3 > add \tmp1, \tbl, #PAGE_SIZE > - phys_to_pte \tmp2, \tmp1 > + phys_to_pte \tmp2, \tmp1, \tmp3 > orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type > lsr \tmp1, \virt, #\shift > sub \ptrs, \ptrs, #1 > @@ -161,8 +161,8 @@ SYM_CODE_END(preserve_boot_args) > * Corrupts: index, tmp1 > * Returns: rtbl > */ > - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 > -.Lpe\@: phys_to_pte \tmp1, \rtbl > + .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1, tmp2 > +.Lpe\@: phys_to_pte \tmp1, \rtbl, \tmp2 > orr \tmp1, \tmp1, \flags // tmp1 = table entry > str \tmp1, [\tbl, \index, lsl #3] > add \rtbl, \rtbl, \inc // rtbl = pa next level > @@ -224,31 +224,32 @@ SYM_CODE_END(preserve_boot_args) > * Preserves: vstart, vend, flags > * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv > */ > - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv > + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, \ > + tmp, tmp1, count, sv > add \rtbl, \tbl, #PAGE_SIZE > mov \sv, \rtbl > mov \count, #0 > compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count > - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp > + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 > mov \tbl, \sv > mov \sv, \rtbl > > #if SWAPPER_PGTABLE_LEVELS > 3 > compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count > - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp > + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 > mov \tbl, \sv > mov \sv, \rtbl > #endif > > #if SWAPPER_PGTABLE_LEVELS > 2 > compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count > - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp > + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 > mov \tbl, \sv > #endif > > compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count > bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 > - populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp > + populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp, \tmp1 > .endm > > /* > @@ -343,7 +344,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) > #endif > > mov x4, EXTRA_PTRS > - create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 > + create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6, x20 > #else > /* > * If VA_BITS == 48, we don't have to configure an additional > @@ -356,7 +357,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) > ldr_l x4, idmap_ptrs_per_pgd > adr_l x6, __idmap_text_end // __pa(__idmap_text_end) > > - map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 > + map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x20, x13, x14 > > /* > * Map the kernel image (starting with PHYS_OFFSET). > @@ -370,7 +371,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) > sub x6, x6, x3 // _end - _text > add x6, x6, x5 // runtime __va(_end) > > - map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 > + map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x20, x13, x14 > > /* > * Since the page tables have been populated with non-cacheable >
On 7/14/21 9:08 PM, Steven Price wrote: > On 14/07/2021 03:21, Anshuman Khandual wrote: >> FEAT_LPA2 requires different PTE representation formats for both 4K and 16K >> page size config. This adds FEAT_LPA2 specific new PTE encodings as per ARM >> ARM (0487G.A) which updates [pte|phys]_to_[phys|pte](). The updated helpers >> would be used when FEAT_LPA2 gets enabled via CONFIG_ARM64_PA_BITS_52 on 4K >> and 16K page size. Although TTBR encoding and phys_to_ttbr() helper remains >> the same as FEAT_LPA for FEAT_LPA2 as well. It updates 'phys_to_pte' helper >> to accept a temporary variable and changes impacted call sites. >> >> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> >> --- >> arch/arm64/include/asm/assembler.h | 23 +++++++++++++++++++---- >> arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ >> arch/arm64/include/asm/pgtable.h | 4 ++++ >> arch/arm64/kernel/head.S | 25 +++++++++++++------------ >> 4 files changed, 40 insertions(+), 16 deletions(-) >> >> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h >> index fedc202..0492543 100644 >> --- a/arch/arm64/include/asm/assembler.h >> +++ b/arch/arm64/include/asm/assembler.h >> @@ -606,7 +606,7 @@ alternative_endif >> #endif >> .endm >> >> - .macro phys_to_pte, pte, phys >> + .macro phys_to_pte, pte, phys, tmp >> #ifdef CONFIG_ARM64_PA_BITS_52_LPA >> /* >> * We assume \phys is 64K aligned and this is guaranteed by only >> @@ -614,6 +614,17 @@ alternative_endif >> */ >> orr \pte, \phys, \phys, lsr #36 >> and \pte, \pte, #PTE_ADDR_MASK >> +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) >> + orr \pte, \phys, \phys, lsr #42 >> + >> + /* >> + * The 'tmp' is being used here to just prepare >> + * and hold PTE_ADDR_MASK which cannot be passed >> + * to the subsequent 'and' instruction. >> + */ >> + mov \tmp, #PTE_ADDR_LOW >> + orr \tmp, \tmp, #PTE_ADDR_HIGH >> + and \pte, \pte, \tmp > Rather than adding an extra temporary register (and the fallout of > various other macros needing an extra register), this can be done with > two AND instructions: I would really like to get rid of the 'tmp' variable here as well but did not figure out any method of accomplishing it. > > /* PTE_ADDR_MASK cannot be encoded as an immediate, so > * mask off all but two bits, followed by masking the > * extra two bits > */ > and \pte, \pte, #PTE_ADDR_MASK | (3 << 10) > and \pte, \pte, #~(3 << 10) Did this change as suggested --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -626,9 +626,8 @@ alternative_endif * and hold PTE_ADDR_MASK which cannot be passed * to the subsequent 'and' instruction. */ - mov \tmp, #PTE_ADDR_LOW - orr \tmp, \tmp, #PTE_ADDR_HIGH - and \pte, \pte, \tmp + and \pte, \pte, #PTE_ADDR_MASK | (0x3 << 10) + and \pte, \pte, #~(0x3 << 10) .Lskip_lpa2\@: mov \pte, \phys but still fails to build (tested on 16K) arch/arm64/kernel/head.S: Assembler messages: arch/arm64/kernel/head.S:377: Error: immediate out of range at operand 3 -- `and x6,x6,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)'
On 16/07/2021 08:20, Anshuman Khandual wrote: > > > On 7/14/21 9:08 PM, Steven Price wrote: >> On 14/07/2021 03:21, Anshuman Khandual wrote: >>> FEAT_LPA2 requires different PTE representation formats for both 4K and 16K >>> page size config. This adds FEAT_LPA2 specific new PTE encodings as per ARM >>> ARM (0487G.A) which updates [pte|phys]_to_[phys|pte](). The updated helpers >>> would be used when FEAT_LPA2 gets enabled via CONFIG_ARM64_PA_BITS_52 on 4K >>> and 16K page size. Although TTBR encoding and phys_to_ttbr() helper remains >>> the same as FEAT_LPA for FEAT_LPA2 as well. It updates 'phys_to_pte' helper >>> to accept a temporary variable and changes impacted call sites. >>> >>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> >>> --- >>> arch/arm64/include/asm/assembler.h | 23 +++++++++++++++++++---- >>> arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ >>> arch/arm64/include/asm/pgtable.h | 4 ++++ >>> arch/arm64/kernel/head.S | 25 +++++++++++++------------ >>> 4 files changed, 40 insertions(+), 16 deletions(-) >>> >>> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h >>> index fedc202..0492543 100644 >>> --- a/arch/arm64/include/asm/assembler.h >>> +++ b/arch/arm64/include/asm/assembler.h >>> @@ -606,7 +606,7 @@ alternative_endif >>> #endif >>> .endm >>> >>> - .macro phys_to_pte, pte, phys >>> + .macro phys_to_pte, pte, phys, tmp >>> #ifdef CONFIG_ARM64_PA_BITS_52_LPA >>> /* >>> * We assume \phys is 64K aligned and this is guaranteed by only >>> @@ -614,6 +614,17 @@ alternative_endif >>> */ >>> orr \pte, \phys, \phys, lsr #36 >>> and \pte, \pte, #PTE_ADDR_MASK >>> +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) >>> + orr \pte, \phys, \phys, lsr #42 >>> + >>> + /* >>> + * The 'tmp' is being used here to just prepare >>> + * and hold PTE_ADDR_MASK which cannot be passed >>> + * to the subsequent 'and' instruction. >>> + */ >>> + mov \tmp, #PTE_ADDR_LOW >>> + orr \tmp, \tmp, #PTE_ADDR_HIGH >>> + and \pte, \pte, \tmp >> Rather than adding an extra temporary register (and the fallout of >> various other macros needing an extra register), this can be done with >> two AND instructions: > > I would really like to get rid of the 'tmp' variable here as > well but did not figure out any method of accomplishing it. > >> >> /* PTE_ADDR_MASK cannot be encoded as an immediate, so >> * mask off all but two bits, followed by masking the >> * extra two bits >> */ >> and \pte, \pte, #PTE_ADDR_MASK | (3 << 10) >> and \pte, \pte, #~(3 << 10) > > Did this change as suggested > > --- a/arch/arm64/include/asm/assembler.h > +++ b/arch/arm64/include/asm/assembler.h > @@ -626,9 +626,8 @@ alternative_endif > * and hold PTE_ADDR_MASK which cannot be passed > * to the subsequent 'and' instruction. > */ > - mov \tmp, #PTE_ADDR_LOW > - orr \tmp, \tmp, #PTE_ADDR_HIGH > - and \pte, \pte, \tmp > + and \pte, \pte, #PTE_ADDR_MASK | (0x3 << 10) > + and \pte, \pte, #~(0x3 << 10) > > .Lskip_lpa2\@: > mov \pte, \phys > > > but still fails to build (tested on 16K) > > arch/arm64/kernel/head.S: Assembler messages: > arch/arm64/kernel/head.S:377: Error: immediate out of range at operand 3 -- `and x6,x6,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' > arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' > arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' > arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' > arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' > Ah, I'd only tested this for 4k. 16k would require a different set of masks. So the bits we need to cover are those from just below PAGE_SHIFT to the top of PTE_ADDR_HIGH (bit 10). So we can compute the mask for both 4k and 16k with GENMASK(PAGE_SHIFT-1, 10): and \pte, \pte, #PTE_ADDR_MASK | GENMASK(PAGE_SHIFT - 1, 10) and \pte, \pte, #~GENMASK(PAGE_SHIFT - 1, 10) This compiles (for both 4k and 16k) and the assembly looks correct, but I've not done any other testing. Steve
On 7/16/21 3:32 PM, Steven Price wrote: > On 16/07/2021 08:20, Anshuman Khandual wrote: >> >> >> On 7/14/21 9:08 PM, Steven Price wrote: >>> On 14/07/2021 03:21, Anshuman Khandual wrote: >>>> FEAT_LPA2 requires different PTE representation formats for both 4K and 16K >>>> page size config. This adds FEAT_LPA2 specific new PTE encodings as per ARM >>>> ARM (0487G.A) which updates [pte|phys]_to_[phys|pte](). The updated helpers >>>> would be used when FEAT_LPA2 gets enabled via CONFIG_ARM64_PA_BITS_52 on 4K >>>> and 16K page size. Although TTBR encoding and phys_to_ttbr() helper remains >>>> the same as FEAT_LPA for FEAT_LPA2 as well. It updates 'phys_to_pte' helper >>>> to accept a temporary variable and changes impacted call sites. >>>> >>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> >>>> --- >>>> arch/arm64/include/asm/assembler.h | 23 +++++++++++++++++++---- >>>> arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ >>>> arch/arm64/include/asm/pgtable.h | 4 ++++ >>>> arch/arm64/kernel/head.S | 25 +++++++++++++------------ >>>> 4 files changed, 40 insertions(+), 16 deletions(-) >>>> >>>> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h >>>> index fedc202..0492543 100644 >>>> --- a/arch/arm64/include/asm/assembler.h >>>> +++ b/arch/arm64/include/asm/assembler.h >>>> @@ -606,7 +606,7 @@ alternative_endif >>>> #endif >>>> .endm >>>> >>>> - .macro phys_to_pte, pte, phys >>>> + .macro phys_to_pte, pte, phys, tmp >>>> #ifdef CONFIG_ARM64_PA_BITS_52_LPA >>>> /* >>>> * We assume \phys is 64K aligned and this is guaranteed by only >>>> @@ -614,6 +614,17 @@ alternative_endif >>>> */ >>>> orr \pte, \phys, \phys, lsr #36 >>>> and \pte, \pte, #PTE_ADDR_MASK >>>> +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) >>>> + orr \pte, \phys, \phys, lsr #42 >>>> + >>>> + /* >>>> + * The 'tmp' is being used here to just prepare >>>> + * and hold PTE_ADDR_MASK which cannot be passed >>>> + * to the subsequent 'and' instruction. >>>> + */ >>>> + mov \tmp, #PTE_ADDR_LOW >>>> + orr \tmp, \tmp, #PTE_ADDR_HIGH >>>> + and \pte, \pte, \tmp >>> Rather than adding an extra temporary register (and the fallout of >>> various other macros needing an extra register), this can be done with >>> two AND instructions: >> >> I would really like to get rid of the 'tmp' variable here as >> well but did not figure out any method of accomplishing it. >> >>> >>> /* PTE_ADDR_MASK cannot be encoded as an immediate, so >>> * mask off all but two bits, followed by masking the >>> * extra two bits >>> */ >>> and \pte, \pte, #PTE_ADDR_MASK | (3 << 10) >>> and \pte, \pte, #~(3 << 10) >> >> Did this change as suggested >> >> --- a/arch/arm64/include/asm/assembler.h >> +++ b/arch/arm64/include/asm/assembler.h >> @@ -626,9 +626,8 @@ alternative_endif >> * and hold PTE_ADDR_MASK which cannot be passed >> * to the subsequent 'and' instruction. >> */ >> - mov \tmp, #PTE_ADDR_LOW >> - orr \tmp, \tmp, #PTE_ADDR_HIGH >> - and \pte, \pte, \tmp >> + and \pte, \pte, #PTE_ADDR_MASK | (0x3 << 10) >> + and \pte, \pte, #~(0x3 << 10) >> >> .Lskip_lpa2\@: >> mov \pte, \phys >> >> >> but still fails to build (tested on 16K) >> >> arch/arm64/kernel/head.S: Assembler messages: >> arch/arm64/kernel/head.S:377: Error: immediate out of range at operand 3 -- `and x6,x6,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' >> arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' >> arch/arm64/kernel/head.S:390: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' >> arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' >> arch/arm64/kernel/head.S:404: Error: immediate out of range at operand 3 -- `and x12,x12,#((((1<<(50-14))-1)<<14)|(0x3<<8))|(0x3<<10)' >> > > Ah, I'd only tested this for 4k. 16k would require a different set of masks. > > So the bits we need to cover are those from just below PAGE_SHIFT to the > top of PTE_ADDR_HIGH (bit 10). So we can compute the mask for both 4k Okay. > and 16k with GENMASK(PAGE_SHIFT-1, 10): > > and \pte, \pte, #PTE_ADDR_MASK | GENMASK(PAGE_SHIFT - 1, 10) > and \pte, \pte, #~GENMASK(PAGE_SHIFT - 1, 10) > > This compiles (for both 4k and 16k) and the assembly looks correct, but > I've not done any other testing. Yeah it works, will do the change.
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index fedc202..0492543 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -606,7 +606,7 @@ alternative_endif #endif .endm - .macro phys_to_pte, pte, phys + .macro phys_to_pte, pte, phys, tmp #ifdef CONFIG_ARM64_PA_BITS_52_LPA /* * We assume \phys is 64K aligned and this is guaranteed by only @@ -614,6 +614,17 @@ alternative_endif */ orr \pte, \phys, \phys, lsr #36 and \pte, \pte, #PTE_ADDR_MASK +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) + orr \pte, \phys, \phys, lsr #42 + + /* + * The 'tmp' is being used here to just prepare + * and hold PTE_ADDR_MASK which cannot be passed + * to the subsequent 'and' instruction. + */ + mov \tmp, #PTE_ADDR_LOW + orr \tmp, \tmp, #PTE_ADDR_HIGH + and \pte, \pte, \tmp #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ mov \pte, \phys #endif /* CONFIG_ARM64_PA_BITS_52_LPA */ @@ -621,9 +632,13 @@ alternative_endif .macro pte_to_phys, phys, pte #ifdef CONFIG_ARM64_PA_BITS_52_LPA - ubfiz \phys, \pte, #(48 - 16 - 12), #16 - bfxil \phys, \pte, #16, #32 - lsl \phys, \phys, #16 + ubfiz \phys, \pte, #(48 - PAGE_SHIFT - 12), #16 + bfxil \phys, \pte, #PAGE_SHIFT, #(48 - PAGE_SHIFT) + lsl \phys, \phys, #PAGE_SHIFT +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) + ubfiz \phys, \pte, #(52 - PAGE_SHIFT - 10), #10 + bfxil \phys, \pte, #PAGE_SHIFT, #(50 - PAGE_SHIFT) + lsl \phys, \phys, #PAGE_SHIFT #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ and \phys, \pte, #PTE_ADDR_MASK #endif /* CONFIG_ARM64_PA_BITS_52_LPA */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index f375bcf..c815a85 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -159,6 +159,10 @@ #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #define PTE_ADDR_HIGH (_AT(pteval_t, 0xf) << 12) #define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) +#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (50 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) +#define PTE_ADDR_HIGH (_AT(pteval_t, 0x3) << 8) +#define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #define PTE_ADDR_MASK PTE_ADDR_LOW diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3c57fb2..5e7e402 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -70,6 +70,10 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define __pte_to_phys(pte) \ ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36)) #define __phys_to_pte_val(phys) (((phys) | ((phys) >> 36)) & PTE_ADDR_MASK) +#elif defined(CONFIG_ARM64_PA_BITS_52_LPA2) +#define __pte_to_phys(pte) \ + ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 42)) +#define __phys_to_pte_val(phys) (((phys) | ((phys) >> 42)) & PTE_ADDR_MASK) #else /* !CONFIG_ARM64_PA_BITS_52_LPA */ #define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) #define __phys_to_pte_val(phys) (phys) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index c5c994a..6444147 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -134,9 +134,9 @@ SYM_CODE_END(preserve_boot_args) * Corrupts: ptrs, tmp1, tmp2 * Returns: tbl -> next level table page address */ - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 + .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2, tmp3 add \tmp1, \tbl, #PAGE_SIZE - phys_to_pte \tmp2, \tmp1 + phys_to_pte \tmp2, \tmp1, \tmp3 orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type lsr \tmp1, \virt, #\shift sub \ptrs, \ptrs, #1 @@ -161,8 +161,8 @@ SYM_CODE_END(preserve_boot_args) * Corrupts: index, tmp1 * Returns: rtbl */ - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 -.Lpe\@: phys_to_pte \tmp1, \rtbl + .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1, tmp2 +.Lpe\@: phys_to_pte \tmp1, \rtbl, \tmp2 orr \tmp1, \tmp1, \flags // tmp1 = table entry str \tmp1, [\tbl, \index, lsl #3] add \rtbl, \rtbl, \inc // rtbl = pa next level @@ -224,31 +224,32 @@ SYM_CODE_END(preserve_boot_args) * Preserves: vstart, vend, flags * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, \ + tmp, tmp1, count, sv add \rtbl, \tbl, #PAGE_SIZE mov \sv, \rtbl mov \count, #0 compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 mov \tbl, \sv mov \sv, \rtbl #if SWAPPER_PGTABLE_LEVELS > 3 compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 mov \tbl, \sv mov \sv, \rtbl #endif #if SWAPPER_PGTABLE_LEVELS > 2 compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp, \tmp1 mov \tbl, \sv #endif compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp + populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp, \tmp1 .endm /* @@ -343,7 +344,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) #endif mov x4, EXTRA_PTRS - create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 + create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6, x20 #else /* * If VA_BITS == 48, we don't have to configure an additional @@ -356,7 +357,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) ldr_l x4, idmap_ptrs_per_pgd adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 + map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x20, x13, x14 /* * Map the kernel image (starting with PHYS_OFFSET). @@ -370,7 +371,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 + map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x20, x13, x14 /* * Since the page tables have been populated with non-cacheable
FEAT_LPA2 requires different PTE representation formats for both 4K and 16K page size config. This adds FEAT_LPA2 specific new PTE encodings as per ARM ARM (0487G.A) which updates [pte|phys]_to_[phys|pte](). The updated helpers would be used when FEAT_LPA2 gets enabled via CONFIG_ARM64_PA_BITS_52 on 4K and 16K page size. Although TTBR encoding and phys_to_ttbr() helper remains the same as FEAT_LPA for FEAT_LPA2 as well. It updates 'phys_to_pte' helper to accept a temporary variable and changes impacted call sites. Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> --- arch/arm64/include/asm/assembler.h | 23 +++++++++++++++++++---- arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ arch/arm64/include/asm/pgtable.h | 4 ++++ arch/arm64/kernel/head.S | 25 +++++++++++++------------ 4 files changed, 40 insertions(+), 16 deletions(-)