diff mbox series

arm64: save movk instructions in mov_q when the lower 16|32 bits are all zero

Message ID 20220709084830.3124-1-jszhang@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: save movk instructions in mov_q when the lower 16|32 bits are all zero | expand

Commit Message

Jisheng Zhang July 9, 2022, 8:48 a.m. UTC
Currently mov_q is used to move a constant into a 64-bit register,
when the lower 16 or 32bits of the constant are all zero, the mov_q
emits one or two useless movk instructions. If the mov_q macro is used
in hot code path, we want to save the movk instructions as much as
possible. For example, when CONFIG_ARM64_MTE is 'Y' and
CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
routine is the pontential optimization target:

        /* set the TCR_EL1 bits */
        mov_q   x10, TCR_MTE_FLAGS

Before the patch:
	mov	x10, #0x10000000000000
	movk	x10, #0x40, lsl #32
	movk	x10, #0x0, lsl #16
	movk	x10, #0x0

After the patch:
	mov	x10, #0x10000000000000
	movk	x10, #0x40, lsl #32

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 arch/arm64/include/asm/assembler.h | 4 ++++
 1 file changed, 4 insertions(+)

Comments

Will Deacon July 19, 2022, 6:13 p.m. UTC | #1
On Sat, Jul 09, 2022 at 04:48:30PM +0800, Jisheng Zhang wrote:
> Currently mov_q is used to move a constant into a 64-bit register,
> when the lower 16 or 32bits of the constant are all zero, the mov_q
> emits one or two useless movk instructions. If the mov_q macro is used
> in hot code path, we want to save the movk instructions as much as
> possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> routine is the pontential optimization target:
> 
>         /* set the TCR_EL1 bits */
>         mov_q   x10, TCR_MTE_FLAGS
> 
> Before the patch:
> 	mov	x10, #0x10000000000000
> 	movk	x10, #0x40, lsl #32
> 	movk	x10, #0x0, lsl #16
> 	movk	x10, #0x0
> 
> After the patch:
> 	mov	x10, #0x10000000000000
> 	movk	x10, #0x40, lsl #32
> 
> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> ---
>  arch/arm64/include/asm/assembler.h | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> index 8c5a61aeaf8e..09f408424cae 100644
> --- a/arch/arm64/include/asm/assembler.h
> +++ b/arch/arm64/include/asm/assembler.h
> @@ -568,9 +568,13 @@ alternative_endif
>  	movz	\reg, :abs_g3:\val
>  	movk	\reg, :abs_g2_nc:\val
>  	.endif
> +	.if ((((\val) >> 16) & 0xffff) != 0)
>  	movk	\reg, :abs_g1_nc:\val
>  	.endif
> +	.endif
> +	.if (((\val) & 0xffff) != 0)
>  	movk	\reg, :abs_g0_nc:\val
> +	.endif

Please provide some numbers showing that this is worthwhile.

Will
Jisheng Zhang July 26, 2022, 1:44 p.m. UTC | #2
On Tue, Jul 19, 2022 at 07:13:41PM +0100, Will Deacon wrote:
> On Sat, Jul 09, 2022 at 04:48:30PM +0800, Jisheng Zhang wrote:
> > Currently mov_q is used to move a constant into a 64-bit register,
> > when the lower 16 or 32bits of the constant are all zero, the mov_q
> > emits one or two useless movk instructions. If the mov_q macro is used
> > in hot code path, we want to save the movk instructions as much as
> > possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> > CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> > routine is the pontential optimization target:
> > 
> >         /* set the TCR_EL1 bits */
> >         mov_q   x10, TCR_MTE_FLAGS
> > 
> > Before the patch:
> > 	mov	x10, #0x10000000000000
> > 	movk	x10, #0x40, lsl #32
> > 	movk	x10, #0x0, lsl #16
> > 	movk	x10, #0x0
> > 
> > After the patch:
> > 	mov	x10, #0x10000000000000
> > 	movk	x10, #0x40, lsl #32
> > 
> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> > ---
> >  arch/arm64/include/asm/assembler.h | 4 ++++
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> > index 8c5a61aeaf8e..09f408424cae 100644
> > --- a/arch/arm64/include/asm/assembler.h
> > +++ b/arch/arm64/include/asm/assembler.h
> > @@ -568,9 +568,13 @@ alternative_endif
> >  	movz	\reg, :abs_g3:\val
> >  	movk	\reg, :abs_g2_nc:\val
> >  	.endif
> > +	.if ((((\val) >> 16) & 0xffff) != 0)
> >  	movk	\reg, :abs_g1_nc:\val
> >  	.endif
> > +	.endif
> > +	.if (((\val) & 0xffff) != 0)
> >  	movk	\reg, :abs_g0_nc:\val
> > +	.endif
> 
> Please provide some numbers showing that this is worthwhile.
> 

No, I have no performance numbers, but here are my opnion
about this patch: the two checks doesn't add maintaince effort, its
readability is good, if the two checks can save two movk instructions,
it's worthwhile to add the checks.


Thanks
Will Deacon July 27, 2022, 8:35 a.m. UTC | #3
On Tue, Jul 26, 2022 at 09:44:40PM +0800, Jisheng Zhang wrote:
> On Tue, Jul 19, 2022 at 07:13:41PM +0100, Will Deacon wrote:
> > On Sat, Jul 09, 2022 at 04:48:30PM +0800, Jisheng Zhang wrote:
> > > Currently mov_q is used to move a constant into a 64-bit register,
> > > when the lower 16 or 32bits of the constant are all zero, the mov_q
> > > emits one or two useless movk instructions. If the mov_q macro is used
> > > in hot code path, we want to save the movk instructions as much as
> > > possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> > > CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> > > routine is the pontential optimization target:
> > > 
> > >         /* set the TCR_EL1 bits */
> > >         mov_q   x10, TCR_MTE_FLAGS
> > > 
> > > Before the patch:
> > > 	mov	x10, #0x10000000000000
> > > 	movk	x10, #0x40, lsl #32
> > > 	movk	x10, #0x0, lsl #16
> > > 	movk	x10, #0x0
> > > 
> > > After the patch:
> > > 	mov	x10, #0x10000000000000
> > > 	movk	x10, #0x40, lsl #32
> > > 
> > > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> > > ---
> > >  arch/arm64/include/asm/assembler.h | 4 ++++
> > >  1 file changed, 4 insertions(+)
> > > 
> > > diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> > > index 8c5a61aeaf8e..09f408424cae 100644
> > > --- a/arch/arm64/include/asm/assembler.h
> > > +++ b/arch/arm64/include/asm/assembler.h
> > > @@ -568,9 +568,13 @@ alternative_endif
> > >  	movz	\reg, :abs_g3:\val
> > >  	movk	\reg, :abs_g2_nc:\val
> > >  	.endif
> > > +	.if ((((\val) >> 16) & 0xffff) != 0)
> > >  	movk	\reg, :abs_g1_nc:\val
> > >  	.endif
> > > +	.endif
> > > +	.if (((\val) & 0xffff) != 0)
> > >  	movk	\reg, :abs_g0_nc:\val
> > > +	.endif
> > 
> > Please provide some numbers showing that this is worthwhile.
> > 
> 
> No, I have no performance numbers, but here are my opnion
> about this patch: the two checks doesn't add maintaince effort, its
> readability is good, if the two checks can save two movk instructions,
> it's worthwhile to add the checks.

Not unless you can measure a performance increase, no. The code is always
going to be more readable without this stuff added so we shouldn't clutter
our low-level assembly macros with nested conditionals just for fun.

Will
Ard Biesheuvel July 27, 2022, 3:15 p.m. UTC | #4
On Sat, 9 Jul 2022 at 01:58, Jisheng Zhang <jszhang@kernel.org> wrote:
>
> Currently mov_q is used to move a constant into a 64-bit register,
> when the lower 16 or 32bits of the constant are all zero, the mov_q
> emits one or two useless movk instructions. If the mov_q macro is used
> in hot code path, we want to save the movk instructions as much as
> possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> routine is the pontential optimization target:
>
>         /* set the TCR_EL1 bits */
>         mov_q   x10, TCR_MTE_FLAGS
>
> Before the patch:
>         mov     x10, #0x10000000000000
>         movk    x10, #0x40, lsl #32
>         movk    x10, #0x0, lsl #16
>         movk    x10, #0x0
>
> After the patch:
>         mov     x10, #0x10000000000000
>         movk    x10, #0x40, lsl #32
>
> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>

This is broken for constants that have 0xffff in the top 16 bits, as
in that case, we will emit a MOVN/MOVK/MOVK sequence, and omitting the
MOVKs will set the corresponding field to 0xffff not 0x0.


> ---
>  arch/arm64/include/asm/assembler.h | 4 ++++
>  1 file changed, 4 insertions(+)
>
> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> index 8c5a61aeaf8e..09f408424cae 100644
> --- a/arch/arm64/include/asm/assembler.h
> +++ b/arch/arm64/include/asm/assembler.h
> @@ -568,9 +568,13 @@ alternative_endif
>         movz    \reg, :abs_g3:\val
>         movk    \reg, :abs_g2_nc:\val
>         .endif
> +       .if ((((\val) >> 16) & 0xffff) != 0)
>         movk    \reg, :abs_g1_nc:\val
>         .endif
> +       .endif
> +       .if (((\val) & 0xffff) != 0)
>         movk    \reg, :abs_g0_nc:\val
> +       .endif
>         .endm
>
>  /*
> --
> 2.34.1
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Jisheng Zhang July 28, 2022, 2:48 p.m. UTC | #5
On Wed, Jul 27, 2022 at 08:15:11AM -0700, Ard Biesheuvel wrote:
> On Sat, 9 Jul 2022 at 01:58, Jisheng Zhang <jszhang@kernel.org> wrote:
> >
> > Currently mov_q is used to move a constant into a 64-bit register,
> > when the lower 16 or 32bits of the constant are all zero, the mov_q
> > emits one or two useless movk instructions. If the mov_q macro is used
> > in hot code path, we want to save the movk instructions as much as
> > possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> > CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> > routine is the pontential optimization target:
> >
> >         /* set the TCR_EL1 bits */
> >         mov_q   x10, TCR_MTE_FLAGS
> >
> > Before the patch:
> >         mov     x10, #0x10000000000000
> >         movk    x10, #0x40, lsl #32
> >         movk    x10, #0x0, lsl #16
> >         movk    x10, #0x0
> >
> > After the patch:
> >         mov     x10, #0x10000000000000
> >         movk    x10, #0x40, lsl #32
> >
> > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> 
> This is broken for constants that have 0xffff in the top 16 bits, as
> in that case, we will emit a MOVN/MOVK/MOVK sequence, and omitting the
> MOVKs will set the corresponding field to 0xffff not 0x0.

Thanks so much for this hint. I think you are right about the 0xffff in
top 16bits case.
Jisheng Zhang July 28, 2022, 3:17 p.m. UTC | #6
On Thu, Jul 28, 2022 at 10:49:02PM +0800, Jisheng Zhang wrote:
> On Wed, Jul 27, 2022 at 08:15:11AM -0700, Ard Biesheuvel wrote:
> > On Sat, 9 Jul 2022 at 01:58, Jisheng Zhang <jszhang@kernel.org> wrote:
> > >
> > > Currently mov_q is used to move a constant into a 64-bit register,
> > > when the lower 16 or 32bits of the constant are all zero, the mov_q
> > > emits one or two useless movk instructions. If the mov_q macro is used
> > > in hot code path, we want to save the movk instructions as much as
> > > possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> > > CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> > > routine is the pontential optimization target:
> > >
> > >         /* set the TCR_EL1 bits */
> > >         mov_q   x10, TCR_MTE_FLAGS
> > >
> > > Before the patch:
> > >         mov     x10, #0x10000000000000
> > >         movk    x10, #0x40, lsl #32
> > >         movk    x10, #0x0, lsl #16
> > >         movk    x10, #0x0
> > >
> > > After the patch:
> > >         mov     x10, #0x10000000000000
> > >         movk    x10, #0x40, lsl #32
> > >
> > > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> > 
> > This is broken for constants that have 0xffff in the top 16 bits, as
> > in that case, we will emit a MOVN/MOVK/MOVK sequence, and omitting the
> > MOVKs will set the corresponding field to 0xffff not 0x0.
> 
> Thanks so much for this hint. I think you are right about the 0xffff in
> top 16bits case.
> 

the patch breaks below usage case:
mov_q x0, 0xffffffff00000000

I think the reason is mov_q starts from high bits, if we change the
macro to start from LSB, then that could solve the breakage. But this
needs a rewrite of mov_q
Ard Biesheuvel July 28, 2022, 3:40 p.m. UTC | #7
On Thu, 28 Jul 2022 at 08:26, Jisheng Zhang <jszhang@kernel.org> wrote:
>
> On Thu, Jul 28, 2022 at 10:49:02PM +0800, Jisheng Zhang wrote:
> > On Wed, Jul 27, 2022 at 08:15:11AM -0700, Ard Biesheuvel wrote:
> > > On Sat, 9 Jul 2022 at 01:58, Jisheng Zhang <jszhang@kernel.org> wrote:
> > > >
> > > > Currently mov_q is used to move a constant into a 64-bit register,
> > > > when the lower 16 or 32bits of the constant are all zero, the mov_q
> > > > emits one or two useless movk instructions. If the mov_q macro is used
> > > > in hot code path, we want to save the movk instructions as much as
> > > > possible. For example, when CONFIG_ARM64_MTE is 'Y' and
> > > > CONFIG_KASAN_HW_TAGS is 'N', the following code in __cpu_setup()
> > > > routine is the pontential optimization target:
> > > >
> > > >         /* set the TCR_EL1 bits */
> > > >         mov_q   x10, TCR_MTE_FLAGS
> > > >
> > > > Before the patch:
> > > >         mov     x10, #0x10000000000000
> > > >         movk    x10, #0x40, lsl #32
> > > >         movk    x10, #0x0, lsl #16
> > > >         movk    x10, #0x0
> > > >
> > > > After the patch:
> > > >         mov     x10, #0x10000000000000
> > > >         movk    x10, #0x40, lsl #32
> > > >
> > > > Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> > >
> > > This is broken for constants that have 0xffff in the top 16 bits, as
> > > in that case, we will emit a MOVN/MOVK/MOVK sequence, and omitting the
> > > MOVKs will set the corresponding field to 0xffff not 0x0.
> >
> > Thanks so much for this hint. I think you are right about the 0xffff in
> > top 16bits case.
> >
>
> the patch breaks below usage case:
> mov_q x0, 0xffffffff00000000
>
> I think the reason is mov_q starts from high bits, if we change the
> macro to start from LSB, then that could solve the breakage. But this
> needs a rewrite of mov_q

No it has nothing to do with that.

The problem is that the use of MOVN changes the implicit value of the
16-bit fields that are left unspecified, and assigning them in a
different order is not going to make any difference.

I don't think we should further complicate mov_q, and I would argue
that the existing optimization (which I added myself) is premature
already: in the grand scheme of things, one or two instructions more
or less are not going to make a difference anyway, given how rarely
this macro is used. And even if any of these occurrences are on a hot
path, it is not a given that shorter sequences of MOVZ/MOVN/MOVK are
going to execute any faster, as the canonical MOVZ/MOVK/MOVK/MOVK
might well decode to fewer uops.

So in summary, let's leave this code be.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 8c5a61aeaf8e..09f408424cae 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -568,9 +568,13 @@  alternative_endif
 	movz	\reg, :abs_g3:\val
 	movk	\reg, :abs_g2_nc:\val
 	.endif
+	.if ((((\val) >> 16) & 0xffff) != 0)
 	movk	\reg, :abs_g1_nc:\val
 	.endif
+	.endif
+	.if (((\val) & 0xffff) != 0)
 	movk	\reg, :abs_g0_nc:\val
+	.endif
 	.endm
 
 /*