diff mbox series

[v3,1/1] iommu/arm-smmu-v3: Simplify useless instructions in arm_smmu_cmdq_build_cmd()

Message ID 20211207094109.1962-2-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm-smmu-v3: Simplify useless instructions in arm_smmu_cmdq_build_cmd() | expand

Commit Message

Leizhen (ThunderTown) Dec. 7, 2021, 9:41 a.m. UTC
Although the parameter 'cmd' is always passed by a local array variable,
and only this function modifies it, the compiler does not know this. Every
time the 'cmd' variable is updated, a memory write operation is generated.
This generates many useless instruction operations.

To guide the compiler for proper optimization, 'cmd' is defined as a local
array variable, and copied to the output parameter at a time when the
function is returned.

The optimization effect can be viewed by running the "size arm-smmu-v3.o"
command.

Before:
   text    data     bss     dec     hex
  28246    1332      56   29634    73c2

After:
   text    data     bss     dec     hex
  28134    1332      56   29522    7352

For example:
	cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
case CMDQ_OP_TLBI_EL2_VA:
	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
	cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;

Before:
  Each "cmd[0] |=" or "cmd[1] |=" operation generates a "str" instruction,
sum = 8.

     ldrb	w4, [x1, #8]		//w4 = ent->tlbi.num
     ubfiz	x4, x4, #12, #5
     mov	w0, #0x0
     orr	x4, x4, x3
     str	x4, [x2]
     autiasp
     ldrb	w3, [x1, #9]		//w3 = ent->tlbi.scale
     ubfiz	x3, x3, #20, #5
     orr	x3, x3, x4
     str	x3, [x2]
     ldrh	w4, [x1, #10]		//w4 = ent->tlbi.asid
     orr	x3, x3, x4, lsl #48
     str	x3, [x2]
     ldrb	w3, [x1, #14]		//w3 = ent->tlbi.leaf
     str	x3, [x2, #8]
     ldrb	w4, [x1, #15]		//w4 = ent->tlbi.ttl
     ubfiz	x4, x4, #8, #2
     orr	x4, x4, x3
     str	x4, [x2, #8]
     ldrb	w3, [x1, #16]		//ent->tlbi.tg
     ubfiz	x3, x3, #10, #2
     orr	x3, x3, x4
     str	x3, [x2, #8]
     ldr	x1, [x1, #24]		//ent->tlbi.addr
     and	x1, x1, #0xfffffffffffff000
     orr	x1, x1, x3
     str	x1, [x2, #8]
     ret

After:
  All "cmd[0] |=" and "cmd[1] |=" operations generate a "stp" instruction,
sum = 1.

3e8:
     mov	w0, #0x0
     autiasp
     stp	x2, x1, [x3]
     ret
     bti	j
3fc:
     ldrb	w5, [x1, #8]		//w5 = ent->tlbi.num
     mov	x2, #0x22		//x2 = ent->opcode = CMDQ_0_OP
     ldrb	w6, [x1, #9]		//w6 = ent->tlbi.scale
     ubfiz	x5, x5, #12, #5
     ldrb	w0, [x1, #16]		//w0 = ent->tlbi.tg
     orr	x5, x5, x2
     ldrb	w7, [x1, #15]		//w7 = ent->tlbi.ttl
     ldr	x4, [x1, #24]		//x4 = ent->tlbi.addr
     ubfiz	x0, x0, #10, #2
     ldrh	w2, [x1, #10]		//w2 = ent->tlbi.asid
     ubfiz	x6, x6, #20, #5
     ldrb	w8, [x1, #14]		//w8 = ent->tlbi.leaf
     and	x4, x4, #0xfffffffffffff000
     ubfiz	x1, x7, #8, #2
     orr	x1, x0, x1
     orr	x2, x6, x2, lsl #48
     orr	x0, x4, x8
     orr	x2, x2, x5
     orr	x1, x1, x0
     b		3e8

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

Comments

John Garry Dec. 7, 2021, 4:17 p.m. UTC | #1
On 07/12/2021 09:41, Zhen Lei via iommu wrote:
> Although the parameter 'cmd' is always passed by a local array variable,
> and only this function modifies it, the compiler does not know this. Every
> time the 'cmd' variable is updated, a memory write operation is generated.
> This generates many useless instruction operations.

I'd hardly call them useless. More like inefficient or sub-optimum.

> 
> To guide the compiler for proper optimization, 'cmd' is defined as a local
> array variable, and copied to the output parameter at a time when the
> function is returned.
> 
> The optimization effect can be viewed by running the "size arm-smmu-v3.o"
> command.
> 
> Before:
>     text    data     bss     dec     hex
>    28246    1332      56   29634    73c2
> 
> After:
>     text    data     bss     dec     hex
>    28134    1332      56   29522    7352
> 
> For example:
> 	cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
> case CMDQ_OP_TLBI_EL2_VA:
> 	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> 	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
> 	cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
> 	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> 	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> 	cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
> 	cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
> 
> Before:
>    Each "cmd[0] |=" or "cmd[1] |=" operation generates a "str" instruction,
> sum = 8.
> 
>       ldrb	w4, [x1, #8]		//w4 = ent->tlbi.num
>       ubfiz	x4, x4, #12, #5
>       mov	w0, #0x0
>       orr	x4, x4, x3
>       str	x4, [x2]
>       autiasp
>       ldrb	w3, [x1, #9]		//w3 = ent->tlbi.scale
>       ubfiz	x3, x3, #20, #5
>       orr	x3, x3, x4
>       str	x3, [x2]
>       ldrh	w4, [x1, #10]		//w4 = ent->tlbi.asid
>       orr	x3, x3, x4, lsl #48
>       str	x3, [x2]
>       ldrb	w3, [x1, #14]		//w3 = ent->tlbi.leaf
>       str	x3, [x2, #8]
>       ldrb	w4, [x1, #15]		//w4 = ent->tlbi.ttl
>       ubfiz	x4, x4, #8, #2
>       orr	x4, x4, x3
>       str	x4, [x2, #8]
>       ldrb	w3, [x1, #16]		//ent->tlbi.tg
>       ubfiz	x3, x3, #10, #2
>       orr	x3, x3, x4
>       str	x3, [x2, #8]
>       ldr	x1, [x1, #24]		//ent->tlbi.addr
>       and	x1, x1, #0xfffffffffffff000
>       orr	x1, x1, x3
>       str	x1, [x2, #8]
>       ret
> 
> After:
>    All "cmd[0] |=" and "cmd[1] |=" operations generate a "stp" instruction,
> sum = 1.
> 
> 3e8:
>       mov	w0, #0x0
>       autiasp
>       stp	x2, x1, [x3]
>       ret
>       bti	j
> 3fc:
>       ldrb	w5, [x1, #8]		//w5 = ent->tlbi.num
>       mov	x2, #0x22		//x2 = ent->opcode = CMDQ_0_OP
>       ldrb	w6, [x1, #9]		//w6 = ent->tlbi.scale
>       ubfiz	x5, x5, #12, #5
>       ldrb	w0, [x1, #16]		//w0 = ent->tlbi.tg
>       orr	x5, x5, x2
>       ldrb	w7, [x1, #15]		//w7 = ent->tlbi.ttl
>       ldr	x4, [x1, #24]		//x4 = ent->tlbi.addr
>       ubfiz	x0, x0, #10, #2
>       ldrh	w2, [x1, #10]		//w2 = ent->tlbi.asid
>       ubfiz	x6, x6, #20, #5
>       ldrb	w8, [x1, #14]		//w8 = ent->tlbi.leaf
>       and	x4, x4, #0xfffffffffffff000
>       ubfiz	x1, x7, #8, #2
>       orr	x1, x0, x1
>       orr	x2, x6, x2, lsl #48
>       orr	x0, x4, x8
>       orr	x2, x2, x5
>       orr	x1, x1, x0
>       b		3e8
> 
> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++---
>   1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index f5848b351b19359..e55dfc14cac6005 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -234,10 +234,12 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
>   }
>   
>   /* High-level queue accessors */
> -static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> +static int arm_smmu_cmdq_build_cmd(u64 *out_cmd, struct arm_smmu_cmdq_ent *ent)
>   {
> -	memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
> -	cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
> +	int i;
> +	u64 cmd[CMDQ_ENT_DWORDS] = {0};

I thought that just {} was preferred. Or could have:
u64 cmd[CMDQ_ENT_DWORDS] = {FIELD_PREP(CMDQ_0_OP, ent->opcode), };
to be more concise

> +
> +	cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
>   
>   	switch (ent->opcode) {
>   	case CMDQ_OP_TLBI_EL2_ALL:
> @@ -332,6 +334,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>   		return -ENOENT;
>   	}
>   
> +	for (i = 0; i < CMDQ_ENT_DWORDS; i++)
> +		out_cmd[i] = cmd[i];

how about memcpy() instead?

> +
>   	return 0;
>   }

Did you notice any performance change with this change?

Thanks,
John
Leizhen (ThunderTown) Dec. 8, 2021, 1:40 p.m. UTC | #2
On 2021/12/8 0:17, John Garry wrote:
> 
>> +
>>       return 0;
>>   }
> 
> Did you notice any performance change with this change?

Hi John:
  Thanks for the tip. I wrote a test case today, and I found that the
performance did not go up but down. It's so weird. So I decided not to
change it, because it's also poorly readable. So I plan to make only
the following modifications:
@@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
 static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 {
        memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
-       cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+       cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);

        switch (ent->opcode) {
        case CMDQ_OP_TLBI_EL2_ALL:

This prevents the compiler from generating the following two inefficient
instructions:
     394:       f9400002        ldr     x2, [x0]	//x2 = cmd[0]
     398:       aa020062        orr     x2, x3, x2	//x3 = FIELD_PREP(CMDQ_0_OP, ent->opcode)

Maybe it's not worth changing because I've only seen a 0.x nanosecond reduction
in performance. But one thing is, it only comes with benefits, no side effects.

> 
> Thanks,
> John
John Garry Dec. 8, 2021, 6:17 p.m. UTC | #3
>> Did you notice any performance change with this change?
> 
> Hi John:
>    Thanks for the tip. I wrote a test case today, and I found that the
> performance did not go up but down.

I very quickly tested on a DMA mapping benchmark very similar to the 
kernel DMA benchmark module - I got mixed results. For fewer CPUs (<8), 
a small improvement, like 0.7%. For more CPUs, a dis-improvement - 
that's surprising, I did expect just no change as any improvement would 
get dwarfed from the slower unmap rates for more CPUs. I can check this
more tomorrow.

> It's so weird. So I decided not to
> change it, because it's also poorly readable. So I plan to make only
> the following modifications:
> @@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
>   static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
>   {
>          memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
> -       cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
> +       cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
> 
>          switch (ent->opcode) {
>          case CMDQ_OP_TLBI_EL2_ALL:
> 
> This prevents the compiler from generating the following two inefficient
> instructions:
>       394:       f9400002        ldr     x2, [x0]	//x2 = cmd[0]
>       398:       aa020062        orr     x2, x3, x2	//x3 = FIELD_PREP(CMDQ_0_OP, ent->opcode)
> 
> Maybe it's not worth changing because I've only seen a 0.x nanosecond reduction
> in performance. But one thing is, it only comes with benefits, no side effects.
> 

I just think that with the original code that cmd[] is on the stack and 
cached, so if we have write-back attribute (which I think we do) then 
there would not necessarily a write to external memory per write to cmd[].

So, apart from this approach, I think that if we can just reduce the 
instructions through other efficiencies in the function then that would 
be good.

Thanks,
John
Robin Murphy Dec. 8, 2021, 6:40 p.m. UTC | #4
On 2021-12-08 18:17, John Garry wrote:
>>> Did you notice any performance change with this change?
>>
>> Hi John:
>>    Thanks for the tip. I wrote a test case today, and I found that the
>> performance did not go up but down.
> 
> I very quickly tested on a DMA mapping benchmark very similar to the 
> kernel DMA benchmark module - I got mixed results. For fewer CPUs (<8), 
> a small improvement, like 0.7%. For more CPUs, a dis-improvement - 
> that's surprising, I did expect just no change as any improvement would 
> get dwarfed from the slower unmap rates for more CPUs. I can check this
> more tomorrow.
> 
>> It's so weird. So I decided not to
>> change it, because it's also poorly readable. So I plan to make only
>> the following modifications:
>> @@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue 
>> *q, u64 *ent)
>>   static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
>> arm_smmu_cmdq_ent *ent)
>>   {
>>          memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
>> -       cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
>> +       cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
>>
>>          switch (ent->opcode) {
>>          case CMDQ_OP_TLBI_EL2_ALL:
>>
>> This prevents the compiler from generating the following two inefficient
>> instructions:
>>       394:       f9400002        ldr     x2, [x0]    //x2 = cmd[0]
>>       398:       aa020062        orr     x2, x3, x2    //x3 = 
>> FIELD_PREP(CMDQ_0_OP, ent->opcode)
>>
>> Maybe it's not worth changing because I've only seen a 0.x nanosecond 
>> reduction
>> in performance. But one thing is, it only comes with benefits, no side 
>> effects.
>>
> 
> I just think that with the original code that cmd[] is on the stack and 
> cached, so if we have write-back attribute (which I think we do) then 
> there would not necessarily a write to external memory per write to cmd[].
> 
> So, apart from this approach, I think that if we can just reduce the 
> instructions through other efficiencies in the function then that would 
> be good.

Not sure if it's still true, but FWIW last time the best result actually 
came from doing the ridiculously counter-intuitive:

https://lore.kernel.org/linux-iommu/141de3c3278e280712d16d9ac9ab305c3b80a810.1534344167.git.robin.murphy@arm.com/

Robin.
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f5848b351b19359..e55dfc14cac6005 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -234,10 +234,12 @@  static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
 }
 
 /* High-level queue accessors */
-static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
+static int arm_smmu_cmdq_build_cmd(u64 *out_cmd, struct arm_smmu_cmdq_ent *ent)
 {
-	memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
-	cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+	int i;
+	u64 cmd[CMDQ_ENT_DWORDS] = {0};
+
+	cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
 
 	switch (ent->opcode) {
 	case CMDQ_OP_TLBI_EL2_ALL:
@@ -332,6 +334,9 @@  static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		return -ENOENT;
 	}
 
+	for (i = 0; i < CMDQ_ENT_DWORDS; i++)
+		out_cmd[i] = cmd[i];
+
 	return 0;
 }