diff mbox series

[RFC,2/8] iommu/arm-smmu-v3: Add and use static helper function arm_smmu_cmdq_issue_cmd_with_sync()

Message ID 20210626110130.2416-3-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series iommu/arm-smmu-v3: add support for ECMDQ register mode | expand

Commit Message

Leizhen (ThunderTown) June 26, 2021, 11:01 a.m. UTC
The obvious key to the performance optimization of commit 587e6c10a7ce
("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is
to allow multiple cores to insert commands in parallel after a brief mutex
contention.

Obviously, inserting as many commands at a time as possible can reduce the
number of times the mutex contention participates, thereby improving the
overall performance. At least it reduces the number of calls to function
arm_smmu_cmdq_issue_cmdlist().

Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
the 'cmd+sync' commands at a time.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
 1 file changed, 21 insertions(+), 12 deletions(-)

Comments

Will Deacon Aug. 10, 2021, 6:24 p.m. UTC | #1
On Sat, Jun 26, 2021 at 07:01:24PM +0800, Zhen Lei wrote:
> The obvious key to the performance optimization of commit 587e6c10a7ce
> ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is
> to allow multiple cores to insert commands in parallel after a brief mutex
> contention.
> 
> Obviously, inserting as many commands at a time as possible can reduce the
> number of times the mutex contention participates, thereby improving the
> overall performance. At least it reduces the number of calls to function
> arm_smmu_cmdq_issue_cmdlist().
> 
> Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
> the 'cmd+sync' commands at a time.
> 
> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
>  1 file changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 2433d3c29b49ff2..a5361153ca1d6a4 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>  	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
>  }
>  
> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
>  }
>  
> +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
> +					     struct arm_smmu_cmdq_ent *ent)
> +{
> +	u64 cmd[CMDQ_ENT_DWORDS];
> +
> +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
> +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> +			 ent->opcode);
> +		return -EINVAL;
> +	}
> +
> +	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);
> +}

This function is almost identical to arm_smmu_cmdq_issue_cmd(). How about
moving the guts out into a helper:

	static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
					     struct arm_smmu_cmdq_ent *ent,
					     bool sync);

and then having arm_smmu_cmdq_issue_cmd_with_sync() and
arm_smmu_cmdq_issue_cmd() wrap that?

Will
Leizhen (ThunderTown) Aug. 11, 2021, 2:16 a.m. UTC | #2
On 2021/8/11 2:24, Will Deacon wrote:
> On Sat, Jun 26, 2021 at 07:01:24PM +0800, Zhen Lei wrote:
>> The obvious key to the performance optimization of commit 587e6c10a7ce
>> ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is
>> to allow multiple cores to insert commands in parallel after a brief mutex
>> contention.
>>
>> Obviously, inserting as many commands at a time as possible can reduce the
>> number of times the mutex contention participates, thereby improving the
>> overall performance. At least it reduces the number of calls to function
>> arm_smmu_cmdq_issue_cmdlist().
>>
>> Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
>> the 'cmd+sync' commands at a time.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> ---
>>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
>>  1 file changed, 21 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 2433d3c29b49ff2..a5361153ca1d6a4 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>  	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
>>  }
>>  
>> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>  {
>>  	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
>>  }
>>  
>> +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
>> +					     struct arm_smmu_cmdq_ent *ent)
>> +{
>> +	u64 cmd[CMDQ_ENT_DWORDS];
>> +
>> +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
>> +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
>> +			 ent->opcode);
>> +		return -EINVAL;
>> +	}
>> +
>> +	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);
>> +}
> 
> This function is almost identical to arm_smmu_cmdq_issue_cmd(). How about
> moving the guts out into a helper:
> 
> 	static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> 					     struct arm_smmu_cmdq_ent *ent,
> 					     bool sync);
> 
> and then having arm_smmu_cmdq_issue_cmd_with_sync() and
> arm_smmu_cmdq_issue_cmd() wrap that?

OK, I will do it.

How about remove arm_smmu_cmdq_issue_sync()? It's unused now.

> 
> Will
> .
>
Will Deacon Aug. 11, 2021, 10:09 a.m. UTC | #3
On Wed, Aug 11, 2021 at 10:16:39AM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2021/8/11 2:24, Will Deacon wrote:
> > On Sat, Jun 26, 2021 at 07:01:24PM +0800, Zhen Lei wrote:
> >> The obvious key to the performance optimization of commit 587e6c10a7ce
> >> ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is
> >> to allow multiple cores to insert commands in parallel after a brief mutex
> >> contention.
> >>
> >> Obviously, inserting as many commands at a time as possible can reduce the
> >> number of times the mutex contention participates, thereby improving the
> >> overall performance. At least it reduces the number of calls to function
> >> arm_smmu_cmdq_issue_cmdlist().
> >>
> >> Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
> >> the 'cmd+sync' commands at a time.
> >>
> >> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >> ---
> >>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
> >>  1 file changed, 21 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> >> index 2433d3c29b49ff2..a5361153ca1d6a4 100644
> >> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> >> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> >> @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> >>  	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
> >>  }
> >>  
> >> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> >> +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> >>  {
> >>  	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
> >>  }
> >>  
> >> +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
> >> +					     struct arm_smmu_cmdq_ent *ent)
> >> +{
> >> +	u64 cmd[CMDQ_ENT_DWORDS];
> >> +
> >> +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
> >> +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> >> +			 ent->opcode);
> >> +		return -EINVAL;
> >> +	}
> >> +
> >> +	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);
> >> +}
> > 
> > This function is almost identical to arm_smmu_cmdq_issue_cmd(). How about
> > moving the guts out into a helper:
> > 
> > 	static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> > 					     struct arm_smmu_cmdq_ent *ent,
> > 					     bool sync);
> > 
> > and then having arm_smmu_cmdq_issue_cmd_with_sync() and
> > arm_smmu_cmdq_issue_cmd() wrap that?
> 
> OK, I will do it.
> 
> How about remove arm_smmu_cmdq_issue_sync()? It's unused now.

Sure.

Will
John Garry Aug. 11, 2021, 10:31 a.m. UTC | #4
>>>> Obviously, inserting as many commands at a time as possible can reduce the
>>>> number of times the mutex contention participates, thereby improving the
>>>> overall performance. At least it reduces the number of calls to function
>>>> arm_smmu_cmdq_issue_cmdlist().
>>>>
>>>> Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
>>>> the 'cmd+sync' commands at a time.
>>>>
>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>> ---
>>>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
>>>>   1 file changed, 21 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>> index 2433d3c29b49ff2..a5361153ca1d6a4 100644
>>>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>> @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>>>   	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
>>>>   }
>>>>   
>>>> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>> +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>>   {
>>>>   	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
>>>>   }
>>>>   
>>>> +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
>>>> +					     struct arm_smmu_cmdq_ent *ent)
>>>> +{
>>>> +	u64 cmd[CMDQ_ENT_DWORDS];
>>>> +
>>>> +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
>>>> +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
>>>> +			 ent->opcode);
>>>> +		return -EINVAL;

Are any of the errors returned from the "issue command" functions 
actually consumed? I couldn't see it on mainline code from a brief browse.

>>>> +	}
>>>> +
>>>> +	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);

Thanks,
John
Will Deacon Aug. 11, 2021, 10:33 a.m. UTC | #5
On Wed, Aug 11, 2021 at 11:31:08AM +0100, John Garry wrote:
> > > > > Obviously, inserting as many commands at a time as possible can reduce the
> > > > > number of times the mutex contention participates, thereby improving the
> > > > > overall performance. At least it reduces the number of calls to function
> > > > > arm_smmu_cmdq_issue_cmdlist().
> > > > > 
> > > > > Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
> > > > > the 'cmd+sync' commands at a time.
> > > > > 
> > > > > Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> > > > > ---
> > > > >   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
> > > > >   1 file changed, 21 insertions(+), 12 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > > index 2433d3c29b49ff2..a5361153ca1d6a4 100644
> > > > > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > > > > @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> > > > >   	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
> > > > >   }
> > > > > -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> > > > > +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> > > > >   {
> > > > >   	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
> > > > >   }
> > > > > +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
> > > > > +					     struct arm_smmu_cmdq_ent *ent)
> > > > > +{
> > > > > +	u64 cmd[CMDQ_ENT_DWORDS];
> > > > > +
> > > > > +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
> > > > > +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> > > > > +			 ent->opcode);
> > > > > +		return -EINVAL;
> 
> Are any of the errors returned from the "issue command" functions actually
> consumed? I couldn't see it on mainline code from a brief browse.

I don't think so. Can we actually propagate them?

Will
John Garry Aug. 11, 2021, 11:15 a.m. UTC | #6
>>>>>>
>>>>>> Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert
>>>>>> the 'cmd+sync' commands at a time.
>>>>>>
>>>>>> Signed-off-by: Zhen Lei<thunder.leizhen@huawei.com>
>>>>>> ---
>>>>>>    drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++--------
>>>>>>    1 file changed, 21 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>>>> index 2433d3c29b49ff2..a5361153ca1d6a4 100644
>>>>>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>>>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>>>>> @@ -858,11 +858,25 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>>>>>    	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
>>>>>>    }
>>>>>> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>>>> +static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>>>>    {
>>>>>>    	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
>>>>>>    }
>>>>>> +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
>>>>>> +					     struct arm_smmu_cmdq_ent *ent)
>>>>>> +{
>>>>>> +	u64 cmd[CMDQ_ENT_DWORDS];
>>>>>> +
>>>>>> +	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
>>>>>> +		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
>>>>>> +			 ent->opcode);
>>>>>> +		return -EINVAL;
>> Are any of the errors returned from the "issue command" functions actually
>> consumed? I couldn't see it on mainline code from a brief browse.
> I don't think so.

I don't think so either.

> Can we actually propagate them?

There does appear to be some places, here's one I found:

arm_smmu_page_response() -> arm_smmu_cmdq_issue_cmd(), and 
arm_smmu_page_response is set to arm_smmu_ops.page_response, which 
returns an int

Thanks,
John
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2433d3c29b49ff2..a5361153ca1d6a4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -858,11 +858,25 @@  static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
 }
 
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
 }
 
+static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
+					     struct arm_smmu_cmdq_ent *ent)
+{
+	u64 cmd[CMDQ_ENT_DWORDS];
+
+	if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+			 ent->opcode);
+		return -EINVAL;
+	}
+
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);
+}
+
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_batch *cmds,
 				    struct arm_smmu_cmdq_ent *cmd)
@@ -928,8 +942,7 @@  void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
 		.tlbi.asid = asid,
 	};
 
-	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
-	arm_smmu_cmdq_issue_sync(smmu);
+	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 }
 
 static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
@@ -1210,8 +1223,7 @@  static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
 		},
 	};
 
-	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
-	arm_smmu_cmdq_issue_sync(smmu);
+	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 }
 
 static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
@@ -1823,8 +1835,7 @@  static void arm_smmu_tlb_inv_context(void *cookie)
 	} else {
 		cmd.opcode	= CMDQ_OP_TLBI_S12_VMALL;
 		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
-		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
-		arm_smmu_cmdq_issue_sync(smmu);
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 	}
 	arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 }
@@ -3338,18 +3349,16 @@  static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
 
 	/* Invalidate any cached configuration */
 	cmd.opcode = CMDQ_OP_CFGI_ALL;
-	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
-	arm_smmu_cmdq_issue_sync(smmu);
+	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 
 	/* Invalidate any stale TLB entries */
 	if (smmu->features & ARM_SMMU_FEAT_HYP) {
 		cmd.opcode = CMDQ_OP_TLBI_EL2_ALL;
-		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 	}
 
 	cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL;
-	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
-	arm_smmu_cmdq_issue_sync(smmu);
+	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 
 	/* Event queue */
 	writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE);