mbox series

[0/4] iommu/arm-smmu-v3: Improve cmdq lock efficiency

Message ID 1592846920-45338-1-git-send-email-john.garry@huawei.com (mailing list archive)
Headers show
Series iommu/arm-smmu-v3: Improve cmdq lock efficiency | expand

Message

John Garry June 22, 2020, 5:28 p.m. UTC
As mentioned in [0], the CPU may consume many cycles processing
arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
get space on the queue takes approx 25% of the cycles for this function.

This series removes that cmpxchg().

For my NVMe test with 3x NVMe SSDs, I'm getting a ~24% throughput
increase:
Before: 1310 IOPs
After: 1630 IOPs

I also have a test harness to check the rate of DMA map+unmaps we can
achieve:

CPU count	32	64	128
Before:		63187	19418	10169
After:		93287	44789	15862

(unit is map+unmaps per CPU per second)

[0] https://lore.kernel.org/linux-iommu/B926444035E5E2439431908E3842AFD24B86DB@DGGEMI525-MBS.china.huawei.com/T/#ma02e301c38c3e94b7725e685757c27e39c7cbde3

John Garry (4):
  iommu/arm-smmu-v3: Fix trivial typo
  iommu/arm-smmu-v3: Calculate bits for prod and owner
  iommu/arm-smmu-v3: Always issue a CMD_SYNC per batch
  iommu/arm-smmu-v3: Remove cmpxchg() in arm_smmu_cmdq_issue_cmdlist()

 drivers/iommu/arm-smmu-v3.c | 233 +++++++++++++++++++++++-------------
 1 file changed, 151 insertions(+), 82 deletions(-)

Comments

John Garry July 8, 2020, 1 p.m. UTC | #1
On 22/06/2020 18:28, John Garry wrote:

Hi, Can you guys let me know if this is on the radar at all?

I have been talking about this performance issue since Jan, and not 
getting anything really.

thanks

> As mentioned in [0], the CPU may consume many cycles processing
> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
> get space on the queue takes approx 25% of the cycles for this function.
> 
> This series removes that cmpxchg().
> 
> For my NVMe test with 3x NVMe SSDs, I'm getting a ~24% throughput
> increase:
> Before: 1310 IOPs
> After: 1630 IOPs
> 
> I also have a test harness to check the rate of DMA map+unmaps we can
> achieve:
> 
> CPU count	32	64	128
> Before:		63187	19418	10169
> After:		93287	44789	15862
> 
> (unit is map+unmaps per CPU per second)
> 
> [0] https://lore.kernel.org/linux-iommu/B926444035E5E2439431908E3842AFD24B86DB@DGGEMI525-MBS.china.huawei.com/T/#ma02e301c38c3e94b7725e685757c27e39c7cbde3
> 
> John Garry (4):
>    iommu/arm-smmu-v3: Fix trivial typo
>    iommu/arm-smmu-v3: Calculate bits for prod and owner
>    iommu/arm-smmu-v3: Always issue a CMD_SYNC per batch
>    iommu/arm-smmu-v3: Remove cmpxchg() in arm_smmu_cmdq_issue_cmdlist()
> 
>   drivers/iommu/arm-smmu-v3.c | 233 +++++++++++++++++++++++-------------
>   1 file changed, 151 insertions(+), 82 deletions(-)
>
Will Deacon July 16, 2020, 10:19 a.m. UTC | #2
On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
> As mentioned in [0], the CPU may consume many cycles processing
> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
> get space on the queue takes approx 25% of the cycles for this function.
> 
> This series removes that cmpxchg().

How about something much simpler like the diff below?

Will

--->8

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f578677a5c41..705d99ec8219 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -560,6 +560,7 @@ struct arm_smmu_cmdq {
 	atomic_long_t			*valid_map;
 	atomic_t			owner_prod;
 	atomic_t			lock;
+	spinlock_t			slock;
 };
 
 struct arm_smmu_cmdq_batch {
@@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
-	bool owner;
+	bool owner, cas_failed = false;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift = cmdq->q.llq.max_n_shift,
@@ -1387,27 +1388,35 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 
 	/* 1. Allocate some space in the queue */
 	local_irq_save(flags);
-	llq.val = READ_ONCE(cmdq->q.llq.val);
 	do {
 		u64 old;
+		llq.val = READ_ONCE(cmdq->q.llq.val);
 
-		while (!queue_has_space(&llq, n + sync)) {
+		if (queue_has_space(&llq, n + sync))
+			goto try_cas;
+
+		if (cas_failed)
+			spin_unlock(&cmdq->slock);
+
+		do {
 			local_irq_restore(flags);
 			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
 				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 			local_irq_save(flags);
-		}
+		} while (!queue_has_space(&llq, n + sync));
 
+try_cas:
 		head.cons = llq.cons;
 		head.prod = queue_inc_prod_n(&llq, n + sync) |
 					     CMDQ_PROD_OWNED_FLAG;
 
 		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
-		if (old == llq.val)
-			break;
+		cas_failed = old != llq.val;
+
+		if (cas_failed)
+			spin_lock(&cmdq->slock);
+	} while (cas_failed);
 
-		llq.val = old;
-	} while (1);
 	owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
 	head.prod &= ~CMDQ_PROD_OWNED_FLAG;
 	llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
@@ -3192,6 +3201,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
 
 	atomic_set(&cmdq->owner_prod, 0);
 	atomic_set(&cmdq->lock, 0);
+	spin_lock_init(&cmdq->slock);
 
 	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
 	if (!bitmap) {
Will Deacon July 16, 2020, 10:22 a.m. UTC | #3
On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
> > As mentioned in [0], the CPU may consume many cycles processing
> > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
> > get space on the queue takes approx 25% of the cycles for this function.
> > 
> > This series removes that cmpxchg().
> 
> How about something much simpler like the diff below?

Ah, scratch that, I don't drop the lock if we fail the cas with it held.
Let me hack it some more (I have no hardware so I can only build-test this).

Will
Will Deacon July 16, 2020, 10:28 a.m. UTC | #4
On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
> > On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
> > > As mentioned in [0], the CPU may consume many cycles processing
> > > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
> > > get space on the queue takes approx 25% of the cycles for this function.
> > > 
> > > This series removes that cmpxchg().
> > 
> > How about something much simpler like the diff below?
> 
> Ah, scratch that, I don't drop the lock if we fail the cas with it held.
> Let me hack it some more (I have no hardware so I can only build-test this).

Right, second attempt...

Will

--->8

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f578677a5c41..e6bcddd6ef69 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -560,6 +560,7 @@ struct arm_smmu_cmdq {
 	atomic_long_t			*valid_map;
 	atomic_t			owner_prod;
 	atomic_t			lock;
+	spinlock_t			slock;
 };
 
 struct arm_smmu_cmdq_batch {
@@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
-	bool owner;
+	bool owner, locked = false;
 	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift = cmdq->q.llq.max_n_shift,
@@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 
 	/* 1. Allocate some space in the queue */
 	local_irq_save(flags);
-	llq.val = READ_ONCE(cmdq->q.llq.val);
 	do {
 		u64 old;
+		llq.val = READ_ONCE(cmdq->q.llq.val);
 
-		while (!queue_has_space(&llq, n + sync)) {
+		if (queue_has_space(&llq, n + sync))
+			goto try_cas;
+
+		if (locked)
+			spin_unlock(&cmdq->slock);
+
+		do {
 			local_irq_restore(flags);
 			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
 				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 			local_irq_save(flags);
-		}
+		} while (!queue_has_space(&llq, n + sync));
 
+try_cas:
 		head.cons = llq.cons;
 		head.prod = queue_inc_prod_n(&llq, n + sync) |
 					     CMDQ_PROD_OWNED_FLAG;
 
 		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
-		if (old == llq.val)
+		if (old != llq.val)
 			break;
 
-		llq.val = old;
+		if (!locked) {
+			spin_lock(&cmdq->slock);
+			locked = true;
+		}
 	} while (1);
+
 	owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
 	head.prod &= ~CMDQ_PROD_OWNED_FLAG;
 	llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
@@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
 
 	atomic_set(&cmdq->owner_prod, 0);
 	atomic_set(&cmdq->lock, 0);
+	spin_lock_init(&cmdq->slock);
 
 	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
 	if (!bitmap) {
John Garry July 16, 2020, 10:56 a.m. UTC | #5
On 16/07/2020 11:28, Will Deacon wrote:
> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
>>>> As mentioned in [0], the CPU may consume many cycles processing
>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
>>>> get space on the queue takes approx 25% of the cycles for this function.
>>>>
>>>> This series removes that cmpxchg().
>>>
>>> How about something much simpler like the diff below? >>
>> Ah, scratch that, I don't drop the lock if we fail the cas with it held.
>> Let me hack it some more (I have no hardware so I can only build-test this).
> 
> Right, second attempt...

I can try it, but if performance if not as good, then please check mine 
further (patch 4/4 specifically) - performance is really good, IMHO.

Thanks,

> 
> Will
> 
> --->8
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f578677a5c41..e6bcddd6ef69 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -560,6 +560,7 @@ struct arm_smmu_cmdq {
>   	atomic_long_t			*valid_map;
>   	atomic_t			owner_prod;
>   	atomic_t			lock;
> +	spinlock_t			slock;
>   };
>   
>   struct arm_smmu_cmdq_batch {
> @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
>   	u64 cmd_sync[CMDQ_ENT_DWORDS];
>   	u32 prod;
>   	unsigned long flags;
> -	bool owner;
> +	bool owner, locked = false;
>   	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
>   	struct arm_smmu_ll_queue llq = {
>   		.max_n_shift = cmdq->q.llq.max_n_shift,
> @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
>   
>   	/* 1. Allocate some space in the queue */
>   	local_irq_save(flags);
> -	llq.val = READ_ONCE(cmdq->q.llq.val);
>   	do {
>   		u64 old;
> +		llq.val = READ_ONCE(cmdq->q.llq.val);
>   
> -		while (!queue_has_space(&llq, n + sync)) {
> +		if (queue_has_space(&llq, n + sync))
> +			goto try_cas;
> +
> +		if (locked)
> +			spin_unlock(&cmdq->slock);
> +
> +		do {
>   			local_irq_restore(flags);
>   			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
>   				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>   			local_irq_save(flags);
> -		}
> +		} while (!queue_has_space(&llq, n + sync));
>   
> +try_cas:
>   		head.cons = llq.cons;
>   		head.prod = queue_inc_prod_n(&llq, n + sync) |
>   					     CMDQ_PROD_OWNED_FLAG;
>   
>   		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
> -		if (old == llq.val)
> +		if (old != llq.val)
>   			break;
>   
> -		llq.val = old;
> +		if (!locked) {
> +			spin_lock(&cmdq->slock);
> +			locked = true;
> +		}
>   	} while (1);
> +
>   	owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
>   	head.prod &= ~CMDQ_PROD_OWNED_FLAG;
>   	llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
> @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
>   
>   	atomic_set(&cmdq->owner_prod, 0);
>   	atomic_set(&cmdq->lock, 0);
> +	spin_lock_init(&cmdq->slock);
>   
>   	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
>   	if (!bitmap) {
> .
>
Robin Murphy July 16, 2020, 11:22 a.m. UTC | #6
On 2020-07-16 11:56, John Garry wrote:
> On 16/07/2020 11:28, Will Deacon wrote:
>> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
>>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
>>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
>>>>> As mentioned in [0], the CPU may consume many cycles processing
>>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() 
>>>>> loop to
>>>>> get space on the queue takes approx 25% of the cycles for this 
>>>>> function.
>>>>>
>>>>> This series removes that cmpxchg().
>>>>
>>>> How about something much simpler like the diff below? >>
>>> Ah, scratch that, I don't drop the lock if we fail the cas with it held.
>>> Let me hack it some more (I have no hardware so I can only build-test 
>>> this).
>>
>> Right, second attempt...
> 
> I can try it, but if performance if not as good, then please check mine 
> further (patch 4/4 specifically) - performance is really good, IMHO.

Perhaps a silly question (I'm too engrossed in PMU world ATM to get 
properly back up to speed on this), but couldn't this be done without 
cmpxchg anyway? Instinctively it feels like instead of maintaining a 
literal software copy of the prod value, we could resolve the "claim my 
slot in the queue" part with atomic_fetch_add on a free-running 32-bit 
"pseudo-prod" index, then whoever updates the hardware deals with the 
truncation and wrap bit to convert it to an actual register value.

Robin.

> 
> Thanks,
> 
>>
>> Will
>>
>> --->8
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index f578677a5c41..e6bcddd6ef69 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -560,6 +560,7 @@ struct arm_smmu_cmdq {
>>       atomic_long_t            *valid_map;
>>       atomic_t            owner_prod;
>>       atomic_t            lock;
>> +    spinlock_t            slock;
>>   };
>>   struct arm_smmu_cmdq_batch {
>> @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
>> arm_smmu_device *smmu,
>>       u64 cmd_sync[CMDQ_ENT_DWORDS];
>>       u32 prod;
>>       unsigned long flags;
>> -    bool owner;
>> +    bool owner, locked = false;
>>       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
>>       struct arm_smmu_ll_queue llq = {
>>           .max_n_shift = cmdq->q.llq.max_n_shift,
>> @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
>> arm_smmu_device *smmu,
>>       /* 1. Allocate some space in the queue */
>>       local_irq_save(flags);
>> -    llq.val = READ_ONCE(cmdq->q.llq.val);
>>       do {
>>           u64 old;
>> +        llq.val = READ_ONCE(cmdq->q.llq.val);
>> -        while (!queue_has_space(&llq, n + sync)) {
>> +        if (queue_has_space(&llq, n + sync))
>> +            goto try_cas;
>> +
>> +        if (locked)
>> +            spin_unlock(&cmdq->slock);
>> +
>> +        do {
>>               local_irq_restore(flags);
>>               if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
>>                   dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>>               local_irq_save(flags);
>> -        }
>> +        } while (!queue_has_space(&llq, n + sync));
>> +try_cas:
>>           head.cons = llq.cons;
>>           head.prod = queue_inc_prod_n(&llq, n + sync) |
>>                            CMDQ_PROD_OWNED_FLAG;
>>           old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
>> -        if (old == llq.val)
>> +        if (old != llq.val)
>>               break;
>> -        llq.val = old;
>> +        if (!locked) {
>> +            spin_lock(&cmdq->slock);
>> +            locked = true;
>> +        }
>>       } while (1);
>> +
>>       owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
>>       head.prod &= ~CMDQ_PROD_OWNED_FLAG;
>>       llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
>> @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct 
>> arm_smmu_device *smmu)
>>       atomic_set(&cmdq->owner_prod, 0);
>>       atomic_set(&cmdq->lock, 0);
>> +    spin_lock_init(&cmdq->slock);
>>       bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
>>       if (!bitmap) {
>> .
>>
>
John Garry July 16, 2020, 11:30 a.m. UTC | #7
On 16/07/2020 12:22, Robin Murphy wrote:
> On 2020-07-16 11:56, John Garry wrote:
>> On 16/07/2020 11:28, Will Deacon wrote:
>>> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
>>>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
>>>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
>>>>>> As mentioned in [0], the CPU may consume many cycles processing
>>>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg()
>>>>>> loop to
>>>>>> get space on the queue takes approx 25% of the cycles for this
>>>>>> function.
>>>>>>
>>>>>> This series removes that cmpxchg().
>>>>>
>>>>> How about something much simpler like the diff below? >>
>>>> Ah, scratch that, I don't drop the lock if we fail the cas with it held.
>>>> Let me hack it some more (I have no hardware so I can only build-test
>>>> this).
>>>
>>> Right, second attempt...
>>
>> I can try it, but if performance if not as good, then please check mine
>> further (patch 4/4 specifically) - performance is really good, IMHO.
> 
> Perhaps a silly question (I'm too engrossed in PMU world ATM to get
> properly back up to speed on this), but couldn't this be done without
> cmpxchg anyway? Instinctively it feels like instead of maintaining a
> literal software copy of the prod value, we could resolve the "claim my
> slot in the queue" part with atomic_fetch_add on a free-running 32-bit
> "pseudo-prod" index, then whoever updates the hardware deals with the
> truncation and wrap bit to convert it to an actual register value.
> 

That's what mine does. But I also need to take care of cmdq locking and 
how we unconditionally provide space.

Cheers,
John
Will Deacon July 16, 2020, 11:32 a.m. UTC | #8
On Thu, Jul 16, 2020 at 12:22:05PM +0100, Robin Murphy wrote:
> On 2020-07-16 11:56, John Garry wrote:
> > On 16/07/2020 11:28, Will Deacon wrote:
> > > On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
> > > > On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
> > > > > On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
> > > > > > As mentioned in [0], the CPU may consume many cycles processing
> > > > > > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the
> > > > > > cmpxchg() loop to
> > > > > > get space on the queue takes approx 25% of the cycles
> > > > > > for this function.
> > > > > > 
> > > > > > This series removes that cmpxchg().
> > > > > 
> > > > > How about something much simpler like the diff below? >>
> > > > Ah, scratch that, I don't drop the lock if we fail the cas with it held.
> > > > Let me hack it some more (I have no hardware so I can only
> > > > build-test this).
> > > 
> > > Right, second attempt...
> > 
> > I can try it, but if performance if not as good, then please check mine
> > further (patch 4/4 specifically) - performance is really good, IMHO.
> 
> Perhaps a silly question (I'm too engrossed in PMU world ATM to get properly
> back up to speed on this), but couldn't this be done without cmpxchg anyway?
> Instinctively it feels like instead of maintaining a literal software copy
> of the prod value, we could resolve the "claim my slot in the queue" part
> with atomic_fetch_add on a free-running 32-bit "pseudo-prod" index, then
> whoever updates the hardware deals with the truncation and wrap bit to
> convert it to an actual register value.

Maybe, but it's easier said than done. The hard part is figuring how that
you have space and there's no way I'm touching that logic without a way to
test this.

I'm also not keen to complicate the code because of systems that don't scale
well with contended CAS [1]. If you put down loads of cores, you need an
interconnect/coherence protocol that can handle it.

Will

[1] https://lore.kernel.org/lkml/20190607072652.GA5522@hc/T/#m0d00f107c29223045933292a8b5b90d2ca9b7e5c
John Garry July 16, 2020, 1:31 p.m. UTC | #9
On 16/07/2020 11:28, Will Deacon wrote:
> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote:
>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote:
>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote:
>>>> As mentioned in [0], the CPU may consume many cycles processing
>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to
>>>> get space on the queue takes approx 25% of the cycles for this function.
>>>>
>>>> This series removes that cmpxchg().
>>>
>>> How about something much simpler like the diff below?
>>
>> Ah, scratch that, I don't drop the lock if we fail the cas with it held.
>> Let me hack it some more (I have no hardware so I can only build-test this).
> 
> Right, second attempt...
> 
> Will

Unfortunately that hangs my machine during boot:

[10.902893] 00:01: ttyS0 at MMIO 0x3f00003f8 (irq = 6, base_baud = 
115200) is a 16550A
[10.912048] SuperH (H)SCI(F) driver initialized
[10.916811] msm_serial: driver initialized
[10.921371] arm-smmu-v3 arm-smmu-v3.0.auto: option mask 0x0
[10.926946] arm-smmu-v3 arm-smmu-v3.0.auto: ias 48-bit, oas 48-bit 
(features 0x00000fef)
[10.935374] arm-smmu-v3 arm-smmu-v3.0.auto: allocated 65536 entries for cmdq
[10.942522] arm-smmu-v3 arm-smmu-v3.0.auto: allocated 32768 entries for evtq


> 
> --->8
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f578677a5c41..e6bcddd6ef69 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -560,6 +560,7 @@ struct arm_smmu_cmdq {
>   	atomic_long_t			*valid_map;
>   	atomic_t			owner_prod;
>   	atomic_t			lock;
> +	spinlock_t			slock;
>   };
>   
>   struct arm_smmu_cmdq_batch {
> @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
>   	u64 cmd_sync[CMDQ_ENT_DWORDS];
>   	u32 prod;
>   	unsigned long flags;
> -	bool owner;
> +	bool owner, locked = false;
>   	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
>   	struct arm_smmu_ll_queue llq = {
>   		.max_n_shift = cmdq->q.llq.max_n_shift,
> @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
>   
>   	/* 1. Allocate some space in the queue */
>   	local_irq_save(flags);
> -	llq.val = READ_ONCE(cmdq->q.llq.val);
>   	do {
>   		u64 old;
> +		llq.val = READ_ONCE(cmdq->q.llq.val);
>   
> -		while (!queue_has_space(&llq, n + sync)) {
> +		if (queue_has_space(&llq, n + sync))
> +			goto try_cas;
> +
> +		if (locked)
> +			spin_unlock(&cmdq->slock);
> +
> +		do {
>   			local_irq_restore(flags);
>   			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
>   				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>   			local_irq_save(flags);
> -		}
> +		} while (!queue_has_space(&llq, n + sync));
>   
> +try_cas:
>   		head.cons = llq.cons;
>   		head.prod = queue_inc_prod_n(&llq, n + sync) |
>   					     CMDQ_PROD_OWNED_FLAG;
>   
>   		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
> -		if (old == llq.val)
> +		if (old != llq.val)

Not sure why you changed this. And if I change it back, it seems that we 
could drop out of the loop with cmdq->slock held, so need to drop the 
lock also.

I tried that and it stops my machine hanging. Let me know that was the 
intention, so I can test.

Thanks,
John

>   			break;
>   
> -		llq.val = old;
> +		if (!locked) {
> +			spin_lock(&cmdq->slock);
> +			locked = true;
> +		}
>   	} while (1);
> +
>   	owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
>   	head.prod &= ~CMDQ_PROD_OWNED_FLAG;
>   	llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
> @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
>   
>   	atomic_set(&cmdq->owner_prod, 0);
>   	atomic_set(&cmdq->lock, 0);
> +	spin_lock_init(&cmdq->slock);
>   
>   	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
>   	if (!bitmap) {
> .
>
John Garry July 16, 2020, 4:50 p.m. UTC | #10
>>
>> Perhaps a silly question (I'm too engrossed in PMU world ATM to get properly
>> back up to speed on this), but couldn't this be done without cmpxchg anyway?
>> Instinctively it feels like instead of maintaining a literal software copy
>> of the prod value, we could resolve the "claim my slot in the queue" part
>> with atomic_fetch_add on a free-running 32-bit "pseudo-prod" index, then
>> whoever updates the hardware deals with the truncation and wrap bit to
>> convert it to an actual register value.
> 
> Maybe, but it's easier said than done. The hard part is figuring how that
> you have space and there's no way I'm touching that logic without a way to
> test this.
> 
> I'm also not keen to complicate the code because of systems that don't scale
> well with contended CAS [1]. If you put down loads of cores, you need an
> interconnect/coherence protocol that can handle it.

JFYI, I added some debug to the driver to get the cmpxchg() attempt rate 
while running a testharness (below):

cpus	rate
2	1.1
4	1.1
8	1.3
16	3.6
32	8.1
64	12.6
96	14.7

Ideal rate is 1. So it's not crazy high for many CPUs, but still 
drifting away from 1.

John

> 
> Will
> 
> [1] https://lore.kernel.org/lkml/20190607072652.GA5522@hc/T/#m0d00f107c29223045933292a8b5b90d2ca9b7e5c
> .
> 

//copied from Barry, thanks :)

static int ways = 64;
module_param(ways, int, S_IRUGO);

static int seconds = 20;
module_param(seconds, int, S_IRUGO);

int mappings[NR_CPUS];
struct semaphore sem[NR_CPUS];


#define COMPLETIONS_SIZE 50

static noinline dma_addr_t test_mapsingle(struct device *dev, void *buf, 
int size)
{
     dma_addr_t dma_addr = dma_map_single(dev, buf, size, DMA_TO_DEVICE);
     return dma_addr;
}

static noinline void test_unmapsingle(struct device *dev, void *buf, int 
size, dma_addr_t dma_addr)
{
     dma_unmap_single(dev, dma_addr, size, DMA_TO_DEVICE);
}

static noinline void test_memcpy(void *out, void *in, int size)
{
     memcpy(out, in, size);
}

//just a hack to get a dev h behind a SMMU
extern struct device *hisi_dev;

static int testthread(void *data)
{
     unsigned long stop = jiffies +seconds*HZ;
     struct device *dev = hisi_dev;
     char *inputs[COMPLETIONS_SIZE];
     char *outputs[COMPLETIONS_SIZE];
     dma_addr_t dma_addr[COMPLETIONS_SIZE];
     int i, cpu = smp_processor_id();
     struct semaphore *sem = data;

     for (i = 0; i < COMPLETIONS_SIZE; i++) {
         inputs[i] = kzalloc(4096, GFP_KERNEL);
         if (!inputs[i])
             return -ENOMEM;
     }

     for (i = 0; i < COMPLETIONS_SIZE; i++) {
         outputs[i] = kzalloc(4096, GFP_KERNEL);
         if (!outputs[i])
             return -ENOMEM;
     }

     while (time_before(jiffies, stop)) {
         for (i = 0; i < COMPLETIONS_SIZE; i++) {
             dma_addr[i] = test_mapsingle(dev, inputs[i], 4096);
             test_memcpy(outputs[i], inputs[i], 4096);
         }
         for (i = 0; i < COMPLETIONS_SIZE; i++) {
             test_unmapsingle(dev, inputs[i], 4096, dma_addr[i]);
         }
         mappings[cpu] += COMPLETIONS_SIZE;
     }

     for (i = 0; i < COMPLETIONS_SIZE; i++) {
         kfree(outputs[i]);
         kfree(inputs[i]);
     }

     up(sem);

     return 0;
}

void smmu_test_core(int cpus)
{
     struct task_struct *tsk;
     int i;
     int total_mappings = 0;

     ways = cpus;

     for(i=0;i<ways;i++) {
         mappings[i] = 0;
         tsk = kthread_create_on_cpu(testthread, &sem[i], i,  "map_test");

         if (IS_ERR(tsk))
             printk(KERN_ERR "create test thread failed\n");
         wake_up_process(tsk);
     }

     for(i=0;i<ways;i++) {
         down(&sem[i]);
         total_mappings += mappings[i];
     }

     printk(KERN_ERR "finished total_mappings=%d (per way=%d) (rate=%d 
per second per cpu) ways=%d\n",
     total_mappings, total_mappings / ways, total_mappings / (seconds* 
ways), ways);

}
EXPORT_SYMBOL(smmu_test_core);


static int __init test_init(void)
{
     int i;

     for(i=0;i<NR_CPUS;i++)
         sema_init(&sem[i], 0);

     return 0;
}

static void __exit test_exit(void)
{
}

module_init(test_init);
module_exit(test_exit);
MODULE_LICENSE("GPL");