Message ID | 1592846920-45338-1-git-send-email-john.garry@huawei.com (mailing list archive) |
---|---|
Headers | show |
Series | iommu/arm-smmu-v3: Improve cmdq lock efficiency | expand |
On 22/06/2020 18:28, John Garry wrote: Hi, Can you guys let me know if this is on the radar at all? I have been talking about this performance issue since Jan, and not getting anything really. thanks > As mentioned in [0], the CPU may consume many cycles processing > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to > get space on the queue takes approx 25% of the cycles for this function. > > This series removes that cmpxchg(). > > For my NVMe test with 3x NVMe SSDs, I'm getting a ~24% throughput > increase: > Before: 1310 IOPs > After: 1630 IOPs > > I also have a test harness to check the rate of DMA map+unmaps we can > achieve: > > CPU count 32 64 128 > Before: 63187 19418 10169 > After: 93287 44789 15862 > > (unit is map+unmaps per CPU per second) > > [0] https://lore.kernel.org/linux-iommu/B926444035E5E2439431908E3842AFD24B86DB@DGGEMI525-MBS.china.huawei.com/T/#ma02e301c38c3e94b7725e685757c27e39c7cbde3 > > John Garry (4): > iommu/arm-smmu-v3: Fix trivial typo > iommu/arm-smmu-v3: Calculate bits for prod and owner > iommu/arm-smmu-v3: Always issue a CMD_SYNC per batch > iommu/arm-smmu-v3: Remove cmpxchg() in arm_smmu_cmdq_issue_cmdlist() > > drivers/iommu/arm-smmu-v3.c | 233 +++++++++++++++++++++++------------- > 1 file changed, 151 insertions(+), 82 deletions(-) >
On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: > As mentioned in [0], the CPU may consume many cycles processing > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to > get space on the queue takes approx 25% of the cycles for this function. > > This series removes that cmpxchg(). How about something much simpler like the diff below? Will --->8 diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index f578677a5c41..705d99ec8219 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { atomic_long_t *valid_map; atomic_t owner_prod; atomic_t lock; + spinlock_t slock; }; struct arm_smmu_cmdq_batch { @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; - bool owner; + bool owner, cas_failed = false; struct arm_smmu_cmdq *cmdq = &smmu->cmdq; struct arm_smmu_ll_queue llq = { .max_n_shift = cmdq->q.llq.max_n_shift, @@ -1387,27 +1388,35 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, /* 1. Allocate some space in the queue */ local_irq_save(flags); - llq.val = READ_ONCE(cmdq->q.llq.val); do { u64 old; + llq.val = READ_ONCE(cmdq->q.llq.val); - while (!queue_has_space(&llq, n + sync)) { + if (queue_has_space(&llq, n + sync)) + goto try_cas; + + if (cas_failed) + spin_unlock(&cmdq->slock); + + do { local_irq_restore(flags); if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); local_irq_save(flags); - } + } while (!queue_has_space(&llq, n + sync)); +try_cas: head.cons = llq.cons; head.prod = queue_inc_prod_n(&llq, n + sync) | CMDQ_PROD_OWNED_FLAG; old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); - if (old == llq.val) - break; + cas_failed = old != llq.val; + + if (cas_failed) + spin_lock(&cmdq->slock); + } while (cas_failed); - llq.val = old; - } while (1); owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); head.prod &= ~CMDQ_PROD_OWNED_FLAG; llq.prod &= ~CMDQ_PROD_OWNED_FLAG; @@ -3192,6 +3201,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) atomic_set(&cmdq->owner_prod, 0); atomic_set(&cmdq->lock, 0); + spin_lock_init(&cmdq->slock); bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); if (!bitmap) {
On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: > On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: > > As mentioned in [0], the CPU may consume many cycles processing > > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to > > get space on the queue takes approx 25% of the cycles for this function. > > > > This series removes that cmpxchg(). > > How about something much simpler like the diff below? Ah, scratch that, I don't drop the lock if we fail the cas with it held. Let me hack it some more (I have no hardware so I can only build-test this). Will
On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: > On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: > > On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: > > > As mentioned in [0], the CPU may consume many cycles processing > > > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to > > > get space on the queue takes approx 25% of the cycles for this function. > > > > > > This series removes that cmpxchg(). > > > > How about something much simpler like the diff below? > > Ah, scratch that, I don't drop the lock if we fail the cas with it held. > Let me hack it some more (I have no hardware so I can only build-test this). Right, second attempt... Will --->8 diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index f578677a5c41..e6bcddd6ef69 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { atomic_long_t *valid_map; atomic_t owner_prod; atomic_t lock; + spinlock_t slock; }; struct arm_smmu_cmdq_batch { @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; - bool owner; + bool owner, locked = false; struct arm_smmu_cmdq *cmdq = &smmu->cmdq; struct arm_smmu_ll_queue llq = { .max_n_shift = cmdq->q.llq.max_n_shift, @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, /* 1. Allocate some space in the queue */ local_irq_save(flags); - llq.val = READ_ONCE(cmdq->q.llq.val); do { u64 old; + llq.val = READ_ONCE(cmdq->q.llq.val); - while (!queue_has_space(&llq, n + sync)) { + if (queue_has_space(&llq, n + sync)) + goto try_cas; + + if (locked) + spin_unlock(&cmdq->slock); + + do { local_irq_restore(flags); if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); local_irq_save(flags); - } + } while (!queue_has_space(&llq, n + sync)); +try_cas: head.cons = llq.cons; head.prod = queue_inc_prod_n(&llq, n + sync) | CMDQ_PROD_OWNED_FLAG; old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); - if (old == llq.val) + if (old != llq.val) break; - llq.val = old; + if (!locked) { + spin_lock(&cmdq->slock); + locked = true; + } } while (1); + owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); head.prod &= ~CMDQ_PROD_OWNED_FLAG; llq.prod &= ~CMDQ_PROD_OWNED_FLAG; @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) atomic_set(&cmdq->owner_prod, 0); atomic_set(&cmdq->lock, 0); + spin_lock_init(&cmdq->slock); bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); if (!bitmap) {
On 16/07/2020 11:28, Will Deacon wrote: > On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: >> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: >>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: >>>> As mentioned in [0], the CPU may consume many cycles processing >>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to >>>> get space on the queue takes approx 25% of the cycles for this function. >>>> >>>> This series removes that cmpxchg(). >>> >>> How about something much simpler like the diff below? >> >> Ah, scratch that, I don't drop the lock if we fail the cas with it held. >> Let me hack it some more (I have no hardware so I can only build-test this). > > Right, second attempt... I can try it, but if performance if not as good, then please check mine further (patch 4/4 specifically) - performance is really good, IMHO. Thanks, > > Will > > --->8 > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index f578677a5c41..e6bcddd6ef69 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { > atomic_long_t *valid_map; > atomic_t owner_prod; > atomic_t lock; > + spinlock_t slock; > }; > > struct arm_smmu_cmdq_batch { > @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, > u64 cmd_sync[CMDQ_ENT_DWORDS]; > u32 prod; > unsigned long flags; > - bool owner; > + bool owner, locked = false; > struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > struct arm_smmu_ll_queue llq = { > .max_n_shift = cmdq->q.llq.max_n_shift, > @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, > > /* 1. Allocate some space in the queue */ > local_irq_save(flags); > - llq.val = READ_ONCE(cmdq->q.llq.val); > do { > u64 old; > + llq.val = READ_ONCE(cmdq->q.llq.val); > > - while (!queue_has_space(&llq, n + sync)) { > + if (queue_has_space(&llq, n + sync)) > + goto try_cas; > + > + if (locked) > + spin_unlock(&cmdq->slock); > + > + do { > local_irq_restore(flags); > if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) > dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); > local_irq_save(flags); > - } > + } while (!queue_has_space(&llq, n + sync)); > > +try_cas: > head.cons = llq.cons; > head.prod = queue_inc_prod_n(&llq, n + sync) | > CMDQ_PROD_OWNED_FLAG; > > old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); > - if (old == llq.val) > + if (old != llq.val) > break; > > - llq.val = old; > + if (!locked) { > + spin_lock(&cmdq->slock); > + locked = true; > + } > } while (1); > + > owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); > head.prod &= ~CMDQ_PROD_OWNED_FLAG; > llq.prod &= ~CMDQ_PROD_OWNED_FLAG; > @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) > > atomic_set(&cmdq->owner_prod, 0); > atomic_set(&cmdq->lock, 0); > + spin_lock_init(&cmdq->slock); > > bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); > if (!bitmap) { > . >
On 2020-07-16 11:56, John Garry wrote: > On 16/07/2020 11:28, Will Deacon wrote: >> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: >>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: >>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: >>>>> As mentioned in [0], the CPU may consume many cycles processing >>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() >>>>> loop to >>>>> get space on the queue takes approx 25% of the cycles for this >>>>> function. >>>>> >>>>> This series removes that cmpxchg(). >>>> >>>> How about something much simpler like the diff below? >> >>> Ah, scratch that, I don't drop the lock if we fail the cas with it held. >>> Let me hack it some more (I have no hardware so I can only build-test >>> this). >> >> Right, second attempt... > > I can try it, but if performance if not as good, then please check mine > further (patch 4/4 specifically) - performance is really good, IMHO. Perhaps a silly question (I'm too engrossed in PMU world ATM to get properly back up to speed on this), but couldn't this be done without cmpxchg anyway? Instinctively it feels like instead of maintaining a literal software copy of the prod value, we could resolve the "claim my slot in the queue" part with atomic_fetch_add on a free-running 32-bit "pseudo-prod" index, then whoever updates the hardware deals with the truncation and wrap bit to convert it to an actual register value. Robin. > > Thanks, > >> >> Will >> >> --->8 >> >> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c >> index f578677a5c41..e6bcddd6ef69 100644 >> --- a/drivers/iommu/arm-smmu-v3.c >> +++ b/drivers/iommu/arm-smmu-v3.c >> @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { >> atomic_long_t *valid_map; >> atomic_t owner_prod; >> atomic_t lock; >> + spinlock_t slock; >> }; >> struct arm_smmu_cmdq_batch { >> @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct >> arm_smmu_device *smmu, >> u64 cmd_sync[CMDQ_ENT_DWORDS]; >> u32 prod; >> unsigned long flags; >> - bool owner; >> + bool owner, locked = false; >> struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> struct arm_smmu_ll_queue llq = { >> .max_n_shift = cmdq->q.llq.max_n_shift, >> @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct >> arm_smmu_device *smmu, >> /* 1. Allocate some space in the queue */ >> local_irq_save(flags); >> - llq.val = READ_ONCE(cmdq->q.llq.val); >> do { >> u64 old; >> + llq.val = READ_ONCE(cmdq->q.llq.val); >> - while (!queue_has_space(&llq, n + sync)) { >> + if (queue_has_space(&llq, n + sync)) >> + goto try_cas; >> + >> + if (locked) >> + spin_unlock(&cmdq->slock); >> + >> + do { >> local_irq_restore(flags); >> if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) >> dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); >> local_irq_save(flags); >> - } >> + } while (!queue_has_space(&llq, n + sync)); >> +try_cas: >> head.cons = llq.cons; >> head.prod = queue_inc_prod_n(&llq, n + sync) | >> CMDQ_PROD_OWNED_FLAG; >> old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); >> - if (old == llq.val) >> + if (old != llq.val) >> break; >> - llq.val = old; >> + if (!locked) { >> + spin_lock(&cmdq->slock); >> + locked = true; >> + } >> } while (1); >> + >> owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); >> head.prod &= ~CMDQ_PROD_OWNED_FLAG; >> llq.prod &= ~CMDQ_PROD_OWNED_FLAG; >> @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct >> arm_smmu_device *smmu) >> atomic_set(&cmdq->owner_prod, 0); >> atomic_set(&cmdq->lock, 0); >> + spin_lock_init(&cmdq->slock); >> bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); >> if (!bitmap) { >> . >> >
On 16/07/2020 12:22, Robin Murphy wrote: > On 2020-07-16 11:56, John Garry wrote: >> On 16/07/2020 11:28, Will Deacon wrote: >>> On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: >>>> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: >>>>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: >>>>>> As mentioned in [0], the CPU may consume many cycles processing >>>>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() >>>>>> loop to >>>>>> get space on the queue takes approx 25% of the cycles for this >>>>>> function. >>>>>> >>>>>> This series removes that cmpxchg(). >>>>> >>>>> How about something much simpler like the diff below? >> >>>> Ah, scratch that, I don't drop the lock if we fail the cas with it held. >>>> Let me hack it some more (I have no hardware so I can only build-test >>>> this). >>> >>> Right, second attempt... >> >> I can try it, but if performance if not as good, then please check mine >> further (patch 4/4 specifically) - performance is really good, IMHO. > > Perhaps a silly question (I'm too engrossed in PMU world ATM to get > properly back up to speed on this), but couldn't this be done without > cmpxchg anyway? Instinctively it feels like instead of maintaining a > literal software copy of the prod value, we could resolve the "claim my > slot in the queue" part with atomic_fetch_add on a free-running 32-bit > "pseudo-prod" index, then whoever updates the hardware deals with the > truncation and wrap bit to convert it to an actual register value. > That's what mine does. But I also need to take care of cmdq locking and how we unconditionally provide space. Cheers, John
On Thu, Jul 16, 2020 at 12:22:05PM +0100, Robin Murphy wrote: > On 2020-07-16 11:56, John Garry wrote: > > On 16/07/2020 11:28, Will Deacon wrote: > > > On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: > > > > On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: > > > > > On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: > > > > > > As mentioned in [0], the CPU may consume many cycles processing > > > > > > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the > > > > > > cmpxchg() loop to > > > > > > get space on the queue takes approx 25% of the cycles > > > > > > for this function. > > > > > > > > > > > > This series removes that cmpxchg(). > > > > > > > > > > How about something much simpler like the diff below? >> > > > > Ah, scratch that, I don't drop the lock if we fail the cas with it held. > > > > Let me hack it some more (I have no hardware so I can only > > > > build-test this). > > > > > > Right, second attempt... > > > > I can try it, but if performance if not as good, then please check mine > > further (patch 4/4 specifically) - performance is really good, IMHO. > > Perhaps a silly question (I'm too engrossed in PMU world ATM to get properly > back up to speed on this), but couldn't this be done without cmpxchg anyway? > Instinctively it feels like instead of maintaining a literal software copy > of the prod value, we could resolve the "claim my slot in the queue" part > with atomic_fetch_add on a free-running 32-bit "pseudo-prod" index, then > whoever updates the hardware deals with the truncation and wrap bit to > convert it to an actual register value. Maybe, but it's easier said than done. The hard part is figuring how that you have space and there's no way I'm touching that logic without a way to test this. I'm also not keen to complicate the code because of systems that don't scale well with contended CAS [1]. If you put down loads of cores, you need an interconnect/coherence protocol that can handle it. Will [1] https://lore.kernel.org/lkml/20190607072652.GA5522@hc/T/#m0d00f107c29223045933292a8b5b90d2ca9b7e5c
On 16/07/2020 11:28, Will Deacon wrote: > On Thu, Jul 16, 2020 at 11:22:33AM +0100, Will Deacon wrote: >> On Thu, Jul 16, 2020 at 11:19:41AM +0100, Will Deacon wrote: >>> On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: >>>> As mentioned in [0], the CPU may consume many cycles processing >>>> arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to >>>> get space on the queue takes approx 25% of the cycles for this function. >>>> >>>> This series removes that cmpxchg(). >>> >>> How about something much simpler like the diff below? >> >> Ah, scratch that, I don't drop the lock if we fail the cas with it held. >> Let me hack it some more (I have no hardware so I can only build-test this). > > Right, second attempt... > > Will Unfortunately that hangs my machine during boot: [10.902893] 00:01: ttyS0 at MMIO 0x3f00003f8 (irq = 6, base_baud = 115200) is a 16550A [10.912048] SuperH (H)SCI(F) driver initialized [10.916811] msm_serial: driver initialized [10.921371] arm-smmu-v3 arm-smmu-v3.0.auto: option mask 0x0 [10.926946] arm-smmu-v3 arm-smmu-v3.0.auto: ias 48-bit, oas 48-bit (features 0x00000fef) [10.935374] arm-smmu-v3 arm-smmu-v3.0.auto: allocated 65536 entries for cmdq [10.942522] arm-smmu-v3 arm-smmu-v3.0.auto: allocated 32768 entries for evtq > > --->8 > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index f578677a5c41..e6bcddd6ef69 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { > atomic_long_t *valid_map; > atomic_t owner_prod; > atomic_t lock; > + spinlock_t slock; > }; > > struct arm_smmu_cmdq_batch { > @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, > u64 cmd_sync[CMDQ_ENT_DWORDS]; > u32 prod; > unsigned long flags; > - bool owner; > + bool owner, locked = false; > struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > struct arm_smmu_ll_queue llq = { > .max_n_shift = cmdq->q.llq.max_n_shift, > @@ -1387,27 +1388,38 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, > > /* 1. Allocate some space in the queue */ > local_irq_save(flags); > - llq.val = READ_ONCE(cmdq->q.llq.val); > do { > u64 old; > + llq.val = READ_ONCE(cmdq->q.llq.val); > > - while (!queue_has_space(&llq, n + sync)) { > + if (queue_has_space(&llq, n + sync)) > + goto try_cas; > + > + if (locked) > + spin_unlock(&cmdq->slock); > + > + do { > local_irq_restore(flags); > if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) > dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); > local_irq_save(flags); > - } > + } while (!queue_has_space(&llq, n + sync)); > > +try_cas: > head.cons = llq.cons; > head.prod = queue_inc_prod_n(&llq, n + sync) | > CMDQ_PROD_OWNED_FLAG; > > old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); > - if (old == llq.val) > + if (old != llq.val) Not sure why you changed this. And if I change it back, it seems that we could drop out of the loop with cmdq->slock held, so need to drop the lock also. I tried that and it stops my machine hanging. Let me know that was the intention, so I can test. Thanks, John > break; > > - llq.val = old; > + if (!locked) { > + spin_lock(&cmdq->slock); > + locked = true; > + } > } while (1); > + > owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); > head.prod &= ~CMDQ_PROD_OWNED_FLAG; > llq.prod &= ~CMDQ_PROD_OWNED_FLAG; > @@ -3192,6 +3204,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) > > atomic_set(&cmdq->owner_prod, 0); > atomic_set(&cmdq->lock, 0); > + spin_lock_init(&cmdq->slock); > > bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); > if (!bitmap) { > . >
>> >> Perhaps a silly question (I'm too engrossed in PMU world ATM to get properly >> back up to speed on this), but couldn't this be done without cmpxchg anyway? >> Instinctively it feels like instead of maintaining a literal software copy >> of the prod value, we could resolve the "claim my slot in the queue" part >> with atomic_fetch_add on a free-running 32-bit "pseudo-prod" index, then >> whoever updates the hardware deals with the truncation and wrap bit to >> convert it to an actual register value. > > Maybe, but it's easier said than done. The hard part is figuring how that > you have space and there's no way I'm touching that logic without a way to > test this. > > I'm also not keen to complicate the code because of systems that don't scale > well with contended CAS [1]. If you put down loads of cores, you need an > interconnect/coherence protocol that can handle it. JFYI, I added some debug to the driver to get the cmpxchg() attempt rate while running a testharness (below): cpus rate 2 1.1 4 1.1 8 1.3 16 3.6 32 8.1 64 12.6 96 14.7 Ideal rate is 1. So it's not crazy high for many CPUs, but still drifting away from 1. John > > Will > > [1] https://lore.kernel.org/lkml/20190607072652.GA5522@hc/T/#m0d00f107c29223045933292a8b5b90d2ca9b7e5c > . > //copied from Barry, thanks :) static int ways = 64; module_param(ways, int, S_IRUGO); static int seconds = 20; module_param(seconds, int, S_IRUGO); int mappings[NR_CPUS]; struct semaphore sem[NR_CPUS]; #define COMPLETIONS_SIZE 50 static noinline dma_addr_t test_mapsingle(struct device *dev, void *buf, int size) { dma_addr_t dma_addr = dma_map_single(dev, buf, size, DMA_TO_DEVICE); return dma_addr; } static noinline void test_unmapsingle(struct device *dev, void *buf, int size, dma_addr_t dma_addr) { dma_unmap_single(dev, dma_addr, size, DMA_TO_DEVICE); } static noinline void test_memcpy(void *out, void *in, int size) { memcpy(out, in, size); } //just a hack to get a dev h behind a SMMU extern struct device *hisi_dev; static int testthread(void *data) { unsigned long stop = jiffies +seconds*HZ; struct device *dev = hisi_dev; char *inputs[COMPLETIONS_SIZE]; char *outputs[COMPLETIONS_SIZE]; dma_addr_t dma_addr[COMPLETIONS_SIZE]; int i, cpu = smp_processor_id(); struct semaphore *sem = data; for (i = 0; i < COMPLETIONS_SIZE; i++) { inputs[i] = kzalloc(4096, GFP_KERNEL); if (!inputs[i]) return -ENOMEM; } for (i = 0; i < COMPLETIONS_SIZE; i++) { outputs[i] = kzalloc(4096, GFP_KERNEL); if (!outputs[i]) return -ENOMEM; } while (time_before(jiffies, stop)) { for (i = 0; i < COMPLETIONS_SIZE; i++) { dma_addr[i] = test_mapsingle(dev, inputs[i], 4096); test_memcpy(outputs[i], inputs[i], 4096); } for (i = 0; i < COMPLETIONS_SIZE; i++) { test_unmapsingle(dev, inputs[i], 4096, dma_addr[i]); } mappings[cpu] += COMPLETIONS_SIZE; } for (i = 0; i < COMPLETIONS_SIZE; i++) { kfree(outputs[i]); kfree(inputs[i]); } up(sem); return 0; } void smmu_test_core(int cpus) { struct task_struct *tsk; int i; int total_mappings = 0; ways = cpus; for(i=0;i<ways;i++) { mappings[i] = 0; tsk = kthread_create_on_cpu(testthread, &sem[i], i, "map_test"); if (IS_ERR(tsk)) printk(KERN_ERR "create test thread failed\n"); wake_up_process(tsk); } for(i=0;i<ways;i++) { down(&sem[i]); total_mappings += mappings[i]; } printk(KERN_ERR "finished total_mappings=%d (per way=%d) (rate=%d per second per cpu) ways=%d\n", total_mappings, total_mappings / ways, total_mappings / (seconds* ways), ways); } EXPORT_SYMBOL(smmu_test_core); static int __init test_init(void) { int i; for(i=0;i<NR_CPUS;i++) sema_init(&sem[i], 0); return 0; } static void __exit test_exit(void) { } module_init(test_init); module_exit(test_exit); MODULE_LICENSE("GPL");