Message ID | 1567118827-26358-4-git-send-email-vdumpa@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Nvidia Arm SMMUv2 Implementation | expand |
On Thu, Aug 29, 2019 at 03:47:03PM -0700, Krishna Reddy wrote: > tlb_sync hook allows nvidia smmu handle tlb sync > across multiple SMMUs as necessary. > > Signed-off-by: Krishna Reddy <vdumpa@nvidia.com> > --- > drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++ > drivers/iommu/arm-smmu.c | 8 +++++--- > drivers/iommu/arm-smmu.h | 4 ++++ > 3 files changed, 41 insertions(+), 3 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c > index d93ceda..a429b2c 100644 > --- a/drivers/iommu/arm-smmu-nvidia.c > +++ b/drivers/iommu/arm-smmu-nvidia.c > @@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu, > writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset); > } > > +static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page, > + int sync, int status, int inst) > +{ > + u32 reg; > + unsigned int spin_cnt, delay; > + > + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { > + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { > + reg = readl_relaxed( > + nsmmu_page(smmu, inst, page) + status); > + if (!(reg & sTLBGSTATUS_GSACTIVE)) > + return; > + cpu_relax(); > + } > + udelay(delay); > + } > + dev_err_ratelimited(smmu->dev, > + "TLB sync timed out -- SMMU may be deadlocked\n"); > +} > + > +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page, > + int sync, int status) > +{ > + int i; > + > + arm_smmu_writel(smmu, page, sync, 0); > + > + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++) > + nsmmu_tlb_sync_wait(smmu, page, sync, status, i); > +} > + > static const struct arm_smmu_impl nsmmu_impl = { > .read_reg = nsmmu_read_reg, > .write_reg = nsmmu_write_reg, > .read_reg64 = nsmmu_read_reg64, > .write_reg64 = nsmmu_write_reg64, > + .tlb_sync = nsmmu_tlb_sync, > }; > > struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu) > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > index 46e1641..f5454e71 100644 > --- a/drivers/iommu/arm-smmu.c > +++ b/drivers/iommu/arm-smmu.c > @@ -52,9 +52,6 @@ > */ > #define QCOM_DUMMY_VAL -1 > > -#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ > -#define TLB_SPIN_COUNT 10 > - > #define MSI_IOVA_BASE 0x8000000 > #define MSI_IOVA_LENGTH 0x100000 > > @@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, > unsigned int spin_cnt, delay; > u32 reg; > > + if (smmu->impl->tlb_sync) { > + smmu->impl->tlb_sync(smmu, page, sync, status); > + return; > + } > + Wouldn't it work if you replaced all calls of __arm_smmu_tlb_sync() by smmu->impl->tlb_sync() and assign __arm_smmu_tlb_sync() as default for devices that don't need to override it? That makes this patch slightly larger, but it saves us one level of indirection. > + > arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); > for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { > for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h > index 9645bf1..d3217f1 100644 > --- a/drivers/iommu/arm-smmu.h > +++ b/drivers/iommu/arm-smmu.h > @@ -207,6 +207,8 @@ enum arm_smmu_cbar_type { > /* Maximum number of context banks per SMMU */ > #define ARM_SMMU_MAX_CBS 128 > > +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ > +#define TLB_SPIN_COUNT 10 > > /* Shared driver definitions */ > enum arm_smmu_arch_version { > @@ -336,6 +338,8 @@ struct arm_smmu_impl { > int (*cfg_probe)(struct arm_smmu_device *smmu); > int (*reset)(struct arm_smmu_device *smmu); > int (*init_context)(struct arm_smmu_domain *smmu_domain); > + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync, > + int status); Can't page, sync and status all be unsigned? Thierry
On 29/08/2019 23:47, Krishna Reddy wrote: > tlb_sync hook allows nvidia smmu handle tlb sync > across multiple SMMUs as necessary. > > Signed-off-by: Krishna Reddy <vdumpa@nvidia.com> > --- > drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++ > drivers/iommu/arm-smmu.c | 8 +++++--- > drivers/iommu/arm-smmu.h | 4 ++++ > 3 files changed, 41 insertions(+), 3 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c > index d93ceda..a429b2c 100644 > --- a/drivers/iommu/arm-smmu-nvidia.c > +++ b/drivers/iommu/arm-smmu-nvidia.c > @@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu, > writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset); > } > > +static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page, > + int sync, int status, int inst) > +{ > + u32 reg; > + unsigned int spin_cnt, delay; > + > + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { > + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { > + reg = readl_relaxed( > + nsmmu_page(smmu, inst, page) + status); > + if (!(reg & sTLBGSTATUS_GSACTIVE)) > + return; > + cpu_relax(); > + } > + udelay(delay); > + } > + dev_err_ratelimited(smmu->dev, > + "TLB sync timed out -- SMMU may be deadlocked\n"); > +} > + > +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page, > + int sync, int status) > +{ > + int i; > + > + arm_smmu_writel(smmu, page, sync, 0); > + > + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++) It might make more sense to make this the innermost loop, i.e.: for (i = 0; i < nsmmu->num_inst; i++) reg &= readl_relaxed(nsmmu_page(smmu, i, page)... since polling the instances in parallel rather than in series seems like it might be a bit more efficient. > + nsmmu_tlb_sync_wait(smmu, page, sync, status, i); > +} > + > static const struct arm_smmu_impl nsmmu_impl = { > .read_reg = nsmmu_read_reg, > .write_reg = nsmmu_write_reg, > .read_reg64 = nsmmu_read_reg64, > .write_reg64 = nsmmu_write_reg64, > + .tlb_sync = nsmmu_tlb_sync, > }; > > struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu) > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c > index 46e1641..f5454e71 100644 > --- a/drivers/iommu/arm-smmu.c > +++ b/drivers/iommu/arm-smmu.c > @@ -52,9 +52,6 @@ > */ > #define QCOM_DUMMY_VAL -1 > > -#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ > -#define TLB_SPIN_COUNT 10 > - > #define MSI_IOVA_BASE 0x8000000 > #define MSI_IOVA_LENGTH 0x100000 > > @@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, > unsigned int spin_cnt, delay; > u32 reg; > > + if (smmu->impl->tlb_sync) { > + smmu->impl->tlb_sync(smmu, page, sync, status); What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection. Robin. > + return; > + } > + > arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); > for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { > for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h > index 9645bf1..d3217f1 100644 > --- a/drivers/iommu/arm-smmu.h > +++ b/drivers/iommu/arm-smmu.h > @@ -207,6 +207,8 @@ enum arm_smmu_cbar_type { > /* Maximum number of context banks per SMMU */ > #define ARM_SMMU_MAX_CBS 128 > > +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ > +#define TLB_SPIN_COUNT 10 > > /* Shared driver definitions */ > enum arm_smmu_arch_version { > @@ -336,6 +338,8 @@ struct arm_smmu_impl { > int (*cfg_probe)(struct arm_smmu_device *smmu); > int (*reset)(struct arm_smmu_device *smmu); > int (*init_context)(struct arm_smmu_domain *smmu_domain); > + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync, > + int status); > }; > > static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n) >
>> + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++) >It might make more sense to make this the innermost loop, i.e.: for (i = 0; i < nsmmu->num_inst; i++) reg &= readl_relaxed(nsmmu_page(smmu, i, page)... >since polling the instances in parallel rather than in series seems like it might be a bit more efficient. Sync register is programmed at the same time for both instances. The status check is serialized. I can update it to check status of both at the same time. >> + if (smmu->impl->tlb_sync) { >> + smmu->impl->tlb_sync(smmu, page, sync, status); >What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection. With arm_smmu_domain now available in arm-smmu.h, arm-smmu-nvidia.c can directly update the tlb_ops->tlb_sync and avoid indirection. Will update in next version. -KR
>Wouldn't it work if you replaced all calls of __arm_smmu_tlb_sync() by >smmu->impl->tlb_sync() and assign __arm_smmu_tlb_sync() as default for >devices that don't need to override it? That makes this patch slightly larger, but it saves us one level of indirection. The tlb_ops->tlb_sync can be overridden directly in arm-smmu-nvidia.c specific implementation as pointed by Robin. Will be updating in next patch. >> + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync, >> + int status); >Can't page, sync and status all be unsigned? This is to be uniform with original tlb_sync definition is arm-smmu.c. Anyway, this hook is not necessary as tlb_ops->tlb_sync can be overridden directly. -KR
>> + if (smmu->impl->tlb_sync) { >> + smmu->impl->tlb_sync(smmu, page, sync, status); >What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection. Hi Robin, overriding tlb_ops->tlb_sync function is not enough here. There are direct references to arm_smmu_tlb_sync_context(), arm_smmu_tlb_sync_global() functions. In arm-smmu.c. we can replace these direct references with tlb_ops->tlb_sync() function except in one function arm_smmu_device_reset(). When arm_smmu_device_reset() gets called, domains are not initialized and tlb_ops is not available. Should we add a new hook for arm_smmu_tlb_sync_global() or make this as a responsibility of impl->reset() hook as it gets called at the same place? -KR
On 30/08/2019 23:49, Krishna Reddy wrote: >>> + if (smmu->impl->tlb_sync) { >>> + smmu->impl->tlb_sync(smmu, page, sync, status); > >> What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection. > > Hi Robin, overriding tlb_ops->tlb_sync function is not enough here. > There are direct references to arm_smmu_tlb_sync_context(), arm_smmu_tlb_sync_global() functions. > In arm-smmu.c. we can replace these direct references with tlb_ops->tlb_sync() function except in one function arm_smmu_device_reset(). > When arm_smmu_device_reset() gets called, domains are not initialized and tlb_ops is not available. > Should we add a new hook for arm_smmu_tlb_sync_global() or make this as a responsibility of impl->reset() hook as it gets > called at the same place? Ah, right, I'd forgotten about the TLB maintenance on reset. Looking again, though, I think you might need to implement impl->reset anyway for the sake of clearing GFSR correctly - since the value read from the base instance technically may not match whatever bits might happen to be set in the other instances - so I don't see any issue with just calling nsmmu_tlb_sync() from there as well. Robin.
diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c index d93ceda..a429b2c 100644 --- a/drivers/iommu/arm-smmu-nvidia.c +++ b/drivers/iommu/arm-smmu-nvidia.c @@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu, writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset); } +static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page, + int sync, int status, int inst) +{ + u32 reg; + unsigned int spin_cnt, delay; + + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { + reg = readl_relaxed( + nsmmu_page(smmu, inst, page) + status); + if (!(reg & sTLBGSTATUS_GSACTIVE)) + return; + cpu_relax(); + } + udelay(delay); + } + dev_err_ratelimited(smmu->dev, + "TLB sync timed out -- SMMU may be deadlocked\n"); +} + +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page, + int sync, int status) +{ + int i; + + arm_smmu_writel(smmu, page, sync, 0); + + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++) + nsmmu_tlb_sync_wait(smmu, page, sync, status, i); +} + static const struct arm_smmu_impl nsmmu_impl = { .read_reg = nsmmu_read_reg, .write_reg = nsmmu_write_reg, .read_reg64 = nsmmu_read_reg64, .write_reg64 = nsmmu_write_reg64, + .tlb_sync = nsmmu_tlb_sync, }; struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 46e1641..f5454e71 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -52,9 +52,6 @@ */ #define QCOM_DUMMY_VAL -1 -#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ -#define TLB_SPIN_COUNT 10 - #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 @@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page, unsigned int spin_cnt, delay; u32 reg; + if (smmu->impl->tlb_sync) { + smmu->impl->tlb_sync(smmu, page, sync, status); + return; + } + arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL); for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h index 9645bf1..d3217f1 100644 --- a/drivers/iommu/arm-smmu.h +++ b/drivers/iommu/arm-smmu.h @@ -207,6 +207,8 @@ enum arm_smmu_cbar_type { /* Maximum number of context banks per SMMU */ #define ARM_SMMU_MAX_CBS 128 +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ +#define TLB_SPIN_COUNT 10 /* Shared driver definitions */ enum arm_smmu_arch_version { @@ -336,6 +338,8 @@ struct arm_smmu_impl { int (*cfg_probe)(struct arm_smmu_device *smmu); int (*reset)(struct arm_smmu_device *smmu); int (*init_context)(struct arm_smmu_domain *smmu_domain); + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync, + int status); }; static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
tlb_sync hook allows nvidia smmu handle tlb sync across multiple SMMUs as necessary. Signed-off-by: Krishna Reddy <vdumpa@nvidia.com> --- drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++ drivers/iommu/arm-smmu.c | 8 +++++--- drivers/iommu/arm-smmu.h | 4 ++++ 3 files changed, 41 insertions(+), 3 deletions(-)