Message ID | 889a32458cec92ed110b94f393aa1c2f0d64dca5.1600754909.git.saiprakash.ranjan@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | System Cache support for GPU and required SMMU support | expand |
On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote: > From: Sharat Masetty <smasetty@codeaurora.org> > > The last level system cache can be partitioned to 32 different > slices of which GPU has two slices preallocated. One slice is > used for caching GPU buffers and the other slice is used for > caching the GPU SMMU pagetables. This talks to the core system > cache driver to acquire the slice handles, configure the SCID's > to those slices and activates and deactivates the slices upon > GPU power collapse and restore. > > Some support from the IOMMU driver is also needed to make use > of the system cache to set the right TCR attributes. GPU then > has the ability to override a few cacheability parameters which > it does to override write-allocate to write-no-allocate as the > GPU hardware does not benefit much from it. > > DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the > IOMMU driver to set the right attributes to cache the hardware > pagetables into the system cache. > > Signed-off-by: Sharat Masetty <smasetty@codeaurora.org> > [saiprakash.ranjan: fix to set attr before device attach to iommu and rebase] > Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> > --- > drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++ > drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ > drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ > 3 files changed, 104 insertions(+) > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > index 8915882e4444..151190ff62f7 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > @@ -8,7 +8,9 @@ > #include "a6xx_gpu.h" > #include "a6xx_gmu.xml.h" > > +#include <linux/bitfield.h> > #include <linux/devfreq.h> > +#include <linux/soc/qcom/llcc-qcom.h> > > #define GPU_PAS_ID 13 > > @@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) > return IRQ_HANDLED; > } > > +static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or) > +{ > + return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); > +} > + > +static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value) > +{ > + return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); > +} > + > +static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) > +{ > + llcc_slice_deactivate(a6xx_gpu->llc_slice); > + llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); > +} > + > +static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) > +{ > + u32 cntl1_regval = 0; > + > + if (IS_ERR(a6xx_gpu->llc_mmio)) > + return; > + > + if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { > + u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); > + > + gpu_scid &= 0x1f; > + cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | > + (gpu_scid << 15) | (gpu_scid << 20); > + } > + > + if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { > + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); > + > + gpuhtw_scid &= 0x1f; > + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); > + } > + > + if (cntl1_regval) { > + /* > + * Program the slice IDs for the various GPU blocks and GPU MMU > + * pagetables > + */ > + a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); > + > + /* > + * Program cacheability overrides to not allocate cache lines on > + * a write miss > + */ > + a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); > + } > +} This code has been around long enough that it pre-dates a650. On a650 and other MMU-500 targets the htw_llc is configured by the firmware and the llc_slice is configured in a different register. I don't think we need to pause everything and add support for the MMU-500 path, but we do need a way to disallow LLCC on affected targets until such time that we can get it fixed up. Jordan > + > +static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) > +{ > + llcc_slice_putd(a6xx_gpu->llc_slice); > + llcc_slice_putd(a6xx_gpu->htw_llc_slice); > +} > + > +static void a6xx_llc_slices_init(struct platform_device *pdev, > + struct a6xx_gpu *a6xx_gpu) > +{ > + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); > + if (IS_ERR(a6xx_gpu->llc_mmio)) > + return; > + > + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); > + a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); > + > + if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice)) > + a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL); > +} > + > static int a6xx_pm_resume(struct msm_gpu *gpu) > { > struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > @@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) > > msm_gpu_resume_devfreq(gpu); > > + a6xx_llc_activate(a6xx_gpu); > + > return 0; > } > > @@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) > > trace_msm_gpu_suspend(0); > > + a6xx_llc_deactivate(a6xx_gpu); > + > devfreq_suspend_device(gpu->devfreq.devfreq); > > return a6xx_gmu_stop(a6xx_gpu); > @@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) > drm_gem_object_put(a6xx_gpu->shadow_bo); > } > > + a6xx_llc_slices_destroy(a6xx_gpu); > + > a6xx_gmu_remove(a6xx_gpu); > > adreno_gpu_cleanup(adreno_gpu); > @@ -1209,6 +1290,8 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) > if (info && info->revn == 650) > adreno_gpu->base.hw_apriv = true; > > + a6xx_llc_slices_init(pdev, a6xx_gpu); > + > ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); > if (ret) { > a6xx_destroy(&(a6xx_gpu->base.base)); > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > index 3eeebf6a754b..9e6079af679c 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > @@ -28,6 +28,10 @@ struct a6xx_gpu { > uint32_t *shadow; > > bool has_whereami; > + > + void __iomem *llc_mmio; > + void *llc_slice; > + void *htw_llc_slice; > }; > > #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c > index fd8f491f2e48..86c4fe667225 100644 > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c > @@ -16,6 +16,7 @@ > #include <linux/soc/qcom/mdt_loader.h> > #include <soc/qcom/ocmem.h> > #include "adreno_gpu.h" > +#include "a6xx_gpu.h" > #include "msm_gem.h" > #include "msm_mmu.h" > > @@ -189,6 +190,8 @@ struct msm_gem_address_space * > adreno_iommu_create_address_space(struct msm_gpu *gpu, > struct platform_device *pdev) > { > + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); > struct iommu_domain *iommu; > struct msm_mmu *mmu; > struct msm_gem_address_space *aspace; > @@ -198,7 +201,21 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, > if (!iommu) > return NULL; > > + /* > + * This allows GPU to set the bus attributes required to use system > + * cache on behalf of the iommu page table walker. > + */ > + if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { > + int gpu_htw_llc = 1; > + > + iommu_domain_set_attr(iommu, DOMAIN_ATTR_SYS_CACHE, &gpu_htw_llc); > + } > + > mmu = msm_iommu_new(&pdev->dev, iommu); > + if (IS_ERR(mmu)) { > + iommu_domain_free(iommu); > + return ERR_CAST(mmu); > + } > > /* > * Use the aperture start or SZ_16M, whichever is greater. This will > -- > QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member > of Code Aurora Forum, hosted by The Linux Foundation >
Hi Jordan, On 2020-09-23 20:33, Jordan Crouse wrote: > On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote: >> From: Sharat Masetty <smasetty@codeaurora.org> >> >> The last level system cache can be partitioned to 32 different >> slices of which GPU has two slices preallocated. One slice is >> used for caching GPU buffers and the other slice is used for >> caching the GPU SMMU pagetables. This talks to the core system >> cache driver to acquire the slice handles, configure the SCID's >> to those slices and activates and deactivates the slices upon >> GPU power collapse and restore. >> >> Some support from the IOMMU driver is also needed to make use >> of the system cache to set the right TCR attributes. GPU then >> has the ability to override a few cacheability parameters which >> it does to override write-allocate to write-no-allocate as the >> GPU hardware does not benefit much from it. >> >> DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the >> IOMMU driver to set the right attributes to cache the hardware >> pagetables into the system cache. >> >> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org> >> [saiprakash.ranjan: fix to set attr before device attach to iommu and >> rebase] >> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> >> --- >> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 >> +++++++++++++++++++++++++ >> drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ >> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ >> 3 files changed, 104 insertions(+) >> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> index 8915882e4444..151190ff62f7 100644 >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> @@ -8,7 +8,9 @@ >> #include "a6xx_gpu.h" >> #include "a6xx_gmu.xml.h" >> >> +#include <linux/bitfield.h> >> #include <linux/devfreq.h> >> +#include <linux/soc/qcom/llcc-qcom.h> >> >> #define GPU_PAS_ID 13 >> >> @@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu >> *gpu) >> return IRQ_HANDLED; >> } >> >> +static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 >> mask, u32 or) >> +{ >> + return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); >> +} >> + >> +static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 >> value) >> +{ >> + return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); >> +} >> + >> +static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) >> +{ >> + llcc_slice_deactivate(a6xx_gpu->llc_slice); >> + llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); >> +} >> + >> +static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) >> +{ >> + u32 cntl1_regval = 0; >> + >> + if (IS_ERR(a6xx_gpu->llc_mmio)) >> + return; >> + >> + if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { >> + u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); >> + >> + gpu_scid &= 0x1f; >> + cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) >> | >> + (gpu_scid << 15) | (gpu_scid << 20); >> + } >> + >> + if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { >> + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); >> + >> + gpuhtw_scid &= 0x1f; >> + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); >> + } >> + >> + if (cntl1_regval) { >> + /* >> + * Program the slice IDs for the various GPU blocks and GPU MMU >> + * pagetables >> + */ >> + a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, >> cntl1_regval); >> + >> + /* >> + * Program cacheability overrides to not allocate cache lines on >> + * a write miss >> + */ >> + a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, >> 0x03); >> + } >> +} > > This code has been around long enough that it pre-dates a650. On a650 > and other > MMU-500 targets the htw_llc is configured by the firmware and the > llc_slice is > configured in a different register. > > I don't think we need to pause everything and add support for the > MMU-500 path, > but we do need a way to disallow LLCC on affected targets until such > time that > we can get it fixed up. > Thanks for taking a close look, does something like below look ok or something else is needed here? + /* Till the time we get in LLCC support for A650 */ + if (!(info && info->revn == 650)) + a6xx_llc_slices_init(pdev, a6xx_gpu); Thanks, Sai
On Mon, Sep 28, 2020 at 05:56:55PM +0530, Sai Prakash Ranjan wrote: > Hi Jordan, > > On 2020-09-23 20:33, Jordan Crouse wrote: > >On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote: > >>From: Sharat Masetty <smasetty@codeaurora.org> > >> > >>The last level system cache can be partitioned to 32 different > >>slices of which GPU has two slices preallocated. One slice is > >>used for caching GPU buffers and the other slice is used for > >>caching the GPU SMMU pagetables. This talks to the core system > >>cache driver to acquire the slice handles, configure the SCID's > >>to those slices and activates and deactivates the slices upon > >>GPU power collapse and restore. > >> > >>Some support from the IOMMU driver is also needed to make use > >>of the system cache to set the right TCR attributes. GPU then > >>has the ability to override a few cacheability parameters which > >>it does to override write-allocate to write-no-allocate as the > >>GPU hardware does not benefit much from it. > >> > >>DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the > >>IOMMU driver to set the right attributes to cache the hardware > >>pagetables into the system cache. > >> > >>Signed-off-by: Sharat Masetty <smasetty@codeaurora.org> > >>[saiprakash.ranjan: fix to set attr before device attach to iommu and > >>rebase] > >>Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> > >>--- > >> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++ > >> drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ > >> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ > >> 3 files changed, 104 insertions(+) > >> > >>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > >>b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > >>index 8915882e4444..151190ff62f7 100644 > >>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > >>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > >>@@ -8,7 +8,9 @@ > >> #include "a6xx_gpu.h" > >> #include "a6xx_gmu.xml.h" > >> > >>+#include <linux/bitfield.h> > >> #include <linux/devfreq.h> > >>+#include <linux/soc/qcom/llcc-qcom.h> > >> > >> #define GPU_PAS_ID 13 > >> > >>@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) > >> return IRQ_HANDLED; > >> } > >> > >>+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, > >>u32 or) > >>+{ > >>+ return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); > >>+} > >>+ > >>+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 > >>value) > >>+{ > >>+ return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); > >>+} > >>+ > >>+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) > >>+{ > >>+ llcc_slice_deactivate(a6xx_gpu->llc_slice); > >>+ llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); > >>+} > >>+ > >>+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) > >>+{ > >>+ u32 cntl1_regval = 0; > >>+ > >>+ if (IS_ERR(a6xx_gpu->llc_mmio)) > >>+ return; > >>+ > >>+ if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { > >>+ u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); > >>+ > >>+ gpu_scid &= 0x1f; > >>+ cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | > >>+ (gpu_scid << 15) | (gpu_scid << 20); > >>+ } > >>+ > >>+ if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { > >>+ u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); > >>+ > >>+ gpuhtw_scid &= 0x1f; > >>+ cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); > >>+ } > >>+ > >>+ if (cntl1_regval) { > >>+ /* > >>+ * Program the slice IDs for the various GPU blocks and GPU MMU > >>+ * pagetables > >>+ */ > >>+ a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, > >>cntl1_regval); > >>+ > >>+ /* > >>+ * Program cacheability overrides to not allocate cache lines on > >>+ * a write miss > >>+ */ > >>+ a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, > >>0x03); > >>+ } > >>+} > > > >This code has been around long enough that it pre-dates a650. On a650 and > >other > >MMU-500 targets the htw_llc is configured by the firmware and the > >llc_slice is > >configured in a different register. > > > >I don't think we need to pause everything and add support for the MMU-500 > >path, > >but we do need a way to disallow LLCC on affected targets until such time > >that > >we can get it fixed up. > > > > Thanks for taking a close look, does something like below look ok or > something > else is needed here? > > + /* Till the time we get in LLCC support for A650 */ > + if (!(info && info->revn == 650)) > + a6xx_llc_slices_init(pdev, a6xx_gpu); It doesn't look like Rob picked this up for 5.10, so we have some time to do it right. Would you like me to give you an add-on patch for mmu-500 targets? Jordan > Thanks, > Sai > > -- > QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member > of Code Aurora Forum, hosted by The Linux Foundation
On 2020-09-28 21:41, Jordan Crouse wrote: > On Mon, Sep 28, 2020 at 05:56:55PM +0530, Sai Prakash Ranjan wrote: >> Hi Jordan, >> >> On 2020-09-23 20:33, Jordan Crouse wrote: >> >On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote: >> >>From: Sharat Masetty <smasetty@codeaurora.org> >> >> >> >>The last level system cache can be partitioned to 32 different >> >>slices of which GPU has two slices preallocated. One slice is >> >>used for caching GPU buffers and the other slice is used for >> >>caching the GPU SMMU pagetables. This talks to the core system >> >>cache driver to acquire the slice handles, configure the SCID's >> >>to those slices and activates and deactivates the slices upon >> >>GPU power collapse and restore. >> >> >> >>Some support from the IOMMU driver is also needed to make use >> >>of the system cache to set the right TCR attributes. GPU then >> >>has the ability to override a few cacheability parameters which >> >>it does to override write-allocate to write-no-allocate as the >> >>GPU hardware does not benefit much from it. >> >> >> >>DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the >> >>IOMMU driver to set the right attributes to cache the hardware >> >>pagetables into the system cache. >> >> >> >>Signed-off-by: Sharat Masetty <smasetty@codeaurora.org> >> >>[saiprakash.ranjan: fix to set attr before device attach to iommu and >> >>rebase] >> >>Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> >> >>--- >> >> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++ >> >> drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ >> >> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ >> >> 3 files changed, 104 insertions(+) >> >> >> >>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> >>b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> >>index 8915882e4444..151190ff62f7 100644 >> >>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> >>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c >> >>@@ -8,7 +8,9 @@ >> >> #include "a6xx_gpu.h" >> >> #include "a6xx_gmu.xml.h" >> >> >> >>+#include <linux/bitfield.h> >> >> #include <linux/devfreq.h> >> >>+#include <linux/soc/qcom/llcc-qcom.h> >> >> >> >> #define GPU_PAS_ID 13 >> >> >> >>@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) >> >> return IRQ_HANDLED; >> >> } >> >> >> >>+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, >> >>u32 or) >> >>+{ >> >>+ return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); >> >>+} >> >>+ >> >>+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 >> >>value) >> >>+{ >> >>+ return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); >> >>+} >> >>+ >> >>+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) >> >>+{ >> >>+ llcc_slice_deactivate(a6xx_gpu->llc_slice); >> >>+ llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); >> >>+} >> >>+ >> >>+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) >> >>+{ >> >>+ u32 cntl1_regval = 0; >> >>+ >> >>+ if (IS_ERR(a6xx_gpu->llc_mmio)) >> >>+ return; >> >>+ >> >>+ if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { >> >>+ u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); >> >>+ >> >>+ gpu_scid &= 0x1f; >> >>+ cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | >> >>+ (gpu_scid << 15) | (gpu_scid << 20); >> >>+ } >> >>+ >> >>+ if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { >> >>+ u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); >> >>+ >> >>+ gpuhtw_scid &= 0x1f; >> >>+ cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); >> >>+ } >> >>+ >> >>+ if (cntl1_regval) { >> >>+ /* >> >>+ * Program the slice IDs for the various GPU blocks and GPU MMU >> >>+ * pagetables >> >>+ */ >> >>+ a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, >> >>cntl1_regval); >> >>+ >> >>+ /* >> >>+ * Program cacheability overrides to not allocate cache lines on >> >>+ * a write miss >> >>+ */ >> >>+ a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, >> >>0x03); >> >>+ } >> >>+} >> > >> >This code has been around long enough that it pre-dates a650. On a650 and >> >other >> >MMU-500 targets the htw_llc is configured by the firmware and the >> >llc_slice is >> >configured in a different register. >> > >> >I don't think we need to pause everything and add support for the MMU-500 >> >path, >> >but we do need a way to disallow LLCC on affected targets until such time >> >that >> >we can get it fixed up. >> > >> >> Thanks for taking a close look, does something like below look ok or >> something >> else is needed here? >> >> + /* Till the time we get in LLCC support for A650 */ >> + if (!(info && info->revn == 650)) >> + a6xx_llc_slices_init(pdev, a6xx_gpu); > > It doesn't look like Rob picked this up for 5.10, so we have some time > to do it > right. Would you like me to give you an add-on patch for mmu-500 > targets? > Yes that will be great. Thanks, Sai
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 8915882e4444..151190ff62f7 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -8,7 +8,9 @@ #include "a6xx_gpu.h" #include "a6xx_gmu.xml.h" +#include <linux/bitfield.h> #include <linux/devfreq.h> +#include <linux/soc/qcom/llcc-qcom.h> #define GPU_PAS_ID 13 @@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) return IRQ_HANDLED; } +static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or) +{ + return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); +} + +static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value) +{ + return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); +} + +static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_deactivate(a6xx_gpu->llc_slice); + llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) +{ + u32 cntl1_regval = 0; + + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { + u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); + + gpu_scid &= 0x1f; + cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | + (gpu_scid << 15) | (gpu_scid << 20); + } + + if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); + + gpuhtw_scid &= 0x1f; + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); + } + + if (cntl1_regval) { + /* + * Program the slice IDs for the various GPU blocks and GPU MMU + * pagetables + */ + a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); + + /* + * Program cacheability overrides to not allocate cache lines on + * a write miss + */ + a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); + } +} + +static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_putd(a6xx_gpu->llc_slice); + llcc_slice_putd(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_slices_init(struct platform_device *pdev, + struct a6xx_gpu *a6xx_gpu) +{ + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); + a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); + + if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice)) + a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL); +} + static int a6xx_pm_resume(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) msm_gpu_resume_devfreq(gpu); + a6xx_llc_activate(a6xx_gpu); + return 0; } @@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) trace_msm_gpu_suspend(0); + a6xx_llc_deactivate(a6xx_gpu); + devfreq_suspend_device(gpu->devfreq.devfreq); return a6xx_gmu_stop(a6xx_gpu); @@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) drm_gem_object_put(a6xx_gpu->shadow_bo); } + a6xx_llc_slices_destroy(a6xx_gpu); + a6xx_gmu_remove(a6xx_gpu); adreno_gpu_cleanup(adreno_gpu); @@ -1209,6 +1290,8 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) if (info && info->revn == 650) adreno_gpu->base.hw_apriv = true; + a6xx_llc_slices_init(pdev, a6xx_gpu); + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) { a6xx_destroy(&(a6xx_gpu->base.base)); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 3eeebf6a754b..9e6079af679c 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -28,6 +28,10 @@ struct a6xx_gpu { uint32_t *shadow; bool has_whereami; + + void __iomem *llc_mmio; + void *llc_slice; + void *htw_llc_slice; }; #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index fd8f491f2e48..86c4fe667225 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -16,6 +16,7 @@ #include <linux/soc/qcom/mdt_loader.h> #include <soc/qcom/ocmem.h> #include "adreno_gpu.h" +#include "a6xx_gpu.h" #include "msm_gem.h" #include "msm_mmu.h" @@ -189,6 +190,8 @@ struct msm_gem_address_space * adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct iommu_domain *iommu; struct msm_mmu *mmu; struct msm_gem_address_space *aspace; @@ -198,7 +201,21 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL; + /* + * This allows GPU to set the bus attributes required to use system + * cache on behalf of the iommu page table walker. + */ + if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { + int gpu_htw_llc = 1; + + iommu_domain_set_attr(iommu, DOMAIN_ATTR_SYS_CACHE, &gpu_htw_llc); + } + mmu = msm_iommu_new(&pdev->dev, iommu); + if (IS_ERR(mmu)) { + iommu_domain_free(iommu); + return ERR_CAST(mmu); + } /* * Use the aperture start or SZ_16M, whichever is greater. This will