diff mbox series

[v2] iommu/arm-smmu-qcom: Only enable stall on smmu-v2

Message ID 20250102183232.115279-1-robdclark@gmail.com (mailing list archive)
State Not Applicable
Headers show
Series [v2] iommu/arm-smmu-qcom: Only enable stall on smmu-v2 | expand

Commit Message

Rob Clark Jan. 2, 2025, 6:32 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

On mmu-500, stall-on-fault seems to stall all context banks, causing the
GMU to misbehave.  So limit this feature to smmu-v2 for now.

This fixes an issue with an older mesa bug taking outo the system
because of GMU going off into the weeds.

What we _think_ is happening is that, if the GPU generates 1000's of
faults at ~once (which is something that GPUs can be good at), it can
result in a sufficient number of stalled translations preventing other
transactions from entering the same TBU.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
v2: Adds a modparam to override the default behavior, for debugging
    GPU faults in cases which do not (or might not) cause lockup.
    Also, rebased to not depend on Bibek's PRR support.

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

Comments

Akhil P Oommen Jan. 2, 2025, 7:30 p.m. UTC | #1
On 1/3/2025 12:02 AM, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> On mmu-500, stall-on-fault seems to stall all context banks, causing the
> GMU to misbehave.  So limit this feature to smmu-v2 for now.
> 
> This fixes an issue with an older mesa bug taking outo the system
> because of GMU going off into the weeds.
> 
> What we _think_ is happening is that, if the GPU generates 1000's of
> faults at ~once (which is something that GPUs can be good at), it can
> result in a sufficient number of stalled translations preventing other
> transactions from entering the same TBU.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>

Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com>

-Akhil

> ---
> v2: Adds a modparam to override the default behavior, for debugging
>     GPU faults in cases which do not (or might not) cause lockup.
>     Also, rebased to not depend on Bibek's PRR support.
> 
>  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 19 +++++++++++++++++--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> index 6372f3e25c4b..3239bbf18514 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
> @@ -16,6 +16,10 @@
>  
>  #define QCOM_DUMMY_VAL	-1
>  
> +static int enable_stall = -1;
> +MODULE_PARM_DESC(enable_stall, "Enable stall on iova fault (1=on , 0=disable, -1=auto (default))");
> +module_param(enable_stall, int, 0600);
> +
>  static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
>  {
>  	return container_of(smmu, struct qcom_smmu, smmu);
> @@ -210,7 +214,9 @@ static bool qcom_adreno_can_do_ttbr1(struct arm_smmu_device *smmu)
>  static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
>  		struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
>  {
> +	const struct device_node *np = smmu_domain->smmu->dev->of_node;
>  	struct adreno_smmu_priv *priv;
> +	bool stall_enabled;
>  
>  	smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
>  
> @@ -237,8 +243,17 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
>  	priv->get_ttbr1_cfg = qcom_adreno_smmu_get_ttbr1_cfg;
>  	priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg;
>  	priv->get_fault_info = qcom_adreno_smmu_get_fault_info;
> -	priv->set_stall = qcom_adreno_smmu_set_stall;
> -	priv->resume_translation = qcom_adreno_smmu_resume_translation;
> +
> +	if (enable_stall < 0) {
> +		stall_enabled = of_device_is_compatible(np, "qcom,smmu-v2");
> +	} else {
> +		stall_enabled = !!enable_stall;
> +	}
> +
> +	if (stall_enabled) {
> +		priv->set_stall = qcom_adreno_smmu_set_stall;
> +		priv->resume_translation = qcom_adreno_smmu_resume_translation;
> +	}
>  
>  	return 0;
>  }
Akhil P Oommen Jan. 6, 2025, 8:10 p.m. UTC | #2
On 1/3/2025 1:00 AM, Akhil P Oommen wrote:
> On 1/3/2025 12:02 AM, Rob Clark wrote:
>> From: Rob Clark <robdclark@chromium.org>
>>
>> On mmu-500, stall-on-fault seems to stall all context banks, causing the
>> GMU to misbehave.  So limit this feature to smmu-v2 for now.
>>
>> This fixes an issue with an older mesa bug taking outo the system
>> because of GMU going off into the weeds.
>>
>> What we _think_ is happening is that, if the GPU generates 1000's of
>> faults at ~once (which is something that GPUs can be good at), it can
>> result in a sufficient number of stalled translations preventing other
>> transactions from entering the same TBU.
>>
>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> 
> Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> 

Btw, if stall is not enabled, I think there is no point in capturing
coredump from adreno pagefault handler. By the time we start coredump,
gpu might have switched context.

-Akhil.

> -Akhil
>
Rob Clark Jan. 6, 2025, 8:59 p.m. UTC | #3
On Mon, Jan 6, 2025 at 12:11 PM Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>
> On 1/3/2025 1:00 AM, Akhil P Oommen wrote:
> > On 1/3/2025 12:02 AM, Rob Clark wrote:
> >> From: Rob Clark <robdclark@chromium.org>
> >>
> >> On mmu-500, stall-on-fault seems to stall all context banks, causing the
> >> GMU to misbehave.  So limit this feature to smmu-v2 for now.
> >>
> >> This fixes an issue with an older mesa bug taking outo the system
> >> because of GMU going off into the weeds.
> >>
> >> What we _think_ is happening is that, if the GPU generates 1000's of
> >> faults at ~once (which is something that GPUs can be good at), it can
> >> result in a sufficient number of stalled translations preventing other
> >> transactions from entering the same TBU.
> >>
> >> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >
> > Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> >
>
> Btw, if stall is not enabled, I think there is no point in capturing
> coredump from adreno pagefault handler. By the time we start coredump,
> gpu might have switched context.
>
> -Akhil.
>
> > -Akhil
> >
Rob Clark Jan. 6, 2025, 9 p.m. UTC | #4
On Mon, Jan 6, 2025 at 12:11 PM Akhil P Oommen <quic_akhilpo@quicinc.com> wrote:
>
> On 1/3/2025 1:00 AM, Akhil P Oommen wrote:
> > On 1/3/2025 12:02 AM, Rob Clark wrote:
> >> From: Rob Clark <robdclark@chromium.org>
> >>
> >> On mmu-500, stall-on-fault seems to stall all context banks, causing the
> >> GMU to misbehave.  So limit this feature to smmu-v2 for now.
> >>
> >> This fixes an issue with an older mesa bug taking outo the system
> >> because of GMU going off into the weeds.
> >>
> >> What we _think_ is happening is that, if the GPU generates 1000's of
> >> faults at ~once (which is something that GPUs can be good at), it can
> >> result in a sufficient number of stalled translations preventing other
> >> transactions from entering the same TBU.
> >>
> >> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >
> > Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
> >
>
> Btw, if stall is not enabled, I think there is no point in capturing
> coredump from adreno pagefault handler. By the time we start coredump,
> gpu might have switched context.

Hmm, we do at least capture ttbr0 both in fault info and from the
current submit, so it would at least be possible to tell if you are
looking at the wrong context.

BR,
-R
Will Deacon Jan. 7, 2025, 12:57 p.m. UTC | #5
On Thu, Jan 02, 2025 at 10:32:31AM -0800, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> On mmu-500, stall-on-fault seems to stall all context banks, causing the
> GMU to misbehave.  So limit this feature to smmu-v2 for now.
> 
> This fixes an issue with an older mesa bug taking outo the system
> because of GMU going off into the weeds.
> 
> What we _think_ is happening is that, if the GPU generates 1000's of
> faults at ~once (which is something that GPUs can be good at), it can
> result in a sufficient number of stalled translations preventing other
> transactions from entering the same TBU.

MMU-500 is an implementation of the SMMUv2 architecture, so this feels
upside-down to me. That is, it should always be valid to probe with
the less specific "SMMUv2" compatible string (modulo hardware errata)
and be limited to the architectural behaviour.

So what is about MMU-500 that means stalling doesn't work when compared
to any other SMMUv2 implementation?

Will
Rob Clark Jan. 7, 2025, 3:26 p.m. UTC | #6
On Tue, Jan 7, 2025 at 4:57 AM Will Deacon <will@kernel.org> wrote:
>
> On Thu, Jan 02, 2025 at 10:32:31AM -0800, Rob Clark wrote:
> > From: Rob Clark <robdclark@chromium.org>
> >
> > On mmu-500, stall-on-fault seems to stall all context banks, causing the
> > GMU to misbehave.  So limit this feature to smmu-v2 for now.
> >
> > This fixes an issue with an older mesa bug taking outo the system
> > because of GMU going off into the weeds.
> >
> > What we _think_ is happening is that, if the GPU generates 1000's of
> > faults at ~once (which is something that GPUs can be good at), it can
> > result in a sufficient number of stalled translations preventing other
> > transactions from entering the same TBU.
>
> MMU-500 is an implementation of the SMMUv2 architecture, so this feels
> upside-down to me. That is, it should always be valid to probe with
> the less specific "SMMUv2" compatible string (modulo hardware errata)
> and be limited to the architectural behaviour.

I should have been more specific and referred to qcom,smmu-v2

> So what is about MMU-500 that means stalling doesn't work when compared
> to any other SMMUv2 implementation?

Well, I have a limited # of data points, in the sense that there
aren't too many a6xx devices prior to the switch to qcom,smmu-500..
but I have access to crash metrics for a lot of sc7180 devices
(qcom,smmu-v2), and I've been unable to find any signs of this sort of
stall related issue.

So maybe I can't 100% say this is qcom,smmu-500 vs qcom,smmu-v2, vs
some other change in later gens that used qcom,smmu-500 or some other
factor, I'm not sure what other conclusion to draw.

BR,
-R
Dmitry Baryshkov Jan. 7, 2025, 10:43 p.m. UTC | #7
On Tue, Jan 07, 2025 at 07:26:44AM -0800, Rob Clark wrote:
> On Tue, Jan 7, 2025 at 4:57 AM Will Deacon <will@kernel.org> wrote:
> >
> > On Thu, Jan 02, 2025 at 10:32:31AM -0800, Rob Clark wrote:
> > > From: Rob Clark <robdclark@chromium.org>
> > >
> > > On mmu-500, stall-on-fault seems to stall all context banks, causing the
> > > GMU to misbehave.  So limit this feature to smmu-v2 for now.
> > >
> > > This fixes an issue with an older mesa bug taking outo the system
> > > because of GMU going off into the weeds.
> > >
> > > What we _think_ is happening is that, if the GPU generates 1000's of
> > > faults at ~once (which is something that GPUs can be good at), it can
> > > result in a sufficient number of stalled translations preventing other
> > > transactions from entering the same TBU.
> >
> > MMU-500 is an implementation of the SMMUv2 architecture, so this feels
> > upside-down to me. That is, it should always be valid to probe with
> > the less specific "SMMUv2" compatible string (modulo hardware errata)
> > and be limited to the architectural behaviour.
> 
> I should have been more specific and referred to qcom,smmu-v2
> 
> > So what is about MMU-500 that means stalling doesn't work when compared
> > to any other SMMUv2 implementation?
> 
> Well, I have a limited # of data points, in the sense that there
> aren't too many a6xx devices prior to the switch to qcom,smmu-500..
> but I have access to crash metrics for a lot of sc7180 devices
> (qcom,smmu-v2), and I've been unable to find any signs of this sort of
> stall related issue.
> 
> So maybe I can't 100% say this is qcom,smmu-500 vs qcom,smmu-v2, vs
> some other change in later gens that used qcom,smmu-500 or some other
> factor, I'm not sure what other conclusion to draw.

Might it be that v2 was an actual hw, but mmu-500 is somehow
virtualized? And as such by these stalls we might be observing some kind
of FW bug in hyp?

> 
> BR,
> -R
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 6372f3e25c4b..3239bbf18514 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -16,6 +16,10 @@ 
 
 #define QCOM_DUMMY_VAL	-1
 
+static int enable_stall = -1;
+MODULE_PARM_DESC(enable_stall, "Enable stall on iova fault (1=on , 0=disable, -1=auto (default))");
+module_param(enable_stall, int, 0600);
+
 static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
 {
 	return container_of(smmu, struct qcom_smmu, smmu);
@@ -210,7 +214,9 @@  static bool qcom_adreno_can_do_ttbr1(struct arm_smmu_device *smmu)
 static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
 		struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
 {
+	const struct device_node *np = smmu_domain->smmu->dev->of_node;
 	struct adreno_smmu_priv *priv;
+	bool stall_enabled;
 
 	smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
 
@@ -237,8 +243,17 @@  static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
 	priv->get_ttbr1_cfg = qcom_adreno_smmu_get_ttbr1_cfg;
 	priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg;
 	priv->get_fault_info = qcom_adreno_smmu_get_fault_info;
-	priv->set_stall = qcom_adreno_smmu_set_stall;
-	priv->resume_translation = qcom_adreno_smmu_resume_translation;
+
+	if (enable_stall < 0) {
+		stall_enabled = of_device_is_compatible(np, "qcom,smmu-v2");
+	} else {
+		stall_enabled = !!enable_stall;
+	}
+
+	if (stall_enabled) {
+		priv->set_stall = qcom_adreno_smmu_set_stall;
+		priv->resume_translation = qcom_adreno_smmu_resume_translation;
+	}
 
 	return 0;
 }