diff mbox series

[32/35] drm/amdgpu: enable retry fault wptr overflow

Message ID 20210107030127.20393-33-Felix.Kuehling@amd.com (mailing list archive)
State New, archived
Headers show
Series Add HMM-based SVM memory manager to KFD | expand

Commit Message

Felix Kuehling Jan. 7, 2021, 3:01 a.m. UTC
From: Philip Yang <Philip.Yang@amd.com>

If xnack is on, VM retry fault interrupt send to IH ring1, and ring1
will be full quickly. IH cannot receive other interrupts, this causes
deadlock if migrating buffer using sdma and waiting for sdma done while
handling retry fault.

Remove VMC from IH storm client, enable ring1 write pointer overflow,
then IH will drop retry fault interrupts and be able to receive other
interrupts while driver is handling retry fault.

IH ring1 write pointer doesn't writeback to memory by IH, and ring1
write pointer recorded by self-irq is not updated, so always read
the latest ring1 write pointer from register.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 32 +++++++++-----------------
 drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 32 +++++++++-----------------
 2 files changed, 22 insertions(+), 42 deletions(-)

Comments

Christian König Jan. 7, 2021, 11:01 a.m. UTC | #1
Am 07.01.21 um 04:01 schrieb Felix Kuehling:
> From: Philip Yang <Philip.Yang@amd.com>
>
> If xnack is on, VM retry fault interrupt send to IH ring1, and ring1
> will be full quickly. IH cannot receive other interrupts, this causes
> deadlock if migrating buffer using sdma and waiting for sdma done while
> handling retry fault.
>
> Remove VMC from IH storm client, enable ring1 write pointer overflow,
> then IH will drop retry fault interrupts and be able to receive other
> interrupts while driver is handling retry fault.
>
> IH ring1 write pointer doesn't writeback to memory by IH, and ring1
> write pointer recorded by self-irq is not updated, so always read
> the latest ring1 write pointer from register.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 32 +++++++++-----------------
>   drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 32 +++++++++-----------------
>   2 files changed, 22 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 88626d83e07b..ca8efa5c6978 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -220,10 +220,8 @@ static int vega10_ih_enable_ring(struct amdgpu_device *adev,
>   	tmp = vega10_ih_rb_cntl(ih, tmp);
>   	if (ih == &adev->irq.ih)
>   		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RPTR_REARM, !!adev->irq.msi_enabled);
> -	if (ih == &adev->irq.ih1) {
> -		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_ENABLE, 0);
> +	if (ih == &adev->irq.ih1)
>   		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RB_FULL_DRAIN_ENABLE, 1);
> -	}
>   	if (amdgpu_sriov_vf(adev)) {
>   		if (psp_reg_program(&adev->psp, ih_regs->psp_reg_id, tmp)) {
>   			dev_err(adev->dev, "PSP program IH_RB_CNTL failed!\n");
> @@ -265,7 +263,6 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
>   	u32 ih_chicken;
>   	int ret;
>   	int i;
> -	u32 tmp;
>   
>   	/* disable irqs */
>   	ret = vega10_ih_toggle_interrupts(adev, false);
> @@ -291,15 +288,6 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> -	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL);
> -	tmp = REG_SET_FIELD(tmp, IH_STORM_CLIENT_LIST_CNTL,
> -			    CLIENT18_IS_STORM_CLIENT, 1);
> -	WREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL, tmp);
> -
> -	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL);
> -	tmp = REG_SET_FIELD(tmp, IH_INT_FLOOD_CNTL, FLOOD_CNTL_ENABLE, 1);
> -	WREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL, tmp);
> -
>   	pci_set_master(adev->pdev);
>   
>   	/* enable interrupts */
> @@ -345,11 +333,17 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev,
>   	u32 wptr, tmp;
>   	struct amdgpu_ih_regs *ih_regs;
>   
> -	wptr = le32_to_cpu(*ih->wptr_cpu);
> -	ih_regs = &ih->ih_regs;
> +	if (ih == &adev->irq.ih) {
> +		/* Only ring0 supports writeback. On other rings fall back
> +		 * to register-based code with overflow checking below.
> +		 */
> +		wptr = le32_to_cpu(*ih->wptr_cpu);
>   
> -	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> -		goto out;
> +		if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> +			goto out;
> +	}
> +
> +	ih_regs = &ih->ih_regs;
>   
>   	/* Double check that the overflow wasn't already cleared. */
>   	wptr = RREG32_NO_KIQ(ih_regs->ih_rb_wptr);
> @@ -440,15 +434,11 @@ static int vega10_ih_self_irq(struct amdgpu_device *adev,
>   			      struct amdgpu_irq_src *source,
>   			      struct amdgpu_iv_entry *entry)
>   {
> -	uint32_t wptr = cpu_to_le32(entry->src_data[0]);
> -
>   	switch (entry->ring_id) {
>   	case 1:
> -		*adev->irq.ih1.wptr_cpu = wptr;
>   		schedule_work(&adev->irq.ih1_work);
>   		break;
>   	case 2:
> -		*adev->irq.ih2.wptr_cpu = wptr;
>   		schedule_work(&adev->irq.ih2_work);
>   		break;
>   	default: break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> index 42032ca380cc..60d1bd51781e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> @@ -220,10 +220,8 @@ static int vega20_ih_enable_ring(struct amdgpu_device *adev,
>   	tmp = vega20_ih_rb_cntl(ih, tmp);
>   	if (ih == &adev->irq.ih)
>   		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RPTR_REARM, !!adev->irq.msi_enabled);
> -	if (ih == &adev->irq.ih1) {
> -		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_ENABLE, 0);
> +	if (ih == &adev->irq.ih1)
>   		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RB_FULL_DRAIN_ENABLE, 1);
> -	}
>   	if (amdgpu_sriov_vf(adev)) {
>   		if (psp_reg_program(&adev->psp, ih_regs->psp_reg_id, tmp)) {
>   			dev_err(adev->dev, "PSP program IH_RB_CNTL failed!\n");
> @@ -297,7 +295,6 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev)
>   	u32 ih_chicken;
>   	int ret;
>   	int i;
> -	u32 tmp;
>   
>   	/* disable irqs */
>   	ret = vega20_ih_toggle_interrupts(adev, false);
> @@ -326,15 +323,6 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> -	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL);
> -	tmp = REG_SET_FIELD(tmp, IH_STORM_CLIENT_LIST_CNTL,
> -			    CLIENT18_IS_STORM_CLIENT, 1);
> -	WREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL, tmp);
> -
> -	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL);
> -	tmp = REG_SET_FIELD(tmp, IH_INT_FLOOD_CNTL, FLOOD_CNTL_ENABLE, 1);
> -	WREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL, tmp);
> -
>   	pci_set_master(adev->pdev);
>   
>   	/* enable interrupts */
> @@ -379,11 +367,17 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev,
>   	u32 wptr, tmp;
>   	struct amdgpu_ih_regs *ih_regs;
>   
> -	wptr = le32_to_cpu(*ih->wptr_cpu);
> -	ih_regs = &ih->ih_regs;
> +	if (ih == &adev->irq.ih) {
> +		/* Only ring0 supports writeback. On other rings fall back
> +		 * to register-based code with overflow checking below.
> +		 */
> +		wptr = le32_to_cpu(*ih->wptr_cpu);
>   
> -	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> -		goto out;
> +		if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> +			goto out;
> +	}
> +
> +	ih_regs = &ih->ih_regs;
>   
>   	/* Double check that the overflow wasn't already cleared. */
>   	wptr = RREG32_NO_KIQ(ih_regs->ih_rb_wptr);
> @@ -473,15 +467,11 @@ static int vega20_ih_self_irq(struct amdgpu_device *adev,
>   			      struct amdgpu_irq_src *source,
>   			      struct amdgpu_iv_entry *entry)
>   {
> -	uint32_t wptr = cpu_to_le32(entry->src_data[0]);
> -
>   	switch (entry->ring_id) {
>   	case 1:
> -		*adev->irq.ih1.wptr_cpu = wptr;
>   		schedule_work(&adev->irq.ih1_work);
>   		break;
>   	case 2:
> -		*adev->irq.ih2.wptr_cpu = wptr;
>   		schedule_work(&adev->irq.ih2_work);
>   		break;
>   	default: break;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index 88626d83e07b..ca8efa5c6978 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -220,10 +220,8 @@  static int vega10_ih_enable_ring(struct amdgpu_device *adev,
 	tmp = vega10_ih_rb_cntl(ih, tmp);
 	if (ih == &adev->irq.ih)
 		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RPTR_REARM, !!adev->irq.msi_enabled);
-	if (ih == &adev->irq.ih1) {
-		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_ENABLE, 0);
+	if (ih == &adev->irq.ih1)
 		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RB_FULL_DRAIN_ENABLE, 1);
-	}
 	if (amdgpu_sriov_vf(adev)) {
 		if (psp_reg_program(&adev->psp, ih_regs->psp_reg_id, tmp)) {
 			dev_err(adev->dev, "PSP program IH_RB_CNTL failed!\n");
@@ -265,7 +263,6 @@  static int vega10_ih_irq_init(struct amdgpu_device *adev)
 	u32 ih_chicken;
 	int ret;
 	int i;
-	u32 tmp;
 
 	/* disable irqs */
 	ret = vega10_ih_toggle_interrupts(adev, false);
@@ -291,15 +288,6 @@  static int vega10_ih_irq_init(struct amdgpu_device *adev)
 		}
 	}
 
-	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL);
-	tmp = REG_SET_FIELD(tmp, IH_STORM_CLIENT_LIST_CNTL,
-			    CLIENT18_IS_STORM_CLIENT, 1);
-	WREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL, tmp);
-
-	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL);
-	tmp = REG_SET_FIELD(tmp, IH_INT_FLOOD_CNTL, FLOOD_CNTL_ENABLE, 1);
-	WREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL, tmp);
-
 	pci_set_master(adev->pdev);
 
 	/* enable interrupts */
@@ -345,11 +333,17 @@  static u32 vega10_ih_get_wptr(struct amdgpu_device *adev,
 	u32 wptr, tmp;
 	struct amdgpu_ih_regs *ih_regs;
 
-	wptr = le32_to_cpu(*ih->wptr_cpu);
-	ih_regs = &ih->ih_regs;
+	if (ih == &adev->irq.ih) {
+		/* Only ring0 supports writeback. On other rings fall back
+		 * to register-based code with overflow checking below.
+		 */
+		wptr = le32_to_cpu(*ih->wptr_cpu);
 
-	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
-		goto out;
+		if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
+			goto out;
+	}
+
+	ih_regs = &ih->ih_regs;
 
 	/* Double check that the overflow wasn't already cleared. */
 	wptr = RREG32_NO_KIQ(ih_regs->ih_rb_wptr);
@@ -440,15 +434,11 @@  static int vega10_ih_self_irq(struct amdgpu_device *adev,
 			      struct amdgpu_irq_src *source,
 			      struct amdgpu_iv_entry *entry)
 {
-	uint32_t wptr = cpu_to_le32(entry->src_data[0]);
-
 	switch (entry->ring_id) {
 	case 1:
-		*adev->irq.ih1.wptr_cpu = wptr;
 		schedule_work(&adev->irq.ih1_work);
 		break;
 	case 2:
-		*adev->irq.ih2.wptr_cpu = wptr;
 		schedule_work(&adev->irq.ih2_work);
 		break;
 	default: break;
diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index 42032ca380cc..60d1bd51781e 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -220,10 +220,8 @@  static int vega20_ih_enable_ring(struct amdgpu_device *adev,
 	tmp = vega20_ih_rb_cntl(ih, tmp);
 	if (ih == &adev->irq.ih)
 		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RPTR_REARM, !!adev->irq.msi_enabled);
-	if (ih == &adev->irq.ih1) {
-		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_ENABLE, 0);
+	if (ih == &adev->irq.ih1)
 		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, RB_FULL_DRAIN_ENABLE, 1);
-	}
 	if (amdgpu_sriov_vf(adev)) {
 		if (psp_reg_program(&adev->psp, ih_regs->psp_reg_id, tmp)) {
 			dev_err(adev->dev, "PSP program IH_RB_CNTL failed!\n");
@@ -297,7 +295,6 @@  static int vega20_ih_irq_init(struct amdgpu_device *adev)
 	u32 ih_chicken;
 	int ret;
 	int i;
-	u32 tmp;
 
 	/* disable irqs */
 	ret = vega20_ih_toggle_interrupts(adev, false);
@@ -326,15 +323,6 @@  static int vega20_ih_irq_init(struct amdgpu_device *adev)
 		}
 	}
 
-	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL);
-	tmp = REG_SET_FIELD(tmp, IH_STORM_CLIENT_LIST_CNTL,
-			    CLIENT18_IS_STORM_CLIENT, 1);
-	WREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL, tmp);
-
-	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL);
-	tmp = REG_SET_FIELD(tmp, IH_INT_FLOOD_CNTL, FLOOD_CNTL_ENABLE, 1);
-	WREG32_SOC15(OSSSYS, 0, mmIH_INT_FLOOD_CNTL, tmp);
-
 	pci_set_master(adev->pdev);
 
 	/* enable interrupts */
@@ -379,11 +367,17 @@  static u32 vega20_ih_get_wptr(struct amdgpu_device *adev,
 	u32 wptr, tmp;
 	struct amdgpu_ih_regs *ih_regs;
 
-	wptr = le32_to_cpu(*ih->wptr_cpu);
-	ih_regs = &ih->ih_regs;
+	if (ih == &adev->irq.ih) {
+		/* Only ring0 supports writeback. On other rings fall back
+		 * to register-based code with overflow checking below.
+		 */
+		wptr = le32_to_cpu(*ih->wptr_cpu);
 
-	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
-		goto out;
+		if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
+			goto out;
+	}
+
+	ih_regs = &ih->ih_regs;
 
 	/* Double check that the overflow wasn't already cleared. */
 	wptr = RREG32_NO_KIQ(ih_regs->ih_rb_wptr);
@@ -473,15 +467,11 @@  static int vega20_ih_self_irq(struct amdgpu_device *adev,
 			      struct amdgpu_irq_src *source,
 			      struct amdgpu_iv_entry *entry)
 {
-	uint32_t wptr = cpu_to_le32(entry->src_data[0]);
-
 	switch (entry->ring_id) {
 	case 1:
-		*adev->irq.ih1.wptr_cpu = wptr;
 		schedule_work(&adev->irq.ih1_work);
 		break;
 	case 2:
-		*adev->irq.ih2.wptr_cpu = wptr;
 		schedule_work(&adev->irq.ih2_work);
 		break;
 	default: break;