diff mbox

[v3,1/3] drm/radeon: take exclusive_lock in read mode during, ring tests, v3

Message ID 53F341A3.6060902@canonical.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maarten Lankhorst Aug. 19, 2014, 12:22 p.m. UTC
This is needed for the next commit, because the lockup detection
will need the read lock to run.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
radeon_pm_compute_clocks already checks if dpm is enabled, so no need to check a second time.

Because of locking and waiting stuff the radeon_pm_compute_clocks and resume_force_mode calls
have to be done with read lock held.

Seems to survive on my radeon when catting /sys/kernel/debug/dri/0/radeon_gpu_reset although
uvd fails to reset, and that ring gets disabled as a result.

 drivers/gpu/drm/radeon/radeon.h         |  2 +-
 drivers/gpu/drm/radeon/radeon_device.c  | 57 +++++++++++++++++++--------------
 drivers/gpu/drm/radeon/radeon_display.c |  4 ++-
 3 files changed, 37 insertions(+), 26 deletions(-)

Comments

Christian König Aug. 19, 2014, 1:06 p.m. UTC | #1
Am 19.08.2014 um 14:22 schrieb Maarten Lankhorst:
> This is needed for the next commit, because the lockup detection
> will need the read lock to run.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
> radeon_pm_compute_clocks already checks if dpm is enabled, so no need to check a second time.
>
> Because of locking and waiting stuff the radeon_pm_compute_clocks and resume_force_mode calls
> have to be done with read lock held.
>
> Seems to survive on my radeon when catting /sys/kernel/debug/dri/0/radeon_gpu_reset although
> uvd fails to reset, and that ring gets disabled as a result.

Depending on what hardware you have it's normal that UVD doesn't reset 
properly. I still haven't figured out the correct sequence in which I 
need to disable/enable the different UVD blocks on all hardware generations.

It seems to work fine on my Cayman, but doesn't for example on Turks 
(which at least theoretically should have the same UVD block). It should 
be fine as long as the engines gets properly disabled when the IB test 
fails after an reset.

Another common source of reset instability is DPM, while it now seems to 
be stable on NI and BTC I can't get a single reset to work once I use it.

Regarding the patch it looks good now, but I still want to test it a bit,
Christian.

>
>   drivers/gpu/drm/radeon/radeon.h         |  2 +-
>   drivers/gpu/drm/radeon/radeon_device.c  | 57 +++++++++++++++++++--------------
>   drivers/gpu/drm/radeon/radeon_display.c |  4 ++-
>   3 files changed, 37 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index b281886f6f51..9d97409c0443 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2315,7 +2315,7 @@ struct radeon_device {
>   	bool				need_dma32;
>   	bool				accel_working;
>   	bool				fastfb_working; /* IGP feature*/
> -	bool				needs_reset;
> +	bool				needs_reset, in_reset;
>   	struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>   	const struct firmware *me_fw;	/* all family ME firmware */
>   	const struct firmware *pfp_fw;	/* r6/700 PFP firmware */
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index 6a219bcee66d..124d8994e02a 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1671,6 +1671,7 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>   	down_write(&rdev->exclusive_lock);
>   
>   	if (!rdev->needs_reset) {
> +		WARN_ON(rdev->in_reset);
>   		up_write(&rdev->exclusive_lock);
>   		return 0;
>   	}
> @@ -1683,17 +1684,21 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>   	radeon_suspend(rdev);
>   	radeon_hpd_fini(rdev);
>   
> -	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> -		ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
> -						   &ring_data[i]);
> -		if (ring_sizes[i]) {
> -			saved = true;
> -			dev_info(rdev->dev, "Saved %d dwords of commands "
> -				 "on ring %d.\n", ring_sizes[i], i);
> +	if (!rdev->in_reset) {
> +		rdev->in_reset = true;
> +
> +		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +			ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
> +							   &ring_data[i]);
> +			if (ring_sizes[i]) {
> +				saved = true;
> +				dev_info(rdev->dev, "Saved %d dwords of commands "
> +					 "on ring %d.\n", ring_sizes[i], i);
> +			}
>   		}
> -	}
> +	} else
> +		memset(ring_data, 0, sizeof(ring_data));
>   
> -retry:
>   	r = radeon_asic_reset(rdev);
>   	if (!r) {
>   		dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n");
> @@ -1709,16 +1714,6 @@ retry:
>   			ring_sizes[i] = 0;
>   			ring_data[i] = NULL;
>   		}
> -
> -		r = radeon_ib_ring_tests(rdev);
> -		if (r) {
> -			dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
> -			if (saved) {
> -				saved = false;
> -				radeon_suspend(rdev);
> -				goto retry;
> -			}
> -		}
>   	} else {
>   		radeon_fence_driver_force_completion(rdev);
>   		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -1728,8 +1723,7 @@ retry:
>   
>   	if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled) {
>   		/* do dpm late init */
> -		r = radeon_pm_late_init(rdev);
> -		if (r) {
> +		if (radeon_pm_late_init(rdev)) {
>   			rdev->pm.dpm_enabled = false;
>   			DRM_ERROR("radeon_pm_late_init failed, disabling dpm\n");
>   		}
> @@ -1753,19 +1747,34 @@ retry:
>   	/* reset hpd state */
>   	radeon_hpd_init(rdev);
>   
> +	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
> +	downgrade_write(&rdev->exclusive_lock);
> +
>   	drm_helper_resume_force_mode(rdev->ddev);
>   
>   	/* set the power state here in case we are a PX system or headless */
> -	if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
> +	if ((rdev->pm.pm_method == PM_METHOD_DPM))
>   		radeon_pm_compute_clocks(rdev);
>   
> -	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
> +	if (!r) {
> +		r = radeon_ib_ring_tests(rdev);
> +		if (r) {
> +			dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
> +			if (saved) {
> +				rdev->needs_reset = true;
> +				up_read(&rdev->exclusive_lock);
> +				return -EAGAIN;
> +			}
> +		}
> +	}
> +
>   	if (r) {
>   		/* bad news, how to tell it to userspace ? */
>   		dev_info(rdev->dev, "GPU reset failed\n");
>   	}
>   
> -	up_write(&rdev->exclusive_lock);
> +	rdev->in_reset = false;
> +	up_read(&rdev->exclusive_lock);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
> index 3fdf87318069..bd0d687379ee 100644
> --- a/drivers/gpu/drm/radeon/radeon_display.c
> +++ b/drivers/gpu/drm/radeon/radeon_display.c
> @@ -405,7 +405,9 @@ static void radeon_flip_work_func(struct work_struct *__work)
>   		r = radeon_fence_wait(work->fence, false);
>   		if (r == -EDEADLK) {
>   			up_read(&rdev->exclusive_lock);
> -			r = radeon_gpu_reset(rdev);
> +			do {
> +				r = radeon_gpu_reset(rdev);
> +			} while (r == -EAGAIN);
>   			down_read(&rdev->exclusive_lock);
>   		}
>   		if (r)
Maarten Lankhorst Aug. 19, 2014, 1:57 p.m. UTC | #2
Op 19-08-14 om 15:06 schreef Christian König:
> Am 19.08.2014 um 14:22 schrieb Maarten Lankhorst:
>> This is needed for the next commit, because the lockup detection
>> will need the read lock to run.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>> ---
>> radeon_pm_compute_clocks already checks if dpm is enabled, so no need to check a second time.
>>
>> Because of locking and waiting stuff the radeon_pm_compute_clocks and resume_force_mode calls
>> have to be done with read lock held.
>>
>> Seems to survive on my radeon when catting /sys/kernel/debug/dri/0/radeon_gpu_reset although
>> uvd fails to reset, and that ring gets disabled as a result.
>
> Depending on what hardware you have it's normal that UVD doesn't reset properly. I still haven't figured out the correct sequence in which I need to disable/enable the different UVD blocks on all hardware generations.
>
> It seems to work fine on my Cayman, but doesn't for example on Turks (which at least theoretically should have the same UVD block). It should be fine as long as the engines gets properly disabled when the IB test fails after an reset.
>
> Another common source of reset instability is DPM, while it now seems to be stable on NI and BTC I can't get a single reset to work once I use it.
Ah, in my testing I've hit both (on redwood). :-) But with DPM failures far less frequent than UVD failures, which trigger consistently.

Turks had some issues before, but my laptop completely fails early during boot with 3.17-rc1 so I can't test it.

~Maarten
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index b281886f6f51..9d97409c0443 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2315,7 +2315,7 @@  struct radeon_device {
 	bool				need_dma32;
 	bool				accel_working;
 	bool				fastfb_working; /* IGP feature*/
-	bool				needs_reset;
+	bool				needs_reset, in_reset;
 	struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
 	const struct firmware *me_fw;	/* all family ME firmware */
 	const struct firmware *pfp_fw;	/* r6/700 PFP firmware */
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 6a219bcee66d..124d8994e02a 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1671,6 +1671,7 @@  int radeon_gpu_reset(struct radeon_device *rdev)
 	down_write(&rdev->exclusive_lock);
 
 	if (!rdev->needs_reset) {
+		WARN_ON(rdev->in_reset);
 		up_write(&rdev->exclusive_lock);
 		return 0;
 	}
@@ -1683,17 +1684,21 @@  int radeon_gpu_reset(struct radeon_device *rdev)
 	radeon_suspend(rdev);
 	radeon_hpd_fini(rdev);
 
-	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-		ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
-						   &ring_data[i]);
-		if (ring_sizes[i]) {
-			saved = true;
-			dev_info(rdev->dev, "Saved %d dwords of commands "
-				 "on ring %d.\n", ring_sizes[i], i);
+	if (!rdev->in_reset) {
+		rdev->in_reset = true;
+
+		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+			ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
+							   &ring_data[i]);
+			if (ring_sizes[i]) {
+				saved = true;
+				dev_info(rdev->dev, "Saved %d dwords of commands "
+					 "on ring %d.\n", ring_sizes[i], i);
+			}
 		}
-	}
+	} else
+		memset(ring_data, 0, sizeof(ring_data));
 
-retry:
 	r = radeon_asic_reset(rdev);
 	if (!r) {
 		dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n");
@@ -1709,16 +1714,6 @@  retry:
 			ring_sizes[i] = 0;
 			ring_data[i] = NULL;
 		}
-
-		r = radeon_ib_ring_tests(rdev);
-		if (r) {
-			dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
-			if (saved) {
-				saved = false;
-				radeon_suspend(rdev);
-				goto retry;
-			}
-		}
 	} else {
 		radeon_fence_driver_force_completion(rdev);
 		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -1728,8 +1723,7 @@  retry:
 
 	if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled) {
 		/* do dpm late init */
-		r = radeon_pm_late_init(rdev);
-		if (r) {
+		if (radeon_pm_late_init(rdev)) {
 			rdev->pm.dpm_enabled = false;
 			DRM_ERROR("radeon_pm_late_init failed, disabling dpm\n");
 		}
@@ -1753,19 +1747,34 @@  retry:
 	/* reset hpd state */
 	radeon_hpd_init(rdev);
 
+	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+	downgrade_write(&rdev->exclusive_lock);
+
 	drm_helper_resume_force_mode(rdev->ddev);
 
 	/* set the power state here in case we are a PX system or headless */
-	if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
+	if ((rdev->pm.pm_method == PM_METHOD_DPM))
 		radeon_pm_compute_clocks(rdev);
 
-	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+	if (!r) {
+		r = radeon_ib_ring_tests(rdev);
+		if (r) {
+			dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
+			if (saved) {
+				rdev->needs_reset = true;
+				up_read(&rdev->exclusive_lock);
+				return -EAGAIN;
+			}
+		}
+	}
+
 	if (r) {
 		/* bad news, how to tell it to userspace ? */
 		dev_info(rdev->dev, "GPU reset failed\n");
 	}
 
-	up_write(&rdev->exclusive_lock);
+	rdev->in_reset = false;
+	up_read(&rdev->exclusive_lock);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 3fdf87318069..bd0d687379ee 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -405,7 +405,9 @@  static void radeon_flip_work_func(struct work_struct *__work)
 		r = radeon_fence_wait(work->fence, false);
 		if (r == -EDEADLK) {
 			up_read(&rdev->exclusive_lock);
-			r = radeon_gpu_reset(rdev);
+			do {
+				r = radeon_gpu_reset(rdev);
+			} while (r == -EAGAIN);
 			down_read(&rdev->exclusive_lock);
 		}
 		if (r)