Message ID | 1355760272-18402-1-git-send-email-j.glisse@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 17.12.2012 17:04, j.glisse@gmail.com wrote: > From: Jerome Glisse <jglisse@redhat.com> > > radeon_fence_wait_empty_locked should not trigger GPU reset as no > place where it's call from would benefit from such thing and it > actually lead to a kernel deadlock in case the reset is triggered > from pm codepath. Instead force ring completion in place where it > makes sense or return early in others. > > Signed-off-by: Jerome Glisse <jglisse@redhat.com> I wanted to stop losing GPU reset signals by moving the reset into radeon_fence_wait_empty locked, but it's true that in most cases it doesn't make much sense (suspend/finish) and I didn't know that it could cause a hang in the PM code. We should probably fix the PM code to properly signal this condition to it's caller and reset the GPU when it is possible to do so, but fixing the deadlock is of course more important. Also should probably go into the stable kernel as well, but anyway it is: Reviewed-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/radeon/radeon.h | 2 +- > drivers/gpu/drm/radeon/radeon_device.c | 13 +++++++++++-- > drivers/gpu/drm/radeon/radeon_fence.c | 30 ++++++++++++++---------------- > drivers/gpu/drm/radeon/radeon_pm.c | 15 ++++++++++++--- > 4 files changed, 38 insertions(+), 22 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 9c7625c..071b2d7 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int ring); > bool radeon_fence_signaled(struct radeon_fence *fence); > int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); > int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring); > -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); > +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); > int radeon_fence_wait_any(struct radeon_device *rdev, > struct radeon_fence **fences, > bool intr); > diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c > index 774fae7..53a9223 100644 > --- a/drivers/gpu/drm/radeon/radeon_device.c > +++ b/drivers/gpu/drm/radeon/radeon_device.c > @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) > struct drm_crtc *crtc; > struct drm_connector *connector; > int i, r; > + bool force_completion = false; > > if (dev == NULL || dev->dev_private == NULL) { > return -ENODEV; > @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) > > mutex_lock(&rdev->ring_lock); > /* wait for gpu to finish processing current batch */ > - for (i = 0; i < RADEON_NUM_RINGS; i++) > - radeon_fence_wait_empty_locked(rdev, i); > + for (i = 0; i < RADEON_NUM_RINGS; i++) { > + r = radeon_fence_wait_empty_locked(rdev, i); > + if (r) { > + /* delay GPU reset to resume */ > + force_completion = true; > + } > + } > + if (force_completion) { > + radeon_fence_driver_force_completion(rdev); > + } > mutex_unlock(&rdev->ring_lock); > > radeon_save_bios_scratch_regs(rdev); > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c > index bf7b20e..28c09b6 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring) > * Returns 0 if the fences have passed, error for all other cases. > * Caller must hold ring lock. > */ > -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) > +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) > { > uint64_t seq = rdev->fence_drv[ring].sync_seq[ring]; > + int r; > > - while(1) { > - int r; > - r = radeon_fence_wait_seq(rdev, seq, ring, false, false); > + r = radeon_fence_wait_seq(rdev, seq, ring, false, false); > + if (r) { > if (r == -EDEADLK) { > - mutex_unlock(&rdev->ring_lock); > - r = radeon_gpu_reset(rdev); > - mutex_lock(&rdev->ring_lock); > - if (!r) > - continue; > - } > - if (r) { > - dev_err(rdev->dev, "error waiting for ring to become" > - " idle (%d)\n", r); > + return -EDEADLK; > } > - return; > + dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n", > + ring, r); > } > + return 0; > } > > /** > @@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev) > */ > void radeon_fence_driver_fini(struct radeon_device *rdev) > { > - int ring; > + int ring, r; > > mutex_lock(&rdev->ring_lock); > for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { > if (!rdev->fence_drv[ring].initialized) > continue; > - radeon_fence_wait_empty_locked(rdev, ring); > + r = radeon_fence_wait_empty_locked(rdev, ring); > + if (r) { > + /* no need to trigger GPU reset as we are unloading */ > + radeon_fence_driver_force_completion(rdev); > + } > wake_up_all(&rdev->fence_queue); > radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); > rdev->fence_drv[ring].initialized = false; > diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c > index aa14dbb..0bfa656 100644 > --- a/drivers/gpu/drm/radeon/radeon_pm.c > +++ b/drivers/gpu/drm/radeon/radeon_pm.c > @@ -234,7 +234,7 @@ static void radeon_set_power_state(struct radeon_device *rdev) > > static void radeon_pm_set_clocks(struct radeon_device *rdev) > { > - int i; > + int i, r; > > /* no need to take locks, etc. if nothing's going to change */ > if ((rdev->pm.requested_clock_mode_index == rdev->pm.current_clock_mode_index) && > @@ -248,8 +248,17 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev) > /* wait for the rings to drain */ > for (i = 0; i < RADEON_NUM_RINGS; i++) { > struct radeon_ring *ring = &rdev->ring[i]; > - if (ring->ready) > - radeon_fence_wait_empty_locked(rdev, i); > + if (!ring->ready) { > + continue; > + } > + r = radeon_fence_wait_empty_locked(rdev, i); > + if (r) { > + /* needs a GPU reset dont reset here */ > + mutex_unlock(&rdev->ring_lock); > + up_write(&rdev->pm.mclk_lock); > + mutex_unlock(&rdev->ddev->struct_mutex); > + return; > + } > } > > radeon_unmap_vram_bos(rdev);
Added to fixes and stable. Thanks! On Mon, Dec 17, 2012 at 11:41 AM, Christian König <deathsimple@vodafone.de> wrote: > On 17.12.2012 17:04, j.glisse@gmail.com wrote: >> >> From: Jerome Glisse <jglisse@redhat.com> >> >> radeon_fence_wait_empty_locked should not trigger GPU reset as no >> place where it's call from would benefit from such thing and it >> actually lead to a kernel deadlock in case the reset is triggered >> from pm codepath. Instead force ring completion in place where it >> makes sense or return early in others. >> >> Signed-off-by: Jerome Glisse <jglisse@redhat.com> > > > I wanted to stop losing GPU reset signals by moving the reset into > radeon_fence_wait_empty locked, but it's true that in most cases it doesn't > make much sense (suspend/finish) and I didn't know that it could cause a > hang in the PM code. > > We should probably fix the PM code to properly signal this condition to it's > caller and reset the GPU when it is possible to do so, but fixing the > deadlock is of course more important. > > Also should probably go into the stable kernel as well, but anyway it is: > > Reviewed-by: Christian König <christian.koenig@amd.com> > > >> --- >> drivers/gpu/drm/radeon/radeon.h | 2 +- >> drivers/gpu/drm/radeon/radeon_device.c | 13 +++++++++++-- >> drivers/gpu/drm/radeon/radeon_fence.c | 30 >> ++++++++++++++---------------- >> drivers/gpu/drm/radeon/radeon_pm.c | 15 ++++++++++++--- >> 4 files changed, 38 insertions(+), 22 deletions(-) >> >> diff --git a/drivers/gpu/drm/radeon/radeon.h >> b/drivers/gpu/drm/radeon/radeon.h >> index 9c7625c..071b2d7 100644 >> --- a/drivers/gpu/drm/radeon/radeon.h >> +++ b/drivers/gpu/drm/radeon/radeon.h >> @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, >> int ring); >> bool radeon_fence_signaled(struct radeon_fence *fence); >> int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); >> int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring); >> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int >> ring); >> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); >> int radeon_fence_wait_any(struct radeon_device *rdev, >> struct radeon_fence **fences, >> bool intr); >> diff --git a/drivers/gpu/drm/radeon/radeon_device.c >> b/drivers/gpu/drm/radeon/radeon_device.c >> index 774fae7..53a9223 100644 >> --- a/drivers/gpu/drm/radeon/radeon_device.c >> +++ b/drivers/gpu/drm/radeon/radeon_device.c >> @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, >> pm_message_t state) >> struct drm_crtc *crtc; >> struct drm_connector *connector; >> int i, r; >> + bool force_completion = false; >> if (dev == NULL || dev->dev_private == NULL) { >> return -ENODEV; >> @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, >> pm_message_t state) >> mutex_lock(&rdev->ring_lock); >> /* wait for gpu to finish processing current batch */ >> - for (i = 0; i < RADEON_NUM_RINGS; i++) >> - radeon_fence_wait_empty_locked(rdev, i); >> + for (i = 0; i < RADEON_NUM_RINGS; i++) { >> + r = radeon_fence_wait_empty_locked(rdev, i); >> + if (r) { >> + /* delay GPU reset to resume */ >> + force_completion = true; >> + } >> + } >> + if (force_completion) { >> + radeon_fence_driver_force_completion(rdev); >> + } >> mutex_unlock(&rdev->ring_lock); >> radeon_save_bios_scratch_regs(rdev); >> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c >> b/drivers/gpu/drm/radeon/radeon_fence.c >> index bf7b20e..28c09b6 100644 >> --- a/drivers/gpu/drm/radeon/radeon_fence.c >> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >> @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct >> radeon_device *rdev, int ring) >> * Returns 0 if the fences have passed, error for all other cases. >> * Caller must hold ring lock. >> */ >> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) >> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) >> { >> uint64_t seq = rdev->fence_drv[ring].sync_seq[ring]; >> + int r; >> - while(1) { >> - int r; >> - r = radeon_fence_wait_seq(rdev, seq, ring, false, false); >> + r = radeon_fence_wait_seq(rdev, seq, ring, false, false); >> + if (r) { >> if (r == -EDEADLK) { >> - mutex_unlock(&rdev->ring_lock); >> - r = radeon_gpu_reset(rdev); >> - mutex_lock(&rdev->ring_lock); >> - if (!r) >> - continue; >> - } >> - if (r) { >> - dev_err(rdev->dev, "error waiting for ring to >> become" >> - " idle (%d)\n", r); >> + return -EDEADLK; >> } >> - return; >> + dev_err(rdev->dev, "error waiting for ring[%d] to become >> idle (%d)\n", >> + ring, r); >> } >> + return 0; >> } >> /** >> @@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device >> *rdev) >> */ >> void radeon_fence_driver_fini(struct radeon_device *rdev) >> { >> - int ring; >> + int ring, r; >> mutex_lock(&rdev->ring_lock); >> for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { >> if (!rdev->fence_drv[ring].initialized) >> continue; >> - radeon_fence_wait_empty_locked(rdev, ring); >> + r = radeon_fence_wait_empty_locked(rdev, ring); >> + if (r) { >> + /* no need to trigger GPU reset as we are >> unloading */ >> + radeon_fence_driver_force_completion(rdev); >> + } >> wake_up_all(&rdev->fence_queue); >> radeon_scratch_free(rdev, >> rdev->fence_drv[ring].scratch_reg); >> rdev->fence_drv[ring].initialized = false; >> diff --git a/drivers/gpu/drm/radeon/radeon_pm.c >> b/drivers/gpu/drm/radeon/radeon_pm.c >> index aa14dbb..0bfa656 100644 >> --- a/drivers/gpu/drm/radeon/radeon_pm.c >> +++ b/drivers/gpu/drm/radeon/radeon_pm.c >> @@ -234,7 +234,7 @@ static void radeon_set_power_state(struct >> radeon_device *rdev) >> static void radeon_pm_set_clocks(struct radeon_device *rdev) >> { >> - int i; >> + int i, r; >> /* no need to take locks, etc. if nothing's going to change */ >> if ((rdev->pm.requested_clock_mode_index == >> rdev->pm.current_clock_mode_index) && >> @@ -248,8 +248,17 @@ static void radeon_pm_set_clocks(struct radeon_device >> *rdev) >> /* wait for the rings to drain */ >> for (i = 0; i < RADEON_NUM_RINGS; i++) { >> struct radeon_ring *ring = &rdev->ring[i]; >> - if (ring->ready) >> - radeon_fence_wait_empty_locked(rdev, i); >> + if (!ring->ready) { >> + continue; >> + } >> + r = radeon_fence_wait_empty_locked(rdev, i); >> + if (r) { >> + /* needs a GPU reset dont reset here */ >> + mutex_unlock(&rdev->ring_lock); >> + up_write(&rdev->pm.mclk_lock); >> + mutex_unlock(&rdev->ddev->struct_mutex); >> + return; >> + } >> } >> radeon_unmap_vram_bos(rdev); > > > _______________________________________________ > dri-devel mailing list > dri-devel@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/dri-devel
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 9c7625c..071b2d7 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int ring); bool radeon_fence_signaled(struct radeon_fence *fence); int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring); -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); int radeon_fence_wait_any(struct radeon_device *rdev, struct radeon_fence **fences, bool intr); diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 774fae7..53a9223 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) struct drm_crtc *crtc; struct drm_connector *connector; int i, r; + bool force_completion = false; if (dev == NULL || dev->dev_private == NULL) { return -ENODEV; @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) mutex_lock(&rdev->ring_lock); /* wait for gpu to finish processing current batch */ - for (i = 0; i < RADEON_NUM_RINGS; i++) - radeon_fence_wait_empty_locked(rdev, i); + for (i = 0; i < RADEON_NUM_RINGS; i++) { + r = radeon_fence_wait_empty_locked(rdev, i); + if (r) { + /* delay GPU reset to resume */ + force_completion = true; + } + } + if (force_completion) { + radeon_fence_driver_force_completion(rdev); + } mutex_unlock(&rdev->ring_lock); radeon_save_bios_scratch_regs(rdev); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index bf7b20e..28c09b6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring) * Returns 0 if the fences have passed, error for all other cases. * Caller must hold ring lock. */ -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) { uint64_t seq = rdev->fence_drv[ring].sync_seq[ring]; + int r; - while(1) { - int r; - r = radeon_fence_wait_seq(rdev, seq, ring, false, false); + r = radeon_fence_wait_seq(rdev, seq, ring, false, false); + if (r) { if (r == -EDEADLK) { - mutex_unlock(&rdev->ring_lock); - r = radeon_gpu_reset(rdev); - mutex_lock(&rdev->ring_lock); - if (!r) - continue; - } - if (r) { - dev_err(rdev->dev, "error waiting for ring to become" - " idle (%d)\n", r); + return -EDEADLK; } - return; + dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n", + ring, r); } + return 0; } /** @@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev) */ void radeon_fence_driver_fini(struct radeon_device *rdev) { - int ring; + int ring, r; mutex_lock(&rdev->ring_lock); for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { if (!rdev->fence_drv[ring].initialized) continue; - radeon_fence_wait_empty_locked(rdev, ring); + r = radeon_fence_wait_empty_locked(rdev, ring); + if (r) { + /* no need to trigger GPU reset as we are unloading */ + radeon_fence_driver_force_completion(rdev); + } wake_up_all(&rdev->fence_queue); radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); rdev->fence_drv[ring].initialized = false; diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c index aa14dbb..0bfa656 100644 --- a/drivers/gpu/drm/radeon/radeon_pm.c +++ b/drivers/gpu/drm/radeon/radeon_pm.c @@ -234,7 +234,7 @@ static void radeon_set_power_state(struct radeon_device *rdev) static void radeon_pm_set_clocks(struct radeon_device *rdev) { - int i; + int i, r; /* no need to take locks, etc. if nothing's going to change */ if ((rdev->pm.requested_clock_mode_index == rdev->pm.current_clock_mode_index) && @@ -248,8 +248,17 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev) /* wait for the rings to drain */ for (i = 0; i < RADEON_NUM_RINGS; i++) { struct radeon_ring *ring = &rdev->ring[i]; - if (ring->ready) - radeon_fence_wait_empty_locked(rdev, i); + if (!ring->ready) { + continue; + } + r = radeon_fence_wait_empty_locked(rdev, i); + if (r) { + /* needs a GPU reset dont reset here */ + mutex_unlock(&rdev->ring_lock); + up_write(&rdev->pm.mclk_lock); + mutex_unlock(&rdev->ddev->struct_mutex); + return; + } } radeon_unmap_vram_bos(rdev);