Message ID | 53F2118F.8000706@canonical.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst: > This is needed for the next commit, because the lockup detection > will need the read lock to run. > > Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > --- > drivers/gpu/drm/radeon/radeon.h | 2 +- > drivers/gpu/drm/radeon/radeon_device.c | 61 ++++++++++++++++++++-------------- > 2 files changed, 37 insertions(+), 26 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 9e1732eb402c..1d806983ec7b 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -2312,7 +2312,7 @@ struct radeon_device { > bool need_dma32; > bool accel_working; > bool fastfb_working; /* IGP feature*/ > - bool needs_reset; > + bool needs_reset, in_reset; > struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES]; > const struct firmware *me_fw; /* all family ME firmware */ > const struct firmware *pfp_fw; /* r6/700 PFP firmware */ > diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c > index c8ea050c8fa4..82633fdd399d 100644 > --- a/drivers/gpu/drm/radeon/radeon_device.c > +++ b/drivers/gpu/drm/radeon/radeon_device.c > @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev) > down_write(&rdev->exclusive_lock); > > if (!rdev->needs_reset) { > + WARN_ON(rdev->in_reset); > up_write(&rdev->exclusive_lock); > return 0; > } > > rdev->needs_reset = false; > - > - radeon_save_bios_scratch_regs(rdev); > - /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); > - radeon_pm_suspend(rdev); > - radeon_suspend(rdev); > > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i], > - &ring_data[i]); > - if (ring_sizes[i]) { > - saved = true; > - dev_info(rdev->dev, "Saved %d dwords of commands " > - "on ring %d.\n", ring_sizes[i], i); > + if (!rdev->in_reset) { > + rdev->in_reset = true; > + > + radeon_save_bios_scratch_regs(rdev); > + /* block TTM */ > + radeon_pm_suspend(rdev); > + radeon_suspend(rdev); That won't work correctly because you might end up with calling pm_resume more often than suspend and that can only lead to a crash. Saying this we probably already have a bug in the reset code at this point anyway, but see below. > + > + for (i = 0; i < RADEON_NUM_RINGS; ++i) { > + ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i], > + &ring_data[i]); > + if (ring_sizes[i]) { > + saved = true; > + dev_info(rdev->dev, "Saved %d dwords of commands " > + "on ring %d.\n", ring_sizes[i], i); > + } > } > - } > + } else > + memset(ring_data, 0, sizeof(ring_data)); > > -retry: > r = radeon_asic_reset(rdev); > if (!r) { > dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n"); > @@ -1702,40 +1707,46 @@ retry: > > radeon_restore_bios_scratch_regs(rdev); We should resume PM here as well. > > - if (!r) { > + if (!r && saved) { > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > radeon_ring_restore(rdev, &rdev->ring[i], > ring_sizes[i], ring_data[i]); > - ring_sizes[i] = 0; > ring_data[i] = NULL; > } > + } else { > + radeon_fence_driver_force_completion(rdev); > + > + for (i = 0; i < RADEON_NUM_RINGS; ++i) { > + kfree(ring_data[i]); > + } > + } > + downgrade_write(&rdev->exclusive_lock); > + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); I would unlock the delayed_workqueue first and then downgrade the readlock. > > + if (!r) { > r = radeon_ib_ring_tests(rdev); > if (r) { > dev_err(rdev->dev, "ib ring test failed (%d).\n", r); > if (saved) { > - saved = false; > + /* if reset fails, try without saving data */ > + rdev->needs_reset = true; > radeon_suspend(rdev); > - goto retry; > + up_read(&rdev->exclusive_lock); > + return -EAGAIN; > } > } > - } else { > - radeon_fence_driver_force_completion(rdev); > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - kfree(ring_data[i]); > - } > } > > radeon_pm_resume(rdev); Move this more up. Alex is more into this, but it's probably a bug in the current reset code that this is after the IB tests, cause the IB tests needs everything powered up and with PM handling suspended it is possible that individual blocks are powered down. Thanks, Christian. > drm_helper_resume_force_mode(rdev->ddev); > > - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); > if (r) { > /* bad news, how to tell it to userspace ? */ > dev_info(rdev->dev, "GPU reset failed\n"); > } > > - up_write(&rdev->exclusive_lock); > + rdev->in_reset = false; > + up_read(&rdev->exclusive_lock); > return r; > } >
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 9e1732eb402c..1d806983ec7b 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -2312,7 +2312,7 @@ struct radeon_device { bool need_dma32; bool accel_working; bool fastfb_working; /* IGP feature*/ - bool needs_reset; + bool needs_reset, in_reset; struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES]; const struct firmware *me_fw; /* all family ME firmware */ const struct firmware *pfp_fw; /* r6/700 PFP firmware */ diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index c8ea050c8fa4..82633fdd399d 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev) down_write(&rdev->exclusive_lock); if (!rdev->needs_reset) { + WARN_ON(rdev->in_reset); up_write(&rdev->exclusive_lock); return 0; } rdev->needs_reset = false; - - radeon_save_bios_scratch_regs(rdev); - /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); - radeon_pm_suspend(rdev); - radeon_suspend(rdev); - for (i = 0; i < RADEON_NUM_RINGS; ++i) { - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i], - &ring_data[i]); - if (ring_sizes[i]) { - saved = true; - dev_info(rdev->dev, "Saved %d dwords of commands " - "on ring %d.\n", ring_sizes[i], i); + if (!rdev->in_reset) { + rdev->in_reset = true; + + radeon_save_bios_scratch_regs(rdev); + /* block TTM */ + radeon_pm_suspend(rdev); + radeon_suspend(rdev); + + for (i = 0; i < RADEON_NUM_RINGS; ++i) { + ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i], + &ring_data[i]); + if (ring_sizes[i]) { + saved = true; + dev_info(rdev->dev, "Saved %d dwords of commands " + "on ring %d.\n", ring_sizes[i], i); + } } - } + } else + memset(ring_data, 0, sizeof(ring_data)); -retry: r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n"); @@ -1702,40 +1707,46 @@ retry: radeon_restore_bios_scratch_regs(rdev); - if (!r) { + if (!r && saved) { for (i = 0; i < RADEON_NUM_RINGS; ++i) { radeon_ring_restore(rdev, &rdev->ring[i], ring_sizes[i], ring_data[i]); - ring_sizes[i] = 0; ring_data[i] = NULL; } + } else { + radeon_fence_driver_force_completion(rdev); + + for (i = 0; i < RADEON_NUM_RINGS; ++i) { + kfree(ring_data[i]); + } + } + downgrade_write(&rdev->exclusive_lock); + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); + if (!r) { r = radeon_ib_ring_tests(rdev); if (r) { dev_err(rdev->dev, "ib ring test failed (%d).\n", r); if (saved) { - saved = false; + /* if reset fails, try without saving data */ + rdev->needs_reset = true; radeon_suspend(rdev); - goto retry; + up_read(&rdev->exclusive_lock); + return -EAGAIN; } } - } else { - radeon_fence_driver_force_completion(rdev); - for (i = 0; i < RADEON_NUM_RINGS; ++i) { - kfree(ring_data[i]); - } } radeon_pm_resume(rdev); drm_helper_resume_force_mode(rdev->ddev); - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); if (r) { /* bad news, how to tell it to userspace ? */ dev_info(rdev->dev, "GPU reset failed\n"); } - up_write(&rdev->exclusive_lock); + rdev->in_reset = false; + up_read(&rdev->exclusive_lock); return r; }
This is needed for the next commit, because the lockup detection will need the read lock to run. Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> --- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_device.c | 61 ++++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 26 deletions(-)