Message ID | 53F211A7.5080506@canonical.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst: > Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > --- > V1 had a nasty bug breaking gpu lockup recovery. The fix is not > allowing radeon_fence_driver_check_lockup to take exclusive_lock, > and kill it during lockup recovery instead. > V2 used delayed work that ran during lockup recovery, but required > read lock. I've fixed this by downgrading the write, and retrying if > recovery fails. > --- > drivers/gpu/drm/radeon/radeon.h | 2 + > drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++----------------- > 2 files changed, 61 insertions(+), 56 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 1d806983ec7b..29504efe8971 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -355,6 +355,8 @@ struct radeon_fence_driver { > uint64_t sync_seq[RADEON_NUM_RINGS]; > atomic64_t last_seq; > bool initialized; > + struct delayed_work fence_check_work; > + struct radeon_device *rdev; Put the reference to the device as the first field in the structure. > }; > > struct radeon_fence { > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c > index 913787085dfa..94eca53d99f8 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, > return 0; > } > > -/** > - * radeon_fence_process - process a fence > - * > - * @rdev: radeon_device pointer > - * @ring: ring index the fence is associated with > - * > - * Checks the current fence value and wakes the fence queue > - * if the sequence number has increased (all asics). > - */ > -void radeon_fence_process(struct radeon_device *rdev, int ring) > +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) Don't use "__" for internal radeon function names and especially don't remove the function documentation. > { > uint64_t seq, last_seq, last_emitted; > unsigned count_loop = 0; > @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) > } > } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); > > - if (wake) > + if (seq < last_emitted && !rdev->in_reset) > + mod_delayed_work(system_power_efficient_wq, > + &rdev->fence_drv[ring].fence_check_work, > + RADEON_FENCE_JIFFIES_TIMEOUT); Am I wrong or do you queue the work only after radeon_fence_process is called for the first time? Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well. > + > + return wake; > +} > + > +static void radeon_fence_driver_check_lockup(struct work_struct *work) > +{ > + struct radeon_fence_driver *fence_drv; > + struct radeon_device *rdev; > + unsigned long iring; > + > + fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work); > + rdev = fence_drv->rdev; > + iring = fence_drv - &rdev->fence_drv[0]; > + > + down_read(&rdev->exclusive_lock); > + if (__radeon_fence_process(rdev, iring)) > + wake_up_all(&rdev->fence_queue); > + else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) { > + /* good news we believe it's a lockup */ > + dev_warn(rdev->dev, "GPU lockup (current fence id " > + "0x%016llx last fence id 0x%016llx on ring %ld)\n", > + (uint64_t)atomic64_read(&fence_drv->last_seq), > + fence_drv->sync_seq[iring], iring); > + > + /* remember that we need an reset */ > + rdev->needs_reset = true; > + wake_up_all(&rdev->fence_queue); > + } > + up_read(&rdev->exclusive_lock); > +} > + > +/** > + * radeon_fence_process - process a fence > + * > + * @rdev: radeon_device pointer > + * @ring: ring index the fence is associated with > + * > + * Checks the current fence value and wakes the fence queue > + * if the sequence number has increased (all asics). > + */ > +void radeon_fence_process(struct radeon_device *rdev, int ring) > +{ > + if (__radeon_fence_process(rdev, ring)) > wake_up_all(&rdev->fence_queue); > } > > @@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, > { > uint64_t last_seq[RADEON_NUM_RINGS]; > bool signaled; > - int i, r; > + int i; > > while (!radeon_fence_any_seq_signaled(rdev, target_seq)) { > + long r; > > /* Save current sequence values, used to check for GPU lockups */ > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, > if (intr) { > r = wait_event_interruptible_timeout(rdev->fence_queue, ( > (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)) > - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } else { > r = wait_event_timeout(rdev->fence_queue, ( > (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)) > - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } > > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, > trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]); > } > > - if (unlikely(r < 0)) > + if (r < 0) > return r; > > - if (unlikely(!signaled)) { > - if (rdev->needs_reset) > - return -EDEADLK; > - > - /* we were interrupted for some reason and fence > - * isn't signaled yet, resume waiting */ > - if (r) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq)) > - break; > - } > - > - if (i != RADEON_NUM_RINGS) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i])) > - break; > - } > - > - if (i < RADEON_NUM_RINGS) { > - /* good news we believe it's a lockup */ > - dev_warn(rdev->dev, "GPU lockup (waiting for " > - "0x%016llx last fence id 0x%016llx on" > - " ring %d)\n", > - target_seq[i], last_seq[i], i); > - > - /* remember that we need an reset */ > - rdev->needs_reset = true; > - wake_up_all(&rdev->fence_queue); > - return -EDEADLK; > - } > - } > + if (rdev->needs_reset) > + return -EDEADLK; > } > return 0; > } > @@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring) > rdev->fence_drv[ring].sync_seq[i] = 0; > atomic64_set(&rdev->fence_drv[ring].last_seq, 0); > rdev->fence_drv[ring].initialized = false; > + INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work, > + radeon_fence_driver_check_lockup); > + rdev->fence_drv[ring].rdev = rdev; > } > > /** > @@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) > /* no need to trigger GPU reset as we are unloading */ > radeon_fence_driver_force_completion(rdev); > } > + cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work); > wake_up_all(&rdev->fence_queue); > radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); > rdev->fence_drv[ring].initialized = false;
Op 18-08-14 om 17:12 schreef Christian König: > Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst: >> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >> --- >> V1 had a nasty bug breaking gpu lockup recovery. The fix is not >> allowing radeon_fence_driver_check_lockup to take exclusive_lock, >> and kill it during lockup recovery instead. >> V2 used delayed work that ran during lockup recovery, but required >> read lock. I've fixed this by downgrading the write, and retrying if >> recovery fails. >> --- >> drivers/gpu/drm/radeon/radeon.h | 2 + >> drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++----------------- >> 2 files changed, 61 insertions(+), 56 deletions(-) >> >> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h >> index 1d806983ec7b..29504efe8971 100644 >> --- a/drivers/gpu/drm/radeon/radeon.h >> +++ b/drivers/gpu/drm/radeon/radeon.h >> @@ -355,6 +355,8 @@ struct radeon_fence_driver { >> uint64_t sync_seq[RADEON_NUM_RINGS]; >> atomic64_t last_seq; >> bool initialized; >> + struct delayed_work fence_check_work; >> + struct radeon_device *rdev; > > Put the reference to the device as the first field in the structure. > >> }; >> struct radeon_fence { >> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c >> index 913787085dfa..94eca53d99f8 100644 >> --- a/drivers/gpu/drm/radeon/radeon_fence.c >> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >> @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, >> return 0; >> } >> -/** >> - * radeon_fence_process - process a fence >> - * >> - * @rdev: radeon_device pointer >> - * @ring: ring index the fence is associated with >> - * >> - * Checks the current fence value and wakes the fence queue >> - * if the sequence number has increased (all asics). >> - */ >> -void radeon_fence_process(struct radeon_device *rdev, int ring) >> +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) > > Don't use "__" for internal radeon function names and especially don't remove the function documentation. I've moved the documentation to the new radeon_fence_process function that calls the __radeon_fence_process one, which is an internal function that is only used by the driver. I guess I can rename it to __radeon_fence_process_nowake or something instead, but documentation doesn't make sense since it's not used outside of radeon_fence.c >> { >> uint64_t seq, last_seq, last_emitted; >> unsigned count_loop = 0; >> @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) >> } >> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); >> - if (wake) >> + if (seq < last_emitted && !rdev->in_reset) >> + mod_delayed_work(system_power_efficient_wq, >> + &rdev->fence_drv[ring].fence_check_work, >> + RADEON_FENCE_JIFFIES_TIMEOUT); > > Am I wrong or do you queue the work only after radeon_fence_process is called for the first time? > > Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well. Yeah might as well, with these changes it makes sense to run it as soon as possible. But if there are no waiters, there's no real need. It's a tree falling in a forest with no-one around to hear it. ;-)
Am 18.08.2014 um 17:28 schrieb Maarten Lankhorst: > Op 18-08-14 om 17:12 schreef Christian König: >> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst: >>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>> --- >>> V1 had a nasty bug breaking gpu lockup recovery. The fix is not >>> allowing radeon_fence_driver_check_lockup to take exclusive_lock, >>> and kill it during lockup recovery instead. >>> V2 used delayed work that ran during lockup recovery, but required >>> read lock. I've fixed this by downgrading the write, and retrying if >>> recovery fails. >>> --- >>> drivers/gpu/drm/radeon/radeon.h | 2 + >>> drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++----------------- >>> 2 files changed, 61 insertions(+), 56 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h >>> index 1d806983ec7b..29504efe8971 100644 >>> --- a/drivers/gpu/drm/radeon/radeon.h >>> +++ b/drivers/gpu/drm/radeon/radeon.h >>> @@ -355,6 +355,8 @@ struct radeon_fence_driver { >>> uint64_t sync_seq[RADEON_NUM_RINGS]; >>> atomic64_t last_seq; >>> bool initialized; >>> + struct delayed_work fence_check_work; >>> + struct radeon_device *rdev; >> Put the reference to the device as the first field in the structure. >> >>> }; >>> struct radeon_fence { >>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c >>> index 913787085dfa..94eca53d99f8 100644 >>> --- a/drivers/gpu/drm/radeon/radeon_fence.c >>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >>> @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, >>> return 0; >>> } >>> -/** >>> - * radeon_fence_process - process a fence >>> - * >>> - * @rdev: radeon_device pointer >>> - * @ring: ring index the fence is associated with >>> - * >>> - * Checks the current fence value and wakes the fence queue >>> - * if the sequence number has increased (all asics). >>> - */ >>> -void radeon_fence_process(struct radeon_device *rdev, int ring) >>> +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) >> Don't use "__" for internal radeon function names and especially don't remove the function documentation. > I've moved the documentation to the new radeon_fence_process function that calls the __radeon_fence_process one, > which is an internal function that is only used by the driver. I guess I can rename it to __radeon_fence_process_nowake or something instead, > but documentation doesn't make sense since it's not used outside of radeon_fence.c We try to document even static functions in doxygen style even if you can't see them outside the C file. >>> { >>> uint64_t seq, last_seq, last_emitted; >>> unsigned count_loop = 0; >>> @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) >>> } >>> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); >>> - if (wake) >>> + if (seq < last_emitted && !rdev->in_reset) >>> + mod_delayed_work(system_power_efficient_wq, >>> + &rdev->fence_drv[ring].fence_check_work, >>> + RADEON_FENCE_JIFFIES_TIMEOUT); >> Am I wrong or do you queue the work only after radeon_fence_process is called for the first time? >> >> Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well. > Yeah might as well, with these changes it makes sense to run it as soon as possible. > > But if there are no waiters, there's no real need. It's a tree falling in a forest with no-one around to hear it. ;-) That's why I suggested to schedule the work item only when the IRQ is enabled for waiting, otherwise it indeed doesn't make much sense. Christian.
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 1d806983ec7b..29504efe8971 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -355,6 +355,8 @@ struct radeon_fence_driver { uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq; bool initialized; + struct delayed_work fence_check_work; + struct radeon_device *rdev; }; struct radeon_fence { diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 913787085dfa..94eca53d99f8 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, return 0; } -/** - * radeon_fence_process - process a fence - * - * @rdev: radeon_device pointer - * @ring: ring index the fence is associated with - * - * Checks the current fence value and wakes the fence queue - * if the sequence number has increased (all asics). - */ -void radeon_fence_process(struct radeon_device *rdev, int ring) +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) { uint64_t seq, last_seq, last_emitted; unsigned count_loop = 0; @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) } } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); - if (wake) + if (seq < last_emitted && !rdev->in_reset) + mod_delayed_work(system_power_efficient_wq, + &rdev->fence_drv[ring].fence_check_work, + RADEON_FENCE_JIFFIES_TIMEOUT); + + return wake; +} + +static void radeon_fence_driver_check_lockup(struct work_struct *work) +{ + struct radeon_fence_driver *fence_drv; + struct radeon_device *rdev; + unsigned long iring; + + fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work); + rdev = fence_drv->rdev; + iring = fence_drv - &rdev->fence_drv[0]; + + down_read(&rdev->exclusive_lock); + if (__radeon_fence_process(rdev, iring)) + wake_up_all(&rdev->fence_queue); + else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) { + /* good news we believe it's a lockup */ + dev_warn(rdev->dev, "GPU lockup (current fence id " + "0x%016llx last fence id 0x%016llx on ring %ld)\n", + (uint64_t)atomic64_read(&fence_drv->last_seq), + fence_drv->sync_seq[iring], iring); + + /* remember that we need an reset */ + rdev->needs_reset = true; + wake_up_all(&rdev->fence_queue); + } + up_read(&rdev->exclusive_lock); +} + +/** + * radeon_fence_process - process a fence + * + * @rdev: radeon_device pointer + * @ring: ring index the fence is associated with + * + * Checks the current fence value and wakes the fence queue + * if the sequence number has increased (all asics). + */ +void radeon_fence_process(struct radeon_device *rdev, int ring) +{ + if (__radeon_fence_process(rdev, ring)) wake_up_all(&rdev->fence_queue); } @@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, { uint64_t last_seq[RADEON_NUM_RINGS]; bool signaled; - int i, r; + int i; while (!radeon_fence_any_seq_signaled(rdev, target_seq)) { + long r; /* Save current sequence values, used to check for GPU lockups */ for (i = 0; i < RADEON_NUM_RINGS; ++i) { @@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, if (intr) { r = wait_event_interruptible_timeout(rdev->fence_queue, ( (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)) - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT); + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); } else { r = wait_event_timeout(rdev->fence_queue, ( (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)) - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT); + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); } for (i = 0; i < RADEON_NUM_RINGS; ++i) { @@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]); } - if (unlikely(r < 0)) + if (r < 0) return r; - if (unlikely(!signaled)) { - if (rdev->needs_reset) - return -EDEADLK; - - /* we were interrupted for some reason and fence - * isn't signaled yet, resume waiting */ - if (r) - continue; - - for (i = 0; i < RADEON_NUM_RINGS; ++i) { - if (!target_seq[i]) - continue; - - if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq)) - break; - } - - if (i != RADEON_NUM_RINGS) - continue; - - for (i = 0; i < RADEON_NUM_RINGS; ++i) { - if (!target_seq[i]) - continue; - - if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i])) - break; - } - - if (i < RADEON_NUM_RINGS) { - /* good news we believe it's a lockup */ - dev_warn(rdev->dev, "GPU lockup (waiting for " - "0x%016llx last fence id 0x%016llx on" - " ring %d)\n", - target_seq[i], last_seq[i], i); - - /* remember that we need an reset */ - rdev->needs_reset = true; - wake_up_all(&rdev->fence_queue); - return -EDEADLK; - } - } + if (rdev->needs_reset) + return -EDEADLK; } return 0; } @@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].sync_seq[i] = 0; atomic64_set(&rdev->fence_drv[ring].last_seq, 0); rdev->fence_drv[ring].initialized = false; + INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work, + radeon_fence_driver_check_lockup); + rdev->fence_drv[ring].rdev = rdev; } /** @@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) /* no need to trigger GPU reset as we are unloading */ radeon_fence_driver_force_completion(rdev); } + cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work); wake_up_all(&rdev->fence_queue); radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); rdev->fence_drv[ring].initialized = false;
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> --- V1 had a nasty bug breaking gpu lockup recovery. The fix is not allowing radeon_fence_driver_check_lockup to take exclusive_lock, and kill it during lockup recovery instead. V2 used delayed work that ran during lockup recovery, but required read lock. I've fixed this by downgrading the write, and retrying if recovery fails. --- drivers/gpu/drm/radeon/radeon.h | 2 + drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++----------------- 2 files changed, 61 insertions(+), 56 deletions(-)