Message ID | 20230117213630.2897570-3-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Allow error capture without a request / on reset failure | expand |
On 17/01/2023 21:36, John.C.Harrison@Intel.com wrote: > From: John Harrison <John.C.Harrison@Intel.com> > > There was a report of error captures occurring without any hung > context being indicated despite the capture being initiated by a 'hung > context notification' from GuC. The problem was not reproducible. > However, it is possible to happen if the context in question has no > active requests. For example, if the hang was in the context switch > itself then the breadcrumb write would have occurred and the KMD would > see an idle context. > > In the interests of attempting to provide as much information as > possible about a hang, it seems wise to include the engine info > regardless of whether a request was found or not. As opposed to just > prentending there was no hang at all. > > So update the error capture code to always record engine information > if an engine is given. Which means updating record_context() to take a > context instead of a request (which it only ever used to find the > context anyway). And split the request agnostic parts of > intel_engine_coredump_add_request() out into a seaprate function. > > v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null > pointer. > v3: Tidy up request locking code flow (Tvrtko) > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > --- > drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++--------- > 1 file changed, 46 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 4107a0dfcca7d..461489d599a7e 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee) > } > > static bool record_context(struct i915_gem_context_coredump *e, > - const struct i915_request *rq) > + struct intel_context *ce) > { > struct i915_gem_context *ctx; > struct task_struct *task; > bool simulated; > > rcu_read_lock(); > - ctx = rcu_dereference(rq->context->gem_context); > + ctx = rcu_dereference(ce->gem_context); > if (ctx && !kref_get_unless_zero(&ctx->ref)) > ctx = NULL; > rcu_read_unlock(); > @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e, > e->guilty = atomic_read(&ctx->guilty_count); > e->active = atomic_read(&ctx->active_count); > > - e->total_runtime = intel_context_get_total_runtime_ns(rq->context); > - e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context); > + e->total_runtime = intel_context_get_total_runtime_ns(ce); > + e->avg_runtime = intel_context_get_avg_runtime_ns(ce); > > simulated = i915_gem_context_no_error_capture(ctx); > > @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_ > return ee; > } > > +static struct intel_engine_capture_vma * > +engine_coredump_add_context(struct intel_engine_coredump *ee, > + struct intel_context *ce, > + gfp_t gfp) > +{ > + struct intel_engine_capture_vma *vma = NULL; > + > + ee->simulated |= record_context(&ee->context, ce); > + if (ee->simulated) > + return NULL; > + > + /* > + * We need to copy these to an anonymous buffer > + * as the simplest method to avoid being overwritten > + * by userspace. > + */ > + vma = capture_vma(vma, ce->ring->vma, "ring", gfp); > + vma = capture_vma(vma, ce->state, "HW context", gfp); > + > + return vma; > +} > + > struct intel_engine_capture_vma * > intel_engine_coredump_add_request(struct intel_engine_coredump *ee, > struct i915_request *rq, > gfp_t gfp) > { > - struct intel_engine_capture_vma *vma = NULL; > + struct intel_engine_capture_vma *vma; > > - ee->simulated |= record_context(&ee->context, rq); > - if (ee->simulated) > + vma = engine_coredump_add_context(ee, rq->context, gfp); > + if (!vma) > return NULL; > > /* > @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, > */ > vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); > vma = capture_user(vma, rq, gfp); > - vma = capture_vma(vma, rq->ring->vma, "ring", gfp); > - vma = capture_vma(vma, rq->context->state, "HW context", gfp); > > ee->rq_head = rq->head; > ee->rq_post = rq->postfix; > @@ -1609,8 +1629,12 @@ capture_engine(struct intel_engine_cs *engine, > intel_engine_clear_hung_context(engine); > /* This will reference count the request (if found) */ > rq = intel_context_find_active_request(ce); > - if (!rq || !i915_request_started(rq)) > - goto no_request_capture; > + if (rq && !i915_request_started(rq)) { > + drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n", > + engine->name); > + i915_request_put(rq); > + rq = NULL; > + } > } else { > /* > * Getting here with GuC enabled means it is a forced error capture > @@ -1624,25 +1648,24 @@ capture_engine(struct intel_engine_cs *engine, > flags); > } > } > - if (!rq) > - goto no_request_capture; > - > - capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); > - if (!capture) { > + if (rq) { > + capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); > i915_request_put(rq); > - goto no_request_capture; > + } else if (ce) { > + capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); > } > + > if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) > intel_guc_capture_get_matching_node(engine->gt, ee, ce); > > - intel_engine_coredump_add_vma(ee, capture, compress); > - i915_request_put(rq); > + if (capture) { > + intel_engine_coredump_add_vma(ee, capture, compress); > + } else { > + kfree(ee); > + ee = NULL; > + } > > return ee; > - > -no_request_capture: > - kfree(ee); > - return NULL; > } > > static void LGTM - regardless of how i915_request_get_rcu flow ends up: Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 4107a0dfcca7d..461489d599a7e 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee) } static bool record_context(struct i915_gem_context_coredump *e, - const struct i915_request *rq) + struct intel_context *ce) { struct i915_gem_context *ctx; struct task_struct *task; bool simulated; rcu_read_lock(); - ctx = rcu_dereference(rq->context->gem_context); + ctx = rcu_dereference(ce->gem_context); if (ctx && !kref_get_unless_zero(&ctx->ref)) ctx = NULL; rcu_read_unlock(); @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e, e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); - e->total_runtime = intel_context_get_total_runtime_ns(rq->context); - e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context); + e->total_runtime = intel_context_get_total_runtime_ns(ce); + e->avg_runtime = intel_context_get_avg_runtime_ns(ce); simulated = i915_gem_context_no_error_capture(ctx); @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_ return ee; } +static struct intel_engine_capture_vma * +engine_coredump_add_context(struct intel_engine_coredump *ee, + struct intel_context *ce, + gfp_t gfp) +{ + struct intel_engine_capture_vma *vma = NULL; + + ee->simulated |= record_context(&ee->context, ce); + if (ee->simulated) + return NULL; + + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + vma = capture_vma(vma, ce->ring->vma, "ring", gfp); + vma = capture_vma(vma, ce->state, "HW context", gfp); + + return vma; +} + struct intel_engine_capture_vma * intel_engine_coredump_add_request(struct intel_engine_coredump *ee, struct i915_request *rq, gfp_t gfp) { - struct intel_engine_capture_vma *vma = NULL; + struct intel_engine_capture_vma *vma; - ee->simulated |= record_context(&ee->context, rq); - if (ee->simulated) + vma = engine_coredump_add_context(ee, rq->context, gfp); + if (!vma) return NULL; /* @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, */ vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); vma = capture_user(vma, rq, gfp); - vma = capture_vma(vma, rq->ring->vma, "ring", gfp); - vma = capture_vma(vma, rq->context->state, "HW context", gfp); ee->rq_head = rq->head; ee->rq_post = rq->postfix; @@ -1609,8 +1629,12 @@ capture_engine(struct intel_engine_cs *engine, intel_engine_clear_hung_context(engine); /* This will reference count the request (if found) */ rq = intel_context_find_active_request(ce); - if (!rq || !i915_request_started(rq)) - goto no_request_capture; + if (rq && !i915_request_started(rq)) { + drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n", + engine->name); + i915_request_put(rq); + rq = NULL; + } } else { /* * Getting here with GuC enabled means it is a forced error capture @@ -1624,25 +1648,24 @@ capture_engine(struct intel_engine_cs *engine, flags); } } - if (!rq) - goto no_request_capture; - - capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); - if (!capture) { + if (rq) { + capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); i915_request_put(rq); - goto no_request_capture; + } else if (ce) { + capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); } + if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) intel_guc_capture_get_matching_node(engine->gt, ee, ce); - intel_engine_coredump_add_vma(ee, capture, compress); - i915_request_put(rq); + if (capture) { + intel_engine_coredump_add_vma(ee, capture, compress); + } else { + kfree(ee); + ee = NULL; + } return ee; - -no_request_capture: - kfree(ee); - return NULL; } static void