Message ID | 20221129211253.3183480-2-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Allow error capture without a request / on reset failure | expand |
On Tue, Nov 29, 2022 at 01:12:52PM -0800, John.C.Harrison@Intel.com wrote: >From: John Harrison <John.C.Harrison@Intel.com> > >There was a report of error captures occurring without any hung >context being indicated despite the capture being initiated by a 'hung >context notification' from GuC. The problem was not reproducible. >However, it is possible to happen if the context in question has no >active requests. For example, if the hang was in the context switch >itself then the breadcrumb write would have occurred and the KMD would >see an idle context. > >In the interests of attempting to provide as much information as >possible about a hang, it seems wise to include the engine info >regardless of whether a request was found or not. As opposed to just >prentending there was no hang at all. > >So update the error capture code to always record engine information >if an engine is given. Which means updating record_context() to take a >context instead of a request (which it only ever used to find the >context anyway). And split the request agnostic parts of >intel_engine_coredump_add_request() out into a seaprate function. > >Signed-off-by: John Harrison <John.C.Harrison@Intel.com> >--- > drivers/gpu/drm/i915/i915_gpu_error.c | 55 +++++++++++++++++++-------- > 1 file changed, 40 insertions(+), 15 deletions(-) > >diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c >index 9d5d5a397b64e..2ed1c84c9fab4 100644 >--- a/drivers/gpu/drm/i915/i915_gpu_error.c >+++ b/drivers/gpu/drm/i915/i915_gpu_error.c >@@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee) > } > > static bool record_context(struct i915_gem_context_coredump *e, >- const struct i915_request *rq) >+ struct intel_context *ce) > { > struct i915_gem_context *ctx; > struct task_struct *task; > bool simulated; > > rcu_read_lock(); >- ctx = rcu_dereference(rq->context->gem_context); >+ ctx = rcu_dereference(ce->gem_context); > if (ctx && !kref_get_unless_zero(&ctx->ref)) > ctx = NULL; > rcu_read_unlock(); >@@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e, > e->guilty = atomic_read(&ctx->guilty_count); > e->active = atomic_read(&ctx->active_count); > >- e->total_runtime = intel_context_get_total_runtime_ns(rq->context); >- e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context); >+ e->total_runtime = intel_context_get_total_runtime_ns(ce); >+ e->avg_runtime = intel_context_get_avg_runtime_ns(ce); > > simulated = i915_gem_context_no_error_capture(ctx); > >@@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_ > return ee; > } > >+static struct intel_engine_capture_vma * >+engine_coredump_add_context(struct intel_engine_coredump *ee, >+ struct intel_context *ce, >+ gfp_t gfp) >+{ >+ struct intel_engine_capture_vma *vma = NULL; >+ >+ ee->simulated |= record_context(&ee->context, ce); >+ if (ee->simulated) >+ return NULL; >+ >+ /* >+ * We need to copy these to an anonymous buffer >+ * as the simplest method to avoid being overwritten >+ * by userspace. >+ */ >+ vma = capture_vma(vma, ce->ring->vma, "ring", gfp); >+ vma = capture_vma(vma, ce->state, "HW context", gfp); >+ >+ return vma; >+} >+ > struct intel_engine_capture_vma * > intel_engine_coredump_add_request(struct intel_engine_coredump *ee, > struct i915_request *rq, > gfp_t gfp) > { >- struct intel_engine_capture_vma *vma = NULL; >+ struct intel_engine_capture_vma *vma; > >- ee->simulated |= record_context(&ee->context, rq); >- if (ee->simulated) >+ vma = engine_coredump_add_context(ee, rq->context, gfp); >+ if (!vma) > return NULL; > > /* >@@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, > */ > vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); > vma = capture_user(vma, rq, gfp); >- vma = capture_vma(vma, rq->ring->vma, "ring", gfp); >- vma = capture_vma(vma, rq->context->state, "HW context", gfp); > > ee->rq_head = rq->head; > ee->rq_post = rq->postfix; >@@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine, > if (ce) { > intel_engine_clear_hung_context(engine); > rq = intel_context_find_active_request(ce); >- if (!rq || !i915_request_started(rq)) >- goto no_request_capture; >+ if (rq && !i915_request_started(rq)) { >+ drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n", >+ engine->name); >+ rq = NULL; >+ } > } else { > /* > * Getting here with GuC enabled means it is a forced error capture >@@ -1625,12 +1648,14 @@ capture_engine(struct intel_engine_cs *engine, > if (rq) > rq = i915_request_get_rcu(rq); > >- if (!rq) >- goto no_request_capture; >+ if (rq) >+ capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); 2 back-to-back if (rq) could merge together, otherwise, lgtm Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> Umesh >+ else if (ce) >+ capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); > >- capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); > if (!capture) { >- i915_request_put(rq); >+ if (rq) >+ i915_request_put(rq); > goto no_request_capture; > } > if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) >-- >2.37.3 >
On 12/12/2022 17:52, Umesh Nerlige Ramappa wrote: > On Tue, Nov 29, 2022 at 01:12:52PM -0800, John.C.Harrison@Intel.com > wrote: >> From: John Harrison <John.C.Harrison@Intel.com> >> >> There was a report of error captures occurring without any hung >> context being indicated despite the capture being initiated by a 'hung >> context notification' from GuC. The problem was not reproducible. >> However, it is possible to happen if the context in question has no >> active requests. For example, if the hang was in the context switch >> itself then the breadcrumb write would have occurred and the KMD would >> see an idle context. >> >> In the interests of attempting to provide as much information as >> possible about a hang, it seems wise to include the engine info >> regardless of whether a request was found or not. As opposed to just >> prentending there was no hang at all. >> >> So update the error capture code to always record engine information >> if an engine is given. Which means updating record_context() to take a >> context instead of a request (which it only ever used to find the >> context anyway). And split the request agnostic parts of >> intel_engine_coredump_add_request() out into a seaprate function. >> >> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> >> --- >> drivers/gpu/drm/i915/i915_gpu_error.c | 55 +++++++++++++++++++-------- >> 1 file changed, 40 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c >> b/drivers/gpu/drm/i915/i915_gpu_error.c >> index 9d5d5a397b64e..2ed1c84c9fab4 100644 >> --- a/drivers/gpu/drm/i915/i915_gpu_error.c >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c >> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct >> intel_engine_coredump *ee) >> } >> >> static bool record_context(struct i915_gem_context_coredump *e, >> - const struct i915_request *rq) >> + struct intel_context *ce) >> { >> struct i915_gem_context *ctx; >> struct task_struct *task; >> bool simulated; >> >> rcu_read_lock(); >> - ctx = rcu_dereference(rq->context->gem_context); >> + ctx = rcu_dereference(ce->gem_context); >> if (ctx && !kref_get_unless_zero(&ctx->ref)) >> ctx = NULL; >> rcu_read_unlock(); >> @@ -1396,8 +1396,8 @@ static bool record_context(struct >> i915_gem_context_coredump *e, >> e->guilty = atomic_read(&ctx->guilty_count); >> e->active = atomic_read(&ctx->active_count); >> >> - e->total_runtime = intel_context_get_total_runtime_ns(rq->context); >> - e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context); >> + e->total_runtime = intel_context_get_total_runtime_ns(ce); >> + e->avg_runtime = intel_context_get_avg_runtime_ns(ce); >> >> simulated = i915_gem_context_no_error_capture(ctx); >> >> @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct >> intel_engine_cs *engine, gfp_t gfp, u32 dump_ >> return ee; >> } >> >> +static struct intel_engine_capture_vma * >> +engine_coredump_add_context(struct intel_engine_coredump *ee, >> + struct intel_context *ce, >> + gfp_t gfp) >> +{ >> + struct intel_engine_capture_vma *vma = NULL; >> + >> + ee->simulated |= record_context(&ee->context, ce); >> + if (ee->simulated) >> + return NULL; >> + >> + /* >> + * We need to copy these to an anonymous buffer >> + * as the simplest method to avoid being overwritten >> + * by userspace. >> + */ >> + vma = capture_vma(vma, ce->ring->vma, "ring", gfp); >> + vma = capture_vma(vma, ce->state, "HW context", gfp); >> + >> + return vma; >> +} >> + >> struct intel_engine_capture_vma * >> intel_engine_coredump_add_request(struct intel_engine_coredump *ee, >> struct i915_request *rq, >> gfp_t gfp) >> { >> - struct intel_engine_capture_vma *vma = NULL; >> + struct intel_engine_capture_vma *vma; >> >> - ee->simulated |= record_context(&ee->context, rq); >> - if (ee->simulated) >> + vma = engine_coredump_add_context(ee, rq->context, gfp); >> + if (!vma) >> return NULL; >> >> /* >> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct >> intel_engine_coredump *ee, >> */ >> vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); >> vma = capture_user(vma, rq, gfp); >> - vma = capture_vma(vma, rq->ring->vma, "ring", gfp); >> - vma = capture_vma(vma, rq->context->state, "HW context", gfp); >> >> ee->rq_head = rq->head; >> ee->rq_post = rq->postfix; >> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine, >> if (ce) { >> intel_engine_clear_hung_context(engine); >> rq = intel_context_find_active_request(ce); >> - if (!rq || !i915_request_started(rq)) >> - goto no_request_capture; >> + if (rq && !i915_request_started(rq)) { >> + drm_info(&engine->gt->i915->drm, "Got hung context on %s >> with no active request!\n", >> + engine->name); >> + rq = NULL; >> + } >> } else { >> /* >> * Getting here with GuC enabled means it is a forced error >> capture >> @@ -1625,12 +1648,14 @@ capture_engine(struct intel_engine_cs *engine, >> if (rq) >> rq = i915_request_get_rcu(rq); >> >> - if (!rq) >> - goto no_request_capture; >> + if (rq) >> + capture = intel_engine_coredump_add_request(ee, rq, >> ATOMIC_MAYFAIL); > > 2 back-to-back if (rq) could merge together, > > > otherwise, lgtm > > Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > > Umesh There was actually an unconditional put of rq just after the block below. So will need to repost to fix that anyway. Not sure how it didn't manage to bang when testing captures with the rq forced to null!? John. >> + else if (ce) >> + capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); >> >> - capture = intel_engine_coredump_add_request(ee, rq, >> ATOMIC_MAYFAIL); >> if (!capture) { >> - i915_request_put(rq); >> + if (rq) >> + i915_request_put(rq); >> goto no_request_capture; >> } >> if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) >> -- >> 2.37.3 >>
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9d5d5a397b64e..2ed1c84c9fab4 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee) } static bool record_context(struct i915_gem_context_coredump *e, - const struct i915_request *rq) + struct intel_context *ce) { struct i915_gem_context *ctx; struct task_struct *task; bool simulated; rcu_read_lock(); - ctx = rcu_dereference(rq->context->gem_context); + ctx = rcu_dereference(ce->gem_context); if (ctx && !kref_get_unless_zero(&ctx->ref)) ctx = NULL; rcu_read_unlock(); @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e, e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); - e->total_runtime = intel_context_get_total_runtime_ns(rq->context); - e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context); + e->total_runtime = intel_context_get_total_runtime_ns(ce); + e->avg_runtime = intel_context_get_avg_runtime_ns(ce); simulated = i915_gem_context_no_error_capture(ctx); @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_ return ee; } +static struct intel_engine_capture_vma * +engine_coredump_add_context(struct intel_engine_coredump *ee, + struct intel_context *ce, + gfp_t gfp) +{ + struct intel_engine_capture_vma *vma = NULL; + + ee->simulated |= record_context(&ee->context, ce); + if (ee->simulated) + return NULL; + + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + vma = capture_vma(vma, ce->ring->vma, "ring", gfp); + vma = capture_vma(vma, ce->state, "HW context", gfp); + + return vma; +} + struct intel_engine_capture_vma * intel_engine_coredump_add_request(struct intel_engine_coredump *ee, struct i915_request *rq, gfp_t gfp) { - struct intel_engine_capture_vma *vma = NULL; + struct intel_engine_capture_vma *vma; - ee->simulated |= record_context(&ee->context, rq); - if (ee->simulated) + vma = engine_coredump_add_context(ee, rq->context, gfp); + if (!vma) return NULL; /* @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, */ vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); vma = capture_user(vma, rq, gfp); - vma = capture_vma(vma, rq->ring->vma, "ring", gfp); - vma = capture_vma(vma, rq->context->state, "HW context", gfp); ee->rq_head = rq->head; ee->rq_post = rq->postfix; @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine, if (ce) { intel_engine_clear_hung_context(engine); rq = intel_context_find_active_request(ce); - if (!rq || !i915_request_started(rq)) - goto no_request_capture; + if (rq && !i915_request_started(rq)) { + drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n", + engine->name); + rq = NULL; + } } else { /* * Getting here with GuC enabled means it is a forced error capture @@ -1625,12 +1648,14 @@ capture_engine(struct intel_engine_cs *engine, if (rq) rq = i915_request_get_rcu(rq); - if (!rq) - goto no_request_capture; + if (rq) + capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); + else if (ce) + capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); - capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); if (!capture) { - i915_request_put(rq); + if (rq) + i915_request_put(rq); goto no_request_capture; } if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)