Message ID | 20220728022028.2190627-5-John.C.Harrison@Intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Fixes and improvements to GuC logging and error capture | expand |
One minor NIT (though i hope it could be fixed otw in as it adds a bit of ease-of-log-readibility). That said, everything else looks good. Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com> On Wed, 2022-07-27 at 19:20 -0700, John.C.Harrison@Intel.com wrote: > From: John Harrison <John.C.Harrison@Intel.com> > > When debugging GuC communication issues, it is useful to have the CTB > info available. So add the state and buffer contents to the error > capture log. > > Also, add a sub-structure for the GuC specific error capture info as > it is now becoming numerous. > > Signed-off-by: John Harrison <John.C.Harrison@Intel.com> > --- > drivers/gpu/drm/i915/i915_gpu_error.c | 59 +++++++++++++++++++++++---- > drivers/gpu/drm/i915/i915_gpu_error.h | 20 +++++++-- > 2 files changed, 67 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index addba75252343..543ba63f958ea 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -671,6 +671,18 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, > pdev->subsystem_device); > } > > +static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, > + const char *name, > + const struct intel_ctb_coredump *ctb) > +{ > + if (!ctb->size) > + return; > + > + err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n", > + name, ctb->raw_status, ctb->raw_head, ctb->raw_tail, > + ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); > NIT: to make it more readible on first glance, would be nice to add more descriptive text like "raw: Sts:0x%08X, Hd:0x%08X,Tl:0x@08X..." also, the not sure why cmds_offset is presented with a "x size" as opposed to just "desc-off = foo1, cmd-off = foo2, size = foo3"? > +} > + > static void err_print_uc(struct drm_i915_error_state_buf *m, > const struct intel_uc_coredump *error_uc) > { > @@ -678,8 +690,12 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, > > intel_uc_fw_dump(&error_uc->guc_fw, &p); > intel_uc_fw_dump(&error_uc->huc_fw, &p); > - err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->timestamp); > - intel_gpu_error_print_vma(m, NULL, error_uc->guc_log); > + err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); > + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); > + err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); > + err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); > + err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1); > + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb); > } > > static void err_free_sgl(struct scatterlist *sgl) > @@ -854,7 +870,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, > if (error->gt) { > bool print_guc_capture = false; > > - if (error->gt->uc && error->gt->uc->is_guc_capture) > + if (error->gt->uc && error->gt->uc->guc.is_guc_capture) > print_guc_capture = true; > > err_print_gt_display(m, error->gt); > @@ -1009,7 +1025,8 @@ static void cleanup_uc(struct intel_uc_coredump *uc) > { > kfree(uc->guc_fw.path); > kfree(uc->huc_fw.path); > - i915_vma_coredump_free(uc->guc_log); > + i915_vma_coredump_free(uc->guc.vma_log); > + i915_vma_coredump_free(uc->guc.vma_ctb); > > kfree(uc); > } > @@ -1658,6 +1675,23 @@ gt_record_engines(struct intel_gt_coredump *gt, > } > } > > +static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, > + const struct intel_guc_ct_buffer *ctb, > + const void *blob_ptr, struct intel_guc *guc) > +{ > + if (!ctb || !ctb->desc) > + return; > + > + saved->raw_status = ctb->desc->status; > + saved->raw_head = ctb->desc->head; > + saved->raw_tail = ctb->desc->tail; > + saved->head = ctb->head; > + saved->tail = ctb->tail; > + saved->size = ctb->size; > + saved->desc_offset = ((void *)ctb->desc) - blob_ptr; > + saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; > +} > + > static struct intel_uc_coredump * > gt_record_uc(struct intel_gt_coredump *gt, > struct i915_vma_compress *compress) > @@ -1684,9 +1718,16 @@ gt_record_uc(struct intel_gt_coredump *gt, > * log times to system times (in conjunction with the error->boottime and > * gt->clock_frequency fields saved elsewhere). > */ > - error_uc->timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); > - error_uc->guc_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, > - "GuC log buffer", compress); > + error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); > + error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, > + "GuC log buffer", compress); > + error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma, > + "GuC CT buffer", compress); > + error_uc->guc.last_fence = uc->guc.ct.requests.last_fence; > + gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send, > + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); > + gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, > + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); > > return error_uc; > } > @@ -2039,9 +2080,9 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du > error->gt->uc = gt_record_uc(error->gt, compress); > if (error->gt->uc) { > if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) > - error->gt->uc->is_guc_capture = true; > + error->gt->uc->guc.is_guc_capture = true; > else > - GEM_BUG_ON(error->gt->uc->is_guc_capture); > + GEM_BUG_ON(error->gt->uc->guc.is_guc_capture); > } > } > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index d8a8b3d529e09..efc75cc2ffdb9 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -125,6 +125,15 @@ struct intel_engine_coredump { > struct intel_engine_coredump *next; > }; > > +struct intel_ctb_coredump { > + u32 raw_head, head; > + u32 raw_tail, tail; > + u32 raw_status; > + u32 desc_offset; > + u32 cmds_offset; > + u32 size; > +}; > + > struct intel_gt_coredump { > const struct intel_gt *_gt; > bool awake; > @@ -165,9 +174,14 @@ struct intel_gt_coredump { > struct intel_uc_coredump { > struct intel_uc_fw guc_fw; > struct intel_uc_fw huc_fw; > - struct i915_vma_coredump *guc_log; > - u32 timestamp; > - bool is_guc_capture; > + struct guc_info { > + struct intel_ctb_coredump ctb[2]; > + struct i915_vma_coredump *vma_ctb; > + struct i915_vma_coredump *vma_log; > + u32 timestamp; > + u16 last_fence; > + bool is_guc_capture; > + } guc; > } *uc; > > struct intel_gt_coredump *next; > -- > 2.37.1 >
On 8/2/2022 11:27, Teres Alexis, Alan Previn wrote: > One minor NIT (though i hope it could be fixed otw in as it adds a bit of ease-of-log-readibility). > That said, everything else looks good. > > Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com> > > On Wed, 2022-07-27 at 19:20 -0700, John.C.Harrison@Intel.com wrote: >> From: John Harrison <John.C.Harrison@Intel.com> >> >> When debugging GuC communication issues, it is useful to have the CTB >> info available. So add the state and buffer contents to the error >> capture log. >> >> Also, add a sub-structure for the GuC specific error capture info as >> it is now becoming numerous. >> >> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> >> --- >> drivers/gpu/drm/i915/i915_gpu_error.c | 59 +++++++++++++++++++++++---- >> drivers/gpu/drm/i915/i915_gpu_error.h | 20 +++++++-- >> 2 files changed, 67 insertions(+), 12 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c >> index addba75252343..543ba63f958ea 100644 >> --- a/drivers/gpu/drm/i915/i915_gpu_error.c >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c >> @@ -671,6 +671,18 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, >> pdev->subsystem_device); >> } >> >> +static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, >> + const char *name, >> + const struct intel_ctb_coredump *ctb) >> +{ >> + if (!ctb->size) >> + return; >> + >> + err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n", >> + name, ctb->raw_status, ctb->raw_head, ctb->raw_tail, >> + ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); >> > NIT: to make it more readible on first glance, would be nice to add more descriptive text like "raw: Sts:0x%08X, > Hd:0x%08X,Tl:0x@08X..." also, the not sure why cmds_offset is presented with a "x size" as opposed to just "desc-off = > foo1, cmd-off = foo2, size = foo3"? The line is long enough as it is. I'd rather not make it even longer. Same for '<name>: <address> x <size>' rather than '<name> _addr = <address>, <name>_size = <size>'. It's useful for readability to keep a single CTB channel on a single line but not if that line is excessively long. John. >> +} >> + >> static void err_print_uc(struct drm_i915_error_state_buf *m, >> const struct intel_uc_coredump *error_uc) >> { >> @@ -678,8 +690,12 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, >> >> intel_uc_fw_dump(&error_uc->guc_fw, &p); >> intel_uc_fw_dump(&error_uc->huc_fw, &p); >> - err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->timestamp); >> - intel_gpu_error_print_vma(m, NULL, error_uc->guc_log); >> + err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); >> + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); >> + err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); >> + err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); >> + err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1); >> + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb); >> } >> >> static void err_free_sgl(struct scatterlist *sgl) >> @@ -854,7 +870,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, >> if (error->gt) { >> bool print_guc_capture = false; >> >> - if (error->gt->uc && error->gt->uc->is_guc_capture) >> + if (error->gt->uc && error->gt->uc->guc.is_guc_capture) >> print_guc_capture = true; >> >> err_print_gt_display(m, error->gt); >> @@ -1009,7 +1025,8 @@ static void cleanup_uc(struct intel_uc_coredump *uc) >> { >> kfree(uc->guc_fw.path); >> kfree(uc->huc_fw.path); >> - i915_vma_coredump_free(uc->guc_log); >> + i915_vma_coredump_free(uc->guc.vma_log); >> + i915_vma_coredump_free(uc->guc.vma_ctb); >> >> kfree(uc); >> } >> @@ -1658,6 +1675,23 @@ gt_record_engines(struct intel_gt_coredump *gt, >> } >> } >> >> +static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, >> + const struct intel_guc_ct_buffer *ctb, >> + const void *blob_ptr, struct intel_guc *guc) >> +{ >> + if (!ctb || !ctb->desc) >> + return; >> + >> + saved->raw_status = ctb->desc->status; >> + saved->raw_head = ctb->desc->head; >> + saved->raw_tail = ctb->desc->tail; >> + saved->head = ctb->head; >> + saved->tail = ctb->tail; >> + saved->size = ctb->size; >> + saved->desc_offset = ((void *)ctb->desc) - blob_ptr; >> + saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; >> +} >> + >> static struct intel_uc_coredump * >> gt_record_uc(struct intel_gt_coredump *gt, >> struct i915_vma_compress *compress) >> @@ -1684,9 +1718,16 @@ gt_record_uc(struct intel_gt_coredump *gt, >> * log times to system times (in conjunction with the error->boottime and >> * gt->clock_frequency fields saved elsewhere). >> */ >> - error_uc->timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); >> - error_uc->guc_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, >> - "GuC log buffer", compress); >> + error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); >> + error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, >> + "GuC log buffer", compress); >> + error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma, >> + "GuC CT buffer", compress); >> + error_uc->guc.last_fence = uc->guc.ct.requests.last_fence; >> + gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send, >> + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); >> + gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, >> + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); >> >> return error_uc; >> } >> @@ -2039,9 +2080,9 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du >> error->gt->uc = gt_record_uc(error->gt, compress); >> if (error->gt->uc) { >> if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) >> - error->gt->uc->is_guc_capture = true; >> + error->gt->uc->guc.is_guc_capture = true; >> else >> - GEM_BUG_ON(error->gt->uc->is_guc_capture); >> + GEM_BUG_ON(error->gt->uc->guc.is_guc_capture); >> } >> } >> >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h >> index d8a8b3d529e09..efc75cc2ffdb9 100644 >> --- a/drivers/gpu/drm/i915/i915_gpu_error.h >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h >> @@ -125,6 +125,15 @@ struct intel_engine_coredump { >> struct intel_engine_coredump *next; >> }; >> >> +struct intel_ctb_coredump { >> + u32 raw_head, head; >> + u32 raw_tail, tail; >> + u32 raw_status; >> + u32 desc_offset; >> + u32 cmds_offset; >> + u32 size; >> +}; >> + >> struct intel_gt_coredump { >> const struct intel_gt *_gt; >> bool awake; >> @@ -165,9 +174,14 @@ struct intel_gt_coredump { >> struct intel_uc_coredump { >> struct intel_uc_fw guc_fw; >> struct intel_uc_fw huc_fw; >> - struct i915_vma_coredump *guc_log; >> - u32 timestamp; >> - bool is_guc_capture; >> + struct guc_info { >> + struct intel_ctb_coredump ctb[2]; >> + struct i915_vma_coredump *vma_ctb; >> + struct i915_vma_coredump *vma_log; >> + u32 timestamp; >> + u16 last_fence; >> + bool is_guc_capture; >> + } guc; >> } *uc; >> >> struct intel_gt_coredump *next; >> -- >> 2.37.1 >>
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index addba75252343..543ba63f958ea 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -671,6 +671,18 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, pdev->subsystem_device); } +static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, + const char *name, + const struct intel_ctb_coredump *ctb) +{ + if (!ctb->size) + return; + + err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n", + name, ctb->raw_status, ctb->raw_head, ctb->raw_tail, + ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); +} + static void err_print_uc(struct drm_i915_error_state_buf *m, const struct intel_uc_coredump *error_uc) { @@ -678,8 +690,12 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); - err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->timestamp); - intel_gpu_error_print_vma(m, NULL, error_uc->guc_log); + err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); + err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); + err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); + err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1); + intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb); } static void err_free_sgl(struct scatterlist *sgl) @@ -854,7 +870,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (error->gt) { bool print_guc_capture = false; - if (error->gt->uc && error->gt->uc->is_guc_capture) + if (error->gt->uc && error->gt->uc->guc.is_guc_capture) print_guc_capture = true; err_print_gt_display(m, error->gt); @@ -1009,7 +1025,8 @@ static void cleanup_uc(struct intel_uc_coredump *uc) { kfree(uc->guc_fw.path); kfree(uc->huc_fw.path); - i915_vma_coredump_free(uc->guc_log); + i915_vma_coredump_free(uc->guc.vma_log); + i915_vma_coredump_free(uc->guc.vma_ctb); kfree(uc); } @@ -1658,6 +1675,23 @@ gt_record_engines(struct intel_gt_coredump *gt, } } +static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, + const struct intel_guc_ct_buffer *ctb, + const void *blob_ptr, struct intel_guc *guc) +{ + if (!ctb || !ctb->desc) + return; + + saved->raw_status = ctb->desc->status; + saved->raw_head = ctb->desc->head; + saved->raw_tail = ctb->desc->tail; + saved->head = ctb->head; + saved->tail = ctb->tail; + saved->size = ctb->size; + saved->desc_offset = ((void *)ctb->desc) - blob_ptr; + saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; +} + static struct intel_uc_coredump * gt_record_uc(struct intel_gt_coredump *gt, struct i915_vma_compress *compress) @@ -1684,9 +1718,16 @@ gt_record_uc(struct intel_gt_coredump *gt, * log times to system times (in conjunction with the error->boottime and * gt->clock_frequency fields saved elsewhere). */ - error_uc->timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); - error_uc->guc_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, - "GuC log buffer", compress); + error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); + error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, + "GuC log buffer", compress); + error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma, + "GuC CT buffer", compress); + error_uc->guc.last_fence = uc->guc.ct.requests.last_fence; + gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send, + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); + gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, + uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); return error_uc; } @@ -2039,9 +2080,9 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du error->gt->uc = gt_record_uc(error->gt, compress); if (error->gt->uc) { if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) - error->gt->uc->is_guc_capture = true; + error->gt->uc->guc.is_guc_capture = true; else - GEM_BUG_ON(error->gt->uc->is_guc_capture); + GEM_BUG_ON(error->gt->uc->guc.is_guc_capture); } } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index d8a8b3d529e09..efc75cc2ffdb9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -125,6 +125,15 @@ struct intel_engine_coredump { struct intel_engine_coredump *next; }; +struct intel_ctb_coredump { + u32 raw_head, head; + u32 raw_tail, tail; + u32 raw_status; + u32 desc_offset; + u32 cmds_offset; + u32 size; +}; + struct intel_gt_coredump { const struct intel_gt *_gt; bool awake; @@ -165,9 +174,14 @@ struct intel_gt_coredump { struct intel_uc_coredump { struct intel_uc_fw guc_fw; struct intel_uc_fw huc_fw; - struct i915_vma_coredump *guc_log; - u32 timestamp; - bool is_guc_capture; + struct guc_info { + struct intel_ctb_coredump ctb[2]; + struct i915_vma_coredump *vma_ctb; + struct i915_vma_coredump *vma_log; + u32 timestamp; + u16 last_fence; + bool is_guc_capture; + } guc; } *uc; struct intel_gt_coredump *next;