[14/20] drm/i915: TDR/watchdog trace points.

Message ID	1452706112-8617-15-git-send-email-arun.siluvery@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Arun Siluvery <arun.siluvery@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 13 Jan 2016 17:28:26 +0000 Message-Id: <1452706112-8617-15-git-send-email-arun.siluvery@linux.intel.com> In-Reply-To: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com> References: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com> Cc: Tomas Elf <tomas.elf@intel.com> Subject: [Intel-gfx] [PATCH 14/20] drm/i915: TDR/watchdog trace points. Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c597dcc..73976f9 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -888,6 +888,7 @@ int i915_reset(struct drm_device *dev) bool simulated; int ret; + trace_i915_tdr_gpu_recovery(dev); intel_reset_gt_powersave(dev); mutex_lock(&dev->struct_mutex); @@ -1112,6 +1113,8 @@ int i915_reset_engine(struct intel_engine_cs *engine) WARN_ON(!mutex_is_locked(&dev->struct_mutex)); + trace_i915_tdr_engine_recovery(engine); + /* Take wake lock to prevent power saving mode */ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0a223b1..a65722d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3382,6 +3382,7 @@ void i915_destroy_error_state(struct drm_device *dev); void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone); const char *i915_cache_level_str(struct drm_i915_private *i915, int type); +const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a); /* i915_cmd_parser.c */ int i915_cmd_parser_get_version(void); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 06ca408..a5dcc7b 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -221,7 +221,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, } } -static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a) +const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a) { switch (a) { case HANGCHECK_IDLE: diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index f8fedbc..acca5d8 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2502,6 +2502,7 @@ static void i915_error_work_func(struct work_struct *work) mutex_lock(&dev->struct_mutex); + trace_i915_tdr_recovery_start(dev); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event); for_each_ring(ring, dev_priv, i) { @@ -2620,6 +2621,7 @@ static void i915_error_work_func(struct work_struct *work) kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, reset_done_event); } + trace_i915_tdr_recovery_complete(dev); } static void i915_report_and_clear_eir(struct drm_device *dev) @@ -2745,6 +2747,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, struct drm_i915_private *dev_priv = dev->dev_private; va_list args; char error_msg[80]; + bool full_reset = true; struct intel_engine_cs *engine; @@ -2763,7 +2766,6 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, * 2. The hardware does not support it (pre-gen7). * 3. We already tried per-engine reset recently. */ - bool full_reset = true; if (!i915.enable_engine_reset) { DRM_INFO("Engine reset disabled: Using full GPU reset.\n"); @@ -2792,6 +2794,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, i915.gpu_reset_promotion_time; engine->hangcheck.last_engine_reset_time = now; + } else { /* * Watchdog timeout always results @@ -2840,6 +2843,8 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, i915_error_wake_up(dev_priv, false); } + trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset); + /* * Gen 7: * @@ -3264,6 +3269,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) for_each_ring(ring, dev_priv, i) { u64 acthd; u32 seqno; + u32 head; bool busy = true; semaphore_clear_deadlocks(dev_priv); @@ -3344,7 +3350,10 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ring->hangcheck.seqno = seqno; ring->hangcheck.acthd = acthd; + head = I915_READ_HEAD(ring); busy_count += busy; + + trace_i915_tdr_hang_check(ring, seqno, acthd, head, busy); } for_each_ring(ring, dev_priv, i) { diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index 52b2d40..5c15d43 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -810,6 +810,345 @@ TRACE_EVENT(switch_mm, __entry->dev, __entry->ring, __entry->to, __entry->vm) ); +/** + * DOC: i915_tdr_gpu_recovery + * + * This tracepoint tracks the onset of the full GPU recovery path + */ +TRACE_EVENT(i915_tdr_gpu_recovery, + TP_PROTO(struct drm_device *dev), + + TP_ARGS(dev), + + TP_STRUCT__entry( + __field(u32, dev) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + ), + + TP_printk("dev=%u, full GPU recovery started", + __entry->dev) +); + +/** + * DOC: i915_tdr_engine_recovery + * + * This tracepoint tracks the onset of the engine recovery path + */ +TRACE_EVENT(i915_tdr_engine_recovery, + TP_PROTO(struct intel_engine_cs *engine), + + TP_ARGS(engine), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + ), + + TP_fast_assign( + __entry->engine = engine; + ), + + TP_printk("dev=%u, engine=%u, recovery of %s started", + __entry->engine->dev->primary->index, + __entry->engine->id, + __entry->engine->name) +); + +/** + * DOC: i915_tdr_recovery_start + * + * This tracepoint tracks hang recovery start + */ +TRACE_EVENT(i915_tdr_recovery_start, + TP_PROTO(struct drm_device *dev), + + TP_ARGS(dev), + + TP_STRUCT__entry( + __field(u32, dev) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + ), + + TP_printk("dev=%u, hang recovery started", + __entry->dev) +); + +/** + * DOC: i915_tdr_recovery_complete + * + * This tracepoint tracks hang recovery completion + */ +TRACE_EVENT(i915_tdr_recovery_complete, + TP_PROTO(struct drm_device *dev), + + TP_ARGS(dev), + + TP_STRUCT__entry( + __field(u32, dev) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + ), + + TP_printk("dev=%u, hang recovery completed", + __entry->dev) +); + +/** + * DOC: i915_tdr_recovery_queued + * + * This tracepoint tracks the point of queuing recovery from hang check. + * If engine recovery is requested engine name will be displayed, otherwise + * it will be set to "none". If too many engine reset was attempted in the + * previous history we promote to full GPU reset, which is remarked by appending + * the "[PROMOTED]" flag. + */ +TRACE_EVENT(i915_tdr_recovery_queued, + TP_PROTO(struct drm_device *dev, + u32 hung_engines, + bool watchdog, + bool full_reset), + + TP_ARGS(dev, hung_engines, watchdog, full_reset), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u32, hung_engines) + __field(bool, watchdog) + __field(bool, full_reset) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->hung_engines = hung_engines; + __entry->watchdog = watchdog; + __entry->full_reset = full_reset; + ), + + TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, full_reset=%s", + __entry->dev, + __entry->hung_engines, + __entry->hung_engines ? " (":"", + __entry->hung_engines & RENDER_RING ? " [RCS] " : "", + __entry->hung_engines & BSD_RING ? " [VCS] " : "", + __entry->hung_engines & BLT_RING ? " [BCS] " : "", + __entry->hung_engines & VEBOX_RING ? " [VECS] " : "", + __entry->hung_engines & BSD2_RING ? " [VCS2] " : "", + __entry->hung_engines ? ")":"", + __entry->watchdog ? "true" : "false", + __entry->full_reset ? + (__entry->hung_engines ? "true [PROMOTED]" : "true") : + "false") +); + +/** + * DOC: i915_tdr_engine_save + * + * This tracepoint tracks the point of engine state save during the engine + * recovery path. Logs the head pointer position at point of hang, the position + * after recovering and whether or not we forced a head pointer advancement or + * rounded up to an aligned QWORD position. + */ +TRACE_EVENT(i915_tdr_engine_save, + TP_PROTO(struct intel_engine_cs *engine, + u32 old_head, + u32 new_head, + bool forced_advance), + + TP_ARGS(engine, old_head, new_head, forced_advance), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + __field(u32, old_head) + __field(u32, new_head) + __field(bool, forced_advance) + ), + + TP_fast_assign( + __entry->engine = engine; + __entry->old_head = old_head; + __entry->new_head = new_head; + __entry->forced_advance = forced_advance; + ), + + TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, forced_advance=%s", + __entry->engine->dev->primary->index, + __entry->engine->name, + __entry->old_head, + __entry->new_head, + __entry->forced_advance ? "true" : "false") +); + +/** + * DOC: i915_tdr_gpu_reset_complete + * + * This tracepoint tracks the point of full GPU reset completion + */ +TRACE_EVENT(i915_tdr_gpu_reset_complete, + TP_PROTO(struct drm_device *dev), + + TP_ARGS(dev), + + TP_STRUCT__entry( + __field(struct drm_device *, dev) + ), + + TP_fast_assign( + __entry->dev = dev; + ), + + TP_printk("dev=%u, resets=%u", + __entry->dev->primary->index, + i915_reset_count(&((struct drm_i915_private *) + (__entry->dev)->dev_private)->gpu_error) ) +); + +/** + * DOC: i915_tdr_engine_reset_complete + * + * This tracepoint tracks the point of engine reset completion + */ +TRACE_EVENT(i915_tdr_engine_reset_complete, + TP_PROTO(struct intel_engine_cs *engine), + + TP_ARGS(engine), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + ), + + TP_fast_assign( + __entry->engine = engine; + ), + + TP_printk("dev=%u, engine=%s, resets=%u", + __entry->engine->dev->primary->index, + __entry->engine->name, + __entry->engine->hangcheck.reset_count) +); + +/** + * DOC: i915_tdr_forced_csb_check + * + * This tracepoint tracks the occurences of forced CSB checks + * that the driver does when detecting inconsistent context + * submission states between the driver state and the current + * CPU engine state. + */ +TRACE_EVENT(i915_tdr_forced_csb_check, + TP_PROTO(struct intel_engine_cs *engine, + bool was_effective), + + TP_ARGS(engine, was_effective), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + __field(bool, was_effective) + ), + + TP_fast_assign( + __entry->engine = engine; + __entry->was_effective = was_effective; + ), + + TP_printk("dev=%u, engine=%s, was_effective=%s", + __entry->engine->dev->primary->index, + __entry->engine->name, + __entry->was_effective ? "yes" : "no") +); + +/** + * DOC: i915_tdr_hang_check + * + * This tracepoint tracks hang checks on each engine. + */ +TRACE_EVENT(i915_tdr_hang_check, + TP_PROTO(struct intel_engine_cs *engine, + u32 seqno, + u64 acthd, + u32 hd, + bool busy), + + TP_ARGS(engine, seqno, acthd, hd, busy), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + __field(u32, seqno) + __field(u64, acthd) + __field(u32, hd) + __field(bool, busy) + ), + + TP_fast_assign( + __entry->engine = engine; + __entry->seqno = seqno; + __entry->acthd = acthd; + __entry->hd = hd; + __entry->busy = busy; + ), + + TP_printk("dev=%u, engine=%s, seqno=%u (%d), last seqno=%u (%d), head=%u (%d), acthd=%lu, score=%d%s, action=%u [%s], busy=%s", + __entry->engine->dev->primary->index, + __entry->engine->name, + __entry->seqno, + __entry->seqno, + __entry->engine->hangcheck.seqno, + __entry->engine->hangcheck.seqno, + __entry->hd, + __entry->hd, + (long unsigned int) __entry->acthd, + __entry->engine->hangcheck.score, + (__entry->engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "", + (unsigned int) __entry->engine->hangcheck.action, + hangcheck_action_to_str(__entry->engine->hangcheck.action), + __entry->busy ? "yes" : "no") +); + +/** + * DOC: i915_tdr_inconsistency + * + * This tracepoint tracks detected inconsistencies + */ +TRACE_EVENT(i915_tdr_inconsistency, + TP_PROTO(struct intel_engine_cs *engine, + u32 hw_context, + bool hw_active, + u32 sw_context, + struct drm_i915_gem_request *req), + + TP_ARGS(engine, hw_context, hw_active, sw_context, req), + + TP_STRUCT__entry( + __field(struct intel_engine_cs *, engine) + __field(u32, hw_context) + __field(bool, hw_active) + __field(u32, sw_context) + __field(int, elsp_submitted) + ), + + TP_fast_assign( + __entry->engine = engine; + __entry->hw_context = hw_context; + __entry->hw_active = hw_active; + __entry->sw_context = sw_context; + __entry->elsp_submitted = req?req->elsp_submitted:0; + ), + + TP_printk("dev=%u, engine=%s, hw_context=%x, hw_active=%s, sw_context=%x, elsp_submitted=%d", + __entry->engine->dev->primary->index, + __entry->engine->name, + __entry->hw_context, + __entry->hw_active?"true":"false", + __entry->sw_context, + __entry->elsp_submitted) +); + #endif /* _I915_TRACE_H_ */ /* This part must be outside protection */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 913fdbb..85107a1 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -2322,7 +2322,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req, struct intel_context *ctx; int ret = 0; int clamp_to_tail = 0; - uint32_t head; + uint32_t head, old_head; uint32_t tail; uint32_t head_addr; uint32_t tail_addr; @@ -2337,7 +2337,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req, * Read head from MMIO register since it contains the * most up to date value of head at this point. */ - head = I915_READ_HEAD(ring); + old_head = head = I915_READ_HEAD(ring); /* * Read tail from the context because the execlist queue @@ -2394,6 +2394,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req, head |= (head_addr & HEAD_ADDR); ring->saved_head = head; + trace_i915_tdr_engine_save(ring, old_head, + head, force_advance); + return 0; } @@ -3304,6 +3307,19 @@ intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring, CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED; } + /* + * This may or may not be a sustained inconsistency. Most of the time + * it's only a matter of a transitory inconsistency during context + * submission/completion but if we happen to detect a sustained + * inconsistency then it helps to have more information. + */ + if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT) + trace_i915_tdr_inconsistency(ring, + hw_context, + hw_active, + sw_context, + tmpreq); + if (req) *req = tmpreq; @@ -3368,6 +3384,7 @@ bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv, spin_unlock_irqrestore(&engine->execlist_lock, flags); + trace_i915_tdr_forced_csb_check(engine, !!was_effective); wake_up_all(&engine->irq_queue); return !!was_effective; diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index f20548c..1c527cd 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -1515,6 +1515,8 @@ static int gen6_do_reset(struct drm_device *dev) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500); + trace_i915_tdr_gpu_reset_complete(dev); + intel_uncore_forcewake_reset(dev, true); return ret; @@ -1680,6 +1682,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs *engine) break; } + trace_i915_tdr_engine_reset_complete(engine); + return ret; }

[14/20] drm/i915: TDR/watchdog trace points.

Commit Message

Patch