@@ -888,6 +888,7 @@ int i915_reset(struct drm_device *dev)
bool simulated;
int ret;
+ trace_i915_tdr_gpu_recovery(dev);
intel_reset_gt_powersave(dev);
mutex_lock(&dev->struct_mutex);
@@ -1112,6 +1113,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+ trace_i915_tdr_engine_recovery(engine);
+
/* Take wake lock to prevent power saving mode */
intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
@@ -3382,6 +3382,7 @@ void i915_destroy_error_state(struct drm_device *dev);
void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
/* i915_cmd_parser.c */
int i915_cmd_parser_get_version(void);
@@ -221,7 +221,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
}
}
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
{
switch (a) {
case HANGCHECK_IDLE:
@@ -2502,6 +2502,7 @@ static void i915_error_work_func(struct work_struct *work)
mutex_lock(&dev->struct_mutex);
+ trace_i915_tdr_recovery_start(dev);
kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
for_each_ring(ring, dev_priv, i) {
@@ -2620,6 +2621,7 @@ static void i915_error_work_func(struct work_struct *work)
kobject_uevent_env(&dev->primary->kdev->kobj,
KOBJ_CHANGE, reset_done_event);
}
+ trace_i915_tdr_recovery_complete(dev);
}
static void i915_report_and_clear_eir(struct drm_device *dev)
@@ -2745,6 +2747,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
struct drm_i915_private *dev_priv = dev->dev_private;
va_list args;
char error_msg[80];
+ bool full_reset = true;
struct intel_engine_cs *engine;
@@ -2763,7 +2766,6 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
* 2. The hardware does not support it (pre-gen7).
* 3. We already tried per-engine reset recently.
*/
- bool full_reset = true;
if (!i915.enable_engine_reset) {
DRM_INFO("Engine reset disabled: Using full GPU reset.\n");
@@ -2792,6 +2794,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
i915.gpu_reset_promotion_time;
engine->hangcheck.last_engine_reset_time = now;
+
} else {
/*
* Watchdog timeout always results
@@ -2840,6 +2843,8 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask,
i915_error_wake_up(dev_priv, false);
}
+ trace_i915_tdr_recovery_queued(dev, engine_mask, watchdog, full_reset);
+
/*
* Gen 7:
*
@@ -3264,6 +3269,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
for_each_ring(ring, dev_priv, i) {
u64 acthd;
u32 seqno;
+ u32 head;
bool busy = true;
semaphore_clear_deadlocks(dev_priv);
@@ -3344,7 +3350,10 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
ring->hangcheck.seqno = seqno;
ring->hangcheck.acthd = acthd;
+ head = I915_READ_HEAD(ring);
busy_count += busy;
+
+ trace_i915_tdr_hang_check(ring, seqno, acthd, head, busy);
}
for_each_ring(ring, dev_priv, i) {
@@ -810,6 +810,345 @@ TRACE_EVENT(switch_mm,
__entry->dev, __entry->ring, __entry->to, __entry->vm)
);
+/**
+ * DOC: i915_tdr_gpu_recovery
+ *
+ * This tracepoint tracks the onset of the full GPU recovery path
+ */
+TRACE_EVENT(i915_tdr_gpu_recovery,
+ TP_PROTO(struct drm_device *dev),
+
+ TP_ARGS(dev),
+
+ TP_STRUCT__entry(
+ __field(u32, dev)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->primary->index;
+ ),
+
+ TP_printk("dev=%u, full GPU recovery started",
+ __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_engine_recovery
+ *
+ * This tracepoint tracks the onset of the engine recovery path
+ */
+TRACE_EVENT(i915_tdr_engine_recovery,
+ TP_PROTO(struct intel_engine_cs *engine),
+
+ TP_ARGS(engine),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ ),
+
+ TP_printk("dev=%u, engine=%u, recovery of %s started",
+ __entry->engine->dev->primary->index,
+ __entry->engine->id,
+ __entry->engine->name)
+);
+
+/**
+ * DOC: i915_tdr_recovery_start
+ *
+ * This tracepoint tracks hang recovery start
+ */
+TRACE_EVENT(i915_tdr_recovery_start,
+ TP_PROTO(struct drm_device *dev),
+
+ TP_ARGS(dev),
+
+ TP_STRUCT__entry(
+ __field(u32, dev)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->primary->index;
+ ),
+
+ TP_printk("dev=%u, hang recovery started",
+ __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_complete
+ *
+ * This tracepoint tracks hang recovery completion
+ */
+TRACE_EVENT(i915_tdr_recovery_complete,
+ TP_PROTO(struct drm_device *dev),
+
+ TP_ARGS(dev),
+
+ TP_STRUCT__entry(
+ __field(u32, dev)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->primary->index;
+ ),
+
+ TP_printk("dev=%u, hang recovery completed",
+ __entry->dev)
+);
+
+/**
+ * DOC: i915_tdr_recovery_queued
+ *
+ * This tracepoint tracks the point of queuing recovery from hang check.
+ * If engine recovery is requested engine name will be displayed, otherwise
+ * it will be set to "none". If too many engine reset was attempted in the
+ * previous history we promote to full GPU reset, which is remarked by appending
+ * the "[PROMOTED]" flag.
+ */
+TRACE_EVENT(i915_tdr_recovery_queued,
+ TP_PROTO(struct drm_device *dev,
+ u32 hung_engines,
+ bool watchdog,
+ bool full_reset),
+
+ TP_ARGS(dev, hung_engines, watchdog, full_reset),
+
+ TP_STRUCT__entry(
+ __field(u32, dev)
+ __field(u32, hung_engines)
+ __field(bool, watchdog)
+ __field(bool, full_reset)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->primary->index;
+ __entry->hung_engines = hung_engines;
+ __entry->watchdog = watchdog;
+ __entry->full_reset = full_reset;
+ ),
+
+ TP_printk("dev=%u, hung_engines=0x%02x%s%s%s%s%s%s%s, watchdog=%s, full_reset=%s",
+ __entry->dev,
+ __entry->hung_engines,
+ __entry->hung_engines ? " (":"",
+ __entry->hung_engines & RENDER_RING ? " [RCS] " : "",
+ __entry->hung_engines & BSD_RING ? " [VCS] " : "",
+ __entry->hung_engines & BLT_RING ? " [BCS] " : "",
+ __entry->hung_engines & VEBOX_RING ? " [VECS] " : "",
+ __entry->hung_engines & BSD2_RING ? " [VCS2] " : "",
+ __entry->hung_engines ? ")":"",
+ __entry->watchdog ? "true" : "false",
+ __entry->full_reset ?
+ (__entry->hung_engines ? "true [PROMOTED]" : "true") :
+ "false")
+);
+
+/**
+ * DOC: i915_tdr_engine_save
+ *
+ * This tracepoint tracks the point of engine state save during the engine
+ * recovery path. Logs the head pointer position at point of hang, the position
+ * after recovering and whether or not we forced a head pointer advancement or
+ * rounded up to an aligned QWORD position.
+ */
+TRACE_EVENT(i915_tdr_engine_save,
+ TP_PROTO(struct intel_engine_cs *engine,
+ u32 old_head,
+ u32 new_head,
+ bool forced_advance),
+
+ TP_ARGS(engine, old_head, new_head, forced_advance),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ __field(u32, old_head)
+ __field(u32, new_head)
+ __field(bool, forced_advance)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ __entry->old_head = old_head;
+ __entry->new_head = new_head;
+ __entry->forced_advance = forced_advance;
+ ),
+
+ TP_printk("dev=%u, engine=%s, old_head=%u, new_head=%u, forced_advance=%s",
+ __entry->engine->dev->primary->index,
+ __entry->engine->name,
+ __entry->old_head,
+ __entry->new_head,
+ __entry->forced_advance ? "true" : "false")
+);
+
+/**
+ * DOC: i915_tdr_gpu_reset_complete
+ *
+ * This tracepoint tracks the point of full GPU reset completion
+ */
+TRACE_EVENT(i915_tdr_gpu_reset_complete,
+ TP_PROTO(struct drm_device *dev),
+
+ TP_ARGS(dev),
+
+ TP_STRUCT__entry(
+ __field(struct drm_device *, dev)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev;
+ ),
+
+ TP_printk("dev=%u, resets=%u",
+ __entry->dev->primary->index,
+ i915_reset_count(&((struct drm_i915_private *)
+ (__entry->dev)->dev_private)->gpu_error) )
+);
+
+/**
+ * DOC: i915_tdr_engine_reset_complete
+ *
+ * This tracepoint tracks the point of engine reset completion
+ */
+TRACE_EVENT(i915_tdr_engine_reset_complete,
+ TP_PROTO(struct intel_engine_cs *engine),
+
+ TP_ARGS(engine),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ ),
+
+ TP_printk("dev=%u, engine=%s, resets=%u",
+ __entry->engine->dev->primary->index,
+ __entry->engine->name,
+ __entry->engine->hangcheck.reset_count)
+);
+
+/**
+ * DOC: i915_tdr_forced_csb_check
+ *
+ * This tracepoint tracks the occurences of forced CSB checks
+ * that the driver does when detecting inconsistent context
+ * submission states between the driver state and the current
+ * CPU engine state.
+ */
+TRACE_EVENT(i915_tdr_forced_csb_check,
+ TP_PROTO(struct intel_engine_cs *engine,
+ bool was_effective),
+
+ TP_ARGS(engine, was_effective),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ __field(bool, was_effective)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ __entry->was_effective = was_effective;
+ ),
+
+ TP_printk("dev=%u, engine=%s, was_effective=%s",
+ __entry->engine->dev->primary->index,
+ __entry->engine->name,
+ __entry->was_effective ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_hang_check
+ *
+ * This tracepoint tracks hang checks on each engine.
+ */
+TRACE_EVENT(i915_tdr_hang_check,
+ TP_PROTO(struct intel_engine_cs *engine,
+ u32 seqno,
+ u64 acthd,
+ u32 hd,
+ bool busy),
+
+ TP_ARGS(engine, seqno, acthd, hd, busy),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ __field(u32, seqno)
+ __field(u64, acthd)
+ __field(u32, hd)
+ __field(bool, busy)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ __entry->seqno = seqno;
+ __entry->acthd = acthd;
+ __entry->hd = hd;
+ __entry->busy = busy;
+ ),
+
+ TP_printk("dev=%u, engine=%s, seqno=%u (%d), last seqno=%u (%d), head=%u (%d), acthd=%lu, score=%d%s, action=%u [%s], busy=%s",
+ __entry->engine->dev->primary->index,
+ __entry->engine->name,
+ __entry->seqno,
+ __entry->seqno,
+ __entry->engine->hangcheck.seqno,
+ __entry->engine->hangcheck.seqno,
+ __entry->hd,
+ __entry->hd,
+ (long unsigned int) __entry->acthd,
+ __entry->engine->hangcheck.score,
+ (__entry->engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) ? " [HUNG]" : "",
+ (unsigned int) __entry->engine->hangcheck.action,
+ hangcheck_action_to_str(__entry->engine->hangcheck.action),
+ __entry->busy ? "yes" : "no")
+);
+
+/**
+ * DOC: i915_tdr_inconsistency
+ *
+ * This tracepoint tracks detected inconsistencies
+ */
+TRACE_EVENT(i915_tdr_inconsistency,
+ TP_PROTO(struct intel_engine_cs *engine,
+ u32 hw_context,
+ bool hw_active,
+ u32 sw_context,
+ struct drm_i915_gem_request *req),
+
+ TP_ARGS(engine, hw_context, hw_active, sw_context, req),
+
+ TP_STRUCT__entry(
+ __field(struct intel_engine_cs *, engine)
+ __field(u32, hw_context)
+ __field(bool, hw_active)
+ __field(u32, sw_context)
+ __field(int, elsp_submitted)
+ ),
+
+ TP_fast_assign(
+ __entry->engine = engine;
+ __entry->hw_context = hw_context;
+ __entry->hw_active = hw_active;
+ __entry->sw_context = sw_context;
+ __entry->elsp_submitted = req?req->elsp_submitted:0;
+ ),
+
+ TP_printk("dev=%u, engine=%s, hw_context=%x, hw_active=%s, sw_context=%x, elsp_submitted=%d",
+ __entry->engine->dev->primary->index,
+ __entry->engine->name,
+ __entry->hw_context,
+ __entry->hw_active?"true":"false",
+ __entry->sw_context,
+ __entry->elsp_submitted)
+);
+
#endif /* _I915_TRACE_H_ */
/* This part must be outside protection */
@@ -2322,7 +2322,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
struct intel_context *ctx;
int ret = 0;
int clamp_to_tail = 0;
- uint32_t head;
+ uint32_t head, old_head;
uint32_t tail;
uint32_t head_addr;
uint32_t tail_addr;
@@ -2337,7 +2337,7 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
* Read head from MMIO register since it contains the
* most up to date value of head at this point.
*/
- head = I915_READ_HEAD(ring);
+ old_head = head = I915_READ_HEAD(ring);
/*
* Read tail from the context because the execlist queue
@@ -2394,6 +2394,9 @@ gen8_ring_save(struct intel_engine_cs *ring, struct drm_i915_gem_request *req,
head |= (head_addr & HEAD_ADDR);
ring->saved_head = head;
+ trace_i915_tdr_engine_save(ring, old_head,
+ head, force_advance);
+
return 0;
}
@@ -3304,6 +3307,19 @@ intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED;
}
+ /*
+ * This may or may not be a sustained inconsistency. Most of the time
+ * it's only a matter of a transitory inconsistency during context
+ * submission/completion but if we happen to detect a sustained
+ * inconsistency then it helps to have more information.
+ */
+ if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT)
+ trace_i915_tdr_inconsistency(ring,
+ hw_context,
+ hw_active,
+ sw_context,
+ tmpreq);
+
if (req)
*req = tmpreq;
@@ -3368,6 +3384,7 @@ bool intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
spin_unlock_irqrestore(&engine->execlist_lock, flags);
+ trace_i915_tdr_forced_csb_check(engine, !!was_effective);
wake_up_all(&engine->irq_queue);
return !!was_effective;
@@ -1515,6 +1515,8 @@ static int gen6_do_reset(struct drm_device *dev)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500);
+ trace_i915_tdr_gpu_reset_complete(dev);
+
intel_uncore_forcewake_reset(dev, true);
return ret;
@@ -1680,6 +1682,8 @@ static int do_engine_reset_nolock(struct intel_engine_cs *engine)
break;
}
+ trace_i915_tdr_engine_reset_complete(engine);
+
return ret;
}