@@ -89,3 +89,11 @@ config DRM_I915_SCHEDULER
help
Choose this option to enable GPU task scheduling for improved
performance and efficiency.
+
+config DRM_I915_SCHEDULER_PREEMPTION
+ bool "Enable pre-emption within the GPU scheduler"
+ depends on DRM_I915_SCHEDULER
+ default y
+ help
+ Choose this option to enable pre-emptive context switching within the
+ GPU scheduler for even more performance and efficiency improvements.
@@ -2312,6 +2312,18 @@ i915_gem_init_seqno(struct drm_device *dev, u32 seqno)
ring->semaphore.sync_seqno[j] = 0;
}
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ /* Also reset sw batch tracking state */
+ for_each_ring(ring, dev_priv, i) {
+ ring->last_regular_batch = 0;
+ ring->last_preemptive_batch = 0;
+ intel_write_status_page(ring, I915_BATCH_DONE_SEQNO, 0);
+ intel_write_status_page(ring, I915_BATCH_ACTIVE_SEQNO, 0);
+ intel_write_status_page(ring, I915_PREEMPTIVE_DONE_SEQNO, 0);
+ intel_write_status_page(ring, I915_PREEMPTIVE_ACTIVE_SEQNO, 0);
+ }
+#endif
+
return 0;
}
@@ -1470,6 +1470,238 @@ pre_mutex_err:
return ret;
}
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+/*
+ * The functions below emit opcodes into the ring buffer.
+ * The simpler ones insert a single instruction, whereas the
+ * prequel/preamble/postamble functions generate a sequence
+ * of operations according to the nature of the current batch.
+ * Top among them is i915_gem_do_execbuffer_final() which is
+ * called by the scheduler to pass a batch to the hardware.
+ *
+ * There are three different types of batch handled here:
+ * 1. non-preemptible batches (using the default context)
+ * 2. preemptible batches (using a non-default context)
+ * 3. preemptive batches (using a non-default context)
+ * and three points at which the code paths vary (prequel, at the very
+ * start of per-batch processing; preamble, just before the call to the
+ * batch buffer; and postamble, which after the batch buffer completes).
+ *
+ * The preamble is simple; it logs the sequence number of the batch that's
+ * about to start, and enables or disables preemption for the duration of
+ * the batch. The postamble is similar: it logs the sequence number of the
+ * batch that's just finished, and clears the in-progress sequence number
+ * (except for preemptive batches, where this is deferred to the interrupt
+ * handler).
+ *
+ * The prequel is the part that differs most. In the case of a regular batch,
+ * it contains an ARB ON/ARB CHECK sequence that allows preemption before
+ * the batch starts. The preemptive prequel, on the other hand, is more
+ * complex; see the description below ...
+ */
+
+/*
+ * Emit an MI_STORE_DWORD_INDEX instruction.
+ * This stores the specified value in the (index)th DWORD of the hardware status page.
+ */
+static uint32_t
+emit_store_dw_index(struct intel_engine_cs *ring, uint32_t value, uint32_t index)
+{
+ uint32_t vptr;
+ intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+ intel_ring_emit(ring, index << MI_STORE_DWORD_INDEX_SHIFT);
+ vptr = intel_ring_get_tail(ring);
+ intel_ring_emit(ring, value);
+ return vptr;
+}
+
+/*
+ * Emit an MI_STORE_REGISTER_MEM instruction.
+ * This stores the specified register in the (index)th DWORD of the memory
+ * area pointed to by base (which is actually the hardware status page).
+ */
+static void
+emit_store_reg_index(struct intel_engine_cs *ring, uint32_t reg, uint32_t base, uint32_t index)
+{
+ intel_ring_emit(ring, MI_STORE_REG_MEM | MI_STORE_REG_MEM_GTT);
+ intel_ring_emit(ring, reg);
+ intel_ring_emit(ring, base+(index << MI_STORE_DWORD_INDEX_SHIFT));
+}
+
+/*
+ * Emit the commands to check for preemption before starting a regular batch
+ */
+static void
+emit_regular_prequel(struct intel_engine_cs *ring, uint32_t seqno, uint32_t start)
+{
+ /* Log the ring address of the batch we're starting BEFORE the ARB CHECK */
+ emit_store_dw_index(ring, start, I915_BATCH_ACTIVE_ADDR);
+ intel_ring_emit(ring, MI_REPORT_HEAD);
+
+ /* Ensure Arbitration is enabled, then check for pending preemption */
+ intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+ intel_ring_emit(ring, MI_ARB_CHECK);
+ /* 6 dwords so far */
+}
+
+/*
+ * Emit the commands that prefix a preemptive batch.
+ *
+ * The difficulty here is that the engine is asynchronous. It may have already
+ * stopped with HEAD == TAIL, or it may still be running. If still running, it
+ * could execute an ARB CHECK instruction at ANY time.
+ *
+ * Therefore, it is unsafe to write UHPTR first and then update TAIL because
+ * an ARB_CHECK might trigger a jump between the two. This would set HEAD to
+ * be *after* TAIL which the engine would interpret as being a VERY looooong
+ * way *BEHIND* TAIL.
+ *
+ * OTOH, if TAIL is written first and then UHPTR, the engine might run the new
+ * code before the update of UHPTR has occurred. It would then stop when
+ * HEAD == (new) TAIL and the updated UHPTR would be ignored leaving the
+ * preemption pending until later!
+ *
+ * In addition, it is necessary to distinguish in the interrupt handler whether
+ * the ring was in fact idle by the time preemption took place. I.e. there were
+ * no ARB CHECK commands between HEAD at the time when UHPTR was set and the
+ * start of the preemptive batch that is being constructed.
+ *
+ * The solution is to first construct a 'landing zone' containing at least one
+ * instruction whose execution can be detected (in this case, a STORE and an
+ * ARB_ENABLE) and advance TAIL over it. Then set UHPTR to the same value as
+ * the new TAIL.
+ *
+ * If an (enabled) ARB_CHECK instruction is executed before the next update to
+ * TAIL, the engine will update HEAD to the value of UHPTR and then stop as the
+ * new value of HEAD will match TAIL. OTOH if no further ARB_CHECK instructions
+ * are reached, the engine will eventually run into the landing zone and again
+ * stop at the same point (but with preemption still pending).
+ *
+ * Thus, a second zone is added that *starts* with an ARB_CHECK. If (and only
+ * if) preemption has not yet occurred, this will cause a jump to the location
+ * given by UHPTR (which is its own address!). As a side effect, the VALID bit
+ * of UHPTR is cleared, so when the same ARB_CHECK is executed again, it now
+ * has no effect.
+ *
+ * Either way, the engine reaches the end of the second landing zone with
+ * preemption having occurred exactly once, so there's no surprise left lurking
+ * for later. If the new batch work has already been added by the time this
+ * happens, it can continue immediately. Otherwise, the engine will stop until
+ * the next update to TAIL after the batch call is added.
+ */
+static void
+emit_preemptive_prequel(struct intel_engine_cs *ring, uint32_t seqno, uint32_t start)
+{
+ /* 'dev_priv' is required by the WRITE_UHPTR() macro! :-( */
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ uint32_t i, hwpa, jump;
+
+ /* Part 1, reached only if the ring is idle */
+ emit_store_dw_index(ring, seqno, I915_BATCH_ACTIVE_SEQNO);
+ intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+ /* 4 dwords so far */
+ intel_ring_advance(ring);
+ jump = intel_ring_get_tail(ring);
+ BUG_ON(jump & UHPTR_GFX_ADDR_ALIGN);
+
+ I915_WRITE_UHPTR(ring, jump | UHPTR_VALID);
+
+ /* May jump to itself! */
+ intel_ring_emit(ring, MI_ARB_CHECK);
+
+ /* Log the ring address of the batch we're starting AFTER the ARB CHECK */
+ emit_store_dw_index(ring, start, I915_PREEMPTIVE_ACTIVE_ADDR);
+ /* 8 dwords so far */
+
+ {
+ /*
+ * Unfortunately not everything we need is automatically saved by a
+ * context switch, so we have to explicitly save some registers here.
+ */
+ static const u32 regs[][2] = {
+ { RING_PREEMPT_ADDR, I915_SAVE_PREEMPTED_RING_PTR },
+ { BB_PREEMPT_ADDR, I915_SAVE_PREEMPTED_BB_PTR },
+ { SBB_PREEMPT_ADDR, I915_SAVE_PREEMPTED_SBB_PTR },
+ { RS_PREEMPT_STATUS, I915_SAVE_PREEMPTED_STATUS },
+
+ { RING_HEAD(RENDER_RING_BASE), I915_SAVE_PREEMPTED_HEAD },
+ { RING_TAIL(RENDER_RING_BASE), I915_SAVE_PREEMPTED_TAIL },
+ { RING_UHPTR(RENDER_RING_BASE), I915_SAVE_PREEMPTED_UHPTR },
+ { NOPID, I915_SAVE_PREEMPTED_NOPID }
+ };
+
+ /* This loop generates another 24 dwords, for a total of 36 so far */
+ hwpa = i915_gem_obj_ggtt_offset(ring->status_page.obj);
+ for (i = 0; i < ARRAY_SIZE(regs); ++i)
+ emit_store_reg_index(ring, regs[i][0], hwpa, regs[i][1]);
+ }
+}
+
+/*
+ * Emit the commands that immediately prefix execution of a batch.
+ *
+ * The GPU will log the seqno of the batch as it starts running it,
+ * then enable or disable preemption checks during this batch.
+ */
+static void
+emit_preamble(struct intel_engine_cs *ring, uint32_t seqno, struct intel_context *ctx, bool preemptive)
+{
+ emit_store_dw_index(ring, seqno, preemptive ? I915_PREEMPTIVE_ACTIVE_SEQNO : I915_BATCH_ACTIVE_SEQNO);
+ if (preemptive || i915_gem_context_is_default(ctx))
+ intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+ else
+ intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
+ /* 4 dwords so far */
+}
+
+/*
+ * Emit the commands that immediately follow execution of a batch.
+ *
+ * The GPU will:
+ * 1) log the end address of the batch we've completed
+ * 2) log the seqno of the batch we've just completed.
+ * 3) in the case of a non-preemptive batch, clear the in-progress sequence
+ * number; otherwise, issue a dummy register store to flush the above
+ * writes before the interrupt happens.
+ */
+static void
+emit_postamble(struct intel_engine_cs *ring, uint32_t seqno, uint32_t start, bool preemptive)
+{
+ uint32_t eptr, end;
+
+ if (intel_ring_begin(ring, 10))
+ return;
+
+ /*
+ * Note that the '~0u' in this call is a placeholder - the actual address
+ * will be calculated later in this function and retroactively patched
+ * into this dword!
+ */
+ eptr = emit_store_dw_index(ring, ~0u, preemptive ? I915_PREEMPTIVE_ACTIVE_END : I915_BATCH_ACTIVE_END);
+ emit_store_dw_index(ring, seqno, preemptive ? I915_PREEMPTIVE_DONE_SEQNO : I915_BATCH_DONE_SEQNO);
+ if (preemptive) {
+ uint32_t hwpa = i915_gem_obj_ggtt_offset(ring->status_page.obj);
+ emit_store_reg_index(ring, NOPID, hwpa, I915_SAVE_PREEMPTED_NOPID);
+ } else {
+ emit_store_dw_index(ring, 0, I915_BATCH_ACTIVE_SEQNO);
+ }
+ intel_ring_emit(ring, MI_NOOP);
+ /* 10 dwords so far */
+
+ end = intel_ring_get_tail(ring);
+
+ /* Stash the batch bounds for use by the interrupt handler */
+ intel_write_status_page(ring, I915_GEM_BATCH_START_ADDR, start);
+ intel_write_status_page(ring, I915_GEM_BATCH_END_ADDR, end);
+
+ BUG_ON(eptr & UHPTR_GFX_ADDR_ALIGN);
+ BUG_ON(end & UHPTR_GFX_ADDR_ALIGN);
+
+ /* Go back and patch the end-batch address inserted above */
+ iowrite32(end, ring->buffer->virtual_start + eptr);
+}
+#endif /* CONFIG_DRM_I915_SCHEDULER_PREEMPTION */
+
/*
* This is the main function for adding a batch to the ring.
* It is called from the scheduler, with the struct_mutex already held.
@@ -1480,6 +1712,10 @@ int i915_gem_do_execbuffer_final(struct i915_execbuffer_params *params)
struct intel_engine_cs *ring = params->ring;
u64 exec_start, exec_len;
int ret, i;
+ bool preemptive;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ u32 start;
+#endif
u32 seqno;
/* The mutex must be acquired before calling this function */
@@ -1547,6 +1783,22 @@ int i915_gem_do_execbuffer_final(struct i915_execbuffer_params *params)
if (ret)
goto err;
+ preemptive = (params->scheduler_flags & i915_ebp_sf_preempt) != 0;
+#ifndef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ /* The scheduler must not request preemption if support wasn't compiled in */
+ BUG_ON(preemptive);
+#endif
+
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ start = intel_ring_get_tail(ring);
+ BUG_ON(start & UHPTR_GFX_ADDR_ALIGN);
+
+ if (preemptive)
+ emit_preemptive_prequel(ring, seqno, start);
+ else
+ emit_regular_prequel(ring, seqno, start);
+#endif
+
/* Switch to the correct context for the batch */
ret = i915_switch_context(ring, params->ctx);
if (ret)
@@ -1583,10 +1835,26 @@ int i915_gem_do_execbuffer_final(struct i915_execbuffer_params *params)
BUG_ON(ring->outstanding_lazy_seqno != params->seqno);
BUG_ON(ring->preallocated_lazy_request != params->request);
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ /*
+ * Log the seqno of the batch we're starting
+ * Enable/disable preemption checks during this batch
+ */
+ emit_preamble(ring, seqno, params->ctx, preemptive);
+#endif
+
exec_len = params->args_batch_len;
exec_start = params->batch_obj_vm_offset +
params->args_batch_start_offset;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ if (params->preemption_point) {
+ uint32_t preemption_offset = params->preemption_point - exec_start;
+ exec_start += preemption_offset;
+ exec_len -= preemption_offset;
+ }
+#endif
+
if (params->cliprects) {
for (i = 0; i < params->args_num_cliprects; i++) {
ret = i915_emit_box(params->dev, ¶ms->cliprects[i],
@@ -1608,6 +1876,11 @@ int i915_gem_do_execbuffer_final(struct i915_execbuffer_params *params)
goto err;
}
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ emit_postamble(ring, seqno, start, preemptive);
+ intel_ring_advance(ring);
+#endif
+
trace_i915_gem_ring_dispatch(ring, seqno, params->eb_flags);
/* Seqno matches? */
@@ -43,6 +43,8 @@ const char *i915_qe_state_str(struct i915_scheduler_queue_entry *node)
char *ptr = str;
*(ptr++) = node->bumped ? 'B' : '-',
+ *(ptr++) = (node->params.scheduler_flags & i915_ebp_sf_preempt) ? 'P' : '-';
+ *(ptr++) = (node->params.scheduler_flags & i915_ebp_sf_was_preempt) ? 'p' : '-';
*ptr = 0;
@@ -61,9 +63,15 @@ char i915_scheduler_queue_status_chr(enum i915_scheduler_queue_status status)
case i915_sqs_flying:
return 'F';
+ case i915_sqs_overtaking:
+ return 'O';
+
case i915_sqs_complete:
return 'C';
+ case i915_sqs_preempted:
+ return 'P';
+
default:
break;
}
@@ -86,9 +94,15 @@ const char *i915_scheduler_queue_status_str(
case i915_sqs_flying:
return "Flying";
+ case i915_sqs_overtaking:
+ return "Overtaking";
+
case i915_sqs_complete:
return "Complete";
+ case i915_sqs_preempted:
+ return "Preempted";
+
default:
break;
}
@@ -155,7 +169,11 @@ int i915_scheduler_init(struct drm_device *dev)
/* Default tuning values: */
scheduler->priority_level_max = ~0U;
scheduler->priority_level_preempt = 900;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ scheduler->min_flying = 8;
+#else
scheduler->min_flying = 2;
+#endif
scheduler->file_queue_max = 64;
dev_priv->scheduler = scheduler;
@@ -172,7 +190,7 @@ int i915_scheduler_queue_execbuffer(struct i915_scheduler_queue_entry *qe)
struct i915_scheduler_queue_entry *test;
struct timespec stamp;
unsigned long flags;
- bool not_flying, found;
+ bool not_flying, want_preempt, found;
int i, j, r, got_batch = 0;
int incomplete = 0;
@@ -315,12 +333,22 @@ int i915_scheduler_queue_execbuffer(struct i915_scheduler_queue_entry *qe)
not_flying = i915_scheduler_count_flying(scheduler, ring) <
scheduler->min_flying;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ want_preempt = node->priority >= scheduler->priority_level_preempt;
+#else
+ want_preempt = false;
+#endif
+
+ if (want_preempt)
+ node->params.scheduler_flags |= i915_ebp_sf_preempt |
+ i915_ebp_sf_was_preempt;
+
trace_i915_scheduler_queue(ring, node);
trace_i915_scheduler_node_state_change(ring, node);
spin_unlock_irqrestore(&scheduler->lock, flags);
- if (not_flying)
+ if (not_flying || want_preempt)
i915_scheduler_submit(ring, true);
return 0;
@@ -341,6 +369,14 @@ int i915_scheduler_fly_seqno(struct intel_engine_cs *ring, uint32_t seqno)
if (scheduler->flags[ring->id] & i915_sf_submitting)
return 0;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ /* Does not work with preemption as that requires the extra seqno status
+ * words to be updated rather than just the one original word! */
+ DRM_DEBUG_SCHED("<%s> Got non-batch ring submission! [seqno = %d]\n",
+ ring->name, seqno);
+ return 0;
+#endif
+
getrawmonotonic(&stamp);
/* Need to allocate a new node. Note that kzalloc can sleep
@@ -382,7 +418,10 @@ int i915_scheduler_fly_node(struct i915_scheduler_queue_entry *node)
* hardware submission order. */
list_add(&node->link, &scheduler->node_queue[ring->id]);
- node->status = i915_sqs_flying;
+ if (node->params.scheduler_flags & i915_ebp_sf_preempt)
+ node->status = i915_sqs_overtaking;
+ else
+ node->status = i915_sqs_flying;
trace_i915_scheduler_fly(ring, node);
trace_i915_scheduler_node_state_change(ring, node);
@@ -424,6 +463,9 @@ static inline bool i915_scheduler_is_dependency_valid(
if (I915_SQS_IS_FLYING(dep)) {
if (node->params.ring != dep->params.ring)
return true;
+
+ if (node->params.scheduler_flags & i915_ebp_sf_preempt)
+ return true;
}
return false;
@@ -467,6 +509,309 @@ static void i915_scheduler_node_kill(struct i915_scheduler_queue_entry *node)
trace_i915_scheduler_node_state_change(node->params.ring, node);
}
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+
+/*
+ * The batch tagged with the indicated seqence number has been started
+ * (but not yet completed). Must be called with spinlock already held.
+ *
+ * This handles two distinct cases: preemptED and preemptIVE. In both
+ * cases, the associated batch MUST exist and be FLYING. Because batch
+ * buffers are moved to the head of the queue as they are submitted to
+ * the hardware, no FLYING batch can come later than the first COMPLETED
+ * batch, even wih preemption, so we can quit the search early if we
+ * find a COMPLETED batch -- which would be a BUG.
+ *
+ * In the case of mid_batch == true, the batch buffer itself was
+ * non-preemptive and has been preempted part way through (at the given
+ * address). The address must be saved away so that the starting point can be
+ * adjusted when the batch is resubmitted.
+ *
+ * In the case of mid_batch == false, the batch buffer is the preempting one
+ * and has started executing (potentially pre-empting other batch buffers part
+ * way through) but not yet completed (at the time of analysis). At this point
+ * it should, in theory, be safe to reallow ring submission rather than waiting
+ * for the preemptive batch to fully complete.
+ */
+static void i915_scheduler_seqno_started(struct intel_engine_cs *ring,
+ uint32_t seqno, bool mid_batch,
+ uint32_t bb_addr)
+{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct i915_scheduler *scheduler = dev_priv->scheduler;
+ struct i915_scheduler_queue_entry *node;
+ bool found = false;
+
+ list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+ if (seqno == node->params.seqno) {
+ found = true;
+ break;
+ }
+
+ BUG_ON(I915_SQS_IS_COMPLETE(node));
+ }
+
+ BUG_ON(!found);
+
+ if (mid_batch) {
+ BUG_ON(node->status != i915_sqs_flying);
+ node->params.preemption_point = bb_addr;
+ } else {
+ BUG_ON(node->status != i915_sqs_overtaking);
+ }
+}
+
+/*
+ * The batch tagged with the indicated seqence number has completed.
+ * Search the queue for it, update its status and those of any batches
+ * submitted earlier, which must also have completed or been preeempted
+ * as appropriate.
+ *
+ * Called with spinlock already held.
+ */
+static void i915_scheduler_seqno_complete(struct intel_engine_cs *ring,
+ uint32_t seqno, bool preemptive)
+{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct i915_scheduler *scheduler = dev_priv->scheduler;
+ struct i915_scheduler_queue_entry *node;
+ bool found = false;
+
+ /*
+ * Batch buffers are added to the head of the list in execution order,
+ * thus seqno values, although not necessarily incrementing, will be
+ * met in completion order when scanning the list. So when a match is
+ * found, all subsequent entries must have either also popped or been
+ * preempted.
+ */
+ list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+ if (seqno == node->params.seqno) {
+ found = true;
+ break;
+ }
+ }
+
+ trace_i915_scheduler_landing(ring, seqno, found ? node : NULL);
+ BUG_ON(!found);
+
+ if (preemptive) {
+ BUG_ON(node->status != i915_sqs_overtaking);
+
+ /*
+ * This batch has overtaken and preempted those still on the
+ * list. All batches in flight will need to be resubmitted.
+ */
+ node->status = i915_sqs_complete;
+ trace_i915_scheduler_node_state_change(ring, node);
+
+ list_for_each_entry_continue(node, &scheduler->node_queue[ring->id], link) {
+ BUG_ON(node->status == i915_sqs_overtaking);
+
+ if (I915_SQS_IS_COMPLETE(node))
+ break;
+
+ if (node->status != i915_sqs_flying)
+ continue;
+
+ node->status = i915_sqs_preempted;
+ trace_i915_scheduler_unfly(ring, node);
+ trace_i915_scheduler_node_state_change(ring, node);
+ }
+
+ /*
+ * Preemption finished:
+ *
+ * The 'preempting' flag prevented submissions to the ring
+ * while a preemptive batch was in flight. Now that it is
+ * complete, the flag can be cleared and submissions may be
+ * resumed.
+ *
+ * The 'preempted' flag, OTOH, tells waiters who may be holding
+ * the 'struct_mutex' that preemption has occurred, and they
+ * should wake up (or not go to sleep) and release the mutex so
+ * that the scheduler's delayed-work task can postprocess the
+ * request queue and initiate submission of more batches.
+ * Without this, a thread that is waiting for a batch that has
+ * been preempted (or has not yet been submited to the hardware)
+ * could sleep while holding the mutex but would never receive
+ * a wakeup, resulting in a device hang.
+ */
+ scheduler->flags[ring->id] &= ~i915_sf_preempting;
+ scheduler->flags[ring->id] |= i915_sf_preempted;
+ } else {
+ BUG_ON(node->status != i915_sqs_flying);
+
+ /* Everything from here can be marked as done: */
+ list_for_each_entry_from(node, &scheduler->node_queue[ring->id], link) {
+ BUG_ON(node->status == i915_sqs_overtaking);
+
+ /* Check if the marking has already been done: */
+ if (I915_SQS_IS_COMPLETE(node))
+ break;
+
+ if (node->status != i915_sqs_flying)
+ continue;
+
+ /* Node was in flight so mark it as complete. */
+ node->status = i915_sqs_complete;
+ trace_i915_scheduler_node_state_change(ring, node);
+ }
+ }
+
+ /* Should submit new work here if flight list is empty but the DRM
+ * mutex lock might not be available if a '__wait_seqno()' call is
+ * blocking the system. */
+}
+
+/*
+ * In the non-preemption case, the last seqno processed by the ring is
+ * sufficient information to keep track of what has or has not completed.
+ *
+ * However, it is insufficient in the preemption case as much historical
+ * information can be lost. Instead, four separate seqno values are required
+ * to distinguish between batches that have completed versus ones that have
+ * been preempted:
+ * p_active sequence number of currently executing preemptive batch or
+ * zero if no such batch is executing
+ * b_active sequence number of currently executing non-preemptive batch
+ * or zero if no such batch is executing
+ * p_done sequence number of last completed preemptive batch
+ * b_done sequence number of last completed non-preemptive batch
+ *
+ * NB: Zero is not a valid seqence number and is therefore safe to use as an
+ * 'N/A' type value.
+ *
+ * Only one preemptive batch can be in the flight at a time. No more
+ * batches can be submitted until it completes, at which time there should
+ * be no further activity. Completion of a preemptive batch is indicated
+ * by (p_done == p_active != 0).
+ *
+ * At any other time, the GPU may still be running additional tasks after the
+ * one that initiated the interrupt, so any values read from the hardware
+ * status page may not reflect a single coherent state!
+ *
+ * In particular, the following cases can occur while handling the completion
+ * of a preemptive batch:
+ *
+ * 1. The regular case is that 'seqno' == 'p_done', and 'b_done' differs
+ * from them, being from an earlier non-preemptive batch.
+ *
+ * 2. The interrupt was generated by an earlier non-preemptive batch. In this
+ * case, 'seqno' should match 'b_done' and 'p_done' should be differ.
+ * There should also be another interrupt still on it's way!
+ * GPU: seq 1, intr 1 ...
+ * CPU: intr 1, reads seqno
+ * GPU: seq 2
+ * CPU: reads p_done, b_done
+ * GPU: intr 2
+ * This can happen when 1 is regular and 2 is preemptive. Most other
+ * strange cases should not happen simply because of the requirement
+ * that no more batches are submitted after a preemptive one until the
+ * preemption completes.
+ *
+ * In the case of handling completion of a NON-preemptive batch, the following
+ * may be observed:
+ *
+ * 1. The regular case is that 'seqno' == 'b_done' and the interrupt was
+ * generated by the completion of the most recent (non-preemptive) batch.
+ *
+ * 2. The interrupt was generated by an earlier non-preemptive batch. In this
+ * case, 'seqno' should be earlier than 'b_done'. There should be another
+ * interrupt still on it's way!
+ * GPU: seq 1, intr 1 ...
+ * CPU: intr 1, reads seqno
+ * GPU: seq 2
+ * CPU: reads b_done
+ * GPU: intr 2
+ * This can easily happen when 1 and 2 are both regular batches.
+ *
+ * 3. Updates to the sequence number can overtake interrupts:
+ * GPU: seq 1, intr 1 (delayed), seq 2 ...
+ * CPU: intr 1, reads/processes seq 2
+ * GPU: intr 2
+ * CPU: intr 2, reads seq 2 again
+ * This can only happen when 1 and 2 are both regular batches i.e. not
+ * the preemptive case where nothing can be queued until preemption is
+ * seen to have completed.
+ *
+ * 4. If there are non-batch commands (with sequence numbers) in the ring,
+ * then 'seqno' could be updated by such a command while 'b_done' remains
+ * at the number of the last non-preemptive batch.
+ *
+ * 5. 'seqno' could also be left over from an already-serviced preemptive batch.
+ *
+ * All of which basically means that 'seqno' as read via 'ring->get_seqno()' is
+ * not especially useful. Thus the four batch buffer bookend values are all that
+ * is used to determine exactly what has or has not occurred between this ISR
+ * execution and the last.
+ */
+int i915_scheduler_handle_IRQ(struct intel_engine_cs *ring)
+{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct i915_scheduler *scheduler = dev_priv->scheduler;
+ unsigned long flags;
+ uint32_t b_active, b_done, p_active, p_done;
+
+ spin_lock_irqsave(&scheduler->lock, flags);
+
+ p_done = intel_read_status_page(ring, I915_PREEMPTIVE_DONE_SEQNO);
+ p_active = intel_read_status_page(ring, I915_PREEMPTIVE_ACTIVE_SEQNO);
+ b_done = intel_read_status_page(ring, I915_BATCH_DONE_SEQNO);
+ b_active = intel_read_status_page(ring, I915_BATCH_ACTIVE_SEQNO);
+
+ trace_i915_scheduler_irq(ring, ring->get_seqno(ring, false),
+ b_active, b_done, p_active, p_done);
+
+ if (i915.scheduler_override & i915_so_direct_submit) {
+ spin_unlock_irqrestore(&scheduler->lock, flags);
+ return 0;
+ }
+
+ /* All regular batches up to 'b_done' have completed */
+ if (b_done != ring->last_regular_batch) {
+ i915_scheduler_seqno_complete(ring, b_done, false);
+ ring->last_regular_batch = b_done;
+ }
+
+ if (p_done) {
+ /*
+ * The preeemptive batch identified by 'p_done' has completed.
+ * If 'b_active' is different from 'p_active' and nonzero, that
+ * batch has been preempted mid-batch. All other batches still
+ * in flight have been preempted before starting.
+ */
+ BUG_ON(p_active != p_done);
+ if (b_active == p_active) {
+ /* null preemption (ring was idle) */
+ } else if (b_active == 0) {
+ /* interbatch preemption (ring was busy) */
+ } else /* any other value of b_active */ {
+ /* midbatch preemption (batch was running) */
+ uint32_t b_addr = intel_read_status_page(ring, I915_SAVE_PREEMPTED_BB_PTR);
+ i915_scheduler_seqno_started(ring, b_active, true, b_addr);
+ }
+
+ i915_scheduler_seqno_complete(ring, p_done, true);
+ ring->last_preemptive_batch = p_done;
+
+ /* Clear the active-batch and preemptive-batch-done sequence
+ * numbers in the status page */
+ intel_write_status_page(ring, I915_BATCH_ACTIVE_SEQNO, 0);
+ intel_write_status_page(ring, I915_PREEMPTIVE_DONE_SEQNO, 0);
+ } else if (p_active && p_active != ring->last_preemptive_batch) {
+ /* new preemptive batch started but not yet finished */
+ i915_scheduler_seqno_started(ring, p_active, false, 0);
+ }
+
+ spin_unlock_irqrestore(&scheduler->lock, flags);
+
+ queue_work(dev_priv->wq, &dev_priv->mm.scheduler_work);
+
+ return 0;
+}
+
+#else /* CONFIG_DRM_I915_SCHEDULER_PREEMPTION */
+
/*
* The batch tagged with the indicated seqence number has completed.
* Search the queue for it, update its status and those of any batches
@@ -542,7 +887,7 @@ int i915_scheduler_handle_IRQ(struct intel_engine_cs *ring)
seqno = ring->get_seqno(ring, false);
- trace_i915_scheduler_irq(ring, seqno);
+ trace_i915_scheduler_irq(ring, seqno, 0, 0, 0, 0);
if (i915.scheduler_override & i915_so_direct_submit)
return 0;
@@ -562,6 +907,8 @@ int i915_scheduler_handle_IRQ(struct intel_engine_cs *ring)
return 0;
}
+#endif /* CONFIG_DRM_I915_SCHEDULER_PREEMPTION */
+
int i915_scheduler_remove(struct intel_engine_cs *ring)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
@@ -1040,7 +1387,8 @@ static int i915_scheduler_pop_from_queue_locked(struct intel_engine_cs *ring,
int ret;
int i;
bool any_queued;
- bool has_local, has_remote, only_remote;
+ bool has_local, has_remote, only_remote, local_preempt_only;
+ bool was_preempted = false;
*pop_node = NULL;
ret = -ENODATA;
@@ -1054,18 +1402,44 @@ static int i915_scheduler_pop_from_queue_locked(struct intel_engine_cs *ring,
continue;
any_queued = true;
+ /* Attempt to re-enable pre-emption if a node wants to pre-empt
+ * but previously got downgraded. */
+ if ((node->params.scheduler_flags &
+ (i915_ebp_sf_preempt |
+ i915_ebp_sf_was_preempt)) ==
+ i915_ebp_sf_was_preempt)
+ node->params.scheduler_flags |=
+ i915_ebp_sf_preempt;
+
has_local = false;
has_remote = false;
+ local_preempt_only = true;
for (i = 0; i < node->num_deps; i++) {
if (!i915_scheduler_is_dependency_valid(node, i))
continue;
- if (node->dep_list[i]->params.ring == node->params.ring)
+ if (node->dep_list[i]->params.ring == node->params.ring) {
has_local = true;
- else
+
+ if (local_preempt_only &&
+ (node->params.scheduler_flags & i915_ebp_sf_preempt)) {
+ node->params.scheduler_flags &= ~i915_ebp_sf_preempt;
+ if (i915_scheduler_is_dependency_valid(node, i))
+ local_preempt_only = false;
+ node->params.scheduler_flags |= i915_ebp_sf_preempt;
+ }
+ } else
has_remote = true;
}
+ if (has_local && local_preempt_only) {
+ /* If a preemptive node's local dependencies are all
+ * flying. then they can be ignore by un-preempting the
+ * node. */
+ node->params.scheduler_flags &= ~i915_ebp_sf_preempt;
+ has_local = false;
+ }
+
if (has_remote && !has_local)
only_remote = true;
@@ -1080,6 +1454,7 @@ static int i915_scheduler_pop_from_queue_locked(struct intel_engine_cs *ring,
list_del(&best->link);
INIT_LIST_HEAD(&best->link);
+ was_preempted = best->status == i915_sqs_preempted;
best->status = i915_sqs_none;
trace_i915_scheduler_node_state_change(ring, best);
@@ -1105,6 +1480,13 @@ static int i915_scheduler_pop_from_queue_locked(struct intel_engine_cs *ring,
trace_i915_scheduler_pop_from_queue(ring, best);
+ if (was_preempted) {
+ /* Previously submitted - cancel outstanding request */
+ spin_unlock_irqrestore(&scheduler->lock, *flags);
+ i915_gem_cancel_request(ring, best->params.seqno);
+ spin_lock_irqsave(&scheduler->lock, *flags);
+ }
+
*pop_node = best;
return ret;
}
@@ -1118,6 +1500,12 @@ int i915_scheduler_submit(struct intel_engine_cs *ring, bool was_locked)
unsigned long flags;
int ret = 0, count = 0;
+ if (scheduler->flags[ring->id] & i915_sf_preempting) {
+ /* If a pre-emption event is in progress then no other work may
+ * be submitted to that ring. Come back later... */
+ return -EAGAIN;
+ }
+
if (!was_locked) {
ret = i915_mutex_lock_interruptible(dev);
if (ret)
@@ -1145,10 +1533,45 @@ int i915_scheduler_submit(struct intel_engine_cs *ring, bool was_locked)
BUG_ON(node->status != i915_sqs_none);
count++;
+ if (node->params.scheduler_flags & i915_ebp_sf_preempt) {
+ struct i915_scheduler_queue_entry *fly;
+ bool got_flying = false;
+
+ list_for_each_entry(fly, &scheduler->node_queue[ring->id], link) {
+ if (!I915_SQS_IS_FLYING(fly))
+ continue;
+
+ got_flying = true;
+ if (fly->priority >= node->priority) {
+ /* Already working on something at least
+ * as important, so don't interrupt it. */
+ node->params.scheduler_flags &=
+ ~i915_ebp_sf_preempt;
+ break;
+ }
+ }
+
+ if (!got_flying) {
+ /* Nothing to preempt so don't bother. */
+ node->params.scheduler_flags &=
+ ~i915_ebp_sf_preempt;
+ }
+ }
+
/* The call to pop above will have removed the node from the
* list. So add it back in and mark it as in flight. */
i915_scheduler_fly_node(node);
+ /* If the submission code path is being called then the
+ * scheduler must be out of the 'post-premeption' state. */
+ scheduler->flags[ring->id] &= ~i915_sf_preempted;
+ /* If this batch is pre-emptive then it will tie the hardware
+ * up until it has at least begun to be executed. That is,
+ * if a pre-emption request is in flight then no other work
+ * may be submitted until it resolves. */
+ if (node->params.scheduler_flags & i915_ebp_sf_preempt)
+ scheduler->flags[ring->id] |= i915_sf_preempting;
+
scheduler->flags[ring->id] |= i915_sf_submitting;
spin_unlock_irqrestore(&scheduler->lock, flags);
ret = i915_gem_do_execbuffer_final(&node->params);
@@ -1160,7 +1583,9 @@ int i915_scheduler_submit(struct intel_engine_cs *ring, bool was_locked)
/* Oh dear! Either the node is broken or the ring is
* busy. So need to kill the node or requeue it and try
- * again later as appropriate. */
+ * again later as appropriate. Either way, clear the
+ * pre-emption flag as it ain't happening. */
+ scheduler->flags[ring->id] &= ~i915_sf_preempting;
switch (-ret) {
case EAGAIN:
@@ -1195,6 +1620,10 @@ int i915_scheduler_submit(struct intel_engine_cs *ring, bool was_locked)
i915_scheduler_node_kill(node);
}
+ /* If pre-emption is in progress then give up and go home. */
+ if (scheduler->flags[ring->id] & i915_sf_preempting)
+ break;
+
/* Keep launching until the sky is sufficiently full. */
if (i915_scheduler_count_flying(scheduler, ring) >=
scheduler->min_flying)
@@ -1329,6 +1758,28 @@ int i915_scheduler_closefile(struct drm_device *dev, struct drm_file *file)
return 0;
}
+bool i915_scheduler_is_busy(struct intel_engine_cs *ring)
+{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct i915_scheduler *scheduler = dev_priv->scheduler;
+
+ /*
+ * The scheduler is prevented from sending batches to the hardware
+ * while preemption is in progress (i915_sf_preempting).
+ *
+ * Post-preemption (i915_sf_preempted), the hardware ring will be
+ * empty, and the scheduler therefore needs a chance to run the
+ * delayed work task to retire completed work and restart submission
+ *
+ * Therefore, if either flag is set, the scheduler is busy.
+ */
+ if (scheduler->flags[ring->id] & (i915_sf_preempting |
+ i915_sf_preempted))
+ return true;
+
+ return false;
+}
+
bool i915_scheduler_is_idle(struct intel_engine_cs *ring)
{
struct i915_scheduler_queue_entry *node;
@@ -42,9 +42,19 @@ struct i915_execbuffer_params {
uint32_t mask;
int mode;
struct intel_context *ctx;
+ uint32_t preemption_point;
int seqno;
struct drm_i915_gem_request *request;
uint32_t scheduler_index;
+ uint32_t scheduler_flags;
+};
+
+/* Flag bits for i915_execbuffer_params::scheduler_flags */
+enum {
+ /* Preemption is currently enabled */
+ i915_ebp_sf_preempt = (1 << 0),
+ /* Preemption was originally requested */
+ i915_ebp_sf_was_preempt = (1 << 1),
};
enum i915_scheduler_queue_status {
@@ -54,8 +64,13 @@ enum i915_scheduler_queue_status {
i915_sqs_queued,
/* Sent to hardware for processing: */
i915_sqs_flying,
+ /* Sent to hardware for high-priority processing: */
+ i915_sqs_overtaking,
/* Finished processing on the hardware: */
i915_sqs_complete,
+ /* Was submitted, may or may not have started processing, now being
+ * evicted: */
+ i915_sqs_preempted,
/* Limit value for use with arrays/loops */
i915_sqs_MAX
};
@@ -63,8 +78,10 @@ char i915_scheduler_queue_status_chr(enum i915_scheduler_queue_status status);
const char *i915_scheduler_queue_status_str(
enum i915_scheduler_queue_status status);
-#define I915_SQS_IS_QUEUED(node) (((node)->status == i915_sqs_queued))
-#define I915_SQS_IS_FLYING(node) (((node)->status == i915_sqs_flying))
+#define I915_SQS_IS_QUEUED(node) (((node)->status == i915_sqs_queued) || \
+ ((node)->status == i915_sqs_preempted))
+#define I915_SQS_IS_FLYING(node) (((node)->status == i915_sqs_flying) || \
+ ((node)->status == i915_sqs_overtaking))
#define I915_SQS_IS_COMPLETE(node) ((node)->status == i915_sqs_complete)
struct i915_scheduler_obj_entry {
@@ -125,6 +142,10 @@ enum {
i915_sf_interrupts_enabled = (1 << 0),
i915_sf_submitting = (1 << 1),
+ /* Preemption-related state */
+ i915_sf_preempting = (1 << 4),
+ i915_sf_preempted = (1 << 5),
+
/* Dump/debug flags */
i915_sf_dump_force = (1 << 8),
i915_sf_dump_details = (1 << 9),
@@ -747,20 +747,33 @@ TRACE_EVENT(i915_scheduler_node_state_change,
);
TRACE_EVENT(i915_scheduler_irq,
- TP_PROTO(struct intel_engine_cs *ring, uint32_t seqno),
- TP_ARGS(ring, seqno),
+ TP_PROTO(struct intel_engine_cs *ring, uint32_t seqno,
+ uint32_t b_active, uint32_t b_done,
+ uint32_t p_active, uint32_t p_done),
+ TP_ARGS(ring, seqno, b_active, b_done, p_active, p_done),
TP_STRUCT__entry(
__field(u32, ring)
__field(u32, seqno)
+ __field(u32, b_active)
+ __field(u32, b_done)
+ __field(u32, p_active)
+ __field(u32, p_done)
),
TP_fast_assign(
- __entry->ring = ring->id;
- __entry->seqno = seqno;
+ __entry->ring = ring->id;
+ __entry->seqno = seqno;
+ __entry->b_active = b_active;
+ __entry->b_done = b_done;
+ __entry->p_active = p_active;
+ __entry->p_done = p_done;
),
- TP_printk("ring=%d, seqno=%d", __entry->ring, __entry->seqno)
+ TP_printk("ring=%d, seqno=%d, b_active = %d, b_done = %d, p_active = %d, p_done = %d",
+ __entry->ring, __entry->seqno,
+ __entry->b_active, __entry->b_done,
+ __entry->p_active, __entry->p_done)
);
TRACE_EVENT(i915_gem_ring_queue,
@@ -182,6 +182,10 @@ struct intel_engine_cs {
struct intel_context *default_context;
struct intel_context *last_context;
+#ifdef CONFIG_DRM_I915_SCHEDULER_PREEMPTION
+ uint32_t last_regular_batch;
+ uint32_t last_preemptive_batch;
+#endif
struct intel_ring_hangcheck hangcheck;
From: John Harrison <John.C.Harrison@Intel.com> Added support for pre-empting batch buffers that have already been submitted to the ring. Currently this implements Gen7 level pre-emption which means pre-empting only at voluntary points within the batch buffer. The ring submission code itself adds such points between batch buffers and the OpenCL driver should be adding them within GPGPU specific batch buffers. Other types of workloads cannot be preempted by the hardware and so will not be adding pre-emption points to their buffers. When a pre-emption occurs, the scheduler must work out which buffers have been pre-empted versus which actually managed to complete first, and of those that were pre-empted was the last one pre-empted mid-batch or had it not yet begun to execute. This is done by extending the seqno mechanism to four slots: batch buffer start, batch buffer end, preemption start and preemption end. By querying these four numbers (and only allowing a single preemption event at a time) the scheduler can guarantee to work out exactly what happened to all batch buffers that had been submitted to the ring. A Kconfig option has also been added to allow pre-emption support to be enabled or disabled. --- drivers/gpu/drm/i915/Kconfig | 8 + drivers/gpu/drm/i915/i915_gem.c | 12 + drivers/gpu/drm/i915/i915_gem_execbuffer.c | 273 ++++++++++++++++ drivers/gpu/drm/i915/i915_scheduler.c | 467 +++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_scheduler.h | 25 +- drivers/gpu/drm/i915/i915_trace.h | 23 +- drivers/gpu/drm/i915/intel_ringbuffer.h | 4 + 7 files changed, 797 insertions(+), 15 deletions(-)