diff mbox series

[12/26] drm/i915/guc: Implement multi-lrc submission

Message ID 20211004220637.14746-13-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Parallel submission aka multi-bb execbuf | expand

Commit Message

Matthew Brost Oct. 4, 2021, 10:06 p.m. UTC
Implement multi-lrc submission via a single workqueue entry and single
H2G. The workqueue entry contains an updated tail value for each
request, of all the contexts in the multi-lrc submission, and updates
these values simultaneously. As such, the tasklet and bypass path have
been updated to coalesce requests into a single submission.

v2:
 (John Harrison)
  - s/wqe/wqi
  - Use FIELD_PREP macros
  - Add GEM_BUG_ONs ensures length fits within field
  - Add comment / white space to intel_guc_write_barrier
 (Kernel test robot)
  - Make need_tasklet a static function

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc.c        |  26 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   8 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c     |  24 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 ++++++++++++++++--
 drivers/gpu/drm/i915/i915_request.h           |   8 +
 6 files changed, 335 insertions(+), 73 deletions(-)

Comments

kernel test robot Oct. 5, 2021, 7:55 a.m. UTC | #1
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master airlied/drm-next v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: i386-randconfig-a004-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
        git checkout 3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:7: warning: variable 'ret' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
                   if (multi_lrc_submit(rq)) {
                       ^~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1499:9: note: uninitialized use occurs here
           return ret;
                  ^~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:3: note: remove the 'if' if its condition is always true
                   if (multi_lrc_submit(rq)) {
                   ^~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1479:9: note: initialize the variable 'ret' to silence this warning
           int ret;
                  ^
                   = 0
   1 warning generated.


vim +1486 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

  1475	
  1476	static int guc_bypass_tasklet_submit(struct intel_guc *guc,
  1477					     struct i915_request *rq)
  1478	{
  1479		int ret;
  1480	
  1481		__i915_request_submit(rq);
  1482	
  1483		trace_i915_request_in(rq, 0);
  1484	
  1485		if (is_multi_lrc_rq(rq)) {
> 1486			if (multi_lrc_submit(rq)) {
  1487				ret = guc_wq_item_append(guc, rq);
  1488				if (!ret)
  1489					ret = guc_add_request(guc, rq);
  1490			}
  1491		} else {
  1492			guc_set_lrc_tail(rq);
  1493			ret = guc_add_request(guc, rq);
  1494		}
  1495	
  1496		if (unlikely(ret == -EPIPE))
  1497			disable_submission(guc);
  1498	
  1499		return ret;
  1500	}
  1501	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 5, 2021, 10:37 a.m. UTC | #2
Hi Matthew,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master airlied/drm-next v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: x86_64-randconfig-a003-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
        git checkout 3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:7: error: variable 'ret' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
                   if (multi_lrc_submit(rq)) {
                       ^~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1499:9: note: uninitialized use occurs here
           return ret;
                  ^~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:3: note: remove the 'if' if its condition is always true
                   if (multi_lrc_submit(rq)) {
                   ^~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1479:9: note: initialize the variable 'ret' to silence this warning
           int ret;
                  ^
                   = 0
   1 error generated.


vim +1486 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

  1475	
  1476	static int guc_bypass_tasklet_submit(struct intel_guc *guc,
  1477					     struct i915_request *rq)
  1478	{
  1479		int ret;
  1480	
  1481		__i915_request_submit(rq);
  1482	
  1483		trace_i915_request_in(rq, 0);
  1484	
  1485		if (is_multi_lrc_rq(rq)) {
> 1486			if (multi_lrc_submit(rq)) {
  1487				ret = guc_wq_item_append(guc, rq);
  1488				if (!ret)
  1489					ret = guc_add_request(guc, rq);
  1490			}
  1491		} else {
  1492			guc_set_lrc_tail(rq);
  1493			ret = guc_add_request(guc, rq);
  1494		}
  1495	
  1496		if (unlikely(ret == -EPIPE))
  1497			disable_submission(guc);
  1498	
  1499		return ret;
  1500	}
  1501	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
John Harrison Oct. 8, 2021, 5:20 p.m. UTC | #3
On 10/4/2021 15:06, Matthew Brost wrote:
> Implement multi-lrc submission via a single workqueue entry and single
> H2G. The workqueue entry contains an updated tail value for each
> request, of all the contexts in the multi-lrc submission, and updates
> these values simultaneously. As such, the tasklet and bypass path have
> been updated to coalesce requests into a single submission.
>
> v2:
>   (John Harrison)
>    - s/wqe/wqi
>    - Use FIELD_PREP macros
>    - Add GEM_BUG_ONs ensures length fits within field
>    - Add comment / white space to intel_guc_write_barrier
>   (Kernel test robot)
>    - Make need_tasklet a static function
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc.c        |  26 ++
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   8 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c     |  24 +-
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 ++++++++++++++++--
>   drivers/gpu/drm/i915/i915_request.h           |   8 +
>   6 files changed, 335 insertions(+), 73 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> index 8f8182bf7c11..7191e8439290 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> @@ -756,3 +756,29 @@ void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p)
>   		}
>   	}
>   }
> +
> +void intel_guc_write_barrier(struct intel_guc *guc)
> +{
> +	struct intel_gt *gt = guc_to_gt(guc);
> +
> +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> +		/*
> +		 * Ensure intel_uncore_write_fw can be used rather than
> +		 * intel_uncore_write.
> +		 */
> +		GEM_BUG_ON(guc->send_regs.fw_domains);
> +
> +		/*
> +		 * This register is used by the i915 and GuC for MMIO based
> +		 * communication. Once we are in this code CTBs are the only
> +		 * method the i915 uses to communicate with the GuC so it is
> +		 * safe to write to this register (a value of 0 is NOP for MMIO
> +		 * communication). If we ever start mixing CTBs and MMIOs a new
> +		 * register will have to be chosen.
> +		 */
Hmm, missed it before but this comment is very CTB centric and the 
barrier function is now being used for parallel submission work queues. 
Seems like an extra comment should be added to cover that case. Just 
something simple about WQ usage is also guaranteed to be post CTB switch 
over.

> +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> +	} else {
> +		/* wmb() sufficient for a barrier if in smem */
> +		wmb();
> +	}
> +}
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index a9f4ec972bfb..147f39cc0f2f 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -46,6 +46,12 @@ struct intel_guc {
>   	 * submitted until the stalled request is processed.
>   	 */
>   	struct i915_request *stalled_request;
> +	enum {
> +		STALL_NONE,
> +		STALL_REGISTER_CONTEXT,
> +		STALL_MOVE_LRC_TAIL,
> +		STALL_ADD_REQUEST,
> +	} submission_stall_reason;
>   
>   	/* intel_guc_recv interrupt related state */
>   	/** @irq_lock: protects GuC irq state */
> @@ -361,4 +367,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc);
>   
>   void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
>   
> +void intel_guc_write_barrier(struct intel_guc *guc);
> +
>   #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index 20c710a74498..10d1878d2826 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -377,28 +377,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
>   	return ++ct->requests.last_fence;
>   }
>   
> -static void write_barrier(struct intel_guc_ct *ct)
> -{
> -	struct intel_guc *guc = ct_to_guc(ct);
> -	struct intel_gt *gt = guc_to_gt(guc);
> -
> -	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> -		GEM_BUG_ON(guc->send_regs.fw_domains);
> -		/*
> -		 * This register is used by the i915 and GuC for MMIO based
> -		 * communication. Once we are in this code CTBs are the only
> -		 * method the i915 uses to communicate with the GuC so it is
> -		 * safe to write to this register (a value of 0 is NOP for MMIO
> -		 * communication). If we ever start mixing CTBs and MMIOs a new
> -		 * register will have to be chosen.
> -		 */
> -		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> -	} else {
> -		/* wmb() sufficient for a barrier if in smem */
> -		wmb();
> -	}
> -}
> -
>   static int ct_write(struct intel_guc_ct *ct,
>   		    const u32 *action,
>   		    u32 len /* in dwords */,
> @@ -468,7 +446,7 @@ static int ct_write(struct intel_guc_ct *ct,
>   	 * make sure H2G buffer update and LRC tail update (if this triggering a
>   	 * submission) are visible before updating the descriptor tail
>   	 */
> -	write_barrier(ct);
> +	intel_guc_write_barrier(ct_to_guc(ct));
>   
>   	/* update local copies */
>   	ctb->tail = tail;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 0eeb2a9feeed..a00eeddc1449 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -58,19 +58,16 @@
>   #define WQ_STATUS_CMD_ERROR		3
>   #define WQ_STATUS_ENGINE_ID_NOT_USED	4
>   #define WQ_STATUS_SUSPENDED_FROM_RESET	5
> -#define WQ_TYPE_SHIFT			0
> -#define   WQ_TYPE_BATCH_BUF		(0x1 << WQ_TYPE_SHIFT)
> -#define   WQ_TYPE_PSEUDO		(0x2 << WQ_TYPE_SHIFT)
> -#define   WQ_TYPE_INORDER		(0x3 << WQ_TYPE_SHIFT)
> -#define   WQ_TYPE_NOOP			(0x4 << WQ_TYPE_SHIFT)
> -#define WQ_TARGET_SHIFT			10
> -#define WQ_LEN_SHIFT			16
> -#define WQ_NO_WCFLUSH_WAIT		(1 << 27)
> -#define WQ_PRESENT_WORKLOAD		(1 << 28)
> -
> -#define WQ_RING_TAIL_SHIFT		20
> -#define WQ_RING_TAIL_MAX		0x7FF	/* 2^11 QWords */
> -#define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
> +#define WQ_TYPE_BATCH_BUF		0x1
> +#define WQ_TYPE_PSEUDO			0x2
> +#define WQ_TYPE_INORDER			0x3
> +#define WQ_TYPE_NOOP			0x4
> +#define WQ_TYPE_MULTI_LRC		0x5
> +#define WQ_TYPE_MASK			GENMASK(7, 0)
> +#define WQ_LEN_MASK			GENMASK(26, 16)
> +
> +#define WQ_GUC_ID_MASK			GENMASK(15, 0)
> +#define WQ_RING_TAIL_MASK		GENMASK(28, 18)
Other option for documenting WQ and WQI would be at the top of this 
block of definitions. I believe there is a one line comment of 'work 
queue item header definitions' but none of these defines actually use 
the WQI abbreviation. And some description of what the work queue is, 
how it is used, etc. would be good.

>   
>   #define GUC_STAGE_DESC_ATTR_ACTIVE	BIT(0)
>   #define GUC_STAGE_DESC_ATTR_PENDING_DB	BIT(1)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 031b1bf5ba91..1610120e31a1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -399,6 +399,29 @@ __get_process_desc(struct intel_context *ce)
>   		   LRC_STATE_OFFSET) / sizeof(u32)));
>   }
>   
> +static u32 *get_wq_pointer(struct guc_process_desc *desc,
> +			   struct intel_context *ce,
> +			   u32 wqi_size)
> +{
> +	/*
> +	 * Check for space in work queue. Caching a value of head pointer in
> +	 * intel_context structure in order reduce the number accesses to shared
> +	 * GPU memory which may be across a PCIe bus.
> +	 */
> +#define AVAILABLE_SPACE	\
> +	CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE)
> +	if (wqi_size > AVAILABLE_SPACE) {
> +		ce->parallel.guc.wqi_head = READ_ONCE(desc->head);
> +
> +		if (wqi_size > AVAILABLE_SPACE)
> +			return NULL;
> +	}
> +#undef AVAILABLE_SPACE
> +
> +	return ((u32 *)__get_process_desc(ce)) +
> +		((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32));
> +}
> +
>   static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index)
>   {
>   	struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr;
> @@ -558,10 +581,10 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout)
>   
>   static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
>   
> -static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> +static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   {
>   	int err = 0;
> -	struct intel_context *ce = rq->context;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
>   	u32 action[3];
>   	int len = 0;
>   	u32 g2h_len_dw = 0;
> @@ -582,26 +605,17 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
>   	GEM_BUG_ON(context_guc_id_invalid(ce));
>   
> -	/*
> -	 * Corner case where the GuC firmware was blown away and reloaded while
> -	 * this context was pinned.
> -	 */
> -	if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id))) {
> -		err = guc_lrc_desc_pin(ce, false);
> -		if (unlikely(err))
> -			return err;
> -	}
> -
>   	spin_lock(&ce->guc_state.lock);
>   
>   	/*
>   	 * The request / context will be run on the hardware when scheduling
> -	 * gets enabled in the unblock.
> +	 * gets enabled in the unblock. For multi-lrc we still submit the
> +	 * context to move the LRC tails.
>   	 */
> -	if (unlikely(context_blocked(ce)))
> +	if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce)))
>   		goto out;
>   
> -	enabled = context_enabled(ce);
> +	enabled = context_enabled(ce) || context_blocked(ce);
>   
>   	if (!enabled) {
>   		action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET;
> @@ -620,6 +634,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   		trace_intel_context_sched_enable(ce);
>   		atomic_inc(&guc->outstanding_submission_g2h);
>   		set_context_enabled(ce);
> +
> +		/*
> +		 * Without multi-lrc KMD does the submission step (moving the
> +		 * lrc tail) so enabling scheduling is sufficient to submit the
> +		 * context. This isn't the case in multi-lrc submission as the
> +		 * GuC needs to move the tails, hence the need for another H2G
> +		 * to submit a multi-lrc context after enabling scheduling.
> +		 */
> +		if (intel_context_is_parent(ce)) {
> +			action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT;
> +			err = intel_guc_send_nb(guc, action, len - 1, 0);
> +		}
>   	} else if (!enabled) {
>   		clr_context_pending_enable(ce);
>   		intel_context_put(ce);
> @@ -632,6 +658,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   	return err;
>   }
>   
> +static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> +{
> +	int ret = __guc_add_request(guc, rq);
> +
> +	if (unlikely(ret == -EBUSY)) {
> +		guc->stalled_request = rq;
> +		guc->submission_stall_reason = STALL_ADD_REQUEST;
> +	}
> +
> +	return ret;
> +}
> +
>   static inline void guc_set_lrc_tail(struct i915_request *rq)
>   {
>   	rq->context->lrc_reg_state[CTX_RING_TAIL] =
> @@ -643,6 +681,134 @@ static inline int rq_prio(const struct i915_request *rq)
>   	return rq->sched.attr.priority;
>   }
>   
> +static bool is_multi_lrc_rq(struct i915_request *rq)
> +{
> +	return intel_context_is_child(rq->context) ||
> +		intel_context_is_parent(rq->context);
> +}
> +
> +static bool can_merge_rq(struct i915_request *rq,
> +			 struct i915_request *last)
> +{
> +	return request_to_scheduling_context(rq) ==
> +		request_to_scheduling_context(last);
> +}
> +
> +static u32 wq_space_until_wrap(struct intel_context *ce)
> +{
> +	return (WQ_SIZE - ce->parallel.guc.wqi_tail);
> +}
> +
> +static void write_wqi(struct guc_process_desc *desc,
> +		      struct intel_context *ce,
> +		      u32 wqi_size)
> +{
> +	/*
> +	 * Ensure WQI are visible before updating tail
> +	 */
> +	intel_guc_write_barrier(ce_to_guc(ce));
> +
> +	ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) &
> +		(WQ_SIZE - 1);
This relies on WQ_SIZE being a power of two, right? Is it possible to 
add a BUILD_BUG_ON to ensure that?

> +	WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail);
> +}
> +
> +static int guc_wq_noop_append(struct intel_context *ce)
> +{
> +	struct guc_process_desc *desc = __get_process_desc(ce);
> +	u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce));
> +	u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1;
> +
> +	if (!wqi)
> +		return -EBUSY;
> +
> +	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
> +
> +	*wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) |
> +		FIELD_PREP(WQ_LEN_MASK, len_dw);
> +	ce->parallel.guc.wqi_tail = 0;
> +
> +	return 0;
> +}
> +
> +static int __guc_wq_item_append(struct i915_request *rq)
> +{
> +	struct intel_context *ce = request_to_scheduling_context(rq);
> +	struct intel_context *child;
> +	struct guc_process_desc *desc = __get_process_desc(ce);
> +	unsigned int wqi_size = (ce->parallel.number_children + 4) *
> +		sizeof(u32);
> +	u32 *wqi;
> +	u32 len_dw = (wqi_size / sizeof(u32)) - 1;
> +	int ret;
> +
> +	/* Ensure context is in correct state updating work queue */
> +	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
> +	GEM_BUG_ON(context_guc_id_invalid(ce));
> +	GEM_BUG_ON(context_wait_for_deregister_to_register(ce));
> +	GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id));
> +
> +	/* Insert NOOP if this work queue item will wrap the tail pointer. */
> +	if (wqi_size > wq_space_until_wrap(ce)) {
> +		ret = guc_wq_noop_append(ce);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	wqi = get_wq_pointer(desc, ce, wqi_size);
> +	if (!wqi)
> +		return -EBUSY;
> +
> +	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
> +
> +	*wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) |
> +		FIELD_PREP(WQ_LEN_MASK, len_dw);
> +	*wqi++ = ce->lrc.lrca;
> +	*wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) |
> +	       FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64));
> +	*wqi++ = 0;	/* fence_id */
> +	for_each_child(ce, child)
> +		*wqi++ = child->ring->tail / sizeof(u64);
> +
> +	write_wqi(desc, ce, wqi_size);
> +
> +	return 0;
> +}
> +
> +static int guc_wq_item_append(struct intel_guc *guc,
> +			      struct i915_request *rq)
> +{
> +	struct intel_context *ce = request_to_scheduling_context(rq);
> +	int ret = 0;
> +
> +	if (likely(!intel_context_is_banned(ce))) {
> +		ret = __guc_wq_item_append(rq);
> +
> +		if (unlikely(ret == -EBUSY)) {
> +			guc->stalled_request = rq;
> +			guc->submission_stall_reason = STALL_MOVE_LRC_TAIL;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static bool multi_lrc_submit(struct i915_request *rq)
> +{
> +	struct intel_context *ce = request_to_scheduling_context(rq);
> +
> +	intel_ring_set_tail(rq->ring, rq->tail);
> +
> +	/*
> +	 * We expect the front end (execbuf IOCTL) to set this flag on the last
> +	 * request generated from a multi-BB submission. This indicates to the
> +	 * backend (GuC interface) that we should submit this context thus
> +	 * submitting all the requests generated in parallel.
> +	 */
> +	return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) ||
FYI: Apparently the test_bit/set_bit/etc helpers are intended for use on 
arbitrary sized bitfields. As in, they do all sorts of complicated 
atomic operations to work on 164 bit words and such like. For single 
word flags, the guidance is to just use 'if(word & BIT(bit))' instead.

John.

> +		intel_context_is_banned(ce);
> +}
> +
>   static int guc_dequeue_one_context(struct intel_guc *guc)
>   {
>   	struct i915_sched_engine * const sched_engine = guc->sched_engine;
> @@ -656,7 +822,17 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
>   	if (guc->stalled_request) {
>   		submit = true;
>   		last = guc->stalled_request;
> -		goto resubmit;
> +
> +		switch (guc->submission_stall_reason) {
> +		case STALL_REGISTER_CONTEXT:
> +			goto register_context;
> +		case STALL_MOVE_LRC_TAIL:
> +			goto move_lrc_tail;
> +		case STALL_ADD_REQUEST:
> +			goto add_request;
> +		default:
> +			MISSING_CASE(guc->submission_stall_reason);
> +		}
>   	}
>   
>   	while ((rb = rb_first_cached(&sched_engine->queue))) {
> @@ -664,8 +840,8 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
>   		struct i915_request *rq, *rn;
>   
>   		priolist_for_each_request_consume(rq, rn, p) {
> -			if (last && rq->context != last->context)
> -				goto done;
> +			if (last && !can_merge_rq(rq, last))
> +				goto register_context;
>   
>   			list_del_init(&rq->sched.link);
>   
> @@ -673,33 +849,84 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
>   
>   			trace_i915_request_in(rq, 0);
>   			last = rq;
> -			submit = true;
> +
> +			if (is_multi_lrc_rq(rq)) {
> +				/*
> +				 * We need to coalesce all multi-lrc requests in
> +				 * a relationship into a single H2G. We are
> +				 * guaranteed that all of these requests will be
> +				 * submitted sequentially.
> +				 */
> +				if (multi_lrc_submit(rq)) {
> +					submit = true;
> +					goto register_context;
> +				}
> +			} else {
> +				submit = true;
> +			}
>   		}
>   
>   		rb_erase_cached(&p->node, &sched_engine->queue);
>   		i915_priolist_free(p);
>   	}
> -done:
> +
> +register_context:
>   	if (submit) {
> -		guc_set_lrc_tail(last);
> -resubmit:
> +		struct intel_context *ce = request_to_scheduling_context(last);
> +
> +		if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) &&
> +			     !intel_context_is_banned(ce))) {
> +			ret = guc_lrc_desc_pin(ce, false);
> +			if (unlikely(ret == -EPIPE)) {
> +				goto deadlk;
> +			} else if (ret == -EBUSY) {
> +				guc->stalled_request = last;
> +				guc->submission_stall_reason =
> +					STALL_REGISTER_CONTEXT;
> +				goto schedule_tasklet;
> +			} else if (ret != 0) {
> +				GEM_WARN_ON(ret);	/* Unexpected */
> +				goto deadlk;
> +			}
> +		}
> +
> +move_lrc_tail:
> +		if (is_multi_lrc_rq(last)) {
> +			ret = guc_wq_item_append(guc, last);
> +			if (ret == -EBUSY) {
> +				goto schedule_tasklet;
> +			} else if (ret != 0) {
> +				GEM_WARN_ON(ret);	/* Unexpected */
> +				goto deadlk;
> +			}
> +		} else {
> +			guc_set_lrc_tail(last);
> +		}
> +
> +add_request:
>   		ret = guc_add_request(guc, last);
> -		if (unlikely(ret == -EPIPE))
> +		if (unlikely(ret == -EPIPE)) {
> +			goto deadlk;
> +		} else if (ret == -EBUSY) {
> +			goto schedule_tasklet;
> +		} else if (ret != 0) {
> +			GEM_WARN_ON(ret);	/* Unexpected */
>   			goto deadlk;
> -		else if (ret == -EBUSY) {
> -			tasklet_schedule(&sched_engine->tasklet);
> -			guc->stalled_request = last;
> -			return false;
>   		}
>   	}
>   
>   	guc->stalled_request = NULL;
> +	guc->submission_stall_reason = STALL_NONE;
>   	return submit;
>   
>   deadlk:
>   	sched_engine->tasklet.callback = NULL;
>   	tasklet_disable_nosync(&sched_engine->tasklet);
>   	return false;
> +
> +schedule_tasklet:
> +	tasklet_schedule(&sched_engine->tasklet);
> +	return false;
>   }
>   
>   static void guc_submission_tasklet(struct tasklet_struct *t)
> @@ -1255,10 +1482,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc,
>   
>   	trace_i915_request_in(rq, 0);
>   
> -	guc_set_lrc_tail(rq);
> -	ret = guc_add_request(guc, rq);
> -	if (ret == -EBUSY)
> -		guc->stalled_request = rq;
> +	if (is_multi_lrc_rq(rq)) {
> +		if (multi_lrc_submit(rq)) {
> +			ret = guc_wq_item_append(guc, rq);
> +			if (!ret)
> +				ret = guc_add_request(guc, rq);
> +		}
> +	} else {
> +		guc_set_lrc_tail(rq);
> +		ret = guc_add_request(guc, rq);
> +	}
>   
>   	if (unlikely(ret == -EPIPE))
>   		disable_submission(guc);
> @@ -1266,6 +1499,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc,
>   	return ret;
>   }
>   
> +static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq)
> +{
> +	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
> +
> +	return submission_disabled(guc) || guc->stalled_request ||
> +		!i915_sched_engine_is_empty(sched_engine) ||
> +		!lrc_desc_registered(guc, ce->guc_id.id);
> +}
> +
>   static void guc_submit_request(struct i915_request *rq)
>   {
>   	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
> @@ -1275,8 +1518,7 @@ static void guc_submit_request(struct i915_request *rq)
>   	/* Will be called from irq-context when using foreign fences. */
>   	spin_lock_irqsave(&sched_engine->lock, flags);
>   
> -	if (submission_disabled(guc) || guc->stalled_request ||
> -	    !i915_sched_engine_is_empty(sched_engine))
> +	if (need_tasklet(guc, rq))
>   		queue_request(sched_engine, rq, rq_prio(rq));
>   	else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY)
>   		tasklet_hi_schedule(&sched_engine->tasklet);
> @@ -2258,9 +2500,10 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
>   
>   static void add_to_context(struct i915_request *rq)
>   {
> -	struct intel_context *ce = rq->context;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
>   	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
>   
> +	GEM_BUG_ON(intel_context_is_child(ce));
>   	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
>   
>   	spin_lock(&ce->guc_state.lock);
> @@ -2293,7 +2536,9 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
>   
>   static void remove_from_context(struct i915_request *rq)
>   {
> -	struct intel_context *ce = rq->context;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
> +
> +	GEM_BUG_ON(intel_context_is_child(ce));
>   
>   	spin_lock_irq(&ce->guc_state.lock);
>   
> @@ -2712,7 +2957,7 @@ static void guc_init_breadcrumbs(struct intel_engine_cs *engine)
>   static void guc_bump_inflight_request_prio(struct i915_request *rq,
>   					   int prio)
>   {
> -	struct intel_context *ce = rq->context;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
>   	u8 new_guc_prio = map_i915_prio_to_guc_prio(prio);
>   
>   	/* Short circuit function */
> @@ -2735,7 +2980,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
>   
>   static void guc_retire_inflight_request_prio(struct i915_request *rq)
>   {
> -	struct intel_context *ce = rq->context;
> +	struct intel_context *ce = request_to_scheduling_context(rq);
>   
>   	spin_lock(&ce->guc_state.lock);
>   	guc_prio_fini(rq, ce);
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 7bd9ed20623e..8950785e55d6 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -139,6 +139,14 @@ enum {
>   	 * the GPU. Here we track such boost requests on a per-request basis.
>   	 */
>   	I915_FENCE_FLAG_BOOST,
> +
> +	/*
> +	 * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a
> +	 * parent-child relationship (parallel submission, multi-lrc) should
> +	 * trigger a submission to the GuC rather than just moving the context
> +	 * tail.
> +	 */
> +	I915_FENCE_FLAG_SUBMIT_PARALLEL,
>   };
>   
>   /**
Matthew Brost Oct. 13, 2021, 6:24 p.m. UTC | #4
On Fri, Oct 08, 2021 at 10:20:24AM -0700, John Harrison wrote:
> On 10/4/2021 15:06, Matthew Brost wrote:
> > Implement multi-lrc submission via a single workqueue entry and single
> > H2G. The workqueue entry contains an updated tail value for each
> > request, of all the contexts in the multi-lrc submission, and updates
> > these values simultaneously. As such, the tasklet and bypass path have
> > been updated to coalesce requests into a single submission.
> > 
> > v2:
> >   (John Harrison)
> >    - s/wqe/wqi
> >    - Use FIELD_PREP macros
> >    - Add GEM_BUG_ONs ensures length fits within field
> >    - Add comment / white space to intel_guc_write_barrier
> >   (Kernel test robot)
> >    - Make need_tasklet a static function
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.c        |  26 ++
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   8 +
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c     |  24 +-
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 ++++++++++++++++--
> >   drivers/gpu/drm/i915/i915_request.h           |   8 +
> >   6 files changed, 335 insertions(+), 73 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > index 8f8182bf7c11..7191e8439290 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > @@ -756,3 +756,29 @@ void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p)
> >   		}
> >   	}
> >   }
> > +
> > +void intel_guc_write_barrier(struct intel_guc *guc)
> > +{
> > +	struct intel_gt *gt = guc_to_gt(guc);
> > +
> > +	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > +		/*
> > +		 * Ensure intel_uncore_write_fw can be used rather than
> > +		 * intel_uncore_write.
> > +		 */
> > +		GEM_BUG_ON(guc->send_regs.fw_domains);
> > +
> > +		/*
> > +		 * This register is used by the i915 and GuC for MMIO based
> > +		 * communication. Once we are in this code CTBs are the only
> > +		 * method the i915 uses to communicate with the GuC so it is
> > +		 * safe to write to this register (a value of 0 is NOP for MMIO
> > +		 * communication). If we ever start mixing CTBs and MMIOs a new
> > +		 * register will have to be chosen.
> > +		 */
> Hmm, missed it before but this comment is very CTB centric and the barrier
> function is now being used for parallel submission work queues. Seems like
> an extra comment should be added to cover that case. Just something simple
> about WQ usage is also guaranteed to be post CTB switch over.
> 

Sure.

> > +		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> > +	} else {
> > +		/* wmb() sufficient for a barrier if in smem */
> > +		wmb();
> > +	}
> > +}
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index a9f4ec972bfb..147f39cc0f2f 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -46,6 +46,12 @@ struct intel_guc {
> >   	 * submitted until the stalled request is processed.
> >   	 */
> >   	struct i915_request *stalled_request;
> > +	enum {
> > +		STALL_NONE,
> > +		STALL_REGISTER_CONTEXT,
> > +		STALL_MOVE_LRC_TAIL,
> > +		STALL_ADD_REQUEST,
> > +	} submission_stall_reason;
> >   	/* intel_guc_recv interrupt related state */
> >   	/** @irq_lock: protects GuC irq state */
> > @@ -361,4 +367,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc);
> >   void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
> > +void intel_guc_write_barrier(struct intel_guc *guc);
> > +
> >   #endif
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > index 20c710a74498..10d1878d2826 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > @@ -377,28 +377,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
> >   	return ++ct->requests.last_fence;
> >   }
> > -static void write_barrier(struct intel_guc_ct *ct)
> > -{
> > -	struct intel_guc *guc = ct_to_guc(ct);
> > -	struct intel_gt *gt = guc_to_gt(guc);
> > -
> > -	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > -		GEM_BUG_ON(guc->send_regs.fw_domains);
> > -		/*
> > -		 * This register is used by the i915 and GuC for MMIO based
> > -		 * communication. Once we are in this code CTBs are the only
> > -		 * method the i915 uses to communicate with the GuC so it is
> > -		 * safe to write to this register (a value of 0 is NOP for MMIO
> > -		 * communication). If we ever start mixing CTBs and MMIOs a new
> > -		 * register will have to be chosen.
> > -		 */
> > -		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> > -	} else {
> > -		/* wmb() sufficient for a barrier if in smem */
> > -		wmb();
> > -	}
> > -}
> > -
> >   static int ct_write(struct intel_guc_ct *ct,
> >   		    const u32 *action,
> >   		    u32 len /* in dwords */,
> > @@ -468,7 +446,7 @@ static int ct_write(struct intel_guc_ct *ct,
> >   	 * make sure H2G buffer update and LRC tail update (if this triggering a
> >   	 * submission) are visible before updating the descriptor tail
> >   	 */
> > -	write_barrier(ct);
> > +	intel_guc_write_barrier(ct_to_guc(ct));
> >   	/* update local copies */
> >   	ctb->tail = tail;
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > index 0eeb2a9feeed..a00eeddc1449 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> > @@ -58,19 +58,16 @@
> >   #define WQ_STATUS_CMD_ERROR		3
> >   #define WQ_STATUS_ENGINE_ID_NOT_USED	4
> >   #define WQ_STATUS_SUSPENDED_FROM_RESET	5
> > -#define WQ_TYPE_SHIFT			0
> > -#define   WQ_TYPE_BATCH_BUF		(0x1 << WQ_TYPE_SHIFT)
> > -#define   WQ_TYPE_PSEUDO		(0x2 << WQ_TYPE_SHIFT)
> > -#define   WQ_TYPE_INORDER		(0x3 << WQ_TYPE_SHIFT)
> > -#define   WQ_TYPE_NOOP			(0x4 << WQ_TYPE_SHIFT)
> > -#define WQ_TARGET_SHIFT			10
> > -#define WQ_LEN_SHIFT			16
> > -#define WQ_NO_WCFLUSH_WAIT		(1 << 27)
> > -#define WQ_PRESENT_WORKLOAD		(1 << 28)
> > -
> > -#define WQ_RING_TAIL_SHIFT		20
> > -#define WQ_RING_TAIL_MAX		0x7FF	/* 2^11 QWords */
> > -#define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
> > +#define WQ_TYPE_BATCH_BUF		0x1
> > +#define WQ_TYPE_PSEUDO			0x2
> > +#define WQ_TYPE_INORDER			0x3
> > +#define WQ_TYPE_NOOP			0x4
> > +#define WQ_TYPE_MULTI_LRC		0x5
> > +#define WQ_TYPE_MASK			GENMASK(7, 0)
> > +#define WQ_LEN_MASK			GENMASK(26, 16)
> > +
> > +#define WQ_GUC_ID_MASK			GENMASK(15, 0)
> > +#define WQ_RING_TAIL_MASK		GENMASK(28, 18)
> Other option for documenting WQ and WQI would be at the top of this block of
> definitions. I believe there is a one line comment of 'work queue item
> header definitions' but none of these defines actually use the WQI
> abbreviation. And some description of what the work queue is, how it is
> used, etc. would be good.
>

Will add something here but again plan on updating the GuC kernel doc
with all the multi-lrc details including WQ / WQI. 
 
> >   #define GUC_STAGE_DESC_ATTR_ACTIVE	BIT(0)
> >   #define GUC_STAGE_DESC_ATTR_PENDING_DB	BIT(1)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 031b1bf5ba91..1610120e31a1 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -399,6 +399,29 @@ __get_process_desc(struct intel_context *ce)
> >   		   LRC_STATE_OFFSET) / sizeof(u32)));
> >   }
> > +static u32 *get_wq_pointer(struct guc_process_desc *desc,
> > +			   struct intel_context *ce,
> > +			   u32 wqi_size)
> > +{
> > +	/*
> > +	 * Check for space in work queue. Caching a value of head pointer in
> > +	 * intel_context structure in order reduce the number accesses to shared
> > +	 * GPU memory which may be across a PCIe bus.
> > +	 */
> > +#define AVAILABLE_SPACE	\
> > +	CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE)
> > +	if (wqi_size > AVAILABLE_SPACE) {
> > +		ce->parallel.guc.wqi_head = READ_ONCE(desc->head);
> > +
> > +		if (wqi_size > AVAILABLE_SPACE)
> > +			return NULL;
> > +	}
> > +#undef AVAILABLE_SPACE
> > +
> > +	return ((u32 *)__get_process_desc(ce)) +
> > +		((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32));
> > +}
> > +
> >   static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index)
> >   {
> >   	struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr;
> > @@ -558,10 +581,10 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout)
> >   static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
> > -static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> > +static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> >   {
> >   	int err = 0;
> > -	struct intel_context *ce = rq->context;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> >   	u32 action[3];
> >   	int len = 0;
> >   	u32 g2h_len_dw = 0;
> > @@ -582,26 +605,17 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> >   	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
> >   	GEM_BUG_ON(context_guc_id_invalid(ce));
> > -	/*
> > -	 * Corner case where the GuC firmware was blown away and reloaded while
> > -	 * this context was pinned.
> > -	 */
> > -	if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id))) {
> > -		err = guc_lrc_desc_pin(ce, false);
> > -		if (unlikely(err))
> > -			return err;
> > -	}
> > -
> >   	spin_lock(&ce->guc_state.lock);
> >   	/*
> >   	 * The request / context will be run on the hardware when scheduling
> > -	 * gets enabled in the unblock.
> > +	 * gets enabled in the unblock. For multi-lrc we still submit the
> > +	 * context to move the LRC tails.
> >   	 */
> > -	if (unlikely(context_blocked(ce)))
> > +	if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce)))
> >   		goto out;
> > -	enabled = context_enabled(ce);
> > +	enabled = context_enabled(ce) || context_blocked(ce);
> >   	if (!enabled) {
> >   		action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET;
> > @@ -620,6 +634,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> >   		trace_intel_context_sched_enable(ce);
> >   		atomic_inc(&guc->outstanding_submission_g2h);
> >   		set_context_enabled(ce);
> > +
> > +		/*
> > +		 * Without multi-lrc KMD does the submission step (moving the
> > +		 * lrc tail) so enabling scheduling is sufficient to submit the
> > +		 * context. This isn't the case in multi-lrc submission as the
> > +		 * GuC needs to move the tails, hence the need for another H2G
> > +		 * to submit a multi-lrc context after enabling scheduling.
> > +		 */
> > +		if (intel_context_is_parent(ce)) {
> > +			action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT;
> > +			err = intel_guc_send_nb(guc, action, len - 1, 0);
> > +		}
> >   	} else if (!enabled) {
> >   		clr_context_pending_enable(ce);
> >   		intel_context_put(ce);
> > @@ -632,6 +658,18 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> >   	return err;
> >   }
> > +static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
> > +{
> > +	int ret = __guc_add_request(guc, rq);
> > +
> > +	if (unlikely(ret == -EBUSY)) {
> > +		guc->stalled_request = rq;
> > +		guc->submission_stall_reason = STALL_ADD_REQUEST;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >   static inline void guc_set_lrc_tail(struct i915_request *rq)
> >   {
> >   	rq->context->lrc_reg_state[CTX_RING_TAIL] =
> > @@ -643,6 +681,134 @@ static inline int rq_prio(const struct i915_request *rq)
> >   	return rq->sched.attr.priority;
> >   }
> > +static bool is_multi_lrc_rq(struct i915_request *rq)
> > +{
> > +	return intel_context_is_child(rq->context) ||
> > +		intel_context_is_parent(rq->context);
> > +}
> > +
> > +static bool can_merge_rq(struct i915_request *rq,
> > +			 struct i915_request *last)
> > +{
> > +	return request_to_scheduling_context(rq) ==
> > +		request_to_scheduling_context(last);
> > +}
> > +
> > +static u32 wq_space_until_wrap(struct intel_context *ce)
> > +{
> > +	return (WQ_SIZE - ce->parallel.guc.wqi_tail);
> > +}
> > +
> > +static void write_wqi(struct guc_process_desc *desc,
> > +		      struct intel_context *ce,
> > +		      u32 wqi_size)
> > +{
> > +	/*
> > +	 * Ensure WQI are visible before updating tail
> > +	 */
> > +	intel_guc_write_barrier(ce_to_guc(ce));
> > +
> > +	ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) &
> > +		(WQ_SIZE - 1);
> This relies on WQ_SIZE being a power of two, right? Is it possible to add a
> BUILD_BUG_ON to ensure that?
> 

Yep.

> > +	WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail);
> > +}
> > +
> > +static int guc_wq_noop_append(struct intel_context *ce)
> > +{
> > +	struct guc_process_desc *desc = __get_process_desc(ce);
> > +	u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce));
> > +	u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1;
> > +
> > +	if (!wqi)
> > +		return -EBUSY;
> > +
> > +	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
> > +
> > +	*wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) |
> > +		FIELD_PREP(WQ_LEN_MASK, len_dw);
> > +	ce->parallel.guc.wqi_tail = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +static int __guc_wq_item_append(struct i915_request *rq)
> > +{
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> > +	struct intel_context *child;
> > +	struct guc_process_desc *desc = __get_process_desc(ce);
> > +	unsigned int wqi_size = (ce->parallel.number_children + 4) *
> > +		sizeof(u32);
> > +	u32 *wqi;
> > +	u32 len_dw = (wqi_size / sizeof(u32)) - 1;
> > +	int ret;
> > +
> > +	/* Ensure context is in correct state updating work queue */
> > +	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
> > +	GEM_BUG_ON(context_guc_id_invalid(ce));
> > +	GEM_BUG_ON(context_wait_for_deregister_to_register(ce));
> > +	GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id));
> > +
> > +	/* Insert NOOP if this work queue item will wrap the tail pointer. */
> > +	if (wqi_size > wq_space_until_wrap(ce)) {
> > +		ret = guc_wq_noop_append(ce);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	wqi = get_wq_pointer(desc, ce, wqi_size);
> > +	if (!wqi)
> > +		return -EBUSY;
> > +
> > +	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
> > +
> > +	*wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) |
> > +		FIELD_PREP(WQ_LEN_MASK, len_dw);
> > +	*wqi++ = ce->lrc.lrca;
> > +	*wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) |
> > +	       FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64));
> > +	*wqi++ = 0;	/* fence_id */
> > +	for_each_child(ce, child)
> > +		*wqi++ = child->ring->tail / sizeof(u64);
> > +
> > +	write_wqi(desc, ce, wqi_size);
> > +
> > +	return 0;
> > +}
> > +
> > +static int guc_wq_item_append(struct intel_guc *guc,
> > +			      struct i915_request *rq)
> > +{
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> > +	int ret = 0;
> > +
> > +	if (likely(!intel_context_is_banned(ce))) {
> > +		ret = __guc_wq_item_append(rq);
> > +
> > +		if (unlikely(ret == -EBUSY)) {
> > +			guc->stalled_request = rq;
> > +			guc->submission_stall_reason = STALL_MOVE_LRC_TAIL;
> > +		}
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static bool multi_lrc_submit(struct i915_request *rq)
> > +{
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> > +
> > +	intel_ring_set_tail(rq->ring, rq->tail);
> > +
> > +	/*
> > +	 * We expect the front end (execbuf IOCTL) to set this flag on the last
> > +	 * request generated from a multi-BB submission. This indicates to the
> > +	 * backend (GuC interface) that we should submit this context thus
> > +	 * submitting all the requests generated in parallel.
> > +	 */
> > +	return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) ||
> FYI: Apparently the test_bit/set_bit/etc helpers are intended for use on
> arbitrary sized bitfields. As in, they do all sorts of complicated atomic
> operations to work on 164 bit words and such like. For single word flags,
> the guidance is to just use 'if(word & BIT(bit))' instead.
> 

I get that but currently everywhere in the code uses
set_bit/clear_bit/test_bit on the rq->fence.flags. IMO is better to
stick to that convention for now rip of all of these helpers in a single
patch later. I'd rather not have a hodgepodge of styles in the code.

I can an AR to clean up rq->fence.flags everywhere in the code in a
follow up.

Matt

> John.
> 
> > +		intel_context_is_banned(ce);
> > +}
> > +
> >   static int guc_dequeue_one_context(struct intel_guc *guc)
> >   {
> >   	struct i915_sched_engine * const sched_engine = guc->sched_engine;
> > @@ -656,7 +822,17 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
> >   	if (guc->stalled_request) {
> >   		submit = true;
> >   		last = guc->stalled_request;
> > -		goto resubmit;
> > +
> > +		switch (guc->submission_stall_reason) {
> > +		case STALL_REGISTER_CONTEXT:
> > +			goto register_context;
> > +		case STALL_MOVE_LRC_TAIL:
> > +			goto move_lrc_tail;
> > +		case STALL_ADD_REQUEST:
> > +			goto add_request;
> > +		default:
> > +			MISSING_CASE(guc->submission_stall_reason);
> > +		}
> >   	}
> >   	while ((rb = rb_first_cached(&sched_engine->queue))) {
> > @@ -664,8 +840,8 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
> >   		struct i915_request *rq, *rn;
> >   		priolist_for_each_request_consume(rq, rn, p) {
> > -			if (last && rq->context != last->context)
> > -				goto done;
> > +			if (last && !can_merge_rq(rq, last))
> > +				goto register_context;
> >   			list_del_init(&rq->sched.link);
> > @@ -673,33 +849,84 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
> >   			trace_i915_request_in(rq, 0);
> >   			last = rq;
> > -			submit = true;
> > +
> > +			if (is_multi_lrc_rq(rq)) {
> > +				/*
> > +				 * We need to coalesce all multi-lrc requests in
> > +				 * a relationship into a single H2G. We are
> > +				 * guaranteed that all of these requests will be
> > +				 * submitted sequentially.
> > +				 */
> > +				if (multi_lrc_submit(rq)) {
> > +					submit = true;
> > +					goto register_context;
> > +				}
> > +			} else {
> > +				submit = true;
> > +			}
> >   		}
> >   		rb_erase_cached(&p->node, &sched_engine->queue);
> >   		i915_priolist_free(p);
> >   	}
> > -done:
> > +
> > +register_context:
> >   	if (submit) {
> > -		guc_set_lrc_tail(last);
> > -resubmit:
> > +		struct intel_context *ce = request_to_scheduling_context(last);
> > +
> > +		if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) &&
> > +			     !intel_context_is_banned(ce))) {
> > +			ret = guc_lrc_desc_pin(ce, false);
> > +			if (unlikely(ret == -EPIPE)) {
> > +				goto deadlk;
> > +			} else if (ret == -EBUSY) {
> > +				guc->stalled_request = last;
> > +				guc->submission_stall_reason =
> > +					STALL_REGISTER_CONTEXT;
> > +				goto schedule_tasklet;
> > +			} else if (ret != 0) {
> > +				GEM_WARN_ON(ret);	/* Unexpected */
> > +				goto deadlk;
> > +			}
> > +		}
> > +
> > +move_lrc_tail:
> > +		if (is_multi_lrc_rq(last)) {
> > +			ret = guc_wq_item_append(guc, last);
> > +			if (ret == -EBUSY) {
> > +				goto schedule_tasklet;
> > +			} else if (ret != 0) {
> > +				GEM_WARN_ON(ret);	/* Unexpected */
> > +				goto deadlk;
> > +			}
> > +		} else {
> > +			guc_set_lrc_tail(last);
> > +		}
> > +
> > +add_request:
> >   		ret = guc_add_request(guc, last);
> > -		if (unlikely(ret == -EPIPE))
> > +		if (unlikely(ret == -EPIPE)) {
> > +			goto deadlk;
> > +		} else if (ret == -EBUSY) {
> > +			goto schedule_tasklet;
> > +		} else if (ret != 0) {
> > +			GEM_WARN_ON(ret);	/* Unexpected */
> >   			goto deadlk;
> > -		else if (ret == -EBUSY) {
> > -			tasklet_schedule(&sched_engine->tasklet);
> > -			guc->stalled_request = last;
> > -			return false;
> >   		}
> >   	}
> >   	guc->stalled_request = NULL;
> > +	guc->submission_stall_reason = STALL_NONE;
> >   	return submit;
> >   deadlk:
> >   	sched_engine->tasklet.callback = NULL;
> >   	tasklet_disable_nosync(&sched_engine->tasklet);
> >   	return false;
> > +
> > +schedule_tasklet:
> > +	tasklet_schedule(&sched_engine->tasklet);
> > +	return false;
> >   }
> >   static void guc_submission_tasklet(struct tasklet_struct *t)
> > @@ -1255,10 +1482,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc,
> >   	trace_i915_request_in(rq, 0);
> > -	guc_set_lrc_tail(rq);
> > -	ret = guc_add_request(guc, rq);
> > -	if (ret == -EBUSY)
> > -		guc->stalled_request = rq;
> > +	if (is_multi_lrc_rq(rq)) {
> > +		if (multi_lrc_submit(rq)) {
> > +			ret = guc_wq_item_append(guc, rq);
> > +			if (!ret)
> > +				ret = guc_add_request(guc, rq);
> > +		}
> > +	} else {
> > +		guc_set_lrc_tail(rq);
> > +		ret = guc_add_request(guc, rq);
> > +	}
> >   	if (unlikely(ret == -EPIPE))
> >   		disable_submission(guc);
> > @@ -1266,6 +1499,16 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc,
> >   	return ret;
> >   }
> > +static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq)
> > +{
> > +	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> > +
> > +	return submission_disabled(guc) || guc->stalled_request ||
> > +		!i915_sched_engine_is_empty(sched_engine) ||
> > +		!lrc_desc_registered(guc, ce->guc_id.id);
> > +}
> > +
> >   static void guc_submit_request(struct i915_request *rq)
> >   {
> >   	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
> > @@ -1275,8 +1518,7 @@ static void guc_submit_request(struct i915_request *rq)
> >   	/* Will be called from irq-context when using foreign fences. */
> >   	spin_lock_irqsave(&sched_engine->lock, flags);
> > -	if (submission_disabled(guc) || guc->stalled_request ||
> > -	    !i915_sched_engine_is_empty(sched_engine))
> > +	if (need_tasklet(guc, rq))
> >   		queue_request(sched_engine, rq, rq_prio(rq));
> >   	else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY)
> >   		tasklet_hi_schedule(&sched_engine->tasklet);
> > @@ -2258,9 +2500,10 @@ static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
> >   static void add_to_context(struct i915_request *rq)
> >   {
> > -	struct intel_context *ce = rq->context;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> >   	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
> > +	GEM_BUG_ON(intel_context_is_child(ce));
> >   	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
> >   	spin_lock(&ce->guc_state.lock);
> > @@ -2293,7 +2536,9 @@ static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
> >   static void remove_from_context(struct i915_request *rq)
> >   {
> > -	struct intel_context *ce = rq->context;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> > +
> > +	GEM_BUG_ON(intel_context_is_child(ce));
> >   	spin_lock_irq(&ce->guc_state.lock);
> > @@ -2712,7 +2957,7 @@ static void guc_init_breadcrumbs(struct intel_engine_cs *engine)
> >   static void guc_bump_inflight_request_prio(struct i915_request *rq,
> >   					   int prio)
> >   {
> > -	struct intel_context *ce = rq->context;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> >   	u8 new_guc_prio = map_i915_prio_to_guc_prio(prio);
> >   	/* Short circuit function */
> > @@ -2735,7 +2980,7 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq,
> >   static void guc_retire_inflight_request_prio(struct i915_request *rq)
> >   {
> > -	struct intel_context *ce = rq->context;
> > +	struct intel_context *ce = request_to_scheduling_context(rq);
> >   	spin_lock(&ce->guc_state.lock);
> >   	guc_prio_fini(rq, ce);
> > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> > index 7bd9ed20623e..8950785e55d6 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -139,6 +139,14 @@ enum {
> >   	 * the GPU. Here we track such boost requests on a per-request basis.
> >   	 */
> >   	I915_FENCE_FLAG_BOOST,
> > +
> > +	/*
> > +	 * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a
> > +	 * parent-child relationship (parallel submission, multi-lrc) should
> > +	 * trigger a submission to the GuC rather than just moving the context
> > +	 * tail.
> > +	 */
> > +	I915_FENCE_FLAG_SUBMIT_PARALLEL,
> >   };
> >   /**
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 8f8182bf7c11..7191e8439290 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -756,3 +756,29 @@  void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p)
 		}
 	}
 }
+
+void intel_guc_write_barrier(struct intel_guc *guc)
+{
+	struct intel_gt *gt = guc_to_gt(guc);
+
+	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
+		/*
+		 * Ensure intel_uncore_write_fw can be used rather than
+		 * intel_uncore_write.
+		 */
+		GEM_BUG_ON(guc->send_regs.fw_domains);
+
+		/*
+		 * This register is used by the i915 and GuC for MMIO based
+		 * communication. Once we are in this code CTBs are the only
+		 * method the i915 uses to communicate with the GuC so it is
+		 * safe to write to this register (a value of 0 is NOP for MMIO
+		 * communication). If we ever start mixing CTBs and MMIOs a new
+		 * register will have to be chosen.
+		 */
+		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
+	} else {
+		/* wmb() sufficient for a barrier if in smem */
+		wmb();
+	}
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index a9f4ec972bfb..147f39cc0f2f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -46,6 +46,12 @@  struct intel_guc {
 	 * submitted until the stalled request is processed.
 	 */
 	struct i915_request *stalled_request;
+	enum {
+		STALL_NONE,
+		STALL_REGISTER_CONTEXT,
+		STALL_MOVE_LRC_TAIL,
+		STALL_ADD_REQUEST,
+	} submission_stall_reason;
 
 	/* intel_guc_recv interrupt related state */
 	/** @irq_lock: protects GuC irq state */
@@ -361,4 +367,6 @@  void intel_guc_submission_cancel_requests(struct intel_guc *guc);
 
 void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
 
+void intel_guc_write_barrier(struct intel_guc *guc);
+
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 20c710a74498..10d1878d2826 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -377,28 +377,6 @@  static u32 ct_get_next_fence(struct intel_guc_ct *ct)
 	return ++ct->requests.last_fence;
 }
 
-static void write_barrier(struct intel_guc_ct *ct)
-{
-	struct intel_guc *guc = ct_to_guc(ct);
-	struct intel_gt *gt = guc_to_gt(guc);
-
-	if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
-		GEM_BUG_ON(guc->send_regs.fw_domains);
-		/*
-		 * This register is used by the i915 and GuC for MMIO based
-		 * communication. Once we are in this code CTBs are the only
-		 * method the i915 uses to communicate with the GuC so it is
-		 * safe to write to this register (a value of 0 is NOP for MMIO
-		 * communication). If we ever start mixing CTBs and MMIOs a new
-		 * register will have to be chosen.
-		 */
-		intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
-	} else {
-		/* wmb() sufficient for a barrier if in smem */
-		wmb();
-	}
-}
-
 static int ct_write(struct intel_guc_ct *ct,
 		    const u32 *action,
 		    u32 len /* in dwords */,
@@ -468,7 +446,7 @@  static int ct_write(struct intel_guc_ct *ct,
 	 * make sure H2G buffer update and LRC tail update (if this triggering a
 	 * submission) are visible before updating the descriptor tail
 	 */
-	write_barrier(ct);
+	intel_guc_write_barrier(ct_to_guc(ct));
 
 	/* update local copies */
 	ctb->tail = tail;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 0eeb2a9feeed..a00eeddc1449 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -58,19 +58,16 @@ 
 #define WQ_STATUS_CMD_ERROR		3
 #define WQ_STATUS_ENGINE_ID_NOT_USED	4
 #define WQ_STATUS_SUSPENDED_FROM_RESET	5
-#define WQ_TYPE_SHIFT			0
-#define   WQ_TYPE_BATCH_BUF		(0x1 << WQ_TYPE_SHIFT)
-#define   WQ_TYPE_PSEUDO		(0x2 << WQ_TYPE_SHIFT)
-#define   WQ_TYPE_INORDER		(0x3 << WQ_TYPE_SHIFT)
-#define   WQ_TYPE_NOOP			(0x4 << WQ_TYPE_SHIFT)
-#define WQ_TARGET_SHIFT			10
-#define WQ_LEN_SHIFT			16
-#define WQ_NO_WCFLUSH_WAIT		(1 << 27)
-#define WQ_PRESENT_WORKLOAD		(1 << 28)
-
-#define WQ_RING_TAIL_SHIFT		20
-#define WQ_RING_TAIL_MAX		0x7FF	/* 2^11 QWords */
-#define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
+#define WQ_TYPE_BATCH_BUF		0x1
+#define WQ_TYPE_PSEUDO			0x2
+#define WQ_TYPE_INORDER			0x3
+#define WQ_TYPE_NOOP			0x4
+#define WQ_TYPE_MULTI_LRC		0x5
+#define WQ_TYPE_MASK			GENMASK(7, 0)
+#define WQ_LEN_MASK			GENMASK(26, 16)
+
+#define WQ_GUC_ID_MASK			GENMASK(15, 0)
+#define WQ_RING_TAIL_MASK		GENMASK(28, 18)
 
 #define GUC_STAGE_DESC_ATTR_ACTIVE	BIT(0)
 #define GUC_STAGE_DESC_ATTR_PENDING_DB	BIT(1)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 031b1bf5ba91..1610120e31a1 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -399,6 +399,29 @@  __get_process_desc(struct intel_context *ce)
 		   LRC_STATE_OFFSET) / sizeof(u32)));
 }
 
+static u32 *get_wq_pointer(struct guc_process_desc *desc,
+			   struct intel_context *ce,
+			   u32 wqi_size)
+{
+	/*
+	 * Check for space in work queue. Caching a value of head pointer in
+	 * intel_context structure in order reduce the number accesses to shared
+	 * GPU memory which may be across a PCIe bus.
+	 */
+#define AVAILABLE_SPACE	\
+	CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE)
+	if (wqi_size > AVAILABLE_SPACE) {
+		ce->parallel.guc.wqi_head = READ_ONCE(desc->head);
+
+		if (wqi_size > AVAILABLE_SPACE)
+			return NULL;
+	}
+#undef AVAILABLE_SPACE
+
+	return ((u32 *)__get_process_desc(ce)) +
+		((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32));
+}
+
 static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index)
 {
 	struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr;
@@ -558,10 +581,10 @@  int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout)
 
 static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
 
-static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
+static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 {
 	int err = 0;
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u32 action[3];
 	int len = 0;
 	u32 g2h_len_dw = 0;
@@ -582,26 +605,17 @@  static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
 	GEM_BUG_ON(context_guc_id_invalid(ce));
 
-	/*
-	 * Corner case where the GuC firmware was blown away and reloaded while
-	 * this context was pinned.
-	 */
-	if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id))) {
-		err = guc_lrc_desc_pin(ce, false);
-		if (unlikely(err))
-			return err;
-	}
-
 	spin_lock(&ce->guc_state.lock);
 
 	/*
 	 * The request / context will be run on the hardware when scheduling
-	 * gets enabled in the unblock.
+	 * gets enabled in the unblock. For multi-lrc we still submit the
+	 * context to move the LRC tails.
 	 */
-	if (unlikely(context_blocked(ce)))
+	if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce)))
 		goto out;
 
-	enabled = context_enabled(ce);
+	enabled = context_enabled(ce) || context_blocked(ce);
 
 	if (!enabled) {
 		action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET;
@@ -620,6 +634,18 @@  static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 		trace_intel_context_sched_enable(ce);
 		atomic_inc(&guc->outstanding_submission_g2h);
 		set_context_enabled(ce);
+
+		/*
+		 * Without multi-lrc KMD does the submission step (moving the
+		 * lrc tail) so enabling scheduling is sufficient to submit the
+		 * context. This isn't the case in multi-lrc submission as the
+		 * GuC needs to move the tails, hence the need for another H2G
+		 * to submit a multi-lrc context after enabling scheduling.
+		 */
+		if (intel_context_is_parent(ce)) {
+			action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT;
+			err = intel_guc_send_nb(guc, action, len - 1, 0);
+		}
 	} else if (!enabled) {
 		clr_context_pending_enable(ce);
 		intel_context_put(ce);
@@ -632,6 +658,18 @@  static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 	return err;
 }
 
+static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
+{
+	int ret = __guc_add_request(guc, rq);
+
+	if (unlikely(ret == -EBUSY)) {
+		guc->stalled_request = rq;
+		guc->submission_stall_reason = STALL_ADD_REQUEST;
+	}
+
+	return ret;
+}
+
 static inline void guc_set_lrc_tail(struct i915_request *rq)
 {
 	rq->context->lrc_reg_state[CTX_RING_TAIL] =
@@ -643,6 +681,134 @@  static inline int rq_prio(const struct i915_request *rq)
 	return rq->sched.attr.priority;
 }
 
+static bool is_multi_lrc_rq(struct i915_request *rq)
+{
+	return intel_context_is_child(rq->context) ||
+		intel_context_is_parent(rq->context);
+}
+
+static bool can_merge_rq(struct i915_request *rq,
+			 struct i915_request *last)
+{
+	return request_to_scheduling_context(rq) ==
+		request_to_scheduling_context(last);
+}
+
+static u32 wq_space_until_wrap(struct intel_context *ce)
+{
+	return (WQ_SIZE - ce->parallel.guc.wqi_tail);
+}
+
+static void write_wqi(struct guc_process_desc *desc,
+		      struct intel_context *ce,
+		      u32 wqi_size)
+{
+	/*
+	 * Ensure WQI are visible before updating tail
+	 */
+	intel_guc_write_barrier(ce_to_guc(ce));
+
+	ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) &
+		(WQ_SIZE - 1);
+	WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail);
+}
+
+static int guc_wq_noop_append(struct intel_context *ce)
+{
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce));
+	u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1;
+
+	if (!wqi)
+		return -EBUSY;
+
+	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
+
+	*wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) |
+		FIELD_PREP(WQ_LEN_MASK, len_dw);
+	ce->parallel.guc.wqi_tail = 0;
+
+	return 0;
+}
+
+static int __guc_wq_item_append(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	struct intel_context *child;
+	struct guc_process_desc *desc = __get_process_desc(ce);
+	unsigned int wqi_size = (ce->parallel.number_children + 4) *
+		sizeof(u32);
+	u32 *wqi;
+	u32 len_dw = (wqi_size / sizeof(u32)) - 1;
+	int ret;
+
+	/* Ensure context is in correct state updating work queue */
+	GEM_BUG_ON(!atomic_read(&ce->guc_id.ref));
+	GEM_BUG_ON(context_guc_id_invalid(ce));
+	GEM_BUG_ON(context_wait_for_deregister_to_register(ce));
+	GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id));
+
+	/* Insert NOOP if this work queue item will wrap the tail pointer. */
+	if (wqi_size > wq_space_until_wrap(ce)) {
+		ret = guc_wq_noop_append(ce);
+		if (ret)
+			return ret;
+	}
+
+	wqi = get_wq_pointer(desc, ce, wqi_size);
+	if (!wqi)
+		return -EBUSY;
+
+	GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw));
+
+	*wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) |
+		FIELD_PREP(WQ_LEN_MASK, len_dw);
+	*wqi++ = ce->lrc.lrca;
+	*wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) |
+	       FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64));
+	*wqi++ = 0;	/* fence_id */
+	for_each_child(ce, child)
+		*wqi++ = child->ring->tail / sizeof(u64);
+
+	write_wqi(desc, ce, wqi_size);
+
+	return 0;
+}
+
+static int guc_wq_item_append(struct intel_guc *guc,
+			      struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+	int ret = 0;
+
+	if (likely(!intel_context_is_banned(ce))) {
+		ret = __guc_wq_item_append(rq);
+
+		if (unlikely(ret == -EBUSY)) {
+			guc->stalled_request = rq;
+			guc->submission_stall_reason = STALL_MOVE_LRC_TAIL;
+		}
+	}
+
+	return ret;
+}
+
+static bool multi_lrc_submit(struct i915_request *rq)
+{
+	struct intel_context *ce = request_to_scheduling_context(rq);
+
+	intel_ring_set_tail(rq->ring, rq->tail);
+
+	/*
+	 * We expect the front end (execbuf IOCTL) to set this flag on the last
+	 * request generated from a multi-BB submission. This indicates to the
+	 * backend (GuC interface) that we should submit this context thus
+	 * submitting all the requests generated in parallel.
+	 */
+	return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) ||
+		intel_context_is_banned(ce);
+}
+
 static int guc_dequeue_one_context(struct intel_guc *guc)
 {
 	struct i915_sched_engine * const sched_engine = guc->sched_engine;
@@ -656,7 +822,17 @@  static int guc_dequeue_one_context(struct intel_guc *guc)
 	if (guc->stalled_request) {
 		submit = true;
 		last = guc->stalled_request;
-		goto resubmit;
+
+		switch (guc->submission_stall_reason) {
+		case STALL_REGISTER_CONTEXT:
+			goto register_context;
+		case STALL_MOVE_LRC_TAIL:
+			goto move_lrc_tail;
+		case STALL_ADD_REQUEST:
+			goto add_request;
+		default:
+			MISSING_CASE(guc->submission_stall_reason);
+		}
 	}
 
 	while ((rb = rb_first_cached(&sched_engine->queue))) {
@@ -664,8 +840,8 @@  static int guc_dequeue_one_context(struct intel_guc *guc)
 		struct i915_request *rq, *rn;
 
 		priolist_for_each_request_consume(rq, rn, p) {
-			if (last && rq->context != last->context)
-				goto done;
+			if (last && !can_merge_rq(rq, last))
+				goto register_context;
 
 			list_del_init(&rq->sched.link);
 
@@ -673,33 +849,84 @@  static int guc_dequeue_one_context(struct intel_guc *guc)
 
 			trace_i915_request_in(rq, 0);
 			last = rq;
-			submit = true;
+
+			if (is_multi_lrc_rq(rq)) {
+				/*
+				 * We need to coalesce all multi-lrc requests in
+				 * a relationship into a single H2G. We are
+				 * guaranteed that all of these requests will be
+				 * submitted sequentially.
+				 */
+				if (multi_lrc_submit(rq)) {
+					submit = true;
+					goto register_context;
+				}
+			} else {
+				submit = true;
+			}
 		}
 
 		rb_erase_cached(&p->node, &sched_engine->queue);
 		i915_priolist_free(p);
 	}
-done:
+
+register_context:
 	if (submit) {
-		guc_set_lrc_tail(last);
-resubmit:
+		struct intel_context *ce = request_to_scheduling_context(last);
+
+		if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) &&
+			     !intel_context_is_banned(ce))) {
+			ret = guc_lrc_desc_pin(ce, false);
+			if (unlikely(ret == -EPIPE)) {
+				goto deadlk;
+			} else if (ret == -EBUSY) {
+				guc->stalled_request = last;
+				guc->submission_stall_reason =
+					STALL_REGISTER_CONTEXT;
+				goto schedule_tasklet;
+			} else if (ret != 0) {
+				GEM_WARN_ON(ret);	/* Unexpected */
+				goto deadlk;
+			}
+		}
+
+move_lrc_tail:
+		if (is_multi_lrc_rq(last)) {
+			ret = guc_wq_item_append(guc, last);
+			if (ret == -EBUSY) {
+				goto schedule_tasklet;
+			} else if (ret != 0) {
+				GEM_WARN_ON(ret);	/* Unexpected */
+				goto deadlk;
+			}
+		} else {
+			guc_set_lrc_tail(last);
+		}
+
+add_request:
 		ret = guc_add_request(guc, last);
-		if (unlikely(ret == -EPIPE))
+		if (unlikely(ret == -EPIPE)) {
+			goto deadlk;
+		} else if (ret == -EBUSY) {
+			goto schedule_tasklet;
+		} else if (ret != 0) {
+			GEM_WARN_ON(ret);	/* Unexpected */
 			goto deadlk;
-		else if (ret == -EBUSY) {
-			tasklet_schedule(&sched_engine->tasklet);
-			guc->stalled_request = last;
-			return false;
 		}
 	}
 
 	guc->stalled_request = NULL;
+	guc->submission_stall_reason = STALL_NONE;
 	return submit;
 
 deadlk:
 	sched_engine->tasklet.callback = NULL;
 	tasklet_disable_nosync(&sched_engine->tasklet);
 	return false;
+
+schedule_tasklet:
+	tasklet_schedule(&sched_engine->tasklet);
+	return false;
 }
 
 static void guc_submission_tasklet(struct tasklet_struct *t)
@@ -1255,10 +1482,16 @@  static int guc_bypass_tasklet_submit(struct intel_guc *guc,
 
 	trace_i915_request_in(rq, 0);
 
-	guc_set_lrc_tail(rq);
-	ret = guc_add_request(guc, rq);
-	if (ret == -EBUSY)
-		guc->stalled_request = rq;
+	if (is_multi_lrc_rq(rq)) {
+		if (multi_lrc_submit(rq)) {
+			ret = guc_wq_item_append(guc, rq);
+			if (!ret)
+				ret = guc_add_request(guc, rq);
+		}
+	} else {
+		guc_set_lrc_tail(rq);
+		ret = guc_add_request(guc, rq);
+	}
 
 	if (unlikely(ret == -EPIPE))
 		disable_submission(guc);
@@ -1266,6 +1499,16 @@  static int guc_bypass_tasklet_submit(struct intel_guc *guc,
 	return ret;
 }
 
+static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq)
+{
+	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
+	struct intel_context *ce = request_to_scheduling_context(rq);
+
+	return submission_disabled(guc) || guc->stalled_request ||
+		!i915_sched_engine_is_empty(sched_engine) ||
+		!lrc_desc_registered(guc, ce->guc_id.id);
+}
+
 static void guc_submit_request(struct i915_request *rq)
 {
 	struct i915_sched_engine *sched_engine = rq->engine->sched_engine;
@@ -1275,8 +1518,7 @@  static void guc_submit_request(struct i915_request *rq)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&sched_engine->lock, flags);
 
-	if (submission_disabled(guc) || guc->stalled_request ||
-	    !i915_sched_engine_is_empty(sched_engine))
+	if (need_tasklet(guc, rq))
 		queue_request(sched_engine, rq, rq_prio(rq));
 	else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY)
 		tasklet_hi_schedule(&sched_engine->tasklet);
@@ -2258,9 +2500,10 @@  static inline bool new_guc_prio_higher(u8 old_guc_prio, u8 new_guc_prio)
 
 static void add_to_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq));
 
+	GEM_BUG_ON(intel_context_is_child(ce));
 	GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI);
 
 	spin_lock(&ce->guc_state.lock);
@@ -2293,7 +2536,9 @@  static void guc_prio_fini(struct i915_request *rq, struct intel_context *ce)
 
 static void remove_from_context(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
+
+	GEM_BUG_ON(intel_context_is_child(ce));
 
 	spin_lock_irq(&ce->guc_state.lock);
 
@@ -2712,7 +2957,7 @@  static void guc_init_breadcrumbs(struct intel_engine_cs *engine)
 static void guc_bump_inflight_request_prio(struct i915_request *rq,
 					   int prio)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 	u8 new_guc_prio = map_i915_prio_to_guc_prio(prio);
 
 	/* Short circuit function */
@@ -2735,7 +2980,7 @@  static void guc_bump_inflight_request_prio(struct i915_request *rq,
 
 static void guc_retire_inflight_request_prio(struct i915_request *rq)
 {
-	struct intel_context *ce = rq->context;
+	struct intel_context *ce = request_to_scheduling_context(rq);
 
 	spin_lock(&ce->guc_state.lock);
 	guc_prio_fini(rq, ce);
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 7bd9ed20623e..8950785e55d6 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -139,6 +139,14 @@  enum {
 	 * the GPU. Here we track such boost requests on a per-request basis.
 	 */
 	I915_FENCE_FLAG_BOOST,
+
+	/*
+	 * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a
+	 * parent-child relationship (parallel submission, multi-lrc) should
+	 * trigger a submission to the GuC rather than just moving the context
+	 * tail.
+	 */
+	I915_FENCE_FLAG_SUBMIT_PARALLEL,
 };
 
 /**