diff mbox series

[24/27] drm/i915: Multi-BB execbuf

Message ID 20210820224446.30620-25-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Parallel submission aka multi-bb execbuf | expand

Commit Message

Matthew Brost Aug. 20, 2021, 10:44 p.m. UTC
Allow multiple batch buffers to be submitted in a single execbuf IOCTL
after a context has been configured with the 'set_parallel' extension.
The number batches is implicit based on the contexts configuration.

This is implemented with a series of loops. First a loop is used to find
all the batches, a loop to pin all the HW contexts, a loop to generate
all the requests, a loop to submit all the requests, a loop to commit
all the requests, and finally a loop to tie the requests to the VMAs
they touch.

A composite fence is also created for the also the generated requests to
return to the user and to stick in dma resv slots.

IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071&rev=1
media UMD: link to come

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 765 ++++++++++++------
 drivers/gpu/drm/i915/gt/intel_context.h       |   8 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  12 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
 drivers/gpu/drm/i915/i915_request.h           |   9 +
 drivers/gpu/drm/i915/i915_vma.c               |  21 +-
 drivers/gpu/drm/i915/i915_vma.h               |  13 +-
 7 files changed, 573 insertions(+), 257 deletions(-)

Comments

kernel test robot Aug. 21, 2021, 7:01 p.m. UTC | #1
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next next-20210820]
[cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.14-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-buildonly-randconfig-r002-20210821 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 9e9d70591e72fc6762b4b9a226b68ed1307419bf)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
        git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:20: warning: comparison of array 'eb->batch_len' equal to a null pointer is always false [-Wtautological-pointer-compare]
                   if (unlikely(eb->batch_len == 0)) { /* impossible! */
                                ~~~~^~~~~~~~~    ~
   include/linux/compiler.h:78:42: note: expanded from macro 'unlikely'
   # define unlikely(x)    __builtin_expect(!!(x), 0)
                                               ^
   1 warning generated.


vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

   548	
   549	static int
   550	eb_add_vma(struct i915_execbuffer *eb,
   551		   unsigned int *current_batch,
   552		   unsigned int i,
   553		   struct i915_vma *vma)
   554	{
   555		struct drm_i915_private *i915 = eb->i915;
   556		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
   557		struct eb_vma *ev = &eb->vma[i];
   558	
   559		ev->vma = vma;
   560		ev->exec = entry;
   561		ev->flags = entry->flags;
   562	
   563		if (eb->lut_size > 0) {
   564			ev->handle = entry->handle;
   565			hlist_add_head(&ev->node,
   566				       &eb->buckets[hash_32(entry->handle,
   567							    eb->lut_size)]);
   568		}
   569	
   570		if (entry->relocation_count)
   571			list_add_tail(&ev->reloc_link, &eb->relocs);
   572	
   573		/*
   574		 * SNA is doing fancy tricks with compressing batch buffers, which leads
   575		 * to negative relocation deltas. Usually that works out ok since the
   576		 * relocate address is still positive, except when the batch is placed
   577		 * very low in the GTT. Ensure this doesn't happen.
   578		 *
   579		 * Note that actual hangs have only been observed on gen7, but for
   580		 * paranoia do it everywhere.
   581		 */
   582		if (is_batch_buffer(eb, i)) {
   583			if (entry->relocation_count &&
   584			    !(ev->flags & EXEC_OBJECT_PINNED))
   585				ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
   586			if (eb->reloc_cache.has_fence)
   587				ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
   588	
   589			eb->batches[*current_batch] = ev;
   590	
   591			if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
   592				drm_dbg(&i915->drm,
   593					"Attempting to use self-modifying batch buffer\n");
   594				return -EINVAL;
   595			}
   596	
   597			if (range_overflows_t(u64,
   598					      eb->batch_start_offset,
   599					      eb->args->batch_len,
   600					      ev->vma->size)) {
   601				drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
   602				return -EINVAL;
   603			}
   604	
   605			if (eb->args->batch_len == 0)
   606				eb->batch_len[*current_batch] = ev->vma->size -
   607					eb->batch_start_offset;
 > 608			if (unlikely(eb->batch_len == 0)) { /* impossible! */
   609				drm_dbg(&i915->drm, "Invalid batch length\n");
   610				return -EINVAL;
   611			}
   612	
   613			++*current_batch;
   614		}
   615	
   616		return 0;
   617	}
   618	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Aug. 30, 2021, 3:46 a.m. UTC | #2
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on drm-tip/drm-tip drm-exynos/exynos-drm-next next-20210827]
[cannot apply to tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-rhel-8.3-kselftests (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-348-gf0e6938b-dirty
        # https://github.com/0day-ci/linux/commit/7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20210821-065348
        git checkout 7e7ae2111b2855ac3d63aa5c806c6936daaa6bbc
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
>> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c:608:21: sparse: sparse: Using plain integer as NULL pointer

vim +608 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

   548	
   549	static int
   550	eb_add_vma(struct i915_execbuffer *eb,
   551		   unsigned int *current_batch,
   552		   unsigned int i,
   553		   struct i915_vma *vma)
   554	{
   555		struct drm_i915_private *i915 = eb->i915;
   556		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
   557		struct eb_vma *ev = &eb->vma[i];
   558	
   559		ev->vma = vma;
   560		ev->exec = entry;
   561		ev->flags = entry->flags;
   562	
   563		if (eb->lut_size > 0) {
   564			ev->handle = entry->handle;
   565			hlist_add_head(&ev->node,
   566				       &eb->buckets[hash_32(entry->handle,
   567							    eb->lut_size)]);
   568		}
   569	
   570		if (entry->relocation_count)
   571			list_add_tail(&ev->reloc_link, &eb->relocs);
   572	
   573		/*
   574		 * SNA is doing fancy tricks with compressing batch buffers, which leads
   575		 * to negative relocation deltas. Usually that works out ok since the
   576		 * relocate address is still positive, except when the batch is placed
   577		 * very low in the GTT. Ensure this doesn't happen.
   578		 *
   579		 * Note that actual hangs have only been observed on gen7, but for
   580		 * paranoia do it everywhere.
   581		 */
   582		if (is_batch_buffer(eb, i)) {
   583			if (entry->relocation_count &&
   584			    !(ev->flags & EXEC_OBJECT_PINNED))
   585				ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
   586			if (eb->reloc_cache.has_fence)
   587				ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
   588	
   589			eb->batches[*current_batch] = ev;
   590	
   591			if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
   592				drm_dbg(&i915->drm,
   593					"Attempting to use self-modifying batch buffer\n");
   594				return -EINVAL;
   595			}
   596	
   597			if (range_overflows_t(u64,
   598					      eb->batch_start_offset,
   599					      eb->args->batch_len,
   600					      ev->vma->size)) {
   601				drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
   602				return -EINVAL;
   603			}
   604	
   605			if (eb->args->batch_len == 0)
   606				eb->batch_len[*current_batch] = ev->vma->size -
   607					eb->batch_start_offset;
 > 608			if (unlikely(eb->batch_len == 0)) { /* impossible! */
   609				drm_dbg(&i915->drm, "Invalid batch length\n");
   610				return -EINVAL;
   611			}
   612	
   613			++*current_batch;
   614		}
   615	
   616		return 0;
   617	}
   618	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Matthew Brost Sept. 30, 2021, 10:16 p.m. UTC | #3
On Fri, Aug 20, 2021 at 03:44:43PM -0700, Matthew Brost wrote:

Did a review offline with John Harrison, adding notes for what we found. 

> Allow multiple batch buffers to be submitted in a single execbuf IOCTL
> after a context has been configured with the 'set_parallel' extension.
> The number batches is implicit based on the contexts configuration.
> 
> This is implemented with a series of loops. First a loop is used to find
> all the batches, a loop to pin all the HW contexts, a loop to generate
> all the requests, a loop to submit all the requests, a loop to commit
> all the requests, and finally a loop to tie the requests to the VMAs
> they touch.

Clarify these steps a bit, also tieing requests to the VMAs is the 2nd to last
step with commiting requests to be the last.

> 
> A composite fence is also created for the also the generated requests to
> return to the user and to stick in dma resv slots.
>

Add a comment saying there should be no change in behavior for existing IOCTLs
expect if throttling because to space in the ring, the wait is done with the VMA
locks being held rather than dropping the locks and putting to the slow path.

> IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071&rev=1
> media UMD: link to come
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 765 ++++++++++++------
>  drivers/gpu/drm/i915/gt/intel_context.h       |   8 +-
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  12 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +
>  drivers/gpu/drm/i915/i915_request.h           |   9 +
>  drivers/gpu/drm/i915/i915_vma.c               |  21 +-
>  drivers/gpu/drm/i915/i915_vma.h               |  13 +-
>  7 files changed, 573 insertions(+), 257 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 8290bdadd167..481978974627 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -244,17 +244,23 @@ struct i915_execbuffer {
>  	struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
>  	struct eb_vma *vma;
>  
> -	struct intel_engine_cs *engine; /** engine to queue the request to */
> +	struct intel_gt *gt; /* gt for the execbuf */
>  	struct intel_context *context; /* logical state for the request */
>  	struct i915_gem_context *gem_context; /** caller's context */
>  
> -	struct i915_request *request; /** our request to build */
> -	struct eb_vma *batch; /** identity of the batch obj/vma */

Line wrap of these comments.

> +	struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our requests to build */
> +	struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the batch obj/vma */
>  	struct i915_vma *trampoline; /** trampoline used for chaining */
>  
> +	/** used for excl fence in dma_resv objects when > 1 BB submitted */
> +	struct dma_fence *composite_fence;
> +
>  	/** actual size of execobj[] as we may extend it for the cmdparser */
>  	unsigned int buffer_count;
>  
> +	/* number of batches in execbuf IOCTL */
> +	unsigned int num_batches;
> +
>  	/** list of vma not yet bound during reservation phase */
>  	struct list_head unbound;
>  
> @@ -281,7 +287,7 @@ struct i915_execbuffer {
>  
>  	u64 invalid_flags; /** Set of execobj.flags that are invalid */
>  
> -	u64 batch_len; /** Length of batch within object */
> +	u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within object */
>  	u32 batch_start_offset; /** Location within object of batch */
>  	u32 batch_flags; /** Flags composed for emit_bb_start() */
>  	struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */
> @@ -299,14 +305,13 @@ struct i915_execbuffer {
>  };
>  
>  static int eb_parse(struct i915_execbuffer *eb);
> -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
> -					  bool throttle);
> +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
>  static void eb_unpin_engine(struct i915_execbuffer *eb);
>  
>  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
>  {
> -	return intel_engine_requires_cmd_parser(eb->engine) ||
> -		(intel_engine_using_cmd_parser(eb->engine) &&
> +	return intel_engine_requires_cmd_parser(eb->context->engine) ||
> +		(intel_engine_using_cmd_parser(eb->context->engine) &&
>  		 eb->args->batch_len);
>  }
>  
> @@ -544,11 +549,21 @@ eb_validate_vma(struct i915_execbuffer *eb,
>  	return 0;
>  }
>  
> -static void
> +static inline bool
> +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
> +{
> +	return eb->args->flags & I915_EXEC_BATCH_FIRST ?
> +		buffer_idx < eb->num_batches :
> +		buffer_idx >= eb->args->buffer_count - eb->num_batches;
> +}
> +
> +static int
>  eb_add_vma(struct i915_execbuffer *eb,
> -	   unsigned int i, unsigned batch_idx,
> +	   unsigned int *current_batch,
> +	   unsigned int i,
>  	   struct i915_vma *vma)
>  {
> +	struct drm_i915_private *i915 = eb->i915;
>  	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
>  	struct eb_vma *ev = &eb->vma[i];
>  
> @@ -575,15 +590,41 @@ eb_add_vma(struct i915_execbuffer *eb,
>  	 * Note that actual hangs have only been observed on gen7, but for
>  	 * paranoia do it everywhere.
>  	 */
> -	if (i == batch_idx) {
> +	if (is_batch_buffer(eb, i)) {
>  		if (entry->relocation_count &&
>  		    !(ev->flags & EXEC_OBJECT_PINNED))
>  			ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
>  		if (eb->reloc_cache.has_fence)
>  			ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
>  
> -		eb->batch = ev;
> +		eb->batches[*current_batch] = ev;
> +
> +		if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
> +			drm_dbg(&i915->drm,
> +				"Attempting to use self-modifying batch buffer\n");
> +			return -EINVAL;
> +		}
> +
> +		if (range_overflows_t(u64,
> +				      eb->batch_start_offset,
> +				      eb->args->batch_len,
> +				      ev->vma->size)) {
> +			drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
> +			return -EINVAL;
> +		}
> +
> +		if (eb->args->batch_len == 0)
> +			eb->batch_len[*current_batch] = ev->vma->size -
> +				eb->batch_start_offset;
> +		if (unlikely(eb->batch_len == 0)) { /* impossible! */
> +			drm_dbg(&i915->drm, "Invalid batch length\n");
> +			return -EINVAL;
> +		}
> +
> +		++*current_batch;
>  	}
> +
> +	return 0;
>  }
>  
>  static inline int use_cpu_reloc(const struct reloc_cache *cache,
> @@ -727,14 +768,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
>  	} while (1);
>  }
>  
> -static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
> -{
> -	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
> -		return 0;
> -	else
> -		return eb->buffer_count - 1;
> -}
> -
>  static int eb_select_context(struct i915_execbuffer *eb)
>  {
>  	struct i915_gem_context *ctx;
> @@ -843,9 +876,7 @@ static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle)
>  
>  static int eb_lookup_vmas(struct i915_execbuffer *eb)
>  {
> -	struct drm_i915_private *i915 = eb->i915;
> -	unsigned int batch = eb_batch_index(eb);
> -	unsigned int i;
> +	unsigned int i, current_batch = 0;
>  	int err = 0;
>  
>  	INIT_LIST_HEAD(&eb->relocs);
> @@ -865,7 +896,9 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
>  			goto err;
>  		}
>  
> -		eb_add_vma(eb, i, batch, vma);
> +		err = eb_add_vma(eb, &current_batch, i, vma);
> +		if (err)
> +			return err;
>  
>  		if (i915_gem_object_is_userptr(vma->obj)) {
>  			err = i915_gem_object_userptr_submit_init(vma->obj);
> @@ -888,26 +921,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
>  		}
>  	}
>  
> -	if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) {
> -		drm_dbg(&i915->drm,
> -			"Attempting to use self-modifying batch buffer\n");
> -		return -EINVAL;
> -	}
> -
> -	if (range_overflows_t(u64,
> -			      eb->batch_start_offset, eb->batch_len,
> -			      eb->batch->vma->size)) {
> -		drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
> -		return -EINVAL;
> -	}
> -
> -	if (eb->batch_len == 0)
> -		eb->batch_len = eb->batch->vma->size - eb->batch_start_offset;
> -	if (unlikely(eb->batch_len == 0)) { /* impossible! */
> -		drm_dbg(&i915->drm, "Invalid batch length\n");
> -		return -EINVAL;
> -	}
> -
>  	return 0;
>  
>  err:
> @@ -1640,8 +1653,7 @@ static int eb_reinit_userptr(struct i915_execbuffer *eb)
>  	return 0;
>  }
>  
> -static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
> -					   struct i915_request *rq)
> +static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb)
>  {
>  	bool have_copy = false;
>  	struct eb_vma *ev;
> @@ -1657,21 +1669,6 @@ static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
>  	eb_release_vmas(eb, false);
>  	i915_gem_ww_ctx_fini(&eb->ww);
>  
> -	if (rq) {
> -		/* nonblocking is always false */
> -		if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
> -				      MAX_SCHEDULE_TIMEOUT) < 0) {
> -			i915_request_put(rq);
> -			rq = NULL;
> -
> -			err = -EINTR;
> -			goto err_relock;
> -		}
> -
> -		i915_request_put(rq);
> -		rq = NULL;
> -	}
> -
>  	/*
>  	 * We take 3 passes through the slowpatch.
>  	 *
> @@ -1698,28 +1695,21 @@ static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
>  	if (!err)
>  		err = eb_reinit_userptr(eb);
>  
> -err_relock:
>  	i915_gem_ww_ctx_init(&eb->ww, true);
>  	if (err)
>  		goto out;
>  
>  	/* reacquire the objects */
>  repeat_validate:
> -	rq = eb_pin_engine(eb, false);
> -	if (IS_ERR(rq)) {
> -		err = PTR_ERR(rq);
> -		rq = NULL;
> +	err = eb_pin_engine(eb, false);
> +	if (err)
>  		goto err;
> -	}
> -
> -	/* We didn't throttle, should be NULL */
> -	GEM_WARN_ON(rq);
>  
>  	err = eb_validate_vmas(eb);
>  	if (err)
>  		goto err;
>  
> -	GEM_BUG_ON(!eb->batch);
> +	GEM_BUG_ON(!eb->batches[0]);
>  
>  	list_for_each_entry(ev, &eb->relocs, reloc_link) {
>  		if (!have_copy) {
> @@ -1783,46 +1773,23 @@ static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
>  		}
>  	}
>  
> -	if (rq)
> -		i915_request_put(rq);
> -
>  	return err;
>  }
>  
>  static int eb_relocate_parse(struct i915_execbuffer *eb)
>  {
>  	int err;
> -	struct i915_request *rq = NULL;
>  	bool throttle = true;
>  
>  retry:
> -	rq = eb_pin_engine(eb, throttle);
> -	if (IS_ERR(rq)) {
> -		err = PTR_ERR(rq);
> -		rq = NULL;
> +	err = eb_pin_engine(eb, throttle);
> +	if (err) {
>  		if (err != -EDEADLK)
>  			return err;
>  
>  		goto err;
>  	}
>  
> -	if (rq) {
> -		bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
> -
> -		/* Need to drop all locks now for throttling, take slowpath */
> -		err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0);
> -		if (err == -ETIME) {
> -			if (nonblock) {
> -				err = -EWOULDBLOCK;
> -				i915_request_put(rq);
> -				goto err;
> -			}
> -			goto slow;
> -		}
> -		i915_request_put(rq);
> -		rq = NULL;
> -	}
> -
>  	/* only throttle once, even if we didn't need to throttle */
>  	throttle = false;
>  
> @@ -1862,7 +1829,7 @@ static int eb_relocate_parse(struct i915_execbuffer *eb)
>  	return err;
>  
>  slow:
> -	err = eb_relocate_parse_slow(eb, rq);
> +	err = eb_relocate_parse_slow(eb);
>  	if (err)
>  		/*
>  		 * If the user expects the execobject.offset and
> @@ -1876,11 +1843,31 @@ static int eb_relocate_parse(struct i915_execbuffer *eb)
>  	return err;
>  }
>  

Add comments in generally about lock nesting of timeline mutexes.

> +#define for_each_batch_create_order(_eb, _i) \
> +	for (_i = 0; _i < _eb->num_batches; ++_i)
> +#define for_each_batch_add_order(_eb, _i) \
> +	BUILD_BUG_ON(!typecheck(int, _i)); \
> +	for (_i = _eb->num_batches - 1; _i >= 0; --_i)
> +
> +static struct i915_request *
> +eb_find_first_request(struct i915_execbuffer *eb)

eb_find_first_request_added

> +{
> +	int i;
> +
> +	for_each_batch_add_order(eb, i)
> +		if (eb->requests[i])
> +			return eb->requests[i];
> +
> +	GEM_BUG_ON("Request not found");
> +
> +	return NULL;
> +}
> +
>  static int eb_move_to_gpu(struct i915_execbuffer *eb)
>  {
>  	const unsigned int count = eb->buffer_count;
>  	unsigned int i = count;
> -	int err = 0;
> +	int err = 0, j;
>  
>  	while (i--) {
>  		struct eb_vma *ev = &eb->vma[i];
> @@ -1893,11 +1880,17 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
>  		if (flags & EXEC_OBJECT_CAPTURE) {
>  			struct i915_capture_list *capture;
>  
> -			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
> -			if (capture) {
> -				capture->next = eb->request->capture_list;
> -				capture->vma = vma;
> -				eb->request->capture_list = capture;
> +			for_each_batch_create_order(eb, j) {
> +				if (!eb->requests[j])
> +					break;
> +
> +				capture = kmalloc(sizeof(*capture), GFP_KERNEL);
> +				if (capture) {
> +					capture->next =
> +						eb->requests[j]->capture_list;
> +					capture->vma = vma;
> +					eb->requests[j]->capture_list = capture;
> +				}
>  			}
>  		}
>  
> @@ -1918,14 +1911,26 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
>  				flags &= ~EXEC_OBJECT_ASYNC;
>  		}
>  
> +		/* We only need to await on the first request */
>  		if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) {
>  			err = i915_request_await_object
> -				(eb->request, obj, flags & EXEC_OBJECT_WRITE);
> +				(eb_find_first_request(eb), obj,
> +				 flags & EXEC_OBJECT_WRITE);
>  		}
>  
> -		if (err == 0)
> -			err = i915_vma_move_to_active(vma, eb->request,
> -						      flags | __EXEC_OBJECT_NO_RESERVE);
> +		for_each_batch_add_order(eb, j) {
> +			if (err)
> +				break;
> +			if (!eb->requests[j])
> +				continue;
> +
> +			err = _i915_vma_move_to_active(vma, eb->requests[j],
> +						       j ? NULL :
> +						       eb->composite_fence ?
> +						       eb->composite_fence :
> +						       &eb->requests[j]->fence,
> +						       flags | __EXEC_OBJECT_NO_RESERVE);
> +		}
>  	}
>  
>  #ifdef CONFIG_MMU_NOTIFIER
> @@ -1956,11 +1961,16 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
>  		goto err_skip;
>  
>  	/* Unconditionally flush any chipset caches (for streaming writes). */
> -	intel_gt_chipset_flush(eb->engine->gt);
> +	intel_gt_chipset_flush(eb->gt);
>  	return 0;
>  
>  err_skip:
> -	i915_request_set_error_once(eb->request, err);
> +	for_each_batch_create_order(eb, j) {
> +		if (!eb->requests[j])
> +			break;
> +
> +		i915_request_set_error_once(eb->requests[j], err);
> +	}
>  	return err;
>  }
>  
> @@ -2055,14 +2065,16 @@ static int eb_parse(struct i915_execbuffer *eb)
>  	int err;
>  
>  	if (!eb_use_cmdparser(eb)) {
> -		batch = eb_dispatch_secure(eb, eb->batch->vma);
> +		batch = eb_dispatch_secure(eb, eb->batches[0]->vma);
>  		if (IS_ERR(batch))
>  			return PTR_ERR(batch);
>  
>  		goto secure_batch;
>  	}
>  
> -	len = eb->batch_len;
> +	GEM_BUG_ON(intel_context_is_parallel(eb->context));

Return -EINVAL (or -ENODEV) rather than BUG_ON

> +
> +	len = eb->batch_len[0];
>  	if (!CMDPARSER_USES_GGTT(eb->i915)) {
>  		/*
>  		 * ppGTT backed shadow buffers must be mapped RO, to prevent
> @@ -2076,11 +2088,11 @@ static int eb_parse(struct i915_execbuffer *eb)
>  	} else {
>  		len += I915_CMD_PARSER_TRAMPOLINE_SIZE;
>  	}
> -	if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */
> +	if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */
>  		return -EINVAL;
>  
>  	if (!pool) {
> -		pool = intel_gt_get_buffer_pool(eb->engine->gt, len,
> +		pool = intel_gt_get_buffer_pool(eb->gt, len,
>  						I915_MAP_WB);
>  		if (IS_ERR(pool))
>  			return PTR_ERR(pool);
> @@ -2105,7 +2117,7 @@ static int eb_parse(struct i915_execbuffer *eb)
>  		trampoline = shadow;
>  
>  		shadow = shadow_batch_pin(eb, pool->obj,
> -					  &eb->engine->gt->ggtt->vm,
> +					  &eb->gt->ggtt->vm,
>  					  PIN_GLOBAL);
>  		if (IS_ERR(shadow)) {
>  			err = PTR_ERR(shadow);
> @@ -2127,26 +2139,27 @@ static int eb_parse(struct i915_execbuffer *eb)
>  	if (err)
>  		goto err_trampoline;
>  
> -	err = intel_engine_cmd_parser(eb->engine,
> -				      eb->batch->vma,
> +	err = intel_engine_cmd_parser(eb->context->engine,
> +				      eb->batches[0]->vma,
>  				      eb->batch_start_offset,
> -				      eb->batch_len,
> +				      eb->batch_len[0],
>  				      shadow, trampoline);
>  	if (err)
>  		goto err_unpin_batch;
>  
> -	eb->batch = &eb->vma[eb->buffer_count++];
> -	eb->batch->vma = i915_vma_get(shadow);
> -	eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
> +	eb->batches[0] = &eb->vma[eb->buffer_count++];
> +	eb->batches[0]->vma = i915_vma_get(shadow);
> +	eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN;
>  
>  	eb->trampoline = trampoline;
>  	eb->batch_start_offset = 0;
>  
>  secure_batch:
>  	if (batch) {
> -		eb->batch = &eb->vma[eb->buffer_count++];
> -		eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
> -		eb->batch->vma = i915_vma_get(batch);
> +		GEM_BUG_ON(intel_context_is_parallel(eb->context));

Same as above, no BUG_ON

> +		eb->batches[0] = &eb->vma[eb->buffer_count++];
> +		eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN;
> +		eb->batches[0]->vma = i915_vma_get(batch);
>  	}
>  	return 0;
>  
> @@ -2162,19 +2175,18 @@ static int eb_parse(struct i915_execbuffer *eb)
>  	return err;
>  }
>  
> -static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
> +static int eb_request_submit(struct i915_execbuffer *eb,
> +			     struct i915_request *rq,
> +			     struct i915_vma *batch,
> +			     u64 batch_len)
>  {
>  	int err;
>  
> -	if (intel_context_nopreempt(eb->context))
> -		__set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags);
> -
> -	err = eb_move_to_gpu(eb);
> -	if (err)
> -		return err;
> +	if (intel_context_nopreempt(rq->context))
> +		__set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
>  
>  	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
> -		err = i915_reset_gen7_sol_offsets(eb->request);
> +		err = i915_reset_gen7_sol_offsets(rq);
>  		if (err)
>  			return err;
>  	}
> @@ -2185,26 +2197,26 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
>  	 * allows us to determine if the batch is still waiting on the GPU
>  	 * or actually running by checking the breadcrumb.
>  	 */
> -	if (eb->engine->emit_init_breadcrumb) {
> -		err = eb->engine->emit_init_breadcrumb(eb->request);
> +	if (rq->context->engine->emit_init_breadcrumb) {
> +		err = rq->context->engine->emit_init_breadcrumb(rq);
>  		if (err)
>  			return err;
>  	}
>  
> -	err = eb->engine->emit_bb_start(eb->request,
> -					batch->node.start +
> -					eb->batch_start_offset,
> -					eb->batch_len,
> -					eb->batch_flags);
> +	err = rq->context->engine->emit_bb_start(rq,
> +						 batch->node.start +
> +						 eb->batch_start_offset,
> +						 batch_len,
> +						 eb->batch_flags);
>  	if (err)
>  		return err;
>  
>  	if (eb->trampoline) {
> +		GEM_BUG_ON(intel_context_is_parallel(rq->context));
>  		GEM_BUG_ON(eb->batch_start_offset);
> -		err = eb->engine->emit_bb_start(eb->request,
> -						eb->trampoline->node.start +
> -						eb->batch_len,
> -						0, 0);
> +		err = rq->context->engine->emit_bb_start(rq,
> +							 eb->trampoline->node.start +
> +							 batch_len, 0, 0);
>  		if (err)
>  			return err;
>  	}
> @@ -2212,6 +2224,27 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
>  	return 0;
>  }
>  
> +static int eb_submit(struct i915_execbuffer *eb)
> +{
> +	unsigned int i;
> +	int err;
> +
> +	err = eb_move_to_gpu(eb);
> +
> +	for_each_batch_create_order(eb, i) {
> +		if (!eb->requests[i])
> +			break;
> +
> +		trace_i915_request_queue(eb->requests[i], eb->batch_flags);
> +		if (!err)
> +			err = eb_request_submit(eb, eb->requests[i],
> +						eb->batches[i]->vma,
> +						eb->batch_len[i]);
> +	}
> +
> +	return err;
> +}
> +
>  static int num_vcs_engines(const struct drm_i915_private *i915)
>  {
>  	return hweight_long(VDBOX_MASK(&i915->gt));
> @@ -2277,26 +2310,11 @@ static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel
>  	return i915_request_get(rq);
>  }
>  
> -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
> +static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce,
> +			   bool throttle)
>  {
> -	struct intel_context *ce = eb->context;
>  	struct intel_timeline *tl;
> -	struct i915_request *rq = NULL;
> -	int err;
> -
> -	GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED);
> -
> -	if (unlikely(intel_context_is_banned(ce)))
> -		return ERR_PTR(-EIO);
> -
> -	/*
> -	 * Pinning the contexts may generate requests in order to acquire
> -	 * GGTT space, so do this first before we reserve a seqno for
> -	 * ourselves.
> -	 */
> -	err = intel_context_pin_ww(ce, &eb->ww);
> -	if (err)
> -		return ERR_PTR(err);
> +	struct i915_request *rq;
>  
>  	/*
>  	 * Take a local wakeref for preparing to dispatch the execbuf as
> @@ -2307,33 +2325,108 @@ static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt
>  	 * taken on the engine, and the parent device.
>  	 */
>  	tl = intel_context_timeline_lock(ce);
> -	if (IS_ERR(tl)) {
> -		intel_context_unpin(ce);
> -		return ERR_CAST(tl);
> -	}
> +	if (IS_ERR(tl))
> +		return PTR_ERR(tl);
>  
>  	intel_context_enter(ce);
>  	if (throttle)
>  		rq = eb_throttle(eb, ce);
>  	intel_context_timeline_unlock(tl);
>  
> +	if (rq) {
> +		bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
> +		long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT;
> +
> +		if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
> +				      timeout) < 0) {
> +			i915_request_put(rq);
> +
> +			tl = intel_context_timeline_lock(ce);
> +			intel_context_exit(ce);
> +			intel_context_timeline_unlock(tl);
> +
> +			if (nonblock)
> +				return -EWOULDBLOCK;
> +			else
> +				return -EINTR;
> +		}
> +		i915_request_put(rq);
> +	}
> +
> +	return 0;
> +}
> +
> +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
> +{
> +	struct intel_context *ce = eb->context, *child;
> +	int err;
> +	int i = 0, j = 0;
> +
> +	GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED);
> +
> +	if (unlikely(intel_context_is_banned(ce)))
> +		return -EIO;
> +
> +	/*
> +	 * Pinning the contexts may generate requests in order to acquire
> +	 * GGTT space, so do this first before we reserve a seqno for
> +	 * ourselves.
> +	 */
> +	err = intel_context_pin_ww(ce, &eb->ww);
> +	if (err)
> +		return err;
> +	for_each_child(ce, child) {
> +		err = intel_context_pin_ww(child, &eb->ww);
> +		GEM_BUG_ON(err);	/* perma-pinned should incr a counter */
> +	}
> +
> +	for_each_child(ce, child) {
> +		err = eb_pin_timeline(eb, child, throttle);
> +		if (err)
> +			goto unwind;
> +		++i;
> +	}
> +	err = eb_pin_timeline(eb, ce, throttle);
> +	if (err)
> +		goto unwind;
> +
>  	eb->args->flags |= __EXEC_ENGINE_PINNED;
> -	return rq;
> +	return 0;
> +
> +unwind:
> +	for_each_child(ce, child) {
> +		if (j++ < i) {
> +			mutex_lock(&child->timeline->mutex);
> +			intel_context_exit(child);
> +			mutex_unlock(&child->timeline->mutex);
> +		}
> +	}
> +	for_each_child(ce, child)
> +		intel_context_unpin(child);
> +	intel_context_unpin(ce);
> +	return err;
>  }
>  
>  static void eb_unpin_engine(struct i915_execbuffer *eb)
>  {
> -	struct intel_context *ce = eb->context;
> -	struct intel_timeline *tl = ce->timeline;
> +	struct intel_context *ce = eb->context, *child;
>  
>  	if (!(eb->args->flags & __EXEC_ENGINE_PINNED))
>  		return;
>  
>  	eb->args->flags &= ~__EXEC_ENGINE_PINNED;
>  
> -	mutex_lock(&tl->mutex);
> +	for_each_child(ce, child) {
> +		mutex_lock(&child->timeline->mutex);
> +		intel_context_exit(child);
> +		mutex_unlock(&child->timeline->mutex);
> +
> +		intel_context_unpin(child);
> +	}
> +
> +	mutex_lock(&ce->timeline->mutex);
>  	intel_context_exit(ce);
> -	mutex_unlock(&tl->mutex);
> +	mutex_unlock(&ce->timeline->mutex);
>  
>  	intel_context_unpin(ce);
>  }
> @@ -2384,7 +2477,7 @@ eb_select_legacy_ring(struct i915_execbuffer *eb)
>  static int
>  eb_select_engine(struct i915_execbuffer *eb)
>  {
> -	struct intel_context *ce;
> +	struct intel_context *ce, *child;
>  	unsigned int idx;
>  	int err;
>  
> @@ -2397,6 +2490,20 @@ eb_select_engine(struct i915_execbuffer *eb)
>  	if (IS_ERR(ce))
>  		return PTR_ERR(ce);
>  
> +	if (intel_context_is_parallel(ce)) {
> +		if (eb->buffer_count < ce->guc_number_children + 1) {
> +			intel_context_put(ce);
> +			return -EINVAL;
> +		}
> +		if (eb->batch_start_offset || eb->args->batch_len) {
> +			intel_context_put(ce);
> +			return -EINVAL;
> +		}
> +	}
> +	eb->num_batches = ce->guc_number_children + 1;
> +
> +	for_each_child(ce, child)
> +		intel_context_get(child);
>  	intel_gt_pm_get(ce->engine->gt);
>  
>  	if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
> @@ -2404,6 +2511,13 @@ eb_select_engine(struct i915_execbuffer *eb)
>  		if (err)
>  			goto err;
>  	}
> +	for_each_child(ce, child) {
> +		if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) {
> +			err = intel_context_alloc_state(child);
> +			if (err)
> +				goto err;
> +		}
> +	}

Probably could delete this as this should be done when context is perma-pinned
but harmless to leave so may leave it.

>  
>  	/*
>  	 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
> @@ -2414,7 +2528,7 @@ eb_select_engine(struct i915_execbuffer *eb)
>  		goto err;
>  
>  	eb->context = ce;
> -	eb->engine = ce->engine;
> +	eb->gt = ce->engine->gt;
>  
>  	/*
>  	 * Make sure engine pool stays alive even if we call intel_context_put
> @@ -2425,6 +2539,8 @@ eb_select_engine(struct i915_execbuffer *eb)
>  
>  err:
>  	intel_gt_pm_put(ce->engine->gt);
> +	for_each_child(ce, child)
> +		intel_context_put(child);
>  	intel_context_put(ce);
>  	return err;
>  }
> @@ -2432,7 +2548,11 @@ eb_select_engine(struct i915_execbuffer *eb)
>  static void
>  eb_put_engine(struct i915_execbuffer *eb)
>  {
> -	intel_gt_pm_put(eb->engine->gt);
> +	struct intel_context *child;
> +
> +	intel_gt_pm_put(eb->gt);
> +	for_each_child(eb->context, child)
> +		intel_context_put(child);
>  	intel_context_put(eb->context);
>  }
>  
> @@ -2655,7 +2775,8 @@ static void put_fence_array(struct eb_fence *fences, int num_fences)
>  }
>  
>  static int
> -await_fence_array(struct i915_execbuffer *eb)
> +await_fence_array(struct i915_execbuffer *eb,
> +		  struct i915_request *rq)
>  {
>  	unsigned int n;
>  	int err;
> @@ -2669,8 +2790,7 @@ await_fence_array(struct i915_execbuffer *eb)
>  		if (!eb->fences[n].dma_fence)
>  			continue;
>  
> -		err = i915_request_await_dma_fence(eb->request,
> -						   eb->fences[n].dma_fence);
> +		err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence);
>  		if (err < 0)
>  			return err;
>  	}
> @@ -2678,9 +2798,9 @@ await_fence_array(struct i915_execbuffer *eb)
>  	return 0;
>  }
>  
> -static void signal_fence_array(const struct i915_execbuffer *eb)
> +static void signal_fence_array(const struct i915_execbuffer *eb,
> +			       struct dma_fence * const fence)
>  {
> -	struct dma_fence * const fence = &eb->request->fence;
>  	unsigned int n;
>  
>  	for (n = 0; n < eb->num_fences; n++) {
> @@ -2728,12 +2848,12 @@ static void retire_requests(struct intel_timeline *tl, struct i915_request *end)
>  			break;
>  }
>  
> -static int eb_request_add(struct i915_execbuffer *eb, int err)
> +static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq)

static void

Matt

>  {
> -	struct i915_request *rq = eb->request;
>  	struct intel_timeline * const tl = i915_request_timeline(rq);
>  	struct i915_sched_attr attr = {};
>  	struct i915_request *prev;
> +	int err = 0;
>  
>  	lockdep_assert_held(&tl->mutex);
>  	lockdep_unpin_lock(&tl->mutex, rq->cookie);
> @@ -2745,11 +2865,6 @@ static int eb_request_add(struct i915_execbuffer *eb, int err)
>  	/* Check that the context wasn't destroyed before submission */
>  	if (likely(!intel_context_is_closed(eb->context))) {
>  		attr = eb->gem_context->sched;
> -	} else {
> -		/* Serialise with context_close via the add_to_timeline */
> -		i915_request_set_error_once(rq, -ENOENT);
> -		__i915_request_skip(rq);
> -		err = -ENOENT; /* override any transient errors */
>  	}
>  
>  	__i915_request_queue(rq, &attr);
> @@ -2763,6 +2878,44 @@ static int eb_request_add(struct i915_execbuffer *eb, int err)
>  	return err;
>  }
>  
> +static int eb_requests_add(struct i915_execbuffer *eb, int err)
> +{
> +	int i;
> +
> +	/*
> +	* We iterate in reverse order of creation to release timeline mutexes in
> +	* same order.
> +	*/
> +	for_each_batch_add_order(eb, i) {
> +		struct i915_request *rq = eb->requests[i];
> +
> +		if (!rq)
> +			continue;
> +
> +		if (unlikely(intel_context_is_closed(eb->context))) {
> +			/* Serialise with context_close via the add_to_timeline */
> +			i915_request_set_error_once(rq, -ENOENT);
> +			__i915_request_skip(rq);
> +			err = -ENOENT; /* override any transient errors */
> +		}
> +
> +		if (intel_context_is_parallel(eb->context)) {
> +			if (err) {
> +				__i915_request_skip(rq);
> +				set_bit(I915_FENCE_FLAG_SKIP_PARALLEL,
> +					&rq->fence.flags);
> +			}
> +			if (i == 0)
> +				set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL,
> +					&rq->fence.flags);
> +		}
> +
> +		err |= eb_request_add(eb, rq);
> +	}
> +
> +	return err;
> +}
> +
>  static const i915_user_extension_fn execbuf_extensions[] = {
>  	[DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences,
>  };
> @@ -2789,6 +2942,166 @@ parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args,
>  				    eb);
>  }
>  
> +static void eb_requests_get(struct i915_execbuffer *eb)
> +{
> +	unsigned int i;
> +
> +	for_each_batch_create_order(eb, i) {
> +		if (!eb->requests[i])
> +			break;
> +
> +		i915_request_get(eb->requests[i]);
> +	}
> +}
> +
> +static void eb_requests_put(struct i915_execbuffer *eb)
> +{
> +	unsigned int i;
> +
> +	for_each_batch_create_order(eb, i) {
> +		if (!eb->requests[i])
> +			break;
> +
> +		i915_request_put(eb->requests[i]);
> +	}
> +}
> +static struct sync_file *
> +eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
> +{
> +	struct sync_file *out_fence = NULL;
> +	struct dma_fence_array *fence_array;
> +	struct dma_fence **fences;
> +	unsigned int i;
> +
> +	GEM_BUG_ON(!intel_context_is_parent(eb->context));
> +
> +	fences = kmalloc(eb->num_batches * sizeof(*fences), GFP_KERNEL);
> +	if (!fences)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for_each_batch_create_order(eb, i)
> +		fences[i] = &eb->requests[i]->fence;
> +
> +	fence_array = dma_fence_array_create(eb->num_batches,
> +					     fences,
> +					     eb->context->fence_context,
> +					     eb->context->seqno,
> +					     false);
> +	if (!fence_array) {
> +		kfree(fences);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* Move ownership to the dma_fence_array created above */
> +	for_each_batch_create_order(eb, i)
> +		dma_fence_get(fences[i]);
> +
> +	if (out_fence_fd != -1) {
> +		out_fence = sync_file_create(&fence_array->base);
> +		/* sync_file now owns fence_arry, drop creation ref */
> +		dma_fence_put(&fence_array->base);
> +		if (!out_fence)
> +			return ERR_PTR(-ENOMEM);
> +	}
> +
> +	eb->composite_fence = &fence_array->base;
> +
> +	return out_fence;
> +}
> +
> +static struct intel_context *
> +eb_find_context(struct i915_execbuffer *eb, unsigned int context_number)
> +{
> +	struct intel_context *child;
> +
> +	if (likely(context_number == 0))
> +		return eb->context;
> +
> +	for_each_child(eb->context, child)
> +		if (!--context_number)
> +			return child;
> +
> +	GEM_BUG_ON("Context not found");
> +
> +	return NULL;
> +}
> +
> +static struct sync_file *
> +eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence,
> +		   int out_fence_fd)
> +{
> +	struct sync_file *out_fence = NULL;
> +	unsigned int i;
> +	int err;
> +
> +	for_each_batch_create_order(eb, i) {
> +		bool first_request_to_add = i + 1 == eb->num_batches;
> +
> +		/* Allocate a request for this batch buffer nice and early. */
> +		eb->requests[i] = i915_request_create(eb_find_context(eb, i));
> +		if (IS_ERR(eb->requests[i])) {
> +			eb->requests[i] = NULL;
> +			return ERR_PTR(PTR_ERR(eb->requests[i]));
> +		}
> +
> +		if (unlikely(eb->gem_context->syncobj &&
> +			     first_request_to_add)) {
> +			struct dma_fence *fence;
> +
> +			fence = drm_syncobj_fence_get(eb->gem_context->syncobj);
> +			err = i915_request_await_dma_fence(eb->requests[i], fence);
> +			dma_fence_put(fence);
> +			if (err)
> +				return ERR_PTR(err);
> +		}
> +
> +		if (in_fence && first_request_to_add) {
> +			if (eb->args->flags & I915_EXEC_FENCE_SUBMIT)
> +				err = i915_request_await_execution(eb->requests[i],
> +								   in_fence);
> +			else
> +				err = i915_request_await_dma_fence(eb->requests[i],
> +								   in_fence);
> +			if (err < 0)
> +				return ERR_PTR(err);
> +		}
> +
> +		if (eb->fences && first_request_to_add) {
> +			err = await_fence_array(eb, eb->requests[i]);
> +			if (err)
> +				return ERR_PTR(err);
> +		}
> +
> +		if (first_request_to_add) {
> +			if (intel_context_is_parallel(eb->context)) {
> +				out_fence = eb_composite_fence_create(eb, out_fence_fd);
> +				if (IS_ERR(out_fence))
> +					return ERR_PTR(-ENOMEM);
> +			} else if (out_fence_fd != -1) {
> +				out_fence = sync_file_create(&eb->requests[i]->fence);
> +				if (!out_fence)
> +					return ERR_PTR(-ENOMEM);
> +			}
> +		}
> +
> +		/*
> +		 * Whilst this request exists, batch_obj will be on the
> +		 * active_list, and so will hold the active reference. Only when
> +		 * this request is retired will the the batch_obj be moved onto
> +		 * the inactive_list and lose its active reference. Hence we do
> +		 * not need to explicitly hold another reference here.
> +		 */
> +		eb->requests[i]->batch = eb->batches[i]->vma;
> +		if (eb->batch_pool) {
> +			GEM_BUG_ON(intel_context_is_parallel(eb->context));
> +			intel_gt_buffer_pool_mark_active(eb->batch_pool,
> +							 eb->requests[i]);
> +		}
> +	}
> +
> +	return out_fence;
> +}
> +
>  static int
>  i915_gem_do_execbuffer(struct drm_device *dev,
>  		       struct drm_file *file,
> @@ -2799,7 +3112,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  	struct i915_execbuffer eb;
>  	struct dma_fence *in_fence = NULL;
>  	struct sync_file *out_fence = NULL;
> -	struct i915_vma *batch;
>  	int out_fence_fd = -1;
>  	int err;
>  
> @@ -2823,12 +3135,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  
>  	eb.buffer_count = args->buffer_count;
>  	eb.batch_start_offset = args->batch_start_offset;
> -	eb.batch_len = args->batch_len;
>  	eb.trampoline = NULL;
>  
>  	eb.fences = NULL;
>  	eb.num_fences = 0;
>  
> +	memset(eb.requests, 0, sizeof(struct i915_request *) *
> +	       ARRAY_SIZE(eb.requests));
> +	eb.composite_fence = NULL;
> +
>  	eb.batch_flags = 0;
>  	if (args->flags & I915_EXEC_SECURE) {
>  		if (GRAPHICS_VER(i915) >= 11)
> @@ -2912,70 +3227,25 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  
>  	ww_acquire_done(&eb.ww.ctx);
>  
> -	batch = eb.batch->vma;
> -
> -	/* Allocate a request for this batch buffer nice and early. */
> -	eb.request = i915_request_create(eb.context);
> -	if (IS_ERR(eb.request)) {
> -		err = PTR_ERR(eb.request);
> -		goto err_vma;
> -	}
> -
> -	if (unlikely(eb.gem_context->syncobj)) {
> -		struct dma_fence *fence;
> -
> -		fence = drm_syncobj_fence_get(eb.gem_context->syncobj);
> -		err = i915_request_await_dma_fence(eb.request, fence);
> -		dma_fence_put(fence);
> -		if (err)
> -			goto err_ext;
> -	}
> -
> -	if (in_fence) {
> -		if (args->flags & I915_EXEC_FENCE_SUBMIT)
> -			err = i915_request_await_execution(eb.request,
> -							   in_fence);
> -		else
> -			err = i915_request_await_dma_fence(eb.request,
> -							   in_fence);
> -		if (err < 0)
> -			goto err_request;
> -	}
> -
> -	if (eb.fences) {
> -		err = await_fence_array(&eb);
> -		if (err)
> -			goto err_request;
> -	}
> -
> -	if (out_fence_fd != -1) {
> -		out_fence = sync_file_create(&eb.request->fence);
> -		if (!out_fence) {
> -			err = -ENOMEM;
> +	out_fence = eb_requests_create(&eb, in_fence, out_fence_fd);
> +	if (IS_ERR(out_fence)) {
> +		err = PTR_ERR(out_fence);
> +		if (eb.requests[0])
>  			goto err_request;
> -		}
> +		else
> +			goto err_vma;
>  	}
>  
> -	/*
> -	 * Whilst this request exists, batch_obj will be on the
> -	 * active_list, and so will hold the active reference. Only when this
> -	 * request is retired will the the batch_obj be moved onto the
> -	 * inactive_list and lose its active reference. Hence we do not need
> -	 * to explicitly hold another reference here.
> -	 */
> -	eb.request->batch = batch;
> -	if (eb.batch_pool)
> -		intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request);
> -
> -	trace_i915_request_queue(eb.request, eb.batch_flags);
> -	err = eb_submit(&eb, batch);
> +	err = eb_submit(&eb);
>  
>  err_request:
> -	i915_request_get(eb.request);
> -	err = eb_request_add(&eb, err);
> +	eb_requests_get(&eb);
> +	err = eb_requests_add(&eb, err);
>  
>  	if (eb.fences)
> -		signal_fence_array(&eb);
> +		signal_fence_array(&eb, eb.composite_fence ?
> +				   eb.composite_fence :
> +				   &eb.requests[0]->fence);
>  
>  	if (out_fence) {
>  		if (err == 0) {
> @@ -2990,10 +3260,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  
>  	if (unlikely(eb.gem_context->syncobj)) {
>  		drm_syncobj_replace_fence(eb.gem_context->syncobj,
> -					  &eb.request->fence);
> +					  eb.composite_fence ?
> +					  eb.composite_fence :
> +					  &eb.requests[0]->fence);
>  	}
>  
> -	i915_request_put(eb.request);
> +	if (!out_fence && eb.composite_fence)
> +		dma_fence_put(eb.composite_fence);
> +
> +	eb_requests_put(&eb);
>  
>  err_vma:
>  	eb_release_vmas(&eb, true);
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index 9dcc1b14697b..1f6a5ae3e33e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -237,7 +237,13 @@ intel_context_timeline_lock(struct intel_context *ce)
>  	struct intel_timeline *tl = ce->timeline;
>  	int err;
>  
> -	err = mutex_lock_interruptible(&tl->mutex);
> +	if (intel_context_is_parent(ce))
> +		err = mutex_lock_interruptible_nested(&tl->mutex, 0);
> +	else if (intel_context_is_child(ce))
> +		err = mutex_lock_interruptible_nested(&tl->mutex,
> +						      ce->guc_child_index + 1);
> +	else
> +		err = mutex_lock_interruptible(&tl->mutex);
>  	if (err)
>  		return ERR_PTR(err);
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 727f91e7f7c2..094fcfb5cbe1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -261,6 +261,18 @@ struct intel_context {
>  		 * context.
>  		 */
>  		struct i915_request *last_rq;
> +
> +		/**
> +		 * @fence_context: fence context composite fence when doing
> +		 * parallel submission
> +		 */
> +		u64 fence_context;
> +
> +		/**
> +		 * @seqno: seqno for composite fence when doing parallel
> +		 * submission
> +		 */
> +		u32 seqno;
>  	};
>  
>  #ifdef CONFIG_DRM_I915_SELFTEST
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 1a18f99bf12a..2ef38557b0f0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -3034,6 +3034,8 @@ guc_create_parallel(struct intel_engine_cs **engines,
>  		}
>  	}
>  
> +	parent->fence_context = dma_fence_context_alloc(1);
> +
>  	parent->engine->emit_bb_start =
>  		emit_bb_start_parent_no_preempt_mid_batch;
>  	parent->engine->emit_fini_breadcrumb =
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 8f0073e19079..602cc246ba85 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -147,6 +147,15 @@ enum {
>  	 * tail.
>  	 */
>  	I915_FENCE_FLAG_SUBMIT_PARALLEL,
> +
> +	/*
> +	 * I915_FENCE_FLAG_SKIP_PARALLEL - request with a context in a
> +	 * parent-child relationship (parallel submission, multi-lrc) that
> +	 * hit an error while generating requests in the execbuf IOCTL.
> +	 * Indicates this request should be skipped as another request in
> +	 * submission / relationship encoutered an error.
> +	 */
> +	I915_FENCE_FLAG_SKIP_PARALLEL,
>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 4b7fc4647e46..90546fa58fc1 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -1234,9 +1234,10 @@ int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq)
>  	return i915_active_add_request(&vma->active, rq);
>  }
>  
> -int i915_vma_move_to_active(struct i915_vma *vma,
> -			    struct i915_request *rq,
> -			    unsigned int flags)
> +int _i915_vma_move_to_active(struct i915_vma *vma,
> +			     struct i915_request *rq,
> +			     struct dma_fence *fence,
> +			     unsigned int flags)
>  {
>  	struct drm_i915_gem_object *obj = vma->obj;
>  	int err;
> @@ -1257,9 +1258,11 @@ int i915_vma_move_to_active(struct i915_vma *vma,
>  			intel_frontbuffer_put(front);
>  		}
>  
> -		dma_resv_add_excl_fence(vma->resv, &rq->fence);
> -		obj->write_domain = I915_GEM_DOMAIN_RENDER;
> -		obj->read_domains = 0;
> +		if (fence) {
> +			dma_resv_add_excl_fence(vma->resv, fence);
> +			obj->write_domain = I915_GEM_DOMAIN_RENDER;
> +			obj->read_domains = 0;
> +		}
>  	} else {
>  		if (!(flags & __EXEC_OBJECT_NO_RESERVE)) {
>  			err = dma_resv_reserve_shared(vma->resv, 1);
> @@ -1267,8 +1270,10 @@ int i915_vma_move_to_active(struct i915_vma *vma,
>  				return err;
>  		}
>  
> -		dma_resv_add_shared_fence(vma->resv, &rq->fence);
> -		obj->write_domain = 0;
> +		if (fence) {
> +			dma_resv_add_shared_fence(vma->resv, fence);
> +			obj->write_domain = 0;
> +		}
>  	}
>  
>  	if (flags & EXEC_OBJECT_NEEDS_FENCE && vma->fence)
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index ed69f66c7ab0..648dbe744c96 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -57,9 +57,16 @@ static inline bool i915_vma_is_active(const struct i915_vma *vma)
>  
>  int __must_check __i915_vma_move_to_active(struct i915_vma *vma,
>  					   struct i915_request *rq);
> -int __must_check i915_vma_move_to_active(struct i915_vma *vma,
> -					 struct i915_request *rq,
> -					 unsigned int flags);
> +int __must_check _i915_vma_move_to_active(struct i915_vma *vma,
> +					  struct i915_request *rq,
> +					  struct dma_fence *fence,
> +					  unsigned int flags);
> +static inline int __must_check
> +i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq,
> +			unsigned int flags)
> +{
> +	return _i915_vma_move_to_active(vma, rq, &rq->fence, flags);
> +}
>  
>  #define __i915_vma_flags(v) ((unsigned long *)&(v)->flags.counter)
>  
> -- 
> 2.32.0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 8290bdadd167..481978974627 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -244,17 +244,23 @@  struct i915_execbuffer {
 	struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
 	struct eb_vma *vma;
 
-	struct intel_engine_cs *engine; /** engine to queue the request to */
+	struct intel_gt *gt; /* gt for the execbuf */
 	struct intel_context *context; /* logical state for the request */
 	struct i915_gem_context *gem_context; /** caller's context */
 
-	struct i915_request *request; /** our request to build */
-	struct eb_vma *batch; /** identity of the batch obj/vma */
+	struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** our requests to build */
+	struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; /** identity of the batch obj/vma */
 	struct i915_vma *trampoline; /** trampoline used for chaining */
 
+	/** used for excl fence in dma_resv objects when > 1 BB submitted */
+	struct dma_fence *composite_fence;
+
 	/** actual size of execobj[] as we may extend it for the cmdparser */
 	unsigned int buffer_count;
 
+	/* number of batches in execbuf IOCTL */
+	unsigned int num_batches;
+
 	/** list of vma not yet bound during reservation phase */
 	struct list_head unbound;
 
@@ -281,7 +287,7 @@  struct i915_execbuffer {
 
 	u64 invalid_flags; /** Set of execobj.flags that are invalid */
 
-	u64 batch_len; /** Length of batch within object */
+	u64 batch_len[MAX_ENGINE_INSTANCE + 1]; /** Length of batch within object */
 	u32 batch_start_offset; /** Location within object of batch */
 	u32 batch_flags; /** Flags composed for emit_bb_start() */
 	struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */
@@ -299,14 +305,13 @@  struct i915_execbuffer {
 };
 
 static int eb_parse(struct i915_execbuffer *eb);
-static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb,
-					  bool throttle);
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
 static void eb_unpin_engine(struct i915_execbuffer *eb);
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
-	return intel_engine_requires_cmd_parser(eb->engine) ||
-		(intel_engine_using_cmd_parser(eb->engine) &&
+	return intel_engine_requires_cmd_parser(eb->context->engine) ||
+		(intel_engine_using_cmd_parser(eb->context->engine) &&
 		 eb->args->batch_len);
 }
 
@@ -544,11 +549,21 @@  eb_validate_vma(struct i915_execbuffer *eb,
 	return 0;
 }
 
-static void
+static inline bool
+is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx)
+{
+	return eb->args->flags & I915_EXEC_BATCH_FIRST ?
+		buffer_idx < eb->num_batches :
+		buffer_idx >= eb->args->buffer_count - eb->num_batches;
+}
+
+static int
 eb_add_vma(struct i915_execbuffer *eb,
-	   unsigned int i, unsigned batch_idx,
+	   unsigned int *current_batch,
+	   unsigned int i,
 	   struct i915_vma *vma)
 {
+	struct drm_i915_private *i915 = eb->i915;
 	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
 	struct eb_vma *ev = &eb->vma[i];
 
@@ -575,15 +590,41 @@  eb_add_vma(struct i915_execbuffer *eb,
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if (i == batch_idx) {
+	if (is_batch_buffer(eb, i)) {
 		if (entry->relocation_count &&
 		    !(ev->flags & EXEC_OBJECT_PINNED))
 			ev->flags |= __EXEC_OBJECT_NEEDS_BIAS;
 		if (eb->reloc_cache.has_fence)
 			ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
 
-		eb->batch = ev;
+		eb->batches[*current_batch] = ev;
+
+		if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
+			drm_dbg(&i915->drm,
+				"Attempting to use self-modifying batch buffer\n");
+			return -EINVAL;
+		}
+
+		if (range_overflows_t(u64,
+				      eb->batch_start_offset,
+				      eb->args->batch_len,
+				      ev->vma->size)) {
+			drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
+			return -EINVAL;
+		}
+
+		if (eb->args->batch_len == 0)
+			eb->batch_len[*current_batch] = ev->vma->size -
+				eb->batch_start_offset;
+		if (unlikely(eb->batch_len == 0)) { /* impossible! */
+			drm_dbg(&i915->drm, "Invalid batch length\n");
+			return -EINVAL;
+		}
+
+		++*current_batch;
 	}
+
+	return 0;
 }
 
 static inline int use_cpu_reloc(const struct reloc_cache *cache,
@@ -727,14 +768,6 @@  static int eb_reserve(struct i915_execbuffer *eb)
 	} while (1);
 }
 
-static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
-{
-	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
-		return 0;
-	else
-		return eb->buffer_count - 1;
-}
-
 static int eb_select_context(struct i915_execbuffer *eb)
 {
 	struct i915_gem_context *ctx;
@@ -843,9 +876,7 @@  static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle)
 
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	struct drm_i915_private *i915 = eb->i915;
-	unsigned int batch = eb_batch_index(eb);
-	unsigned int i;
+	unsigned int i, current_batch = 0;
 	int err = 0;
 
 	INIT_LIST_HEAD(&eb->relocs);
@@ -865,7 +896,9 @@  static int eb_lookup_vmas(struct i915_execbuffer *eb)
 			goto err;
 		}
 
-		eb_add_vma(eb, i, batch, vma);
+		err = eb_add_vma(eb, &current_batch, i, vma);
+		if (err)
+			return err;
 
 		if (i915_gem_object_is_userptr(vma->obj)) {
 			err = i915_gem_object_userptr_submit_init(vma->obj);
@@ -888,26 +921,6 @@  static int eb_lookup_vmas(struct i915_execbuffer *eb)
 		}
 	}
 
-	if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) {
-		drm_dbg(&i915->drm,
-			"Attempting to use self-modifying batch buffer\n");
-		return -EINVAL;
-	}
-
-	if (range_overflows_t(u64,
-			      eb->batch_start_offset, eb->batch_len,
-			      eb->batch->vma->size)) {
-		drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n");
-		return -EINVAL;
-	}
-
-	if (eb->batch_len == 0)
-		eb->batch_len = eb->batch->vma->size - eb->batch_start_offset;
-	if (unlikely(eb->batch_len == 0)) { /* impossible! */
-		drm_dbg(&i915->drm, "Invalid batch length\n");
-		return -EINVAL;
-	}
-
 	return 0;
 
 err:
@@ -1640,8 +1653,7 @@  static int eb_reinit_userptr(struct i915_execbuffer *eb)
 	return 0;
 }
 
-static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
-					   struct i915_request *rq)
+static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb)
 {
 	bool have_copy = false;
 	struct eb_vma *ev;
@@ -1657,21 +1669,6 @@  static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
 	eb_release_vmas(eb, false);
 	i915_gem_ww_ctx_fini(&eb->ww);
 
-	if (rq) {
-		/* nonblocking is always false */
-		if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
-				      MAX_SCHEDULE_TIMEOUT) < 0) {
-			i915_request_put(rq);
-			rq = NULL;
-
-			err = -EINTR;
-			goto err_relock;
-		}
-
-		i915_request_put(rq);
-		rq = NULL;
-	}
-
 	/*
 	 * We take 3 passes through the slowpatch.
 	 *
@@ -1698,28 +1695,21 @@  static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
 	if (!err)
 		err = eb_reinit_userptr(eb);
 
-err_relock:
 	i915_gem_ww_ctx_init(&eb->ww, true);
 	if (err)
 		goto out;
 
 	/* reacquire the objects */
 repeat_validate:
-	rq = eb_pin_engine(eb, false);
-	if (IS_ERR(rq)) {
-		err = PTR_ERR(rq);
-		rq = NULL;
+	err = eb_pin_engine(eb, false);
+	if (err)
 		goto err;
-	}
-
-	/* We didn't throttle, should be NULL */
-	GEM_WARN_ON(rq);
 
 	err = eb_validate_vmas(eb);
 	if (err)
 		goto err;
 
-	GEM_BUG_ON(!eb->batch);
+	GEM_BUG_ON(!eb->batches[0]);
 
 	list_for_each_entry(ev, &eb->relocs, reloc_link) {
 		if (!have_copy) {
@@ -1783,46 +1773,23 @@  static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
 		}
 	}
 
-	if (rq)
-		i915_request_put(rq);
-
 	return err;
 }
 
 static int eb_relocate_parse(struct i915_execbuffer *eb)
 {
 	int err;
-	struct i915_request *rq = NULL;
 	bool throttle = true;
 
 retry:
-	rq = eb_pin_engine(eb, throttle);
-	if (IS_ERR(rq)) {
-		err = PTR_ERR(rq);
-		rq = NULL;
+	err = eb_pin_engine(eb, throttle);
+	if (err) {
 		if (err != -EDEADLK)
 			return err;
 
 		goto err;
 	}
 
-	if (rq) {
-		bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
-
-		/* Need to drop all locks now for throttling, take slowpath */
-		err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0);
-		if (err == -ETIME) {
-			if (nonblock) {
-				err = -EWOULDBLOCK;
-				i915_request_put(rq);
-				goto err;
-			}
-			goto slow;
-		}
-		i915_request_put(rq);
-		rq = NULL;
-	}
-
 	/* only throttle once, even if we didn't need to throttle */
 	throttle = false;
 
@@ -1862,7 +1829,7 @@  static int eb_relocate_parse(struct i915_execbuffer *eb)
 	return err;
 
 slow:
-	err = eb_relocate_parse_slow(eb, rq);
+	err = eb_relocate_parse_slow(eb);
 	if (err)
 		/*
 		 * If the user expects the execobject.offset and
@@ -1876,11 +1843,31 @@  static int eb_relocate_parse(struct i915_execbuffer *eb)
 	return err;
 }
 
+#define for_each_batch_create_order(_eb, _i) \
+	for (_i = 0; _i < _eb->num_batches; ++_i)
+#define for_each_batch_add_order(_eb, _i) \
+	BUILD_BUG_ON(!typecheck(int, _i)); \
+	for (_i = _eb->num_batches - 1; _i >= 0; --_i)
+
+static struct i915_request *
+eb_find_first_request(struct i915_execbuffer *eb)
+{
+	int i;
+
+	for_each_batch_add_order(eb, i)
+		if (eb->requests[i])
+			return eb->requests[i];
+
+	GEM_BUG_ON("Request not found");
+
+	return NULL;
+}
+
 static int eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	const unsigned int count = eb->buffer_count;
 	unsigned int i = count;
-	int err = 0;
+	int err = 0, j;
 
 	while (i--) {
 		struct eb_vma *ev = &eb->vma[i];
@@ -1893,11 +1880,17 @@  static int eb_move_to_gpu(struct i915_execbuffer *eb)
 		if (flags & EXEC_OBJECT_CAPTURE) {
 			struct i915_capture_list *capture;
 
-			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
-			if (capture) {
-				capture->next = eb->request->capture_list;
-				capture->vma = vma;
-				eb->request->capture_list = capture;
+			for_each_batch_create_order(eb, j) {
+				if (!eb->requests[j])
+					break;
+
+				capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+				if (capture) {
+					capture->next =
+						eb->requests[j]->capture_list;
+					capture->vma = vma;
+					eb->requests[j]->capture_list = capture;
+				}
 			}
 		}
 
@@ -1918,14 +1911,26 @@  static int eb_move_to_gpu(struct i915_execbuffer *eb)
 				flags &= ~EXEC_OBJECT_ASYNC;
 		}
 
+		/* We only need to await on the first request */
 		if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) {
 			err = i915_request_await_object
-				(eb->request, obj, flags & EXEC_OBJECT_WRITE);
+				(eb_find_first_request(eb), obj,
+				 flags & EXEC_OBJECT_WRITE);
 		}
 
-		if (err == 0)
-			err = i915_vma_move_to_active(vma, eb->request,
-						      flags | __EXEC_OBJECT_NO_RESERVE);
+		for_each_batch_add_order(eb, j) {
+			if (err)
+				break;
+			if (!eb->requests[j])
+				continue;
+
+			err = _i915_vma_move_to_active(vma, eb->requests[j],
+						       j ? NULL :
+						       eb->composite_fence ?
+						       eb->composite_fence :
+						       &eb->requests[j]->fence,
+						       flags | __EXEC_OBJECT_NO_RESERVE);
+		}
 	}
 
 #ifdef CONFIG_MMU_NOTIFIER
@@ -1956,11 +1961,16 @@  static int eb_move_to_gpu(struct i915_execbuffer *eb)
 		goto err_skip;
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
-	intel_gt_chipset_flush(eb->engine->gt);
+	intel_gt_chipset_flush(eb->gt);
 	return 0;
 
 err_skip:
-	i915_request_set_error_once(eb->request, err);
+	for_each_batch_create_order(eb, j) {
+		if (!eb->requests[j])
+			break;
+
+		i915_request_set_error_once(eb->requests[j], err);
+	}
 	return err;
 }
 
@@ -2055,14 +2065,16 @@  static int eb_parse(struct i915_execbuffer *eb)
 	int err;
 
 	if (!eb_use_cmdparser(eb)) {
-		batch = eb_dispatch_secure(eb, eb->batch->vma);
+		batch = eb_dispatch_secure(eb, eb->batches[0]->vma);
 		if (IS_ERR(batch))
 			return PTR_ERR(batch);
 
 		goto secure_batch;
 	}
 
-	len = eb->batch_len;
+	GEM_BUG_ON(intel_context_is_parallel(eb->context));
+
+	len = eb->batch_len[0];
 	if (!CMDPARSER_USES_GGTT(eb->i915)) {
 		/*
 		 * ppGTT backed shadow buffers must be mapped RO, to prevent
@@ -2076,11 +2088,11 @@  static int eb_parse(struct i915_execbuffer *eb)
 	} else {
 		len += I915_CMD_PARSER_TRAMPOLINE_SIZE;
 	}
-	if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */
+	if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */
 		return -EINVAL;
 
 	if (!pool) {
-		pool = intel_gt_get_buffer_pool(eb->engine->gt, len,
+		pool = intel_gt_get_buffer_pool(eb->gt, len,
 						I915_MAP_WB);
 		if (IS_ERR(pool))
 			return PTR_ERR(pool);
@@ -2105,7 +2117,7 @@  static int eb_parse(struct i915_execbuffer *eb)
 		trampoline = shadow;
 
 		shadow = shadow_batch_pin(eb, pool->obj,
-					  &eb->engine->gt->ggtt->vm,
+					  &eb->gt->ggtt->vm,
 					  PIN_GLOBAL);
 		if (IS_ERR(shadow)) {
 			err = PTR_ERR(shadow);
@@ -2127,26 +2139,27 @@  static int eb_parse(struct i915_execbuffer *eb)
 	if (err)
 		goto err_trampoline;
 
-	err = intel_engine_cmd_parser(eb->engine,
-				      eb->batch->vma,
+	err = intel_engine_cmd_parser(eb->context->engine,
+				      eb->batches[0]->vma,
 				      eb->batch_start_offset,
-				      eb->batch_len,
+				      eb->batch_len[0],
 				      shadow, trampoline);
 	if (err)
 		goto err_unpin_batch;
 
-	eb->batch = &eb->vma[eb->buffer_count++];
-	eb->batch->vma = i915_vma_get(shadow);
-	eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
+	eb->batches[0] = &eb->vma[eb->buffer_count++];
+	eb->batches[0]->vma = i915_vma_get(shadow);
+	eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN;
 
 	eb->trampoline = trampoline;
 	eb->batch_start_offset = 0;
 
 secure_batch:
 	if (batch) {
-		eb->batch = &eb->vma[eb->buffer_count++];
-		eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
-		eb->batch->vma = i915_vma_get(batch);
+		GEM_BUG_ON(intel_context_is_parallel(eb->context));
+		eb->batches[0] = &eb->vma[eb->buffer_count++];
+		eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN;
+		eb->batches[0]->vma = i915_vma_get(batch);
 	}
 	return 0;
 
@@ -2162,19 +2175,18 @@  static int eb_parse(struct i915_execbuffer *eb)
 	return err;
 }
 
-static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
+static int eb_request_submit(struct i915_execbuffer *eb,
+			     struct i915_request *rq,
+			     struct i915_vma *batch,
+			     u64 batch_len)
 {
 	int err;
 
-	if (intel_context_nopreempt(eb->context))
-		__set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags);
-
-	err = eb_move_to_gpu(eb);
-	if (err)
-		return err;
+	if (intel_context_nopreempt(rq->context))
+		__set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
 
 	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
-		err = i915_reset_gen7_sol_offsets(eb->request);
+		err = i915_reset_gen7_sol_offsets(rq);
 		if (err)
 			return err;
 	}
@@ -2185,26 +2197,26 @@  static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
 	 * allows us to determine if the batch is still waiting on the GPU
 	 * or actually running by checking the breadcrumb.
 	 */
-	if (eb->engine->emit_init_breadcrumb) {
-		err = eb->engine->emit_init_breadcrumb(eb->request);
+	if (rq->context->engine->emit_init_breadcrumb) {
+		err = rq->context->engine->emit_init_breadcrumb(rq);
 		if (err)
 			return err;
 	}
 
-	err = eb->engine->emit_bb_start(eb->request,
-					batch->node.start +
-					eb->batch_start_offset,
-					eb->batch_len,
-					eb->batch_flags);
+	err = rq->context->engine->emit_bb_start(rq,
+						 batch->node.start +
+						 eb->batch_start_offset,
+						 batch_len,
+						 eb->batch_flags);
 	if (err)
 		return err;
 
 	if (eb->trampoline) {
+		GEM_BUG_ON(intel_context_is_parallel(rq->context));
 		GEM_BUG_ON(eb->batch_start_offset);
-		err = eb->engine->emit_bb_start(eb->request,
-						eb->trampoline->node.start +
-						eb->batch_len,
-						0, 0);
+		err = rq->context->engine->emit_bb_start(rq,
+							 eb->trampoline->node.start +
+							 batch_len, 0, 0);
 		if (err)
 			return err;
 	}
@@ -2212,6 +2224,27 @@  static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch)
 	return 0;
 }
 
+static int eb_submit(struct i915_execbuffer *eb)
+{
+	unsigned int i;
+	int err;
+
+	err = eb_move_to_gpu(eb);
+
+	for_each_batch_create_order(eb, i) {
+		if (!eb->requests[i])
+			break;
+
+		trace_i915_request_queue(eb->requests[i], eb->batch_flags);
+		if (!err)
+			err = eb_request_submit(eb, eb->requests[i],
+						eb->batches[i]->vma,
+						eb->batch_len[i]);
+	}
+
+	return err;
+}
+
 static int num_vcs_engines(const struct drm_i915_private *i915)
 {
 	return hweight_long(VDBOX_MASK(&i915->gt));
@@ -2277,26 +2310,11 @@  static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel
 	return i915_request_get(rq);
 }
 
-static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
+static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce,
+			   bool throttle)
 {
-	struct intel_context *ce = eb->context;
 	struct intel_timeline *tl;
-	struct i915_request *rq = NULL;
-	int err;
-
-	GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED);
-
-	if (unlikely(intel_context_is_banned(ce)))
-		return ERR_PTR(-EIO);
-
-	/*
-	 * Pinning the contexts may generate requests in order to acquire
-	 * GGTT space, so do this first before we reserve a seqno for
-	 * ourselves.
-	 */
-	err = intel_context_pin_ww(ce, &eb->ww);
-	if (err)
-		return ERR_PTR(err);
+	struct i915_request *rq;
 
 	/*
 	 * Take a local wakeref for preparing to dispatch the execbuf as
@@ -2307,33 +2325,108 @@  static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt
 	 * taken on the engine, and the parent device.
 	 */
 	tl = intel_context_timeline_lock(ce);
-	if (IS_ERR(tl)) {
-		intel_context_unpin(ce);
-		return ERR_CAST(tl);
-	}
+	if (IS_ERR(tl))
+		return PTR_ERR(tl);
 
 	intel_context_enter(ce);
 	if (throttle)
 		rq = eb_throttle(eb, ce);
 	intel_context_timeline_unlock(tl);
 
+	if (rq) {
+		bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
+		long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT;
+
+		if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
+				      timeout) < 0) {
+			i915_request_put(rq);
+
+			tl = intel_context_timeline_lock(ce);
+			intel_context_exit(ce);
+			intel_context_timeline_unlock(tl);
+
+			if (nonblock)
+				return -EWOULDBLOCK;
+			else
+				return -EINTR;
+		}
+		i915_request_put(rq);
+	}
+
+	return 0;
+}
+
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
+{
+	struct intel_context *ce = eb->context, *child;
+	int err;
+	int i = 0, j = 0;
+
+	GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED);
+
+	if (unlikely(intel_context_is_banned(ce)))
+		return -EIO;
+
+	/*
+	 * Pinning the contexts may generate requests in order to acquire
+	 * GGTT space, so do this first before we reserve a seqno for
+	 * ourselves.
+	 */
+	err = intel_context_pin_ww(ce, &eb->ww);
+	if (err)
+		return err;
+	for_each_child(ce, child) {
+		err = intel_context_pin_ww(child, &eb->ww);
+		GEM_BUG_ON(err);	/* perma-pinned should incr a counter */
+	}
+
+	for_each_child(ce, child) {
+		err = eb_pin_timeline(eb, child, throttle);
+		if (err)
+			goto unwind;
+		++i;
+	}
+	err = eb_pin_timeline(eb, ce, throttle);
+	if (err)
+		goto unwind;
+
 	eb->args->flags |= __EXEC_ENGINE_PINNED;
-	return rq;
+	return 0;
+
+unwind:
+	for_each_child(ce, child) {
+		if (j++ < i) {
+			mutex_lock(&child->timeline->mutex);
+			intel_context_exit(child);
+			mutex_unlock(&child->timeline->mutex);
+		}
+	}
+	for_each_child(ce, child)
+		intel_context_unpin(child);
+	intel_context_unpin(ce);
+	return err;
 }
 
 static void eb_unpin_engine(struct i915_execbuffer *eb)
 {
-	struct intel_context *ce = eb->context;
-	struct intel_timeline *tl = ce->timeline;
+	struct intel_context *ce = eb->context, *child;
 
 	if (!(eb->args->flags & __EXEC_ENGINE_PINNED))
 		return;
 
 	eb->args->flags &= ~__EXEC_ENGINE_PINNED;
 
-	mutex_lock(&tl->mutex);
+	for_each_child(ce, child) {
+		mutex_lock(&child->timeline->mutex);
+		intel_context_exit(child);
+		mutex_unlock(&child->timeline->mutex);
+
+		intel_context_unpin(child);
+	}
+
+	mutex_lock(&ce->timeline->mutex);
 	intel_context_exit(ce);
-	mutex_unlock(&tl->mutex);
+	mutex_unlock(&ce->timeline->mutex);
 
 	intel_context_unpin(ce);
 }
@@ -2384,7 +2477,7 @@  eb_select_legacy_ring(struct i915_execbuffer *eb)
 static int
 eb_select_engine(struct i915_execbuffer *eb)
 {
-	struct intel_context *ce;
+	struct intel_context *ce, *child;
 	unsigned int idx;
 	int err;
 
@@ -2397,6 +2490,20 @@  eb_select_engine(struct i915_execbuffer *eb)
 	if (IS_ERR(ce))
 		return PTR_ERR(ce);
 
+	if (intel_context_is_parallel(ce)) {
+		if (eb->buffer_count < ce->guc_number_children + 1) {
+			intel_context_put(ce);
+			return -EINVAL;
+		}
+		if (eb->batch_start_offset || eb->args->batch_len) {
+			intel_context_put(ce);
+			return -EINVAL;
+		}
+	}
+	eb->num_batches = ce->guc_number_children + 1;
+
+	for_each_child(ce, child)
+		intel_context_get(child);
 	intel_gt_pm_get(ce->engine->gt);
 
 	if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
@@ -2404,6 +2511,13 @@  eb_select_engine(struct i915_execbuffer *eb)
 		if (err)
 			goto err;
 	}
+	for_each_child(ce, child) {
+		if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) {
+			err = intel_context_alloc_state(child);
+			if (err)
+				goto err;
+		}
+	}
 
 	/*
 	 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
@@ -2414,7 +2528,7 @@  eb_select_engine(struct i915_execbuffer *eb)
 		goto err;
 
 	eb->context = ce;
-	eb->engine = ce->engine;
+	eb->gt = ce->engine->gt;
 
 	/*
 	 * Make sure engine pool stays alive even if we call intel_context_put
@@ -2425,6 +2539,8 @@  eb_select_engine(struct i915_execbuffer *eb)
 
 err:
 	intel_gt_pm_put(ce->engine->gt);
+	for_each_child(ce, child)
+		intel_context_put(child);
 	intel_context_put(ce);
 	return err;
 }
@@ -2432,7 +2548,11 @@  eb_select_engine(struct i915_execbuffer *eb)
 static void
 eb_put_engine(struct i915_execbuffer *eb)
 {
-	intel_gt_pm_put(eb->engine->gt);
+	struct intel_context *child;
+
+	intel_gt_pm_put(eb->gt);
+	for_each_child(eb->context, child)
+		intel_context_put(child);
 	intel_context_put(eb->context);
 }
 
@@ -2655,7 +2775,8 @@  static void put_fence_array(struct eb_fence *fences, int num_fences)
 }
 
 static int
-await_fence_array(struct i915_execbuffer *eb)
+await_fence_array(struct i915_execbuffer *eb,
+		  struct i915_request *rq)
 {
 	unsigned int n;
 	int err;
@@ -2669,8 +2790,7 @@  await_fence_array(struct i915_execbuffer *eb)
 		if (!eb->fences[n].dma_fence)
 			continue;
 
-		err = i915_request_await_dma_fence(eb->request,
-						   eb->fences[n].dma_fence);
+		err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence);
 		if (err < 0)
 			return err;
 	}
@@ -2678,9 +2798,9 @@  await_fence_array(struct i915_execbuffer *eb)
 	return 0;
 }
 
-static void signal_fence_array(const struct i915_execbuffer *eb)
+static void signal_fence_array(const struct i915_execbuffer *eb,
+			       struct dma_fence * const fence)
 {
-	struct dma_fence * const fence = &eb->request->fence;
 	unsigned int n;
 
 	for (n = 0; n < eb->num_fences; n++) {
@@ -2728,12 +2848,12 @@  static void retire_requests(struct intel_timeline *tl, struct i915_request *end)
 			break;
 }
 
-static int eb_request_add(struct i915_execbuffer *eb, int err)
+static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq)
 {
-	struct i915_request *rq = eb->request;
 	struct intel_timeline * const tl = i915_request_timeline(rq);
 	struct i915_sched_attr attr = {};
 	struct i915_request *prev;
+	int err = 0;
 
 	lockdep_assert_held(&tl->mutex);
 	lockdep_unpin_lock(&tl->mutex, rq->cookie);
@@ -2745,11 +2865,6 @@  static int eb_request_add(struct i915_execbuffer *eb, int err)
 	/* Check that the context wasn't destroyed before submission */
 	if (likely(!intel_context_is_closed(eb->context))) {
 		attr = eb->gem_context->sched;
-	} else {
-		/* Serialise with context_close via the add_to_timeline */
-		i915_request_set_error_once(rq, -ENOENT);
-		__i915_request_skip(rq);
-		err = -ENOENT; /* override any transient errors */
 	}
 
 	__i915_request_queue(rq, &attr);
@@ -2763,6 +2878,44 @@  static int eb_request_add(struct i915_execbuffer *eb, int err)
 	return err;
 }
 
+static int eb_requests_add(struct i915_execbuffer *eb, int err)
+{
+	int i;
+
+	/*
+	* We iterate in reverse order of creation to release timeline mutexes in
+	* same order.
+	*/
+	for_each_batch_add_order(eb, i) {
+		struct i915_request *rq = eb->requests[i];
+
+		if (!rq)
+			continue;
+
+		if (unlikely(intel_context_is_closed(eb->context))) {
+			/* Serialise with context_close via the add_to_timeline */
+			i915_request_set_error_once(rq, -ENOENT);
+			__i915_request_skip(rq);
+			err = -ENOENT; /* override any transient errors */
+		}
+
+		if (intel_context_is_parallel(eb->context)) {
+			if (err) {
+				__i915_request_skip(rq);
+				set_bit(I915_FENCE_FLAG_SKIP_PARALLEL,
+					&rq->fence.flags);
+			}
+			if (i == 0)
+				set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL,
+					&rq->fence.flags);
+		}
+
+		err |= eb_request_add(eb, rq);
+	}
+
+	return err;
+}
+
 static const i915_user_extension_fn execbuf_extensions[] = {
 	[DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences,
 };
@@ -2789,6 +2942,166 @@  parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args,
 				    eb);
 }
 
+static void eb_requests_get(struct i915_execbuffer *eb)
+{
+	unsigned int i;
+
+	for_each_batch_create_order(eb, i) {
+		if (!eb->requests[i])
+			break;
+
+		i915_request_get(eb->requests[i]);
+	}
+}
+
+static void eb_requests_put(struct i915_execbuffer *eb)
+{
+	unsigned int i;
+
+	for_each_batch_create_order(eb, i) {
+		if (!eb->requests[i])
+			break;
+
+		i915_request_put(eb->requests[i]);
+	}
+}
+static struct sync_file *
+eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
+{
+	struct sync_file *out_fence = NULL;
+	struct dma_fence_array *fence_array;
+	struct dma_fence **fences;
+	unsigned int i;
+
+	GEM_BUG_ON(!intel_context_is_parent(eb->context));
+
+	fences = kmalloc(eb->num_batches * sizeof(*fences), GFP_KERNEL);
+	if (!fences)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_batch_create_order(eb, i)
+		fences[i] = &eb->requests[i]->fence;
+
+	fence_array = dma_fence_array_create(eb->num_batches,
+					     fences,
+					     eb->context->fence_context,
+					     eb->context->seqno,
+					     false);
+	if (!fence_array) {
+		kfree(fences);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Move ownership to the dma_fence_array created above */
+	for_each_batch_create_order(eb, i)
+		dma_fence_get(fences[i]);
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(&fence_array->base);
+		/* sync_file now owns fence_arry, drop creation ref */
+		dma_fence_put(&fence_array->base);
+		if (!out_fence)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	eb->composite_fence = &fence_array->base;
+
+	return out_fence;
+}
+
+static struct intel_context *
+eb_find_context(struct i915_execbuffer *eb, unsigned int context_number)
+{
+	struct intel_context *child;
+
+	if (likely(context_number == 0))
+		return eb->context;
+
+	for_each_child(eb->context, child)
+		if (!--context_number)
+			return child;
+
+	GEM_BUG_ON("Context not found");
+
+	return NULL;
+}
+
+static struct sync_file *
+eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence,
+		   int out_fence_fd)
+{
+	struct sync_file *out_fence = NULL;
+	unsigned int i;
+	int err;
+
+	for_each_batch_create_order(eb, i) {
+		bool first_request_to_add = i + 1 == eb->num_batches;
+
+		/* Allocate a request for this batch buffer nice and early. */
+		eb->requests[i] = i915_request_create(eb_find_context(eb, i));
+		if (IS_ERR(eb->requests[i])) {
+			eb->requests[i] = NULL;
+			return ERR_PTR(PTR_ERR(eb->requests[i]));
+		}
+
+		if (unlikely(eb->gem_context->syncobj &&
+			     first_request_to_add)) {
+			struct dma_fence *fence;
+
+			fence = drm_syncobj_fence_get(eb->gem_context->syncobj);
+			err = i915_request_await_dma_fence(eb->requests[i], fence);
+			dma_fence_put(fence);
+			if (err)
+				return ERR_PTR(err);
+		}
+
+		if (in_fence && first_request_to_add) {
+			if (eb->args->flags & I915_EXEC_FENCE_SUBMIT)
+				err = i915_request_await_execution(eb->requests[i],
+								   in_fence);
+			else
+				err = i915_request_await_dma_fence(eb->requests[i],
+								   in_fence);
+			if (err < 0)
+				return ERR_PTR(err);
+		}
+
+		if (eb->fences && first_request_to_add) {
+			err = await_fence_array(eb, eb->requests[i]);
+			if (err)
+				return ERR_PTR(err);
+		}
+
+		if (first_request_to_add) {
+			if (intel_context_is_parallel(eb->context)) {
+				out_fence = eb_composite_fence_create(eb, out_fence_fd);
+				if (IS_ERR(out_fence))
+					return ERR_PTR(-ENOMEM);
+			} else if (out_fence_fd != -1) {
+				out_fence = sync_file_create(&eb->requests[i]->fence);
+				if (!out_fence)
+					return ERR_PTR(-ENOMEM);
+			}
+		}
+
+		/*
+		 * Whilst this request exists, batch_obj will be on the
+		 * active_list, and so will hold the active reference. Only when
+		 * this request is retired will the the batch_obj be moved onto
+		 * the inactive_list and lose its active reference. Hence we do
+		 * not need to explicitly hold another reference here.
+		 */
+		eb->requests[i]->batch = eb->batches[i]->vma;
+		if (eb->batch_pool) {
+			GEM_BUG_ON(intel_context_is_parallel(eb->context));
+			intel_gt_buffer_pool_mark_active(eb->batch_pool,
+							 eb->requests[i]);
+		}
+	}
+
+	return out_fence;
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_file *file,
@@ -2799,7 +3112,6 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
 	struct sync_file *out_fence = NULL;
-	struct i915_vma *batch;
 	int out_fence_fd = -1;
 	int err;
 
@@ -2823,12 +3135,15 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 
 	eb.buffer_count = args->buffer_count;
 	eb.batch_start_offset = args->batch_start_offset;
-	eb.batch_len = args->batch_len;
 	eb.trampoline = NULL;
 
 	eb.fences = NULL;
 	eb.num_fences = 0;
 
+	memset(eb.requests, 0, sizeof(struct i915_request *) *
+	       ARRAY_SIZE(eb.requests));
+	eb.composite_fence = NULL;
+
 	eb.batch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
 		if (GRAPHICS_VER(i915) >= 11)
@@ -2912,70 +3227,25 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 
 	ww_acquire_done(&eb.ww.ctx);
 
-	batch = eb.batch->vma;
-
-	/* Allocate a request for this batch buffer nice and early. */
-	eb.request = i915_request_create(eb.context);
-	if (IS_ERR(eb.request)) {
-		err = PTR_ERR(eb.request);
-		goto err_vma;
-	}
-
-	if (unlikely(eb.gem_context->syncobj)) {
-		struct dma_fence *fence;
-
-		fence = drm_syncobj_fence_get(eb.gem_context->syncobj);
-		err = i915_request_await_dma_fence(eb.request, fence);
-		dma_fence_put(fence);
-		if (err)
-			goto err_ext;
-	}
-
-	if (in_fence) {
-		if (args->flags & I915_EXEC_FENCE_SUBMIT)
-			err = i915_request_await_execution(eb.request,
-							   in_fence);
-		else
-			err = i915_request_await_dma_fence(eb.request,
-							   in_fence);
-		if (err < 0)
-			goto err_request;
-	}
-
-	if (eb.fences) {
-		err = await_fence_array(&eb);
-		if (err)
-			goto err_request;
-	}
-
-	if (out_fence_fd != -1) {
-		out_fence = sync_file_create(&eb.request->fence);
-		if (!out_fence) {
-			err = -ENOMEM;
+	out_fence = eb_requests_create(&eb, in_fence, out_fence_fd);
+	if (IS_ERR(out_fence)) {
+		err = PTR_ERR(out_fence);
+		if (eb.requests[0])
 			goto err_request;
-		}
+		else
+			goto err_vma;
 	}
 
-	/*
-	 * Whilst this request exists, batch_obj will be on the
-	 * active_list, and so will hold the active reference. Only when this
-	 * request is retired will the the batch_obj be moved onto the
-	 * inactive_list and lose its active reference. Hence we do not need
-	 * to explicitly hold another reference here.
-	 */
-	eb.request->batch = batch;
-	if (eb.batch_pool)
-		intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request);
-
-	trace_i915_request_queue(eb.request, eb.batch_flags);
-	err = eb_submit(&eb, batch);
+	err = eb_submit(&eb);
 
 err_request:
-	i915_request_get(eb.request);
-	err = eb_request_add(&eb, err);
+	eb_requests_get(&eb);
+	err = eb_requests_add(&eb, err);
 
 	if (eb.fences)
-		signal_fence_array(&eb);
+		signal_fence_array(&eb, eb.composite_fence ?
+				   eb.composite_fence :
+				   &eb.requests[0]->fence);
 
 	if (out_fence) {
 		if (err == 0) {
@@ -2990,10 +3260,15 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 
 	if (unlikely(eb.gem_context->syncobj)) {
 		drm_syncobj_replace_fence(eb.gem_context->syncobj,
-					  &eb.request->fence);
+					  eb.composite_fence ?
+					  eb.composite_fence :
+					  &eb.requests[0]->fence);
 	}
 
-	i915_request_put(eb.request);
+	if (!out_fence && eb.composite_fence)
+		dma_fence_put(eb.composite_fence);
+
+	eb_requests_put(&eb);
 
 err_vma:
 	eb_release_vmas(&eb, true);
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 9dcc1b14697b..1f6a5ae3e33e 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -237,7 +237,13 @@  intel_context_timeline_lock(struct intel_context *ce)
 	struct intel_timeline *tl = ce->timeline;
 	int err;
 
-	err = mutex_lock_interruptible(&tl->mutex);
+	if (intel_context_is_parent(ce))
+		err = mutex_lock_interruptible_nested(&tl->mutex, 0);
+	else if (intel_context_is_child(ce))
+		err = mutex_lock_interruptible_nested(&tl->mutex,
+						      ce->guc_child_index + 1);
+	else
+		err = mutex_lock_interruptible(&tl->mutex);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 727f91e7f7c2..094fcfb5cbe1 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -261,6 +261,18 @@  struct intel_context {
 		 * context.
 		 */
 		struct i915_request *last_rq;
+
+		/**
+		 * @fence_context: fence context composite fence when doing
+		 * parallel submission
+		 */
+		u64 fence_context;
+
+		/**
+		 * @seqno: seqno for composite fence when doing parallel
+		 * submission
+		 */
+		u32 seqno;
 	};
 
 #ifdef CONFIG_DRM_I915_SELFTEST
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 1a18f99bf12a..2ef38557b0f0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -3034,6 +3034,8 @@  guc_create_parallel(struct intel_engine_cs **engines,
 		}
 	}
 
+	parent->fence_context = dma_fence_context_alloc(1);
+
 	parent->engine->emit_bb_start =
 		emit_bb_start_parent_no_preempt_mid_batch;
 	parent->engine->emit_fini_breadcrumb =
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 8f0073e19079..602cc246ba85 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -147,6 +147,15 @@  enum {
 	 * tail.
 	 */
 	I915_FENCE_FLAG_SUBMIT_PARALLEL,
+
+	/*
+	 * I915_FENCE_FLAG_SKIP_PARALLEL - request with a context in a
+	 * parent-child relationship (parallel submission, multi-lrc) that
+	 * hit an error while generating requests in the execbuf IOCTL.
+	 * Indicates this request should be skipped as another request in
+	 * submission / relationship encoutered an error.
+	 */
+	I915_FENCE_FLAG_SKIP_PARALLEL,
 };
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 4b7fc4647e46..90546fa58fc1 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1234,9 +1234,10 @@  int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq)
 	return i915_active_add_request(&vma->active, rq);
 }
 
-int i915_vma_move_to_active(struct i915_vma *vma,
-			    struct i915_request *rq,
-			    unsigned int flags)
+int _i915_vma_move_to_active(struct i915_vma *vma,
+			     struct i915_request *rq,
+			     struct dma_fence *fence,
+			     unsigned int flags)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
 	int err;
@@ -1257,9 +1258,11 @@  int i915_vma_move_to_active(struct i915_vma *vma,
 			intel_frontbuffer_put(front);
 		}
 
-		dma_resv_add_excl_fence(vma->resv, &rq->fence);
-		obj->write_domain = I915_GEM_DOMAIN_RENDER;
-		obj->read_domains = 0;
+		if (fence) {
+			dma_resv_add_excl_fence(vma->resv, fence);
+			obj->write_domain = I915_GEM_DOMAIN_RENDER;
+			obj->read_domains = 0;
+		}
 	} else {
 		if (!(flags & __EXEC_OBJECT_NO_RESERVE)) {
 			err = dma_resv_reserve_shared(vma->resv, 1);
@@ -1267,8 +1270,10 @@  int i915_vma_move_to_active(struct i915_vma *vma,
 				return err;
 		}
 
-		dma_resv_add_shared_fence(vma->resv, &rq->fence);
-		obj->write_domain = 0;
+		if (fence) {
+			dma_resv_add_shared_fence(vma->resv, fence);
+			obj->write_domain = 0;
+		}
 	}
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE && vma->fence)
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index ed69f66c7ab0..648dbe744c96 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -57,9 +57,16 @@  static inline bool i915_vma_is_active(const struct i915_vma *vma)
 
 int __must_check __i915_vma_move_to_active(struct i915_vma *vma,
 					   struct i915_request *rq);
-int __must_check i915_vma_move_to_active(struct i915_vma *vma,
-					 struct i915_request *rq,
-					 unsigned int flags);
+int __must_check _i915_vma_move_to_active(struct i915_vma *vma,
+					  struct i915_request *rq,
+					  struct dma_fence *fence,
+					  unsigned int flags);
+static inline int __must_check
+i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq,
+			unsigned int flags)
+{
+	return _i915_vma_move_to_active(vma, rq, &rq->fence, flags);
+}
 
 #define __i915_vma_flags(v) ((unsigned long *)&(v)->flags.counter)