diff mbox series

[24/26] drm/i915: Update I915_GEM_BUSY IOCTL to understand composite fences

Message ID 20211004220637.14746-25-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Parallel submission aka multi-bb execbuf | expand

Commit Message

Matthew Brost Oct. 4, 2021, 10:06 p.m. UTC
Parallel submission create composite fences (dma_fence_array) for excl /
shared slots in objects. The I915_GEM_BUSY IOCTL checks these slots to
determine the busyness of the object. Prior to patch it only check if
the fence in the slot was a i915_request. Update the check to understand
composite fences and correctly report the busyness.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_busy.c      | 60 +++++++++++++++----
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  5 +-
 drivers/gpu/drm/i915/i915_request.h           |  6 ++
 3 files changed, 58 insertions(+), 13 deletions(-)

Comments

Daniele Ceraolo Spurio Oct. 11, 2021, 10:15 p.m. UTC | #1
On 10/4/2021 3:06 PM, Matthew Brost wrote:
> Parallel submission create composite fences (dma_fence_array) for excl /
> shared slots in objects. The I915_GEM_BUSY IOCTL checks these slots to
> determine the busyness of the object. Prior to patch it only check if
> the fence in the slot was a i915_request. Update the check to understand
> composite fences and correctly report the busyness.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> ---
>   drivers/gpu/drm/i915/gem/i915_gem_busy.c      | 60 +++++++++++++++----
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  5 +-
>   drivers/gpu/drm/i915/i915_request.h           |  6 ++
>   3 files changed, 58 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> index 6234e17259c1..b89d173c62eb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> @@ -4,6 +4,8 @@
>    * Copyright © 2014-2016 Intel Corporation
>    */
>   
> +#include <linux/dma-fence-array.h>
> +
>   #include "gt/intel_engine.h"
>   
>   #include "i915_gem_ioctls.h"
> @@ -36,7 +38,7 @@ static __always_inline u32 __busy_write_id(u16 id)
>   }
>   
>   static __always_inline unsigned int
> -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
> +__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id))
>   {
>   	const struct i915_request *rq;
>   
> @@ -46,29 +48,63 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
>   	 * to eventually flush us, but to minimise latency just ask the
>   	 * hardware.
>   	 *
> -	 * Note we only report on the status of native fences.
> +	 * Note we only report on the status of native fences and we currently
> +	 * have two native fences:
> +	 *
> +	 * 1. A composite fence (dma_fence_array) constructed of i915 requests
> +	 * created during a parallel submission. In this case we deconstruct the
> +	 * composite fence into individual i915 requests and check the status of
> +	 * each request.
> +	 *
> +	 * 2. A single i915 request.
>   	 */
> -	if (!dma_fence_is_i915(fence))
> +	if (dma_fence_is_array(fence)) {
> +		struct dma_fence_array *array = to_dma_fence_array(fence);
> +		struct dma_fence **child = array->fences;
> +		unsigned int nchild = array->num_fences;
> +
> +		do {
> +			struct dma_fence *current_fence = *child++;
> +
> +			/* Not an i915 fence, can't be busy per above */
> +			if (!dma_fence_is_i915(current_fence) ||
> +			    !test_bit(I915_FENCE_FLAG_COMPOSITE,
> +				      &current_fence->flags)) {
> +				return 0;
> +			}
> +
> +			rq = to_request(current_fence);
> +			if (!i915_request_completed(rq)) {
> +				BUILD_BUG_ON(!typecheck(u16,
> +							rq->engine->uabi_class));
> +				return flag(rq->engine->uabi_class);
> +			}
> +		} while (--nchild);
> +
> +		/* All requests in array complete, not busy */
>   		return 0;
> +	} else {
> +		if (!dma_fence_is_i915(fence))
> +			return 0;
>   
> -	/* opencode to_request() in order to avoid const warnings */
> -	rq = container_of(fence, const struct i915_request, fence);
> -	if (i915_request_completed(rq))
> -		return 0;
> +		rq = to_request(fence);
> +		if (i915_request_completed(rq))
> +			return 0;
>   
> -	/* Beware type-expansion follies! */
> -	BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> -	return flag(rq->engine->uabi_class);
> +		/* Beware type-expansion follies! */
> +		BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> +		return flag(rq->engine->uabi_class);
> +	}
>   }
>   
>   static __always_inline unsigned int
> -busy_check_reader(const struct dma_fence *fence)
> +busy_check_reader(struct dma_fence *fence)
>   {
>   	return __busy_set_if_active(fence, __busy_read_flag);
>   }
>   
>   static __always_inline unsigned int
> -busy_check_writer(const struct dma_fence *fence)
> +busy_check_writer(struct dma_fence *fence)
>   {
>   	if (!fence)
>   		return 0;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 5c7fb6f68bbb..16276f406fd6 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -2988,8 +2988,11 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
>   	if (!fences)
>   		return ERR_PTR(-ENOMEM);
>   
> -	for_each_batch_create_order(eb, i)
> +	for_each_batch_create_order(eb, i) {
>   		fences[i] = &eb->requests[i]->fence;
> +		__set_bit(I915_FENCE_FLAG_COMPOSITE,
> +			  &eb->requests[i]->fence.flags);
> +	}
>   
>   	fence_array = dma_fence_array_create(eb->num_batches,
>   					     fences,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 24db8459376b..dc359242d1ae 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -156,6 +156,12 @@ enum {
>   	 * submission / relationship encoutered an error.
>   	 */
>   	I915_FENCE_FLAG_SKIP_PARALLEL,
> +
> +	/*
> +	 * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite
> +	 * fence (dma_fence_array) and i915 generated for parallel submission.
> +	 */
> +	I915_FENCE_FLAG_COMPOSITE,
>   };
>   
>   /**
Tvrtko Ursulin Oct. 12, 2021, 7:53 a.m. UTC | #2
On 04/10/2021 23:06, Matthew Brost wrote:
> Parallel submission create composite fences (dma_fence_array) for excl /
> shared slots in objects. The I915_GEM_BUSY IOCTL checks these slots to
> determine the busyness of the object. Prior to patch it only check if
> the fence in the slot was a i915_request. Update the check to understand
> composite fences and correctly report the busyness.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_busy.c      | 60 +++++++++++++++----
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  5 +-
>   drivers/gpu/drm/i915/i915_request.h           |  6 ++
>   3 files changed, 58 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> index 6234e17259c1..b89d173c62eb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> @@ -4,6 +4,8 @@
>    * Copyright © 2014-2016 Intel Corporation
>    */
>   
> +#include <linux/dma-fence-array.h>
> +
>   #include "gt/intel_engine.h"
>   
>   #include "i915_gem_ioctls.h"
> @@ -36,7 +38,7 @@ static __always_inline u32 __busy_write_id(u16 id)
>   }
>   
>   static __always_inline unsigned int
> -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
> +__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id))
>   {
>   	const struct i915_request *rq;
>   
> @@ -46,29 +48,63 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
>   	 * to eventually flush us, but to minimise latency just ask the
>   	 * hardware.
>   	 *
> -	 * Note we only report on the status of native fences.
> +	 * Note we only report on the status of native fences and we currently
> +	 * have two native fences:
> +	 *
> +	 * 1. A composite fence (dma_fence_array) constructed of i915 requests
> +	 * created during a parallel submission. In this case we deconstruct the
> +	 * composite fence into individual i915 requests and check the status of
> +	 * each request.
> +	 *
> +	 * 2. A single i915 request.
>   	 */
> -	if (!dma_fence_is_i915(fence))
> +	if (dma_fence_is_array(fence)) {
> +		struct dma_fence_array *array = to_dma_fence_array(fence);
> +		struct dma_fence **child = array->fences;
> +		unsigned int nchild = array->num_fences;
> +
> +		do {
> +			struct dma_fence *current_fence = *child++;
> +
> +			/* Not an i915 fence, can't be busy per above */
> +			if (!dma_fence_is_i915(current_fence) ||
> +			    !test_bit(I915_FENCE_FLAG_COMPOSITE,
> +				      &current_fence->flags)) {
> +				return 0;
> +			}
> +
> +			rq = to_request(current_fence);
> +			if (!i915_request_completed(rq)) {
> +				BUILD_BUG_ON(!typecheck(u16,
> +							rq->engine->uabi_class));
> +				return flag(rq->engine->uabi_class);
> +			}
> +		} while (--nchild);

Do you even need to introduce I915_FENCE_FLAG_COMPOSITE? If parallel 
submit is the only possible creator of array fences then possibly not. 
Probably even would result in less code which even keeps working in a 
hypothetical future. Otherwise you could add a debug bug on if array 
fence contains a fence without I915_FENCE_FLAG_COMPOSITE set.

Secondly, I'd also run the whole loop and not return on first busy or 
incompatible for simplicity.

And finally, with all above in place, I think you could have common 
function for the below (checking one fence) and call that both for a 
single fence and from an array loop above for less duplication. (Even 
duplicated BUILD_BUG_ON which makes no sense!)

End result would be a simpler patch like:

__busy_set_if_active_one(...)
{
    .. existing __busy_set_if_active ..
}

__busy_set_if_active(..)
{
   ...
   if (dma_fence_is_array(fence)) {
	...
	for (i = 0; i < array->num_fences; i++)
		flags |= __busy_set_if_active_one(...);
   } else {
	flags = __busy_set_if_active_one(...);
   }

Regards,

Tvrtko

> +
> +		/* All requests in array complete, not busy */
>   		return 0;
> +	} else {
> +		if (!dma_fence_is_i915(fence))
> +			return 0;
>   
> -	/* opencode to_request() in order to avoid const warnings */
> -	rq = container_of(fence, const struct i915_request, fence);
> -	if (i915_request_completed(rq))
> -		return 0;
> +		rq = to_request(fence);
> +		if (i915_request_completed(rq))
> +			return 0;
>   
> -	/* Beware type-expansion follies! */
> -	BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> -	return flag(rq->engine->uabi_class);
> +		/* Beware type-expansion follies! */
> +		BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> +		return flag(rq->engine->uabi_class);
> +	}
>   }
>   
>   static __always_inline unsigned int
> -busy_check_reader(const struct dma_fence *fence)
> +busy_check_reader(struct dma_fence *fence)
>   {
>   	return __busy_set_if_active(fence, __busy_read_flag);
>   }
>   
>   static __always_inline unsigned int
> -busy_check_writer(const struct dma_fence *fence)
> +busy_check_writer(struct dma_fence *fence)
>   {
>   	if (!fence)
>   		return 0;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 5c7fb6f68bbb..16276f406fd6 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -2988,8 +2988,11 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
>   	if (!fences)
>   		return ERR_PTR(-ENOMEM);
>   
> -	for_each_batch_create_order(eb, i)
> +	for_each_batch_create_order(eb, i) {
>   		fences[i] = &eb->requests[i]->fence;
> +		__set_bit(I915_FENCE_FLAG_COMPOSITE,
> +			  &eb->requests[i]->fence.flags);
> +	}
>   
>   	fence_array = dma_fence_array_create(eb->num_batches,
>   					     fences,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 24db8459376b..dc359242d1ae 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -156,6 +156,12 @@ enum {
>   	 * submission / relationship encoutered an error.
>   	 */
>   	I915_FENCE_FLAG_SKIP_PARALLEL,
> +
> +	/*
> +	 * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite
> +	 * fence (dma_fence_array) and i915 generated for parallel submission.
> +	 */
> +	I915_FENCE_FLAG_COMPOSITE,
>   };
>   
>   /**
>
Matthew Brost Oct. 12, 2021, 6:31 p.m. UTC | #3
On Tue, Oct 12, 2021 at 08:53:25AM +0100, Tvrtko Ursulin wrote:
> 
> On 04/10/2021 23:06, Matthew Brost wrote:
> > Parallel submission create composite fences (dma_fence_array) for excl /
> > shared slots in objects. The I915_GEM_BUSY IOCTL checks these slots to
> > determine the busyness of the object. Prior to patch it only check if
> > the fence in the slot was a i915_request. Update the check to understand
> > composite fences and correctly report the busyness.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_busy.c      | 60 +++++++++++++++----
> >   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  5 +-
> >   drivers/gpu/drm/i915/i915_request.h           |  6 ++
> >   3 files changed, 58 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> > index 6234e17259c1..b89d173c62eb 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
> > @@ -4,6 +4,8 @@
> >    * Copyright © 2014-2016 Intel Corporation
> >    */
> > +#include <linux/dma-fence-array.h>
> > +
> >   #include "gt/intel_engine.h"
> >   #include "i915_gem_ioctls.h"
> > @@ -36,7 +38,7 @@ static __always_inline u32 __busy_write_id(u16 id)
> >   }
> >   static __always_inline unsigned int
> > -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
> > +__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id))
> >   {
> >   	const struct i915_request *rq;
> > @@ -46,29 +48,63 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
> >   	 * to eventually flush us, but to minimise latency just ask the
> >   	 * hardware.
> >   	 *
> > -	 * Note we only report on the status of native fences.
> > +	 * Note we only report on the status of native fences and we currently
> > +	 * have two native fences:
> > +	 *
> > +	 * 1. A composite fence (dma_fence_array) constructed of i915 requests
> > +	 * created during a parallel submission. In this case we deconstruct the
> > +	 * composite fence into individual i915 requests and check the status of
> > +	 * each request.
> > +	 *
> > +	 * 2. A single i915 request.
> >   	 */
> > -	if (!dma_fence_is_i915(fence))
> > +	if (dma_fence_is_array(fence)) {
> > +		struct dma_fence_array *array = to_dma_fence_array(fence);
> > +		struct dma_fence **child = array->fences;
> > +		unsigned int nchild = array->num_fences;
> > +
> > +		do {
> > +			struct dma_fence *current_fence = *child++;
> > +
> > +			/* Not an i915 fence, can't be busy per above */
> > +			if (!dma_fence_is_i915(current_fence) ||
> > +			    !test_bit(I915_FENCE_FLAG_COMPOSITE,
> > +				      &current_fence->flags)) {
> > +				return 0;
> > +			}
> > +
> > +			rq = to_request(current_fence);
> > +			if (!i915_request_completed(rq)) {
> > +				BUILD_BUG_ON(!typecheck(u16,
> > +							rq->engine->uabi_class));
> > +				return flag(rq->engine->uabi_class);
> > +			}
> > +		} while (--nchild);
> 
> Do you even need to introduce I915_FENCE_FLAG_COMPOSITE? If parallel submit
> is the only possible creator of array fences then possibly not. Probably
> even would result in less code which even keeps working in a hypothetical
> future. Otherwise you could add a debug bug on if array fence contains a
> fence without I915_FENCE_FLAG_COMPOSITE set.
> 

Certainly other drivers can create a dma fence array and in theory could
include a i915_request in that array. Adding this flag makes it clear
that this fence was created by i915 for parallel submission and future
proofs this code.

> Secondly, I'd also run the whole loop and not return on first busy or
> incompatible for simplicity.
> 

I disagree. Short circuiting when a condition is found is pretty
standard and not hard to understand.

> And finally, with all above in place, I think you could have common function
> for the below (checking one fence) and call that both for a single fence and
> from an array loop above for less duplication. (Even duplicated BUILD_BUG_ON
> which makes no sense!)
>

Yea duplicating the BUILD_BUG_ON doesn't make a ton of sense. Will
remove.

Disagree on the helper, the code paths are different enough to just open
code this.

Matt

> End result would be a simpler patch like:
> 
> __busy_set_if_active_one(...)
> {
>    .. existing __busy_set_if_active ..
> }
> 
> __busy_set_if_active(..)
> {
>   ...
>   if (dma_fence_is_array(fence)) {
> 	...
> 	for (i = 0; i < array->num_fences; i++)
> 		flags |= __busy_set_if_active_one(...);
>   } else {
> 	flags = __busy_set_if_active_one(...);
>   }
> 
> Regards,
> 
> Tvrtko
> 
> > +
> > +		/* All requests in array complete, not busy */
> >   		return 0;
> > +	} else {
> > +		if (!dma_fence_is_i915(fence))
> > +			return 0;
> > -	/* opencode to_request() in order to avoid const warnings */
> > -	rq = container_of(fence, const struct i915_request, fence);
> > -	if (i915_request_completed(rq))
> > -		return 0;
> > +		rq = to_request(fence);
> > +		if (i915_request_completed(rq))
> > +			return 0;
> > -	/* Beware type-expansion follies! */
> > -	BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> > -	return flag(rq->engine->uabi_class);
> > +		/* Beware type-expansion follies! */
> > +		BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
> > +		return flag(rq->engine->uabi_class);
> > +	}
> >   }
> >   static __always_inline unsigned int
> > -busy_check_reader(const struct dma_fence *fence)
> > +busy_check_reader(struct dma_fence *fence)
> >   {
> >   	return __busy_set_if_active(fence, __busy_read_flag);
> >   }
> >   static __always_inline unsigned int
> > -busy_check_writer(const struct dma_fence *fence)
> > +busy_check_writer(struct dma_fence *fence)
> >   {
> >   	if (!fence)
> >   		return 0;
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > index 5c7fb6f68bbb..16276f406fd6 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> > @@ -2988,8 +2988,11 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
> >   	if (!fences)
> >   		return ERR_PTR(-ENOMEM);
> > -	for_each_batch_create_order(eb, i)
> > +	for_each_batch_create_order(eb, i) {
> >   		fences[i] = &eb->requests[i]->fence;
> > +		__set_bit(I915_FENCE_FLAG_COMPOSITE,
> > +			  &eb->requests[i]->fence.flags);
> > +	}
> >   	fence_array = dma_fence_array_create(eb->num_batches,
> >   					     fences,
> > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> > index 24db8459376b..dc359242d1ae 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -156,6 +156,12 @@ enum {
> >   	 * submission / relationship encoutered an error.
> >   	 */
> >   	I915_FENCE_FLAG_SKIP_PARALLEL,
> > +
> > +	/*
> > +	 * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite
> > +	 * fence (dma_fence_array) and i915 generated for parallel submission.
> > +	 */
> > +	I915_FENCE_FLAG_COMPOSITE,
> >   };
> >   /**
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
index 6234e17259c1..b89d173c62eb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
@@ -4,6 +4,8 @@ 
  * Copyright © 2014-2016 Intel Corporation
  */
 
+#include <linux/dma-fence-array.h>
+
 #include "gt/intel_engine.h"
 
 #include "i915_gem_ioctls.h"
@@ -36,7 +38,7 @@  static __always_inline u32 __busy_write_id(u16 id)
 }
 
 static __always_inline unsigned int
-__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
+__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id))
 {
 	const struct i915_request *rq;
 
@@ -46,29 +48,63 @@  __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id))
 	 * to eventually flush us, but to minimise latency just ask the
 	 * hardware.
 	 *
-	 * Note we only report on the status of native fences.
+	 * Note we only report on the status of native fences and we currently
+	 * have two native fences:
+	 *
+	 * 1. A composite fence (dma_fence_array) constructed of i915 requests
+	 * created during a parallel submission. In this case we deconstruct the
+	 * composite fence into individual i915 requests and check the status of
+	 * each request.
+	 *
+	 * 2. A single i915 request.
 	 */
-	if (!dma_fence_is_i915(fence))
+	if (dma_fence_is_array(fence)) {
+		struct dma_fence_array *array = to_dma_fence_array(fence);
+		struct dma_fence **child = array->fences;
+		unsigned int nchild = array->num_fences;
+
+		do {
+			struct dma_fence *current_fence = *child++;
+
+			/* Not an i915 fence, can't be busy per above */
+			if (!dma_fence_is_i915(current_fence) ||
+			    !test_bit(I915_FENCE_FLAG_COMPOSITE,
+				      &current_fence->flags)) {
+				return 0;
+			}
+
+			rq = to_request(current_fence);
+			if (!i915_request_completed(rq)) {
+				BUILD_BUG_ON(!typecheck(u16,
+							rq->engine->uabi_class));
+				return flag(rq->engine->uabi_class);
+			}
+		} while (--nchild);
+
+		/* All requests in array complete, not busy */
 		return 0;
+	} else {
+		if (!dma_fence_is_i915(fence))
+			return 0;
 
-	/* opencode to_request() in order to avoid const warnings */
-	rq = container_of(fence, const struct i915_request, fence);
-	if (i915_request_completed(rq))
-		return 0;
+		rq = to_request(fence);
+		if (i915_request_completed(rq))
+			return 0;
 
-	/* Beware type-expansion follies! */
-	BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
-	return flag(rq->engine->uabi_class);
+		/* Beware type-expansion follies! */
+		BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class));
+		return flag(rq->engine->uabi_class);
+	}
 }
 
 static __always_inline unsigned int
-busy_check_reader(const struct dma_fence *fence)
+busy_check_reader(struct dma_fence *fence)
 {
 	return __busy_set_if_active(fence, __busy_read_flag);
 }
 
 static __always_inline unsigned int
-busy_check_writer(const struct dma_fence *fence)
+busy_check_writer(struct dma_fence *fence)
 {
 	if (!fence)
 		return 0;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5c7fb6f68bbb..16276f406fd6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2988,8 +2988,11 @@  eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
 	if (!fences)
 		return ERR_PTR(-ENOMEM);
 
-	for_each_batch_create_order(eb, i)
+	for_each_batch_create_order(eb, i) {
 		fences[i] = &eb->requests[i]->fence;
+		__set_bit(I915_FENCE_FLAG_COMPOSITE,
+			  &eb->requests[i]->fence.flags);
+	}
 
 	fence_array = dma_fence_array_create(eb->num_batches,
 					     fences,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 24db8459376b..dc359242d1ae 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -156,6 +156,12 @@  enum {
 	 * submission / relationship encoutered an error.
 	 */
 	I915_FENCE_FLAG_SKIP_PARALLEL,
+
+	/*
+	 * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite
+	 * fence (dma_fence_array) and i915 generated for parallel submission.
+	 */
+	I915_FENCE_FLAG_COMPOSITE,
 };
 
 /**