diff mbox series

[09/13] drm/msm: Use the DRM common Scheduler

Message ID 1538397105-19581-10-git-send-email-smasetty@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series drm/msm: Hook up the DRM gpu scheduler | expand

Commit Message

Sharat Masetty Oct. 1, 2018, 12:31 p.m. UTC
This patch hooks up the DRM gpu scheduler to the msm DRM driver. The
most noticeable changes to the driver are as follows. The submit path is
split into two parts, in the user context the submit(job) is created and
added to one of the entity's scheduler run queue. The scheduler then
tries to drain the queue by submitting the jobs the hardware to act upon.
The submit job sits on the scheduler queue until all the dependent
fences are waited upon successfully.

We have one scheduler instance per ring. The submitqueues will host a
scheduler entity object. This entity will be mapped to the scheduler's
default runqueue. This should be good for now, but in future it is possible
to extend the API to allow for scheduling amongst the submitqueues on the
same ring.

With this patch the role of the struct_mutex lock has been greatly reduced in
scope in the submit path, evidently when actually putting the job on the
ringbuffer. This should enable us with increased parallelism in the
driver which should translate to better performance overall hopefully.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/Kconfig           |   1 +
 drivers/gpu/drm/msm/Makefile          |   3 +-
 drivers/gpu/drm/msm/msm_drv.h         |   3 +-
 drivers/gpu/drm/msm/msm_gem.c         |   8 +-
 drivers/gpu/drm/msm/msm_gem.h         |   6 +
 drivers/gpu/drm/msm/msm_gem_submit.c  | 138 +++++++++------
 drivers/gpu/drm/msm/msm_gpu.c         |  72 ++++++--
 drivers/gpu/drm/msm/msm_gpu.h         |   2 +
 drivers/gpu/drm/msm/msm_ringbuffer.c  |   7 +
 drivers/gpu/drm/msm/msm_ringbuffer.h  |   3 +
 drivers/gpu/drm/msm/msm_sched.c       | 313 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/msm_sched.h       |  18 ++
 drivers/gpu/drm/msm/msm_submitqueue.c |  18 +-
 13 files changed, 507 insertions(+), 85 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_sched.c
 create mode 100644 drivers/gpu/drm/msm/msm_sched.h

Comments

Jordan Crouse Oct. 1, 2018, 7:02 p.m. UTC | #1
On Mon, Oct 01, 2018 at 06:01:41PM +0530, Sharat Masetty wrote:
> This patch hooks up the DRM gpu scheduler to the msm DRM driver. The
> most noticeable changes to the driver are as follows. The submit path is
> split into two parts, in the user context the submit(job) is created and
> added to one of the entity's scheduler run queue. The scheduler then
> tries to drain the queue by submitting the jobs the hardware to act upon.
> The submit job sits on the scheduler queue until all the dependent
> fences are waited upon successfully.
> 
> We have one scheduler instance per ring. The submitqueues will host a
> scheduler entity object. This entity will be mapped to the scheduler's
> default runqueue. This should be good for now, but in future it is possible
> to extend the API to allow for scheduling amongst the submitqueues on the
> same ring.
> 
> With this patch the role of the struct_mutex lock has been greatly reduced in
> scope in the submit path, evidently when actually putting the job on the
> ringbuffer. This should enable us with increased parallelism in the
> driver which should translate to better performance overall hopefully.
> 
> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/Kconfig           |   1 +
>  drivers/gpu/drm/msm/Makefile          |   3 +-
>  drivers/gpu/drm/msm/msm_drv.h         |   3 +-
>  drivers/gpu/drm/msm/msm_gem.c         |   8 +-
>  drivers/gpu/drm/msm/msm_gem.h         |   6 +
>  drivers/gpu/drm/msm/msm_gem_submit.c  | 138 +++++++++------
>  drivers/gpu/drm/msm/msm_gpu.c         |  72 ++++++--
>  drivers/gpu/drm/msm/msm_gpu.h         |   2 +
>  drivers/gpu/drm/msm/msm_ringbuffer.c  |   7 +
>  drivers/gpu/drm/msm/msm_ringbuffer.h  |   3 +
>  drivers/gpu/drm/msm/msm_sched.c       | 313 ++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/msm/msm_sched.h       |  18 ++
>  drivers/gpu/drm/msm/msm_submitqueue.c |  18 +-
>  13 files changed, 507 insertions(+), 85 deletions(-)
>  create mode 100644 drivers/gpu/drm/msm/msm_sched.c
>  create mode 100644 drivers/gpu/drm/msm/msm_sched.h
> 
> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
> index 38cbde9..0d01810 100644
> --- a/drivers/gpu/drm/msm/Kconfig
> +++ b/drivers/gpu/drm/msm/Kconfig
> @@ -15,6 +15,7 @@ config DRM_MSM
>  	select SND_SOC_HDMI_CODEC if SND_SOC
>  	select SYNC_FILE
>  	select PM_OPP
> +	select DRM_SCHED
>  	default y
>  	help
>  	  DRM/KMS driver for MSM/snapdragon.
> diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
> index cd40c05..71ed921 100644
> --- a/drivers/gpu/drm/msm/Makefile
> +++ b/drivers/gpu/drm/msm/Makefile
> @@ -60,7 +60,8 @@ msm-y := \
>  	msm_perf.o \
>  	msm_rd.o \
>  	msm_ringbuffer.o \
> -	msm_submitqueue.o
> +	msm_submitqueue.o \
> +	msm_sched.o
>  
>  msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
>  
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index b2da1fb..e461a9c 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -213,8 +213,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev,
>  int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
>  int msm_gem_sync_object(struct drm_gem_object *obj,
>  		struct msm_fence_context *fctx, bool exclusive);
> -void msm_gem_move_to_active(struct drm_gem_object *obj,
> -		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence);
> +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu);
>  void msm_gem_move_to_inactive(struct drm_gem_object *obj);
>  int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
>  int msm_gem_cpu_fini(struct drm_gem_object *obj);
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index f583bb4..7a12f30 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -663,16 +663,12 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
>  	return 0;
>  }
>  
> -void msm_gem_move_to_active(struct drm_gem_object *obj,
> -		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence)
> +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu)
>  {
>  	struct msm_gem_object *msm_obj = to_msm_bo(obj);
>  	WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
>  	msm_obj->gpu = gpu;
> -	if (exclusive)
> -		reservation_object_add_excl_fence(msm_obj->resv, fence);
> -	else
> -		reservation_object_add_shared_fence(msm_obj->resv, fence);
> +
>  	list_del_init(&msm_obj->mm_list);
>  	list_add_tail(&msm_obj->mm_list, &gpu->active_list);
>  }
> diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
> index cae3aa5..8c6269f 100644
> --- a/drivers/gpu/drm/msm/msm_gem.h
> +++ b/drivers/gpu/drm/msm/msm_gem.h
> @@ -20,6 +20,7 @@
>  
>  #include <linux/kref.h>
>  #include <linux/reservation.h>
> +#include <drm/gpu_scheduler.h>
>  #include "msm_drv.h"
>  
>  /* Additional internal-use only BO flags: */
> @@ -136,6 +137,7 @@ enum msm_gem_lock {
>   * lasts for the duration of the submit-ioctl.
>   */
>  struct msm_gem_submit {
> +	struct drm_sched_job sched_job;
>  	struct drm_device *dev;
>  	struct msm_file_private *ctx;
>  	struct msm_gpu *gpu;
> @@ -144,6 +146,7 @@ struct msm_gem_submit {
>  	struct ww_acquire_ctx ticket;
>  	uint32_t seqno;		/* Sequence number of the submit on the ring */
>  	struct dma_fence *hw_fence;
> +	struct dma_fence *in_fence, *out_fence;
>  	int out_fence_id;
>  	struct msm_gpu_submitqueue *queue;
>  	struct pid *pid;    /* submitting process */
> @@ -162,6 +165,9 @@ struct msm_gem_submit {
>  		uint32_t flags;
>  		struct msm_gem_object *obj;
>  		uint64_t iova;
> +		struct dma_fence *excl;
> +		uint32_t nr_shared;
> +		struct dma_fence **shared;
>  	} bos[0];
>  };
>  
> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
> index 7931c2a..dff945c 100644
> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
> @@ -65,23 +65,37 @@ void msm_gem_submit_free(struct msm_gem_submit *submit)
>  {
>  	int i;
>  
> +	mutex_lock(&submit->ring->fence_idr_lock);
>  	idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
> -
> -	dma_fence_put(submit->hw_fence);
> +	mutex_unlock(&submit->ring->fence_idr_lock);

Why are we using a mutex for this guy - this seems like a job for a spin if I
ever saw one. Maybe even a rw spin depending on how often that idr gets
queried.

>  	for (i = 0; i < submit->nr_bos; i++) {
> +		int j;
> +
>  		struct msm_gem_object *msm_obj = submit->bos[i].obj;
>  
>  		if (submit->bos[i].flags & BO_PINNED)
>  			msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
>  
> -		drm_gem_object_put(&msm_obj->base);
> +		drm_gem_object_put_unlocked(&msm_obj->base);
> +
> +		/*
> +		 * While we are at it, clear the saved exclusive and shared
> +		 * fences if any
> +		 */
> +		dma_fence_put(submit->bos[i].excl);
> +
> +		for (j = 0; j < submit->bos[i].nr_shared; j++)
> +			dma_fence_put(submit->bos[i].shared[j]);
> +
> +		kfree(submit->bos[i].shared);
>  	}
>  
> -	list_del(&submit->node);
>  	put_pid(submit->pid);
>  	msm_submitqueue_put(submit->queue);
>  
> +	dma_fence_put(submit->in_fence);
> +	dma_fence_put(submit->out_fence);
>  	kfree(submit);
>  }
>  
> @@ -238,6 +252,27 @@ static int submit_lock_objects(struct msm_gem_submit *submit)
>  	return ret;
>  }
>  
> +static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit)
> +{
> +	int i;
> +
> +	for (i = 0; i < submit->nr_bos; i++) {
> +		struct msm_gem_object *msm_obj = submit->bos[i].obj;
> +
> +		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
> +			reservation_object_add_excl_fence(msm_obj->resv,
> +					submit->out_fence);
> +		else
> +			reservation_object_add_shared_fence(msm_obj->resv,
> +					submit->out_fence);
> +
> +		if (submit->bos[i].flags & BO_LOCKED) {
> +			ww_mutex_unlock(&msm_obj->resv->lock);
> +			submit->bos[i].flags &= ~BO_LOCKED;
> +		}
> +	}
> +}
> +
>  static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
>  {
>  	int i, ret = 0;
> @@ -260,10 +295,24 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
>  		if (no_implicit)
>  			continue;
>  
> -		ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
> -			write);
> -		if (ret)
> -			break;
> +		if (write) {
> +			/*
> +			 * Save the per buffer shared and exclusive fences for
> +			 * the scheduler thread to wait on.
> +			 *
> +			 * Note: The memory allocated for the storing the
> +			 * shared fences will be released when the scheduler
> +			 * is done waiting on all the saved fences.
> +			 */
> +			ret = reservation_object_get_fences_rcu(msm_obj->resv,
> +					&submit->bos[i].excl,
> +					&submit->bos[i].nr_shared,
> +					&submit->bos[i].shared);
> +			if (ret)
> +				return ret;

I think this can just be a return ret;

> +		} else

No need for the else
> +			submit->bos[i].excl =
> +				reservation_object_get_excl_rcu(msm_obj->resv);
>  	}
>  
>  	return ret;

This can be a return 0;

> @@ -425,7 +474,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>  	struct msm_file_private *ctx = file->driver_priv;
>  	struct msm_gem_submit *submit;
>  	struct msm_gpu *gpu = priv->gpu;
> -	struct dma_fence *in_fence = NULL;
>  	struct sync_file *sync_file = NULL;
>  	struct msm_gpu_submitqueue *queue;
>  	struct msm_ringbuffer *ring;
> @@ -457,32 +505,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>  
>  	ring = gpu->rb[queue->prio];
>  
> -	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
> -		in_fence = sync_file_get_fence(args->fence_fd);
> -
> -		if (!in_fence)
> -			return -EINVAL;
> -
> -		/*
> -		 * Wait if the fence is from a foreign context, or if the fence
> -		 * array contains any fence from a foreign context.
> -		 */
> -		if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
> -			ret = dma_fence_wait(in_fence, true);
> -			if (ret)
> -				return ret;
> -		}
> -	}
> -
> -	ret = mutex_lock_interruptible(&dev->struct_mutex);
> -	if (ret)
> -		return ret;
> -
>  	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
>  		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
>  		if (out_fence_fd < 0) {
>  			ret = out_fence_fd;
> -			goto out_unlock;
> +			goto out_err;
>  		}
>  	}
>  
> @@ -490,7 +517,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>  			args->nr_cmds, ctx);
>  	if (!submit) {
>  		ret = -ENOMEM;
> -		goto out_unlock;
> +		goto out_err;
> +	}
> +
> +	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
> +		submit->in_fence = sync_file_get_fence(args->fence_fd);
> +
> +		if (!submit->in_fence) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
>  	}
>  
>  	if (args->flags & MSM_SUBMIT_SUDO)
> @@ -573,28 +609,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>  
>  	submit->nr_cmds = i;
>  
> -	msm_gpu_submit(submit);
> -	if (IS_ERR(submit->hw_fence)) {
> -		ret = PTR_ERR(submit->hw_fence);
> -		submit->hw_fence = NULL;
> +	ret = msm_sched_job_init(&submit->sched_job);
> +	if (ret)
>  		goto out;
> -	}
>  
> -	/*
> -	 * No protection should be okay here since this is protected by the big
> -	 * GPU lock.
> -	 */
> -	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
> -			submit->hw_fence, 0, INT_MAX, GFP_KERNEL);
> -	if (submit->out_fence_id < 0) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> +	submit_attach_fences_and_unlock(submit);
>  
>  	args->fence = submit->out_fence_id;
>  
>  	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
> -		sync_file = sync_file_create(submit->hw_fence);
> +		sync_file = sync_file_create(submit->out_fence);
>  		if (!sync_file) {
>  			ret = -ENOMEM;
>  			goto out;
> @@ -604,14 +628,22 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>  	}
>  
>  out:
> -	if (in_fence)
> -		dma_fence_put(in_fence);
> +	/*
> +	 * Clean up the submit bits and pieces which are not needed beyond this
> +	 * function context
> +	 */
>  	submit_cleanup(submit);
> -	if (ret)
> +
> +	if (!ret)
> +		/*
> +		 * If we reached here, then all is well. Push the job to the
> +		 * scheduler
> +		 */
> +		msm_sched_push_job(submit);
> +	else
>  		msm_gem_submit_free(submit);
> -out_unlock:
> +out_err:
>  	if (ret && (out_fence_fd >= 0))
>  		put_unused_fd(out_fence_fd);
> -	mutex_unlock(&dev->struct_mutex);
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index cd5fe49..481a55c 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -295,12 +295,17 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>  find_submit(struct msm_ringbuffer *ring, uint32_t fence)
>  {
>  	struct msm_gem_submit *submit;
> +	unsigned long flags;
>  
> -	WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
> +	spin_lock_irqsave(&ring->lock, flags);
>  
>  	list_for_each_entry(submit, &ring->submits, node)
> -		if (submit->seqno == fence)
> +		if (submit->seqno == fence) {
> +			spin_unlock_irqrestore(&ring->lock, flags);
>  			return submit;
> +		}
> +
> +	spin_unlock_irqrestore(&ring->lock, flags);
>  
>  	return NULL;
>  }
> @@ -316,6 +321,12 @@ static void recover_worker(struct work_struct *work)
>  	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
>  	int i;
>  
> +	submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
> +	return msm_sched_gpu_recovery(gpu, submit);
> +
> +	/*
> +	 * The unused code below will be removed in a subsequent patch
> +	 */

Why not now?

>  	mutex_lock(&dev->struct_mutex);
>  
>  	dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);
> @@ -591,11 +602,36 @@ static void retire_worker(struct work_struct *work)
>  	mutex_unlock(&dev->struct_mutex);
>  }
>  
> -/* call from irq handler to schedule work to retire bo's */
> +static void signal_hw_fences(struct msm_ringbuffer *ring)
> +{
> +	unsigned long flags;
> +	struct msm_gem_submit *submit, *tmp;
> +
> +	spin_lock_irqsave(&ring->lock, flags);
> +
> +	list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
> +		if (submit->seqno > ring->memptrs->fence)
> +			break;
> +
> +		list_del(&submit->node);
> +
> +		dma_fence_signal(submit->hw_fence);
> +	}
> +
> +	spin_unlock_irqrestore(&ring->lock, flags);
> +}
> +
> +/*
> + * Called from the IRQ context to signal hardware fences for the completed
> + * submits
> + */
>  void msm_gpu_retire(struct msm_gpu *gpu)
>  {
> -	struct msm_drm_private *priv = gpu->dev->dev_private;
> -	queue_work(priv->wq, &gpu->retire_work);
> +	int i;
> +
> +	for (i = 0; i < gpu->nr_rings; i++)
> +		signal_hw_fences(gpu->rb[i]);
> +
>  	update_sw_cntrs(gpu);
>  }
>  
> @@ -606,25 +642,28 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
>  	struct drm_device *dev = gpu->dev;
>  	struct msm_drm_private *priv = dev->dev_private;
>  	struct msm_ringbuffer *ring = submit->ring;
> +	unsigned long flags;
>  	int i;
>  
> -	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
> -
>  	submit->hw_fence = msm_fence_alloc(ring->fctx);
>  	if (IS_ERR(submit->hw_fence))
>  		return submit->hw_fence;
>  
> -	pm_runtime_get_sync(&gpu->pdev->dev);
> -
> -	msm_gpu_hw_init(gpu);
> +	spin_lock_irqsave(&ring->lock, flags);
> +	list_add_tail(&submit->node, &ring->submits);
> +	spin_unlock_irqrestore(&ring->lock, flags);
>  	submit->seqno = ++ring->seqno;
>  
> -	list_add_tail(&submit->node, &ring->submits);
> +	update_sw_cntrs(gpu);

I'm not sure if this needs the hardware to be up (it does check msm_gpu_active),
but I don't think we should reorganize the order of these functions unless we
need to.

> -	msm_rd_dump_submit(priv->rd, submit, NULL);
> +	mutex_lock(&dev->struct_mutex);
>  
> -	update_sw_cntrs(gpu);
> +	pm_runtime_get_sync(&gpu->pdev->dev);
> +
> +	msm_gpu_hw_init(gpu);
> +
> +	msm_rd_dump_submit(priv->rd, submit, NULL);
>  
>  	for (i = 0; i < submit->nr_bos; i++) {
>  		struct msm_gem_object *msm_obj = submit->bos[i].obj;
> @@ -634,16 +673,13 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
>  		 */
>  		WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
>  
> -		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
> -			msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence);
> -		else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
> -			msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence);
> +		msm_gem_move_to_active(&msm_obj->base, gpu);
>  	}
>  
>  	gpu->funcs->submit(gpu, submit, submit->ctx);
>  	priv->lastctx = submit->ctx;
>  
> -	hangcheck_timer_reset(gpu);
> +	mutex_unlock(&dev->struct_mutex);
>  
>  	return submit->hw_fence;
>  }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index dd55979..3bd1920 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -173,6 +173,8 @@ struct msm_gpu_submitqueue {
>  	int faults;
>  	struct list_head node;
>  	struct kref ref;
> +	struct msm_gpu *gpu;
> +	struct drm_sched_entity sched_entity;
>  };
>  
>  static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
> index 0889766..ef2bf10 100644
> --- a/drivers/gpu/drm/msm/msm_ringbuffer.c
> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
> @@ -56,9 +56,14 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
>  
>  	snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
>  
> +	if (msm_sched_init(&ring->sched, ring->name) != 0) {
> +		ret = -EINVAL;
> +		goto fail;
> +	}
>  	ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
>  
>  	idr_init(&ring->fence_idr);
> +	mutex_init(&ring->fence_idr_lock);
>  
>  	return ring;
>  
> @@ -74,6 +79,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
>  
>  	msm_fence_context_free(ring->fctx);
>  
> +	msm_sched_cleanup(&ring->sched);
>  	if (ring->bo) {
>  		msm_gem_put_iova(ring->bo, ring->gpu->aspace);
>  		msm_gem_put_vaddr(ring->bo);
> @@ -81,6 +87,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
>  	}
>  
>  	idr_destroy(&ring->fence_idr);
> +	mutex_destroy(&ring->fence_idr_lock);
>  
>  	kfree(ring);
>  }
> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
> index 523373b..10ae4a8 100644
> --- a/drivers/gpu/drm/msm/msm_ringbuffer.h
> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
> @@ -19,6 +19,7 @@
>  #define __MSM_RINGBUFFER_H__
>  
>  #include "msm_drv.h"
> +#include "msm_sched.h"
>  
>  #define rbmemptr(ring, member)  \
>  	((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member))
> @@ -42,7 +43,9 @@ struct msm_ringbuffer {
>  	uint64_t memptrs_iova;
>  	struct msm_fence_context *fctx;
>  	struct idr fence_idr;
> +	struct mutex fence_idr_lock;
>  	spinlock_t lock;
> +	struct drm_gpu_scheduler sched;
>  };
>  
>  struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
> diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c
> new file mode 100644
> index 0000000..8b805ce
> --- /dev/null
> +++ b/drivers/gpu/drm/msm/msm_sched.c
> @@ -0,0 +1,313 @@
> +/* SPDX-License-Identifier: GPL-2.0 */

// SPDX-License-Identifier: GPL-2.0

> +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
> +
> +#include "msm_gpu.h"
> +#include "msm_gem.h"
> +#include "msm_sched.h"
> +
> +#include <linux/string_helpers.h>
> +#include <linux/kthread.h>
> +
> +struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job)
> +{
> +	return container_of(sched_job, struct msm_gem_submit, sched_job);
> +}
> +
> +/*
> + * Go through the bo's one by one and return the previously saved shared and

bo's -> bos

> + * exclusive fences. If the scheduler gets the fence, then it takes care of
> + * releasing the reference on the fence.
> + */
> +static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job,
> +		struct drm_sched_entity *entity)
> +{
> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
> +	struct dma_fence *fence;
> +	int i;
> +
> +	if (submit->in_fence) {
> +		fence = submit->in_fence;
> +		submit->in_fence = NULL;

> +		if (!dma_fence_is_signaled(fence))
> +			return fence;
> +
> +		dma_fence_put(fence);
> +	}

There might be a chance to consolidate some code here

static struct dma_fence *_get_fence(struct dma_fence **)
{
	struct dma_fence *fence = *in;
	*in = NULL;

	if (fence && !dma_fence_is_signaled(fence))
		return fence;

	dma_fence_put(fence);
	return NULL;
}

fence = _fence(&submit->in_fence);
if (fence)
	return fence;


> +	/* Return the previously saved shared and exclusive fences if any */
> +	for (i = 0; i < submit->nr_bos; i++) {
> +		int j;
> +
> +		if (submit->bos[i].excl) {
> +			fence = submit->bos[i].excl;
> +			submit->bos[i].excl = NULL;
> +
> +			if (!dma_fence_is_signaled(fence))
> +				return fence;
> +
> +			dma_fence_put(fence);
> +		}

fence = _get_fence(&submit->bos[i].excl);
if (fence)
	return fence;

> +		for (j = 0; j < submit->bos[i].nr_shared; j++) {
> +			if (!submit->bos[i].shared[j])
> +				continue;
> +
> +			fence = submit->bos[i].shared[j];
> +			submit->bos[i].shared[j] = NULL;
> +
> +			if (!dma_fence_is_signaled(fence))
> +				return fence;
> +
> +			dma_fence_put(fence);
> +		}

fence = _get_fence(&submit->bos[i].shared);
if (fence)
	return fence;

> +
> +		kfree(submit->bos[i].shared);
> +		submit->bos[i].nr_shared = 0;
> +		submit->bos[i].shared = NULL;
> +	}
> +
> +	return NULL;
> +}

This is an interesting function - in order to avoid wasting cycles it needs to
be ordered so that the most likely fences happen first.  If that is the case, I
think that in_fence might be the least likely so you should check it last.

> +static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job)
> +{
> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
> +
> +	return !sched_job->s_fence->finished.error ?
> +		msm_gpu_submit(submit) : NULL;
> +}
> +
> +static void dump_bad_submit(struct msm_gem_submit *submit)
> +{
> +	struct msm_gpu *gpu = submit->gpu;
> +	struct drm_device *dev = gpu->dev;
> +	struct msm_drm_private *priv = dev->dev_private;
> +	struct task_struct *task;
> +	char task_name[32] = {0};
> +
> +	rcu_read_lock();
> +	task = pid_task(submit->pid, PIDTYPE_PID);
> +	if (task) {
> +		char *cmd;
> +
> +		cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);

This should be GFP_ATOMIC because we are in the rcu_read_lock(). There might be
something else wrong with it too - I know we have this problem in the current
kernel and I'm not sure if it is avoidable without losing the task name.

> +		dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n",
> +				gpu->name, task->comm, cmd);
> +
> +		snprintf(task_name, sizeof(task_name),
> +				"offending task: %s (%s)", task->comm, cmd);
> +
> +		kfree(cmd);
> +	}
> +	rcu_read_unlock();
> +
> +	mutex_lock(&gpu->dev->struct_mutex);
> +	msm_rd_dump_submit(priv->hangrd, submit, task_name);
> +	mutex_unlock(&gpu->dev->struct_mutex);
> +}
> +
> +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit)
> +{
> +	int i;
> +	static atomic_t gpu_recovery;
> +
> +	/* Check if a GPU recovery is already in progress */
> +	if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0))
> +		return;
> +
> +	/*
> +	 * Pause the schedulers so we don't get new requests while the recovery
> +	 * is in progress
> +	 */
> +	for (i = 0; i < gpu->nr_rings; i++)
> +		kthread_park(gpu->rb[i]->sched.thread);
> +
> +	/*
> +	 * Disable interrupts so we don't get interrupted while the recovery is
> +	 * in progress
> +	 */
> +	disable_irq(gpu->irq);
> +
> +	dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name);

I don't know if we still need this line? Recovery seems to be standard operating
procedure these days.

> +
> +	if (submit)
> +		dump_bad_submit(submit);
> +
> +	/*
> +	 * For each ring, we do the following
> +	 * a) Inform the scheduler to drop the hardware fence for the
> +	 * submissions on its mirror list
> +	 * b) The submit(job) list on the ring is not useful anymore
> +	 * as we are going for a gpu reset, so we empty the submit list
> +	 */
> +	for (i = 0; i < gpu->nr_rings; i++) {
> +		struct msm_gem_submit *job, *tmp;
> +		unsigned long flags;
> +		struct msm_ringbuffer *ring = gpu->rb[i];
> +
> +		/* a) Release the hardware fences */
> +		drm_sched_hw_job_reset(&ring->sched,
> +				(submit && submit->ring == ring) ?
> +				&submit->sched_job : NULL);
> +
> +		/* b) Free up the ring submit list */
> +		spin_lock_irqsave(&ring->lock, flags);
> +
> +		list_for_each_entry_safe(job, tmp, &ring->submits, node)
> +			list_del(&job->node);
> +
> +		spin_unlock_irqrestore(&ring->lock, flags);
> +	}
> +
> +	/* Power cycle and reset the GPU back to init state */
> +	mutex_lock(&gpu->dev->struct_mutex);
> +
> +	pm_runtime_get_sync(&gpu->pdev->dev);
> +	gpu->funcs->recover(gpu);
> +	pm_runtime_put_sync(&gpu->pdev->dev);
> +
> +	mutex_unlock(&gpu->dev->struct_mutex);
> +
> +	/* Re-enable the interrupts once the gpu reset is complete */
> +	enable_irq(gpu->irq);
> +
> +	/*
> +	 * The GPU is hopefully back in good shape, now request the schedulers
> +	 * to replay its mirror list, starting with the scheduler on the highest
> +	 * priority ring
> +	 */
> +	for (i = 0; i < gpu->nr_rings; i++) {
> +		drm_sched_job_recovery(&gpu->rb[i]->sched);
> +		kthread_unpark(gpu->rb[i]->sched.thread);
> +	}
> +
> +	atomic_set(&gpu_recovery, 0);

I think we need a smp_wmb() here to make sure everybody else sees the update.

> +}
> +
> +static void msm_sched_timedout_job(struct drm_sched_job *bad_job)
> +{
> +	struct msm_gem_submit *submit = to_msm_gem_submit(bad_job);
> +	struct msm_gpu *gpu = submit->gpu;
> +	struct msm_ringbuffer *ring = submit->ring;
> +
> +	/*
> +	 * If this submission completed in the mean time, then the timeout is
> +	 * spurious
> +	 */
> +	if (submit->seqno <= submit->ring->memptrs->fence)
> +		return;
> +
> +	dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
> +			gpu->name, ring->id);
> +	dev_err(&gpu->pdev->dev, "%s:     completed fence: %u\n",
> +			gpu->name, ring->memptrs->fence);
> +	dev_err(&gpu->pdev->dev, "%s:     submitted fence: %u\n",
> +			gpu->name, ring->seqno);
> +
> +	msm_sched_gpu_recovery(gpu, submit);
> +}
> +
> +static void msm_sched_free_job(struct drm_sched_job *sched_job)
> +{
> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
> +	struct msm_gpu *gpu = submit->gpu;
> +	int i;
> +
> +	mutex_lock(&gpu->dev->struct_mutex);
> +
> +	for (i = 0; i < submit->nr_bos; i++) {
> +		struct msm_gem_object *msm_obj = submit->bos[i].obj;
> +
> +		msm_gem_move_to_inactive(&msm_obj->base);
> +	}
> +
> +	mutex_unlock(&gpu->dev->struct_mutex);
> +
> +	pm_runtime_mark_last_busy(&gpu->pdev->dev);
> +	pm_runtime_put_autosuspend(&gpu->pdev->dev);
> +
> +	msm_gem_submit_free(submit);
> +}
> +
> +static const struct drm_sched_backend_ops msm_sched_ops = {
> +	.dependency = msm_sched_dependency,
> +	.run_job = msm_sched_run_job,
> +	.timedout_job = msm_sched_timedout_job,
> +	.free_job = msm_sched_free_job,
> +};
> +
> +int msm_sched_job_init(struct drm_sched_job *sched_job)
> +{
> +	int ret;
> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
> +	struct msm_ringbuffer *ring = submit->ring;
> +
> +	ret = drm_sched_job_init(sched_job, &ring->sched,
> +			&submit->queue->sched_entity, submit->ctx);
> +	if (ret)
> +		return ret;
> +
> +	submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
> +
> +	mutex_lock(&ring->fence_idr_lock);
> +	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
> +						submit->out_fence, 0, INT_MAX,
> +						GFP_KERNEL);
> +	mutex_unlock(&ring->fence_idr_lock);
> +
> +	if (submit->out_fence_id < 0) {
> +		/*
> +		 * TODO: The scheduler's finished fence leaks here since the
> +		 * job will not be pushed to the queue. Need to update scheduler
> +		 * to fix this cleanly(?)
> +		 */

How would you propose to fix this?

> +		dma_fence_put(submit->out_fence);
> +		submit->out_fence = NULL;
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +void msm_sched_push_job(struct msm_gem_submit *submit)
> +{
> +	return drm_sched_entity_push_job(&submit->sched_job,
> +			&submit->queue->sched_entity);
> +}
> +
> +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name)
> +{


> +	return drm_sched_init(sched, &msm_sched_ops, 4, 0,
> +			msecs_to_jiffies(500), name);

Okay, I see where the ring->name comes in.  I don't like it but at least it
is a relatively low cost option if you share when the fence names.  Still...
sigh.

> +}
> +
> +void msm_sched_cleanup(struct drm_gpu_scheduler *sched)
> +{
> +	drm_sched_fini(sched);
> +}

I don't think this function is needed - I think we're smart enough to call
drm_sched_fini directly.

> +/*
> + * Create a new entity on the schedulers normal priority runqueue. For now we
> + * choose to always use the normal run queue priority, but in future its
> + * possible with some extension to the msm drm interface, to create the
> + * submitqueue(entities) of different priorities on the same ring, thereby
> + * allowing to priortize and schedule submitqueues belonging to the same ring
> + */
> +int msm_sched_entity_init(struct msm_gpu *gpu,
> +		struct drm_sched_entity *sched_entity, int prio)
> +{
> +	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
> +
> +	return drm_sched_entity_init(sched, sched_entity,
> +			&sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL);
> +}
> +
> +void msm_sched_entity_cleanup(struct msm_gpu *gpu,
> +		struct drm_sched_entity *sched_entity, int prio)
> +{
> +	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
> +
> +	drm_sched_entity_fini(sched, sched_entity);
> +}
> diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h
> new file mode 100644
> index 0000000..6ab2728
> --- /dev/null
> +++ b/drivers/gpu/drm/msm/msm_sched.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: GPL-2.0 */

// SPDX-License-Identifier: GPL-2.0

> +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
> +
> +#ifndef __MSM_SCHED_H__
> +#define __MSM_SCHED_H__
> +
> +#include <drm/gpu_scheduler.h>
> +
> +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name);
> +void msm_sched_cleanup(struct drm_gpu_scheduler *sched);
> +int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue,
> +		int prio);
> +void msm_sched_entity_cleanup(struct msm_gpu *gpu,
> +		struct drm_sched_entity *queue, int prio);
> +int msm_sched_job_init(struct drm_sched_job *sched_job);
> +void msm_sched_push_job(struct msm_gem_submit *submit);
> +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit);
> +#endif /* __MSM_SCHED_H__ */
> diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c
> index 325da44..b6eb13e 100644
> --- a/drivers/gpu/drm/msm/msm_submitqueue.c
> +++ b/drivers/gpu/drm/msm/msm_submitqueue.c
> @@ -19,6 +19,7 @@ void msm_submitqueue_destroy(struct kref *kref)
>  	struct msm_gpu_submitqueue *queue = container_of(kref,
>  		struct msm_gpu_submitqueue, ref);
>  
> +	msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio);
>  	kfree(queue);
>  }
>  
> @@ -65,6 +66,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>  {
>  	struct msm_drm_private *priv = drm->dev_private;
>  	struct msm_gpu_submitqueue *queue;
> +	struct msm_gpu *gpu = priv->gpu;
>  
>  	if (!ctx)
>  		return -ENODEV;
> @@ -77,13 +79,16 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>  	kref_init(&queue->ref);
>  	queue->flags = flags;
>  
> -	if (priv->gpu) {
> -		if (prio >= priv->gpu->nr_rings) {
> -			kfree(queue);
> -			return -EINVAL;
> -		}
> +	if (gpu) {
> +		if (prio >= gpu->nr_rings)
> +			goto fail;
> +
> +		if (msm_sched_entity_init(priv->gpu, &queue->sched_entity,
> +					prio))
> +			goto fail;
>  
>  		queue->prio = prio;
> +		queue->gpu = gpu;
>  	}
>  
>  	write_lock(&ctx->queuelock);
> @@ -98,6 +103,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>  	write_unlock(&ctx->queuelock);
>  
>  	return 0;
> +fail:
> +	kfree(queue);
> +	return -EINVAL;
>  }
>  
>  int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
> -- 
> 1.9.1
>
Sharat Masetty Oct. 3, 2018, 10:43 a.m. UTC | #2
Thanks for the review comments Jordan. I tried to answer a few queries.. 
please check.

On 10/2/2018 12:32 AM, Jordan Crouse wrote:
> On Mon, Oct 01, 2018 at 06:01:41PM +0530, Sharat Masetty wrote:
>> This patch hooks up the DRM gpu scheduler to the msm DRM driver. The
>> most noticeable changes to the driver are as follows. The submit path is
>> split into two parts, in the user context the submit(job) is created and
>> added to one of the entity's scheduler run queue. The scheduler then
>> tries to drain the queue by submitting the jobs the hardware to act upon.
>> The submit job sits on the scheduler queue until all the dependent
>> fences are waited upon successfully.
>>
>> We have one scheduler instance per ring. The submitqueues will host a
>> scheduler entity object. This entity will be mapped to the scheduler's
>> default runqueue. This should be good for now, but in future it is possible
>> to extend the API to allow for scheduling amongst the submitqueues on the
>> same ring.
>>
>> With this patch the role of the struct_mutex lock has been greatly reduced in
>> scope in the submit path, evidently when actually putting the job on the
>> ringbuffer. This should enable us with increased parallelism in the
>> driver which should translate to better performance overall hopefully.
>>
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/Kconfig           |   1 +
>>   drivers/gpu/drm/msm/Makefile          |   3 +-
>>   drivers/gpu/drm/msm/msm_drv.h         |   3 +-
>>   drivers/gpu/drm/msm/msm_gem.c         |   8 +-
>>   drivers/gpu/drm/msm/msm_gem.h         |   6 +
>>   drivers/gpu/drm/msm/msm_gem_submit.c  | 138 +++++++++------
>>   drivers/gpu/drm/msm/msm_gpu.c         |  72 ++++++--
>>   drivers/gpu/drm/msm/msm_gpu.h         |   2 +
>>   drivers/gpu/drm/msm/msm_ringbuffer.c  |   7 +
>>   drivers/gpu/drm/msm/msm_ringbuffer.h  |   3 +
>>   drivers/gpu/drm/msm/msm_sched.c       | 313 ++++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/msm/msm_sched.h       |  18 ++
>>   drivers/gpu/drm/msm/msm_submitqueue.c |  18 +-
>>   13 files changed, 507 insertions(+), 85 deletions(-)
>>   create mode 100644 drivers/gpu/drm/msm/msm_sched.c
>>   create mode 100644 drivers/gpu/drm/msm/msm_sched.h
>>
>> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
>> index 38cbde9..0d01810 100644
>> --- a/drivers/gpu/drm/msm/Kconfig
>> +++ b/drivers/gpu/drm/msm/Kconfig
>> @@ -15,6 +15,7 @@ config DRM_MSM
>>   	select SND_SOC_HDMI_CODEC if SND_SOC
>>   	select SYNC_FILE
>>   	select PM_OPP
>> +	select DRM_SCHED
>>   	default y
>>   	help
>>   	  DRM/KMS driver for MSM/snapdragon.
>> diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
>> index cd40c05..71ed921 100644
>> --- a/drivers/gpu/drm/msm/Makefile
>> +++ b/drivers/gpu/drm/msm/Makefile
>> @@ -60,7 +60,8 @@ msm-y := \
>>   	msm_perf.o \
>>   	msm_rd.o \
>>   	msm_ringbuffer.o \
>> -	msm_submitqueue.o
>> +	msm_submitqueue.o \
>> +	msm_sched.o
>>   
>>   msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
>> index b2da1fb..e461a9c 100644
>> --- a/drivers/gpu/drm/msm/msm_drv.h
>> +++ b/drivers/gpu/drm/msm/msm_drv.h
>> @@ -213,8 +213,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev,
>>   int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
>>   int msm_gem_sync_object(struct drm_gem_object *obj,
>>   		struct msm_fence_context *fctx, bool exclusive);
>> -void msm_gem_move_to_active(struct drm_gem_object *obj,
>> -		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence);
>> +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu);
>>   void msm_gem_move_to_inactive(struct drm_gem_object *obj);
>>   int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
>>   int msm_gem_cpu_fini(struct drm_gem_object *obj);
>> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
>> index f583bb4..7a12f30 100644
>> --- a/drivers/gpu/drm/msm/msm_gem.c
>> +++ b/drivers/gpu/drm/msm/msm_gem.c
>> @@ -663,16 +663,12 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
>>   	return 0;
>>   }
>>   
>> -void msm_gem_move_to_active(struct drm_gem_object *obj,
>> -		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence)
>> +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu)
>>   {
>>   	struct msm_gem_object *msm_obj = to_msm_bo(obj);
>>   	WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
>>   	msm_obj->gpu = gpu;
>> -	if (exclusive)
>> -		reservation_object_add_excl_fence(msm_obj->resv, fence);
>> -	else
>> -		reservation_object_add_shared_fence(msm_obj->resv, fence);
>> +
>>   	list_del_init(&msm_obj->mm_list);
>>   	list_add_tail(&msm_obj->mm_list, &gpu->active_list);
>>   }
>> diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
>> index cae3aa5..8c6269f 100644
>> --- a/drivers/gpu/drm/msm/msm_gem.h
>> +++ b/drivers/gpu/drm/msm/msm_gem.h
>> @@ -20,6 +20,7 @@
>>   
>>   #include <linux/kref.h>
>>   #include <linux/reservation.h>
>> +#include <drm/gpu_scheduler.h>
>>   #include "msm_drv.h"
>>   
>>   /* Additional internal-use only BO flags: */
>> @@ -136,6 +137,7 @@ enum msm_gem_lock {
>>    * lasts for the duration of the submit-ioctl.
>>    */
>>   struct msm_gem_submit {
>> +	struct drm_sched_job sched_job;
>>   	struct drm_device *dev;
>>   	struct msm_file_private *ctx;
>>   	struct msm_gpu *gpu;
>> @@ -144,6 +146,7 @@ struct msm_gem_submit {
>>   	struct ww_acquire_ctx ticket;
>>   	uint32_t seqno;		/* Sequence number of the submit on the ring */
>>   	struct dma_fence *hw_fence;
>> +	struct dma_fence *in_fence, *out_fence;
>>   	int out_fence_id;
>>   	struct msm_gpu_submitqueue *queue;
>>   	struct pid *pid;    /* submitting process */
>> @@ -162,6 +165,9 @@ struct msm_gem_submit {
>>   		uint32_t flags;
>>   		struct msm_gem_object *obj;
>>   		uint64_t iova;
>> +		struct dma_fence *excl;
>> +		uint32_t nr_shared;
>> +		struct dma_fence **shared;
>>   	} bos[0];
>>   };
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
>> index 7931c2a..dff945c 100644
>> --- a/drivers/gpu/drm/msm/msm_gem_submit.c
>> +++ b/drivers/gpu/drm/msm/msm_gem_submit.c
>> @@ -65,23 +65,37 @@ void msm_gem_submit_free(struct msm_gem_submit *submit)
>>   {
>>   	int i;
>>   
>> +	mutex_lock(&submit->ring->fence_idr_lock);
>>   	idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
>> -
>> -	dma_fence_put(submit->hw_fence);
>> +	mutex_unlock(&submit->ring->fence_idr_lock);
> 
> Why are we using a mutex for this guy - this seems like a job for a spin if I
> ever saw one. Maybe even a rw spin depending on how often that idr gets
> queried.
> 
>>   	for (i = 0; i < submit->nr_bos; i++) {
>> +		int j;
>> +
>>   		struct msm_gem_object *msm_obj = submit->bos[i].obj;
>>   
>>   		if (submit->bos[i].flags & BO_PINNED)
>>   			msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
>>   
>> -		drm_gem_object_put(&msm_obj->base);
>> +		drm_gem_object_put_unlocked(&msm_obj->base);
>> +
>> +		/*
>> +		 * While we are at it, clear the saved exclusive and shared
>> +		 * fences if any
>> +		 */
>> +		dma_fence_put(submit->bos[i].excl);
>> +
>> +		for (j = 0; j < submit->bos[i].nr_shared; j++)
>> +			dma_fence_put(submit->bos[i].shared[j]);
>> +
>> +		kfree(submit->bos[i].shared);
>>   	}
>>   
>> -	list_del(&submit->node);
>>   	put_pid(submit->pid);
>>   	msm_submitqueue_put(submit->queue);
>>   
>> +	dma_fence_put(submit->in_fence);
>> +	dma_fence_put(submit->out_fence);
>>   	kfree(submit);
>>   }
>>   
>> @@ -238,6 +252,27 @@ static int submit_lock_objects(struct msm_gem_submit *submit)
>>   	return ret;
>>   }
>>   
>> +static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < submit->nr_bos; i++) {
>> +		struct msm_gem_object *msm_obj = submit->bos[i].obj;
>> +
>> +		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
>> +			reservation_object_add_excl_fence(msm_obj->resv,
>> +					submit->out_fence);
>> +		else
>> +			reservation_object_add_shared_fence(msm_obj->resv,
>> +					submit->out_fence);
>> +
>> +		if (submit->bos[i].flags & BO_LOCKED) {
>> +			ww_mutex_unlock(&msm_obj->resv->lock);
>> +			submit->bos[i].flags &= ~BO_LOCKED;
>> +		}
>> +	}
>> +}
>> +
>>   static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
>>   {
>>   	int i, ret = 0;
>> @@ -260,10 +295,24 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
>>   		if (no_implicit)
>>   			continue;
>>   
>> -		ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
>> -			write);
>> -		if (ret)
>> -			break;
>> +		if (write) {
>> +			/*
>> +			 * Save the per buffer shared and exclusive fences for
>> +			 * the scheduler thread to wait on.
>> +			 *
>> +			 * Note: The memory allocated for the storing the
>> +			 * shared fences will be released when the scheduler
>> +			 * is done waiting on all the saved fences.
>> +			 */
>> +			ret = reservation_object_get_fences_rcu(msm_obj->resv,
>> +					&submit->bos[i].excl,
>> +					&submit->bos[i].nr_shared,
>> +					&submit->bos[i].shared);
>> +			if (ret)
>> +				return ret;
> 
> I think this can just be a return ret;
> 
>> +		} else
> 
> No need for the else
>> +			submit->bos[i].excl =
>> +				reservation_object_get_excl_rcu(msm_obj->resv);
>>   	}
>>   
>>   	return ret;
> 
> This can be a return 0;
> 
>> @@ -425,7 +474,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   	struct msm_file_private *ctx = file->driver_priv;
>>   	struct msm_gem_submit *submit;
>>   	struct msm_gpu *gpu = priv->gpu;
>> -	struct dma_fence *in_fence = NULL;
>>   	struct sync_file *sync_file = NULL;
>>   	struct msm_gpu_submitqueue *queue;
>>   	struct msm_ringbuffer *ring;
>> @@ -457,32 +505,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   
>>   	ring = gpu->rb[queue->prio];
>>   
>> -	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
>> -		in_fence = sync_file_get_fence(args->fence_fd);
>> -
>> -		if (!in_fence)
>> -			return -EINVAL;
>> -
>> -		/*
>> -		 * Wait if the fence is from a foreign context, or if the fence
>> -		 * array contains any fence from a foreign context.
>> -		 */
>> -		if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
>> -			ret = dma_fence_wait(in_fence, true);
>> -			if (ret)
>> -				return ret;
>> -		}
>> -	}
>> -
>> -	ret = mutex_lock_interruptible(&dev->struct_mutex);
>> -	if (ret)
>> -		return ret;
>> -
>>   	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
>>   		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
>>   		if (out_fence_fd < 0) {
>>   			ret = out_fence_fd;
>> -			goto out_unlock;
>> +			goto out_err;
>>   		}
>>   	}
>>   
>> @@ -490,7 +517,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   			args->nr_cmds, ctx);
>>   	if (!submit) {
>>   		ret = -ENOMEM;
>> -		goto out_unlock;
>> +		goto out_err;
>> +	}
>> +
>> +	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
>> +		submit->in_fence = sync_file_get_fence(args->fence_fd);
>> +
>> +		if (!submit->in_fence) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>>   	}
>>   
>>   	if (args->flags & MSM_SUBMIT_SUDO)
>> @@ -573,28 +609,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   
>>   	submit->nr_cmds = i;
>>   
>> -	msm_gpu_submit(submit);
>> -	if (IS_ERR(submit->hw_fence)) {
>> -		ret = PTR_ERR(submit->hw_fence);
>> -		submit->hw_fence = NULL;
>> +	ret = msm_sched_job_init(&submit->sched_job);
>> +	if (ret)
>>   		goto out;
>> -	}
>>   
>> -	/*
>> -	 * No protection should be okay here since this is protected by the big
>> -	 * GPU lock.
>> -	 */
>> -	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
>> -			submit->hw_fence, 0, INT_MAX, GFP_KERNEL);
>> -	if (submit->out_fence_id < 0) {
>> -		ret = -ENOMEM;
>> -		goto out;
>> -	}
>> +	submit_attach_fences_and_unlock(submit);
>>   
>>   	args->fence = submit->out_fence_id;
>>   
>>   	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
>> -		sync_file = sync_file_create(submit->hw_fence);
>> +		sync_file = sync_file_create(submit->out_fence);
>>   		if (!sync_file) {
>>   			ret = -ENOMEM;
>>   			goto out;
>> @@ -604,14 +628,22 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
>>   	}
>>   
>>   out:
>> -	if (in_fence)
>> -		dma_fence_put(in_fence);
>> +	/*
>> +	 * Clean up the submit bits and pieces which are not needed beyond this
>> +	 * function context
>> +	 */
>>   	submit_cleanup(submit);
>> -	if (ret)
>> +
>> +	if (!ret)
>> +		/*
>> +		 * If we reached here, then all is well. Push the job to the
>> +		 * scheduler
>> +		 */
>> +		msm_sched_push_job(submit);
>> +	else
>>   		msm_gem_submit_free(submit);
>> -out_unlock:
>> +out_err:
>>   	if (ret && (out_fence_fd >= 0))
>>   		put_unused_fd(out_fence_fd);
>> -	mutex_unlock(&dev->struct_mutex);
>>   	return ret;
>>   }
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index cd5fe49..481a55c 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -295,12 +295,17 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>>   find_submit(struct msm_ringbuffer *ring, uint32_t fence)
>>   {
>>   	struct msm_gem_submit *submit;
>> +	unsigned long flags;
>>   
>> -	WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
>> +	spin_lock_irqsave(&ring->lock, flags);
>>   
>>   	list_for_each_entry(submit, &ring->submits, node)
>> -		if (submit->seqno == fence)
>> +		if (submit->seqno == fence) {
>> +			spin_unlock_irqrestore(&ring->lock, flags);
>>   			return submit;
>> +		}
>> +
>> +	spin_unlock_irqrestore(&ring->lock, flags);
>>   
>>   	return NULL;
>>   }
>> @@ -316,6 +321,12 @@ static void recover_worker(struct work_struct *work)
>>   	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
>>   	int i;
>>   
>> +	submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
>> +	return msm_sched_gpu_recovery(gpu, submit);
>> +
>> +	/*
>> +	 * The unused code below will be removed in a subsequent patch
>> +	 */
> 
> Why not now?
I removed all the dead code as part of a separate patch "msm/drm: Remove 
unused code". I can either remove this comment from here or squash the 
other commit with this one.
> 
>>   	mutex_lock(&dev->struct_mutex);
>>   
>>   	dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);
>> @@ -591,11 +602,36 @@ static void retire_worker(struct work_struct *work)
>>   	mutex_unlock(&dev->struct_mutex);
>>   }
>>   
>> -/* call from irq handler to schedule work to retire bo's */
>> +static void signal_hw_fences(struct msm_ringbuffer *ring)
>> +{
>> +	unsigned long flags;
>> +	struct msm_gem_submit *submit, *tmp;
>> +
>> +	spin_lock_irqsave(&ring->lock, flags);
>> +
>> +	list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
>> +		if (submit->seqno > ring->memptrs->fence)
>> +			break;
>> +
>> +		list_del(&submit->node);
>> +
>> +		dma_fence_signal(submit->hw_fence);
>> +	}
>> +
>> +	spin_unlock_irqrestore(&ring->lock, flags);
>> +}
>> +
>> +/*
>> + * Called from the IRQ context to signal hardware fences for the completed
>> + * submits
>> + */
>>   void msm_gpu_retire(struct msm_gpu *gpu)
>>   {
>> -	struct msm_drm_private *priv = gpu->dev->dev_private;
>> -	queue_work(priv->wq, &gpu->retire_work);
>> +	int i;
>> +
>> +	for (i = 0; i < gpu->nr_rings; i++)
>> +		signal_hw_fences(gpu->rb[i]);
>> +
>>   	update_sw_cntrs(gpu);
>>   }
>>   
>> @@ -606,25 +642,28 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
>>   	struct drm_device *dev = gpu->dev;
>>   	struct msm_drm_private *priv = dev->dev_private;
>>   	struct msm_ringbuffer *ring = submit->ring;
>> +	unsigned long flags;
>>   	int i;
>>   
>> -	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
>> -
>>   	submit->hw_fence = msm_fence_alloc(ring->fctx);
>>   	if (IS_ERR(submit->hw_fence))
>>   		return submit->hw_fence;
>>   
>> -	pm_runtime_get_sync(&gpu->pdev->dev);
>> -
>> -	msm_gpu_hw_init(gpu);
>> +	spin_lock_irqsave(&ring->lock, flags);
>> +	list_add_tail(&submit->node, &ring->submits);
>> +	spin_unlock_irqrestore(&ring->lock, flags);
>>   	submit->seqno = ++ring->seqno;
>>   
>> -	list_add_tail(&submit->node, &ring->submits);
>> +	update_sw_cntrs(gpu);
> 
> I'm not sure if this needs the hardware to be up (it does check msm_gpu_active),
> but I don't think we should reorganize the order of these functions unless we
> need to.
Sure, I will check this out. The intent was to have only the bare 
essentials under the struct_mutex
> 
>> -	msm_rd_dump_submit(priv->rd, submit, NULL);
>> +	mutex_lock(&dev->struct_mutex);
>>   
>> -	update_sw_cntrs(gpu);
>> +	pm_runtime_get_sync(&gpu->pdev->dev);
>> +
>> +	msm_gpu_hw_init(gpu);
>> +
>> +	msm_rd_dump_submit(priv->rd, submit, NULL);
>>   
>>   	for (i = 0; i < submit->nr_bos; i++) {
>>   		struct msm_gem_object *msm_obj = submit->bos[i].obj;
>> @@ -634,16 +673,13 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
>>   		 */
>>   		WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
>>   
>> -		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
>> -			msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence);
>> -		else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
>> -			msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence);
>> +		msm_gem_move_to_active(&msm_obj->base, gpu);
>>   	}
>>   
>>   	gpu->funcs->submit(gpu, submit, submit->ctx);
>>   	priv->lastctx = submit->ctx;
>>   
>> -	hangcheck_timer_reset(gpu);
>> +	mutex_unlock(&dev->struct_mutex);
>>   
>>   	return submit->hw_fence;
>>   }
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index dd55979..3bd1920 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -173,6 +173,8 @@ struct msm_gpu_submitqueue {
>>   	int faults;
>>   	struct list_head node;
>>   	struct kref ref;
>> +	struct msm_gpu *gpu;
>> +	struct drm_sched_entity sched_entity;
>>   };
>>   
>>   static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
>> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
>> index 0889766..ef2bf10 100644
>> --- a/drivers/gpu/drm/msm/msm_ringbuffer.c
>> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
>> @@ -56,9 +56,14 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
>>   
>>   	snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
>>   
>> +	if (msm_sched_init(&ring->sched, ring->name) != 0) {
>> +		ret = -EINVAL;
>> +		goto fail;
>> +	}
>>   	ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
>>   
>>   	idr_init(&ring->fence_idr);
>> +	mutex_init(&ring->fence_idr_lock);
>>   
>>   	return ring;
>>   
>> @@ -74,6 +79,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
>>   
>>   	msm_fence_context_free(ring->fctx);
>>   
>> +	msm_sched_cleanup(&ring->sched);
>>   	if (ring->bo) {
>>   		msm_gem_put_iova(ring->bo, ring->gpu->aspace);
>>   		msm_gem_put_vaddr(ring->bo);
>> @@ -81,6 +87,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
>>   	}
>>   
>>   	idr_destroy(&ring->fence_idr);
>> +	mutex_destroy(&ring->fence_idr_lock);
>>   
>>   	kfree(ring);
>>   }
>> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
>> index 523373b..10ae4a8 100644
>> --- a/drivers/gpu/drm/msm/msm_ringbuffer.h
>> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
>> @@ -19,6 +19,7 @@
>>   #define __MSM_RINGBUFFER_H__
>>   
>>   #include "msm_drv.h"
>> +#include "msm_sched.h"
>>   
>>   #define rbmemptr(ring, member)  \
>>   	((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member))
>> @@ -42,7 +43,9 @@ struct msm_ringbuffer {
>>   	uint64_t memptrs_iova;
>>   	struct msm_fence_context *fctx;
>>   	struct idr fence_idr;
>> +	struct mutex fence_idr_lock;
>>   	spinlock_t lock;
>> +	struct drm_gpu_scheduler sched;
>>   };
>>   
>>   struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
>> diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c
>> new file mode 100644
>> index 0000000..8b805ce
>> --- /dev/null
>> +++ b/drivers/gpu/drm/msm/msm_sched.c
>> @@ -0,0 +1,313 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
> 
> // SPDX-License-Identifier: GPL-2.0
> 
>> +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
>> +
>> +#include "msm_gpu.h"
>> +#include "msm_gem.h"
>> +#include "msm_sched.h"
>> +
>> +#include <linux/string_helpers.h>
>> +#include <linux/kthread.h>
>> +
>> +struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job)
>> +{
>> +	return container_of(sched_job, struct msm_gem_submit, sched_job);
>> +}
>> +
>> +/*
>> + * Go through the bo's one by one and return the previously saved shared and
> 
> bo's -> bos
> 
>> + * exclusive fences. If the scheduler gets the fence, then it takes care of
>> + * releasing the reference on the fence.
>> + */
>> +static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job,
>> +		struct drm_sched_entity *entity)
>> +{
>> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
>> +	struct dma_fence *fence;
>> +	int i;
>> +
>> +	if (submit->in_fence) {
>> +		fence = submit->in_fence;
>> +		submit->in_fence = NULL;
> 
>> +		if (!dma_fence_is_signaled(fence))
>> +			return fence;
>> +
>> +		dma_fence_put(fence);
>> +	}
> 
> There might be a chance to consolidate some code here
> 
> static struct dma_fence *_get_fence(struct dma_fence **)
> {
> 	struct dma_fence *fence = *in;
> 	*in = NULL;
> 
> 	if (fence && !dma_fence_is_signaled(fence))
> 		return fence;
> 
> 	dma_fence_put(fence);
> 	return NULL;
> }
> 
> fence = _fence(&submit->in_fence);
> if (fence)
> 	return fence;
> 
> 
>> +	/* Return the previously saved shared and exclusive fences if any */
>> +	for (i = 0; i < submit->nr_bos; i++) {
>> +		int j;
>> +
>> +		if (submit->bos[i].excl) {
>> +			fence = submit->bos[i].excl;
>> +			submit->bos[i].excl = NULL;
>> +
>> +			if (!dma_fence_is_signaled(fence))
>> +				return fence;
>> +
>> +			dma_fence_put(fence);
>> +		}
> 
> fence = _get_fence(&submit->bos[i].excl);
> if (fence)
> 	return fence;
> 
>> +		for (j = 0; j < submit->bos[i].nr_shared; j++) {
>> +			if (!submit->bos[i].shared[j])
>> +				continue;
>> +
>> +			fence = submit->bos[i].shared[j];
>> +			submit->bos[i].shared[j] = NULL;
>> +
>> +			if (!dma_fence_is_signaled(fence))
>> +				return fence;
>> +
>> +			dma_fence_put(fence);
>> +		}
> 
> fence = _get_fence(&submit->bos[i].shared);
> if (fence)
> 	return fence;
> 
>> +
>> +		kfree(submit->bos[i].shared);
>> +		submit->bos[i].nr_shared = 0;
>> +		submit->bos[i].shared = NULL;
>> +	}
>> +
>> +	return NULL;
>> +}
> 
> This is an interesting function - in order to avoid wasting cycles it needs to
> be ordered so that the most likely fences happen first.  If that is the case, I
> think that in_fence might be the least likely so you should check it last.
Looked at this again.. In addition to your suggestion, I think this 
function will benefit greatly by adding a couple of iterators, one for 
bo and the other for shared fences, that way it doesn't have to restart 
checking from scratch everytime. The iterators will have to be part of 
the submit structure.
> 
>> +static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job)
>> +{
>> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
>> +
>> +	return !sched_job->s_fence->finished.error ?
>> +		msm_gpu_submit(submit) : NULL;
>> +}
>> +
>> +static void dump_bad_submit(struct msm_gem_submit *submit)
>> +{
>> +	struct msm_gpu *gpu = submit->gpu;
>> +	struct drm_device *dev = gpu->dev;
>> +	struct msm_drm_private *priv = dev->dev_private;
>> +	struct task_struct *task;
>> +	char task_name[32] = {0};
>> +
>> +	rcu_read_lock();
>> +	task = pid_task(submit->pid, PIDTYPE_PID);
>> +	if (task) {
>> +		char *cmd;
>> +
>> +		cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
> 
> This should be GFP_ATOMIC because we are in the rcu_read_lock(). There might be
> something else wrong with it too - I know we have this problem in the current
> kernel and I'm not sure if it is avoidable without losing the task name.
> 
>> +		dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n",
>> +				gpu->name, task->comm, cmd);
>> +
>> +		snprintf(task_name, sizeof(task_name),
>> +				"offending task: %s (%s)", task->comm, cmd);
>> +
>> +		kfree(cmd);
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	mutex_lock(&gpu->dev->struct_mutex);
>> +	msm_rd_dump_submit(priv->hangrd, submit, task_name);
>> +	mutex_unlock(&gpu->dev->struct_mutex);
>> +}
>> +
>> +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>> +{
>> +	int i;
>> +	static atomic_t gpu_recovery;
>> +
>> +	/* Check if a GPU recovery is already in progress */
>> +	if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0))
>> +		return;
>> +
>> +	/*
>> +	 * Pause the schedulers so we don't get new requests while the recovery
>> +	 * is in progress
>> +	 */
>> +	for (i = 0; i < gpu->nr_rings; i++)
>> +		kthread_park(gpu->rb[i]->sched.thread);
>> +
>> +	/*
>> +	 * Disable interrupts so we don't get interrupted while the recovery is
>> +	 * in progress
>> +	 */
>> +	disable_irq(gpu->irq);
>> +
>> +	dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name);
> 
> I don't know if we still need this line? Recovery seems to be standard operating
> procedure these days.
> 
>> +
>> +	if (submit)
>> +		dump_bad_submit(submit);
>> +
>> +	/*
>> +	 * For each ring, we do the following
>> +	 * a) Inform the scheduler to drop the hardware fence for the
>> +	 * submissions on its mirror list
>> +	 * b) The submit(job) list on the ring is not useful anymore
>> +	 * as we are going for a gpu reset, so we empty the submit list
>> +	 */
>> +	for (i = 0; i < gpu->nr_rings; i++) {
>> +		struct msm_gem_submit *job, *tmp;
>> +		unsigned long flags;
>> +		struct msm_ringbuffer *ring = gpu->rb[i];
>> +
>> +		/* a) Release the hardware fences */
>> +		drm_sched_hw_job_reset(&ring->sched,
>> +				(submit && submit->ring == ring) ?
>> +				&submit->sched_job : NULL);
>> +
>> +		/* b) Free up the ring submit list */
>> +		spin_lock_irqsave(&ring->lock, flags);
>> +
>> +		list_for_each_entry_safe(job, tmp, &ring->submits, node)
>> +			list_del(&job->node);
>> +
>> +		spin_unlock_irqrestore(&ring->lock, flags);
>> +	}
>> +
>> +	/* Power cycle and reset the GPU back to init state */
>> +	mutex_lock(&gpu->dev->struct_mutex);
>> +
>> +	pm_runtime_get_sync(&gpu->pdev->dev);
>> +	gpu->funcs->recover(gpu);
>> +	pm_runtime_put_sync(&gpu->pdev->dev);
>> +
>> +	mutex_unlock(&gpu->dev->struct_mutex);
>> +
>> +	/* Re-enable the interrupts once the gpu reset is complete */
>> +	enable_irq(gpu->irq);
>> +
>> +	/*
>> +	 * The GPU is hopefully back in good shape, now request the schedulers
>> +	 * to replay its mirror list, starting with the scheduler on the highest
>> +	 * priority ring
>> +	 */
>> +	for (i = 0; i < gpu->nr_rings; i++) {
>> +		drm_sched_job_recovery(&gpu->rb[i]->sched);
>> +		kthread_unpark(gpu->rb[i]->sched.thread);
>> +	}
>> +
>> +	atomic_set(&gpu_recovery, 0);
> 
> I think we need a smp_wmb() here to make sure everybody else sees the update.
> 
>> +}
>> +
>> +static void msm_sched_timedout_job(struct drm_sched_job *bad_job)
>> +{
>> +	struct msm_gem_submit *submit = to_msm_gem_submit(bad_job);
>> +	struct msm_gpu *gpu = submit->gpu;
>> +	struct msm_ringbuffer *ring = submit->ring;
>> +
>> +	/*
>> +	 * If this submission completed in the mean time, then the timeout is
>> +	 * spurious
>> +	 */
>> +	if (submit->seqno <= submit->ring->memptrs->fence)
>> +		return;
>> +
>> +	dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
>> +			gpu->name, ring->id);
>> +	dev_err(&gpu->pdev->dev, "%s:     completed fence: %u\n",
>> +			gpu->name, ring->memptrs->fence);
>> +	dev_err(&gpu->pdev->dev, "%s:     submitted fence: %u\n",
>> +			gpu->name, ring->seqno);
>> +
>> +	msm_sched_gpu_recovery(gpu, submit);
>> +}
>> +
>> +static void msm_sched_free_job(struct drm_sched_job *sched_job)
>> +{
>> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
>> +	struct msm_gpu *gpu = submit->gpu;
>> +	int i;
>> +
>> +	mutex_lock(&gpu->dev->struct_mutex);
>> +
>> +	for (i = 0; i < submit->nr_bos; i++) {
>> +		struct msm_gem_object *msm_obj = submit->bos[i].obj;
>> +
>> +		msm_gem_move_to_inactive(&msm_obj->base);
>> +	}
>> +
>> +	mutex_unlock(&gpu->dev->struct_mutex);
>> +
>> +	pm_runtime_mark_last_busy(&gpu->pdev->dev);
>> +	pm_runtime_put_autosuspend(&gpu->pdev->dev);
>> +
>> +	msm_gem_submit_free(submit);
>> +}
>> +
>> +static const struct drm_sched_backend_ops msm_sched_ops = {
>> +	.dependency = msm_sched_dependency,
>> +	.run_job = msm_sched_run_job,
>> +	.timedout_job = msm_sched_timedout_job,
>> +	.free_job = msm_sched_free_job,
>> +};
>> +
>> +int msm_sched_job_init(struct drm_sched_job *sched_job)
>> +{
>> +	int ret;
>> +	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
>> +	struct msm_ringbuffer *ring = submit->ring;
>> +
>> +	ret = drm_sched_job_init(sched_job, &ring->sched,
>> +			&submit->queue->sched_entity, submit->ctx);
>> +	if (ret)
>> +		return ret;
>> +
>> +	submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
>> +
>> +	mutex_lock(&ring->fence_idr_lock);
>> +	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
>> +						submit->out_fence, 0, INT_MAX,
>> +						GFP_KERNEL);
>> +	mutex_unlock(&ring->fence_idr_lock);
>> +
>> +	if (submit->out_fence_id < 0) {
>> +		/*
>> +		 * TODO: The scheduler's finished fence leaks here since the
>> +		 * job will not be pushed to the queue. Need to update scheduler
>> +		 * to fix this cleanly(?)
>> +		 */
> 
> How would you propose to fix this?
The problem arises because the scheduler code provided an API to create 
a sched_job, but no API to cleanup, instead it took care of cleaning up
the sched_job implicitly for us.

I was thinking of this change...
a) Add a drm_sched_job_cleanup() API which will remove the last 
reference count on the finished fence(dma_fence_put(finished_fence)) as 
part of this function.
b) Call this new function from this failure case here as well as from 
the msm_sched_free_job()
c) remove dma_fence_put(finished_fence) from drm_sched_job_finish(), 
since it is now moved to (b)

This I think should handle both cases. (a) and (c) require changes in 
the scheduler and will impact other drivers too, so I did not try it out 
as part of this patch series.

I will make another patch and share it.
> 
>> +		dma_fence_put(submit->out_fence);
>> +		submit->out_fence = NULL;
>> +		return -ENOMEM;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +void msm_sched_push_job(struct msm_gem_submit *submit)
>> +{
>> +	return drm_sched_entity_push_job(&submit->sched_job,
>> +			&submit->queue->sched_entity);
>> +}
>> +
>> +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name)
>> +{
> 
> 
>> +	return drm_sched_init(sched, &msm_sched_ops, 4, 0,
>> +			msecs_to_jiffies(500), name);
> 
> Okay, I see where the ring->name comes in.  I don't like it but at least it
> is a relatively low cost option if you share when the fence names.  Still...
> sigh.
> 
>> +}
>> +
>> +void msm_sched_cleanup(struct drm_gpu_scheduler *sched)
>> +{
>> +	drm_sched_fini(sched);
>> +}
> 
> I don't think this function is needed - I think we're smart enough to call
> drm_sched_fini directly.
> 
>> +/*
>> + * Create a new entity on the schedulers normal priority runqueue. For now we
>> + * choose to always use the normal run queue priority, but in future its
>> + * possible with some extension to the msm drm interface, to create the
>> + * submitqueue(entities) of different priorities on the same ring, thereby
>> + * allowing to priortize and schedule submitqueues belonging to the same ring
>> + */
>> +int msm_sched_entity_init(struct msm_gpu *gpu,
>> +		struct drm_sched_entity *sched_entity, int prio)
>> +{
>> +	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
>> +
>> +	return drm_sched_entity_init(sched, sched_entity,
>> +			&sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL);
>> +}
>> +
>> +void msm_sched_entity_cleanup(struct msm_gpu *gpu,
>> +		struct drm_sched_entity *sched_entity, int prio)
>> +{
>> +	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
>> +
>> +	drm_sched_entity_fini(sched, sched_entity);
>> +}
>> diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h
>> new file mode 100644
>> index 0000000..6ab2728
>> --- /dev/null
>> +++ b/drivers/gpu/drm/msm/msm_sched.h
>> @@ -0,0 +1,18 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
> 
> // SPDX-License-Identifier: GPL-2.0
> 
>> +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
>> +
>> +#ifndef __MSM_SCHED_H__
>> +#define __MSM_SCHED_H__
>> +
>> +#include <drm/gpu_scheduler.h>
>> +
>> +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name);
>> +void msm_sched_cleanup(struct drm_gpu_scheduler *sched);
>> +int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue,
>> +		int prio);
>> +void msm_sched_entity_cleanup(struct msm_gpu *gpu,
>> +		struct drm_sched_entity *queue, int prio);
>> +int msm_sched_job_init(struct drm_sched_job *sched_job);
>> +void msm_sched_push_job(struct msm_gem_submit *submit);
>> +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit);
>> +#endif /* __MSM_SCHED_H__ */
>> diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c
>> index 325da44..b6eb13e 100644
>> --- a/drivers/gpu/drm/msm/msm_submitqueue.c
>> +++ b/drivers/gpu/drm/msm/msm_submitqueue.c
>> @@ -19,6 +19,7 @@ void msm_submitqueue_destroy(struct kref *kref)
>>   	struct msm_gpu_submitqueue *queue = container_of(kref,
>>   		struct msm_gpu_submitqueue, ref);
>>   
>> +	msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio);
>>   	kfree(queue);
>>   }
>>   
>> @@ -65,6 +66,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>>   {
>>   	struct msm_drm_private *priv = drm->dev_private;
>>   	struct msm_gpu_submitqueue *queue;
>> +	struct msm_gpu *gpu = priv->gpu;
>>   
>>   	if (!ctx)
>>   		return -ENODEV;
>> @@ -77,13 +79,16 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>>   	kref_init(&queue->ref);
>>   	queue->flags = flags;
>>   
>> -	if (priv->gpu) {
>> -		if (prio >= priv->gpu->nr_rings) {
>> -			kfree(queue);
>> -			return -EINVAL;
>> -		}
>> +	if (gpu) {
>> +		if (prio >= gpu->nr_rings)
>> +			goto fail;
>> +
>> +		if (msm_sched_entity_init(priv->gpu, &queue->sched_entity,
>> +					prio))
>> +			goto fail;
>>   
>>   		queue->prio = prio;
>> +		queue->gpu = gpu;
>>   	}
>>   
>>   	write_lock(&ctx->queuelock);
>> @@ -98,6 +103,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
>>   	write_unlock(&ctx->queuelock);
>>   
>>   	return 0;
>> +fail:
>> +	kfree(queue);
>> +	return -EINVAL;
>>   }
>>   
>>   int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
>> -- 
>> 1.9.1
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 38cbde9..0d01810 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -15,6 +15,7 @@  config DRM_MSM
 	select SND_SOC_HDMI_CODEC if SND_SOC
 	select SYNC_FILE
 	select PM_OPP
+	select DRM_SCHED
 	default y
 	help
 	  DRM/KMS driver for MSM/snapdragon.
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index cd40c05..71ed921 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -60,7 +60,8 @@  msm-y := \
 	msm_perf.o \
 	msm_rd.o \
 	msm_ringbuffer.o \
-	msm_submitqueue.o
+	msm_submitqueue.o \
+	msm_sched.o
 
 msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b2da1fb..e461a9c 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -213,8 +213,7 @@  struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev,
 int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
 int msm_gem_sync_object(struct drm_gem_object *obj,
 		struct msm_fence_context *fctx, bool exclusive);
-void msm_gem_move_to_active(struct drm_gem_object *obj,
-		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence);
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu);
 void msm_gem_move_to_inactive(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout);
 int msm_gem_cpu_fini(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index f583bb4..7a12f30 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -663,16 +663,12 @@  int msm_gem_sync_object(struct drm_gem_object *obj,
 	return 0;
 }
 
-void msm_gem_move_to_active(struct drm_gem_object *obj,
-		struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence)
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
 	msm_obj->gpu = gpu;
-	if (exclusive)
-		reservation_object_add_excl_fence(msm_obj->resv, fence);
-	else
-		reservation_object_add_shared_fence(msm_obj->resv, fence);
+
 	list_del_init(&msm_obj->mm_list);
 	list_add_tail(&msm_obj->mm_list, &gpu->active_list);
 }
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index cae3aa5..8c6269f 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -20,6 +20,7 @@ 
 
 #include <linux/kref.h>
 #include <linux/reservation.h>
+#include <drm/gpu_scheduler.h>
 #include "msm_drv.h"
 
 /* Additional internal-use only BO flags: */
@@ -136,6 +137,7 @@  enum msm_gem_lock {
  * lasts for the duration of the submit-ioctl.
  */
 struct msm_gem_submit {
+	struct drm_sched_job sched_job;
 	struct drm_device *dev;
 	struct msm_file_private *ctx;
 	struct msm_gpu *gpu;
@@ -144,6 +146,7 @@  struct msm_gem_submit {
 	struct ww_acquire_ctx ticket;
 	uint32_t seqno;		/* Sequence number of the submit on the ring */
 	struct dma_fence *hw_fence;
+	struct dma_fence *in_fence, *out_fence;
 	int out_fence_id;
 	struct msm_gpu_submitqueue *queue;
 	struct pid *pid;    /* submitting process */
@@ -162,6 +165,9 @@  struct msm_gem_submit {
 		uint32_t flags;
 		struct msm_gem_object *obj;
 		uint64_t iova;
+		struct dma_fence *excl;
+		uint32_t nr_shared;
+		struct dma_fence **shared;
 	} bos[0];
 };
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 7931c2a..dff945c 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -65,23 +65,37 @@  void msm_gem_submit_free(struct msm_gem_submit *submit)
 {
 	int i;
 
+	mutex_lock(&submit->ring->fence_idr_lock);
 	idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
-
-	dma_fence_put(submit->hw_fence);
+	mutex_unlock(&submit->ring->fence_idr_lock);
 
 	for (i = 0; i < submit->nr_bos; i++) {
+		int j;
+
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
 
 		if (submit->bos[i].flags & BO_PINNED)
 			msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
 
-		drm_gem_object_put(&msm_obj->base);
+		drm_gem_object_put_unlocked(&msm_obj->base);
+
+		/*
+		 * While we are at it, clear the saved exclusive and shared
+		 * fences if any
+		 */
+		dma_fence_put(submit->bos[i].excl);
+
+		for (j = 0; j < submit->bos[i].nr_shared; j++)
+			dma_fence_put(submit->bos[i].shared[j]);
+
+		kfree(submit->bos[i].shared);
 	}
 
-	list_del(&submit->node);
 	put_pid(submit->pid);
 	msm_submitqueue_put(submit->queue);
 
+	dma_fence_put(submit->in_fence);
+	dma_fence_put(submit->out_fence);
 	kfree(submit);
 }
 
@@ -238,6 +252,27 @@  static int submit_lock_objects(struct msm_gem_submit *submit)
 	return ret;
 }
 
+static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit)
+{
+	int i;
+
+	for (i = 0; i < submit->nr_bos; i++) {
+		struct msm_gem_object *msm_obj = submit->bos[i].obj;
+
+		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
+			reservation_object_add_excl_fence(msm_obj->resv,
+					submit->out_fence);
+		else
+			reservation_object_add_shared_fence(msm_obj->resv,
+					submit->out_fence);
+
+		if (submit->bos[i].flags & BO_LOCKED) {
+			ww_mutex_unlock(&msm_obj->resv->lock);
+			submit->bos[i].flags &= ~BO_LOCKED;
+		}
+	}
+}
+
 static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
 {
 	int i, ret = 0;
@@ -260,10 +295,24 @@  static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
 		if (no_implicit)
 			continue;
 
-		ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
-			write);
-		if (ret)
-			break;
+		if (write) {
+			/*
+			 * Save the per buffer shared and exclusive fences for
+			 * the scheduler thread to wait on.
+			 *
+			 * Note: The memory allocated for the storing the
+			 * shared fences will be released when the scheduler
+			 * is done waiting on all the saved fences.
+			 */
+			ret = reservation_object_get_fences_rcu(msm_obj->resv,
+					&submit->bos[i].excl,
+					&submit->bos[i].nr_shared,
+					&submit->bos[i].shared);
+			if (ret)
+				return ret;
+		} else
+			submit->bos[i].excl =
+				reservation_object_get_excl_rcu(msm_obj->resv);
 	}
 
 	return ret;
@@ -425,7 +474,6 @@  int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	struct msm_file_private *ctx = file->driver_priv;
 	struct msm_gem_submit *submit;
 	struct msm_gpu *gpu = priv->gpu;
-	struct dma_fence *in_fence = NULL;
 	struct sync_file *sync_file = NULL;
 	struct msm_gpu_submitqueue *queue;
 	struct msm_ringbuffer *ring;
@@ -457,32 +505,11 @@  int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	ring = gpu->rb[queue->prio];
 
-	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
-		in_fence = sync_file_get_fence(args->fence_fd);
-
-		if (!in_fence)
-			return -EINVAL;
-
-		/*
-		 * Wait if the fence is from a foreign context, or if the fence
-		 * array contains any fence from a foreign context.
-		 */
-		if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
-			ret = dma_fence_wait(in_fence, true);
-			if (ret)
-				return ret;
-		}
-	}
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
 	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
 		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
 		if (out_fence_fd < 0) {
 			ret = out_fence_fd;
-			goto out_unlock;
+			goto out_err;
 		}
 	}
 
@@ -490,7 +517,16 @@  int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 			args->nr_cmds, ctx);
 	if (!submit) {
 		ret = -ENOMEM;
-		goto out_unlock;
+		goto out_err;
+	}
+
+	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
+		submit->in_fence = sync_file_get_fence(args->fence_fd);
+
+		if (!submit->in_fence) {
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	if (args->flags & MSM_SUBMIT_SUDO)
@@ -573,28 +609,16 @@  int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	submit->nr_cmds = i;
 
-	msm_gpu_submit(submit);
-	if (IS_ERR(submit->hw_fence)) {
-		ret = PTR_ERR(submit->hw_fence);
-		submit->hw_fence = NULL;
+	ret = msm_sched_job_init(&submit->sched_job);
+	if (ret)
 		goto out;
-	}
 
-	/*
-	 * No protection should be okay here since this is protected by the big
-	 * GPU lock.
-	 */
-	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
-			submit->hw_fence, 0, INT_MAX, GFP_KERNEL);
-	if (submit->out_fence_id < 0) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	submit_attach_fences_and_unlock(submit);
 
 	args->fence = submit->out_fence_id;
 
 	if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-		sync_file = sync_file_create(submit->hw_fence);
+		sync_file = sync_file_create(submit->out_fence);
 		if (!sync_file) {
 			ret = -ENOMEM;
 			goto out;
@@ -604,14 +628,22 @@  int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	}
 
 out:
-	if (in_fence)
-		dma_fence_put(in_fence);
+	/*
+	 * Clean up the submit bits and pieces which are not needed beyond this
+	 * function context
+	 */
 	submit_cleanup(submit);
-	if (ret)
+
+	if (!ret)
+		/*
+		 * If we reached here, then all is well. Push the job to the
+		 * scheduler
+		 */
+		msm_sched_push_job(submit);
+	else
 		msm_gem_submit_free(submit);
-out_unlock:
+out_err:
 	if (ret && (out_fence_fd >= 0))
 		put_unused_fd(out_fence_fd);
-	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index cd5fe49..481a55c 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -295,12 +295,17 @@  static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
 find_submit(struct msm_ringbuffer *ring, uint32_t fence)
 {
 	struct msm_gem_submit *submit;
+	unsigned long flags;
 
-	WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
+	spin_lock_irqsave(&ring->lock, flags);
 
 	list_for_each_entry(submit, &ring->submits, node)
-		if (submit->seqno == fence)
+		if (submit->seqno == fence) {
+			spin_unlock_irqrestore(&ring->lock, flags);
 			return submit;
+		}
+
+	spin_unlock_irqrestore(&ring->lock, flags);
 
 	return NULL;
 }
@@ -316,6 +321,12 @@  static void recover_worker(struct work_struct *work)
 	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
 	int i;
 
+	submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
+	return msm_sched_gpu_recovery(gpu, submit);
+
+	/*
+	 * The unused code below will be removed in a subsequent patch
+	 */
 	mutex_lock(&dev->struct_mutex);
 
 	dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);
@@ -591,11 +602,36 @@  static void retire_worker(struct work_struct *work)
 	mutex_unlock(&dev->struct_mutex);
 }
 
-/* call from irq handler to schedule work to retire bo's */
+static void signal_hw_fences(struct msm_ringbuffer *ring)
+{
+	unsigned long flags;
+	struct msm_gem_submit *submit, *tmp;
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
+		if (submit->seqno > ring->memptrs->fence)
+			break;
+
+		list_del(&submit->node);
+
+		dma_fence_signal(submit->hw_fence);
+	}
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+}
+
+/*
+ * Called from the IRQ context to signal hardware fences for the completed
+ * submits
+ */
 void msm_gpu_retire(struct msm_gpu *gpu)
 {
-	struct msm_drm_private *priv = gpu->dev->dev_private;
-	queue_work(priv->wq, &gpu->retire_work);
+	int i;
+
+	for (i = 0; i < gpu->nr_rings; i++)
+		signal_hw_fences(gpu->rb[i]);
+
 	update_sw_cntrs(gpu);
 }
 
@@ -606,25 +642,28 @@  struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_ringbuffer *ring = submit->ring;
+	unsigned long flags;
 	int i;
 
-	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
-
 	submit->hw_fence = msm_fence_alloc(ring->fctx);
 	if (IS_ERR(submit->hw_fence))
 		return submit->hw_fence;
 
-	pm_runtime_get_sync(&gpu->pdev->dev);
-
-	msm_gpu_hw_init(gpu);
+	spin_lock_irqsave(&ring->lock, flags);
+	list_add_tail(&submit->node, &ring->submits);
+	spin_unlock_irqrestore(&ring->lock, flags);
 
 	submit->seqno = ++ring->seqno;
 
-	list_add_tail(&submit->node, &ring->submits);
+	update_sw_cntrs(gpu);
 
-	msm_rd_dump_submit(priv->rd, submit, NULL);
+	mutex_lock(&dev->struct_mutex);
 
-	update_sw_cntrs(gpu);
+	pm_runtime_get_sync(&gpu->pdev->dev);
+
+	msm_gpu_hw_init(gpu);
+
+	msm_rd_dump_submit(priv->rd, submit, NULL);
 
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -634,16 +673,13 @@  struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
 		 */
 		WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
 
-		if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
-			msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence);
-		else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
-			msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence);
+		msm_gem_move_to_active(&msm_obj->base, gpu);
 	}
 
 	gpu->funcs->submit(gpu, submit, submit->ctx);
 	priv->lastctx = submit->ctx;
 
-	hangcheck_timer_reset(gpu);
+	mutex_unlock(&dev->struct_mutex);
 
 	return submit->hw_fence;
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index dd55979..3bd1920 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -173,6 +173,8 @@  struct msm_gpu_submitqueue {
 	int faults;
 	struct list_head node;
 	struct kref ref;
+	struct msm_gpu *gpu;
+	struct drm_sched_entity sched_entity;
 };
 
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 0889766..ef2bf10 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -56,9 +56,14 @@  struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
 
 	snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
 
+	if (msm_sched_init(&ring->sched, ring->name) != 0) {
+		ret = -EINVAL;
+		goto fail;
+	}
 	ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
 
 	idr_init(&ring->fence_idr);
+	mutex_init(&ring->fence_idr_lock);
 
 	return ring;
 
@@ -74,6 +79,7 @@  void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
 
 	msm_fence_context_free(ring->fctx);
 
+	msm_sched_cleanup(&ring->sched);
 	if (ring->bo) {
 		msm_gem_put_iova(ring->bo, ring->gpu->aspace);
 		msm_gem_put_vaddr(ring->bo);
@@ -81,6 +87,7 @@  void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
 	}
 
 	idr_destroy(&ring->fence_idr);
+	mutex_destroy(&ring->fence_idr_lock);
 
 	kfree(ring);
 }
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
index 523373b..10ae4a8 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -19,6 +19,7 @@ 
 #define __MSM_RINGBUFFER_H__
 
 #include "msm_drv.h"
+#include "msm_sched.h"
 
 #define rbmemptr(ring, member)  \
 	((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member))
@@ -42,7 +43,9 @@  struct msm_ringbuffer {
 	uint64_t memptrs_iova;
 	struct msm_fence_context *fctx;
 	struct idr fence_idr;
+	struct mutex fence_idr_lock;
 	spinlock_t lock;
+	struct drm_gpu_scheduler sched;
 };
 
 struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c
new file mode 100644
index 0000000..8b805ce
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_sched.c
@@ -0,0 +1,313 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+#include "msm_sched.h"
+
+#include <linux/string_helpers.h>
+#include <linux/kthread.h>
+
+struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job)
+{
+	return container_of(sched_job, struct msm_gem_submit, sched_job);
+}
+
+/*
+ * Go through the bo's one by one and return the previously saved shared and
+ * exclusive fences. If the scheduler gets the fence, then it takes care of
+ * releasing the reference on the fence.
+ */
+static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job,
+		struct drm_sched_entity *entity)
+{
+	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
+	struct dma_fence *fence;
+	int i;
+
+	if (submit->in_fence) {
+		fence = submit->in_fence;
+		submit->in_fence = NULL;
+
+		if (!dma_fence_is_signaled(fence))
+			return fence;
+
+		dma_fence_put(fence);
+	}
+
+	/* Return the previously saved shared and exclusive fences if any */
+	for (i = 0; i < submit->nr_bos; i++) {
+		int j;
+
+		if (submit->bos[i].excl) {
+			fence = submit->bos[i].excl;
+			submit->bos[i].excl = NULL;
+
+			if (!dma_fence_is_signaled(fence))
+				return fence;
+
+			dma_fence_put(fence);
+		}
+
+		for (j = 0; j < submit->bos[i].nr_shared; j++) {
+			if (!submit->bos[i].shared[j])
+				continue;
+
+			fence = submit->bos[i].shared[j];
+			submit->bos[i].shared[j] = NULL;
+
+			if (!dma_fence_is_signaled(fence))
+				return fence;
+
+			dma_fence_put(fence);
+		}
+
+		kfree(submit->bos[i].shared);
+		submit->bos[i].nr_shared = 0;
+		submit->bos[i].shared = NULL;
+	}
+
+	return NULL;
+}
+
+static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job)
+{
+	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
+
+	return !sched_job->s_fence->finished.error ?
+		msm_gpu_submit(submit) : NULL;
+}
+
+static void dump_bad_submit(struct msm_gem_submit *submit)
+{
+	struct msm_gpu *gpu = submit->gpu;
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct task_struct *task;
+	char task_name[32] = {0};
+
+	rcu_read_lock();
+	task = pid_task(submit->pid, PIDTYPE_PID);
+	if (task) {
+		char *cmd;
+
+		cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
+
+		dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n",
+				gpu->name, task->comm, cmd);
+
+		snprintf(task_name, sizeof(task_name),
+				"offending task: %s (%s)", task->comm, cmd);
+
+		kfree(cmd);
+	}
+	rcu_read_unlock();
+
+	mutex_lock(&gpu->dev->struct_mutex);
+	msm_rd_dump_submit(priv->hangrd, submit, task_name);
+	mutex_unlock(&gpu->dev->struct_mutex);
+}
+
+void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit)
+{
+	int i;
+	static atomic_t gpu_recovery;
+
+	/* Check if a GPU recovery is already in progress */
+	if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0))
+		return;
+
+	/*
+	 * Pause the schedulers so we don't get new requests while the recovery
+	 * is in progress
+	 */
+	for (i = 0; i < gpu->nr_rings; i++)
+		kthread_park(gpu->rb[i]->sched.thread);
+
+	/*
+	 * Disable interrupts so we don't get interrupted while the recovery is
+	 * in progress
+	 */
+	disable_irq(gpu->irq);
+
+	dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name);
+
+	if (submit)
+		dump_bad_submit(submit);
+
+	/*
+	 * For each ring, we do the following
+	 * a) Inform the scheduler to drop the hardware fence for the
+	 * submissions on its mirror list
+	 * b) The submit(job) list on the ring is not useful anymore
+	 * as we are going for a gpu reset, so we empty the submit list
+	 */
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_gem_submit *job, *tmp;
+		unsigned long flags;
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		/* a) Release the hardware fences */
+		drm_sched_hw_job_reset(&ring->sched,
+				(submit && submit->ring == ring) ?
+				&submit->sched_job : NULL);
+
+		/* b) Free up the ring submit list */
+		spin_lock_irqsave(&ring->lock, flags);
+
+		list_for_each_entry_safe(job, tmp, &ring->submits, node)
+			list_del(&job->node);
+
+		spin_unlock_irqrestore(&ring->lock, flags);
+	}
+
+	/* Power cycle and reset the GPU back to init state */
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	pm_runtime_get_sync(&gpu->pdev->dev);
+	gpu->funcs->recover(gpu);
+	pm_runtime_put_sync(&gpu->pdev->dev);
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+
+	/* Re-enable the interrupts once the gpu reset is complete */
+	enable_irq(gpu->irq);
+
+	/*
+	 * The GPU is hopefully back in good shape, now request the schedulers
+	 * to replay its mirror list, starting with the scheduler on the highest
+	 * priority ring
+	 */
+	for (i = 0; i < gpu->nr_rings; i++) {
+		drm_sched_job_recovery(&gpu->rb[i]->sched);
+		kthread_unpark(gpu->rb[i]->sched.thread);
+	}
+
+	atomic_set(&gpu_recovery, 0);
+}
+
+static void msm_sched_timedout_job(struct drm_sched_job *bad_job)
+{
+	struct msm_gem_submit *submit = to_msm_gem_submit(bad_job);
+	struct msm_gpu *gpu = submit->gpu;
+	struct msm_ringbuffer *ring = submit->ring;
+
+	/*
+	 * If this submission completed in the mean time, then the timeout is
+	 * spurious
+	 */
+	if (submit->seqno <= submit->ring->memptrs->fence)
+		return;
+
+	dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
+			gpu->name, ring->id);
+	dev_err(&gpu->pdev->dev, "%s:     completed fence: %u\n",
+			gpu->name, ring->memptrs->fence);
+	dev_err(&gpu->pdev->dev, "%s:     submitted fence: %u\n",
+			gpu->name, ring->seqno);
+
+	msm_sched_gpu_recovery(gpu, submit);
+}
+
+static void msm_sched_free_job(struct drm_sched_job *sched_job)
+{
+	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
+	struct msm_gpu *gpu = submit->gpu;
+	int i;
+
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	for (i = 0; i < submit->nr_bos; i++) {
+		struct msm_gem_object *msm_obj = submit->bos[i].obj;
+
+		msm_gem_move_to_inactive(&msm_obj->base);
+	}
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+
+	pm_runtime_mark_last_busy(&gpu->pdev->dev);
+	pm_runtime_put_autosuspend(&gpu->pdev->dev);
+
+	msm_gem_submit_free(submit);
+}
+
+static const struct drm_sched_backend_ops msm_sched_ops = {
+	.dependency = msm_sched_dependency,
+	.run_job = msm_sched_run_job,
+	.timedout_job = msm_sched_timedout_job,
+	.free_job = msm_sched_free_job,
+};
+
+int msm_sched_job_init(struct drm_sched_job *sched_job)
+{
+	int ret;
+	struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
+	struct msm_ringbuffer *ring = submit->ring;
+
+	ret = drm_sched_job_init(sched_job, &ring->sched,
+			&submit->queue->sched_entity, submit->ctx);
+	if (ret)
+		return ret;
+
+	submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
+
+	mutex_lock(&ring->fence_idr_lock);
+	submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
+						submit->out_fence, 0, INT_MAX,
+						GFP_KERNEL);
+	mutex_unlock(&ring->fence_idr_lock);
+
+	if (submit->out_fence_id < 0) {
+		/*
+		 * TODO: The scheduler's finished fence leaks here since the
+		 * job will not be pushed to the queue. Need to update scheduler
+		 * to fix this cleanly(?)
+		 */
+		dma_fence_put(submit->out_fence);
+		submit->out_fence = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void msm_sched_push_job(struct msm_gem_submit *submit)
+{
+	return drm_sched_entity_push_job(&submit->sched_job,
+			&submit->queue->sched_entity);
+}
+
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name)
+{
+	return drm_sched_init(sched, &msm_sched_ops, 4, 0,
+			msecs_to_jiffies(500), name);
+}
+
+void msm_sched_cleanup(struct drm_gpu_scheduler *sched)
+{
+	drm_sched_fini(sched);
+}
+
+/*
+ * Create a new entity on the schedulers normal priority runqueue. For now we
+ * choose to always use the normal run queue priority, but in future its
+ * possible with some extension to the msm drm interface, to create the
+ * submitqueue(entities) of different priorities on the same ring, thereby
+ * allowing to priortize and schedule submitqueues belonging to the same ring
+ */
+int msm_sched_entity_init(struct msm_gpu *gpu,
+		struct drm_sched_entity *sched_entity, int prio)
+{
+	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
+
+	return drm_sched_entity_init(sched, sched_entity,
+			&sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL);
+}
+
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
+		struct drm_sched_entity *sched_entity, int prio)
+{
+	struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
+
+	drm_sched_entity_fini(sched, sched_entity);
+}
diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h
new file mode 100644
index 0000000..6ab2728
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_sched.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+
+#ifndef __MSM_SCHED_H__
+#define __MSM_SCHED_H__
+
+#include <drm/gpu_scheduler.h>
+
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name);
+void msm_sched_cleanup(struct drm_gpu_scheduler *sched);
+int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue,
+		int prio);
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
+		struct drm_sched_entity *queue, int prio);
+int msm_sched_job_init(struct drm_sched_job *sched_job);
+void msm_sched_push_job(struct msm_gem_submit *submit);
+void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit);
+#endif /* __MSM_SCHED_H__ */
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c
index 325da44..b6eb13e 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -19,6 +19,7 @@  void msm_submitqueue_destroy(struct kref *kref)
 	struct msm_gpu_submitqueue *queue = container_of(kref,
 		struct msm_gpu_submitqueue, ref);
 
+	msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio);
 	kfree(queue);
 }
 
@@ -65,6 +66,7 @@  int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 {
 	struct msm_drm_private *priv = drm->dev_private;
 	struct msm_gpu_submitqueue *queue;
+	struct msm_gpu *gpu = priv->gpu;
 
 	if (!ctx)
 		return -ENODEV;
@@ -77,13 +79,16 @@  int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 	kref_init(&queue->ref);
 	queue->flags = flags;
 
-	if (priv->gpu) {
-		if (prio >= priv->gpu->nr_rings) {
-			kfree(queue);
-			return -EINVAL;
-		}
+	if (gpu) {
+		if (prio >= gpu->nr_rings)
+			goto fail;
+
+		if (msm_sched_entity_init(priv->gpu, &queue->sched_entity,
+					prio))
+			goto fail;
 
 		queue->prio = prio;
+		queue->gpu = gpu;
 	}
 
 	write_lock(&ctx->queuelock);
@@ -98,6 +103,9 @@  int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
 	write_unlock(&ctx->queuelock);
 
 	return 0;
+fail:
+	kfree(queue);
+	return -EINVAL;
 }
 
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)