diff mbox series

[RFC,04/20] drm/sched: Convert drm scheduler to use a work queue rather than kthread

Message ID 20221222222127.34560-5-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Initial Xe driver submission | expand

Commit Message

Matthew Brost Dec. 22, 2022, 10:21 p.m. UTC
In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
seems a bit odd but let us explain the reasoning below.

1. In XE the submission order from multiple drm_sched_entity is not
guaranteed to be the same completion even if targeting the same hardware
engine. This is because in XE we have a firmware scheduler, the GuC,
which allowed to reorder, timeslice, and preempt submissions. If a using
shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
apart as the TDR expects submission order == completion order. Using a
dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.

2. In XE submissions are done via programming a ring buffer (circular
buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
control on the ring for free.

A problem with this design is currently a drm_gpu_scheduler uses a
kthread for submission / job cleanup. This doesn't scale if a large
number of drm_gpu_scheduler are used. To work around the scaling issue,
use a worker rather than kthread for submission / job cleanup.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  14 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  12 +-
 drivers/gpu/drm/scheduler/sched_main.c      | 124 ++++++++++++--------
 include/drm/gpu_scheduler.h                 |  13 +-
 4 files changed, 93 insertions(+), 70 deletions(-)

Comments

Rob Clark Dec. 23, 2022, 5:42 p.m. UTC | #1
On Thu, Dec 22, 2022 at 2:29 PM Matthew Brost <matthew.brost@intel.com> wrote:
>
> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> seems a bit odd but let us explain the reasoning below.
>
> 1. In XE the submission order from multiple drm_sched_entity is not
> guaranteed to be the same completion even if targeting the same hardware
> engine. This is because in XE we have a firmware scheduler, the GuC,
> which allowed to reorder, timeslice, and preempt submissions. If a using
> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> apart as the TDR expects submission order == completion order. Using a
> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
>
> 2. In XE submissions are done via programming a ring buffer (circular
> buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
> limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
> control on the ring for free.
>
> A problem with this design is currently a drm_gpu_scheduler uses a
> kthread for submission / job cleanup. This doesn't scale if a large
> number of drm_gpu_scheduler are used. To work around the scaling issue,
> use a worker rather than kthread for submission / job cleanup.

You might want to enable CONFIG_DRM_MSM in your kconfig, I think you
missed a part

BR,
-R

> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  14 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  12 +-
>  drivers/gpu/drm/scheduler/sched_main.c      | 124 ++++++++++++--------
>  include/drm/gpu_scheduler.h                 |  13 +-
>  4 files changed, 93 insertions(+), 70 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index f60753f97ac5..9c2a10aeb0b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>         for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
> -               if (!ring || !ring->sched.thread)
> +               if (!ring || !ring->sched.ready)
>                         continue;
> -               kthread_park(ring->sched.thread);
> +               drm_sched_run_wq_stop(&ring->sched);
>         }
>
>         seq_printf(m, "run ib test:\n");
> @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>         for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
> -               if (!ring || !ring->sched.thread)
> +               if (!ring || !ring->sched.ready)
>                         continue;
> -               kthread_unpark(ring->sched.thread);
> +               drm_sched_run_wq_start(&ring->sched);
>         }
>
>         up_write(&adev->reset_domain->sem);
> @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>
>         ring = adev->rings[val];
>
> -       if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
> +       if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
>                 return -EINVAL;
>
>         /* the last preemption failed */
> @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>                 goto pro_end;
>
>         /* stop the scheduler */
> -       kthread_park(ring->sched.thread);
> +       drm_sched_run_wq_stop(&ring->sched);
>
>         /* preempt the IB */
>         r = amdgpu_ring_preempt_ib(ring);
> @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>
>  failure:
>         /* restart the scheduler */
> -       kthread_unpark(ring->sched.thread);
> +       drm_sched_run_wq_start(&ring->sched);
>
>         up_read(&adev->reset_domain->sem);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 076ae400d099..9552929ccf87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4577,7 +4577,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
> -               if (!ring || !ring->sched.thread)
> +               if (!ring || !ring->sched.ready)
>                         continue;
>
>                 spin_lock(&ring->sched.job_list_lock);
> @@ -4708,7 +4708,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
> -               if (!ring || !ring->sched.thread)
> +               if (!ring || !ring->sched.ready)
>                         continue;
>
>                 /*clear job fence from fence drv to avoid force_completion
> @@ -5247,7 +5247,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> -                       if (!ring || !ring->sched.thread)
> +                       if (!ring || !ring->sched.ready)
>                                 continue;
>
>                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> @@ -5321,7 +5321,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> -                       if (!ring || !ring->sched.thread)
> +                       if (!ring || !ring->sched.ready)
>                                 continue;
>
>                         drm_sched_start(&ring->sched, true);
> @@ -5648,7 +5648,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = adev->rings[i];
>
> -                       if (!ring || !ring->sched.thread)
> +                       if (!ring || !ring->sched.ready)
>                                 continue;
>
>                         drm_sched_stop(&ring->sched, NULL);
> @@ -5776,7 +5776,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
> -               if (!ring || !ring->sched.thread)
> +               if (!ring || !ring->sched.ready)
>                         continue;
>
>                 drm_sched_start(&ring->sched, true);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 27d52ffbb808..8c64045d0692 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -44,7 +44,6 @@
>   * The jobs in a entity are always scheduled in the order that they were pushed.
>   */
>
> -#include <linux/kthread.h>
>  #include <linux/wait.h>
>  #include <linux/sched.h>
>  #include <linux/completion.h>
> @@ -251,6 +250,53 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>         return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
>  }
>
> +/**
> + * drm_sched_run_wq_stop - stop scheduler run worker
> + *
> + * @sched: scheduler instance to stop run worker
> + */
> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
> +{
> +       sched->pause_run_wq = true;
> +       smp_wmb();
> +
> +       cancel_work_sync(&sched->work_run);
> +}
> +EXPORT_SYMBOL(drm_sched_run_wq_stop);
> +
> +/**
> + * drm_sched_run_wq_start - start scheduler run worker
> + *
> + * @sched: scheduler instance to start run worker
> + */
> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
> +{
> +       sched->pause_run_wq = false;
> +       smp_wmb();
> +
> +       queue_work(sched->run_wq, &sched->work_run);
> +}
> +EXPORT_SYMBOL(drm_sched_run_wq_start);
> +
> +/**
> + * drm_sched_run_wq_queue - queue scheduler run worker
> + *
> + * @sched: scheduler instance to queue run worker
> + */
> +static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
> +{
> +       smp_rmb();
> +
> +       /*
> +        * Try not to schedule work if pause_run_wq set but not the end of world
> +        * if we do as either it will be cancelled by the above
> +        * cancel_work_sync, or drm_sched_main turns into a NOP while
> +        * pause_run_wq is set.
> +        */
> +       if (!sched->pause_run_wq)
> +               queue_work(sched->run_wq, &sched->work_run);
> +}
> +
>  /**
>   * drm_sched_job_done - complete a job
>   * @s_job: pointer to the job which is done
> @@ -270,7 +316,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>         dma_fence_get(&s_fence->finished);
>         drm_sched_fence_finished(s_fence);
>         dma_fence_put(&s_fence->finished);
> -       wake_up_interruptible(&sched->wake_up_worker);
> +       drm_sched_run_wq_queue(sched);
>  }
>
>  /**
> @@ -433,7 +479,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>  {
>         struct drm_sched_job *s_job, *tmp;
>
> -       kthread_park(sched->thread);
> +       drm_sched_run_wq_stop(sched);
>
>         /*
>          * Reinsert back the bad job here - now it's safe as
> @@ -546,7 +592,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>                 spin_unlock(&sched->job_list_lock);
>         }
>
> -       kthread_unpark(sched->thread);
> +       drm_sched_run_wq_start(sched);
>  }
>  EXPORT_SYMBOL(drm_sched_start);
>
> @@ -831,7 +877,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
>  void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
>  {
>         if (drm_sched_ready(sched))
> -               wake_up_interruptible(&sched->wake_up_worker);
> +               drm_sched_run_wq_queue(sched);
>  }
>
>  /**
> @@ -941,60 +987,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>  }
>  EXPORT_SYMBOL(drm_sched_pick_best);
>
> -/**
> - * drm_sched_blocked - check if the scheduler is blocked
> - *
> - * @sched: scheduler instance
> - *
> - * Returns true if blocked, otherwise false.
> - */
> -static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
> -{
> -       if (kthread_should_park()) {
> -               kthread_parkme();
> -               return true;
> -       }
> -
> -       return false;
> -}
> -
>  /**
>   * drm_sched_main - main scheduler thread
>   *
>   * @param: scheduler instance
> - *
> - * Returns 0.
>   */
> -static int drm_sched_main(void *param)
> +static void drm_sched_main(struct work_struct *w)
>  {
> -       struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> +       struct drm_gpu_scheduler *sched =
> +               container_of(w, struct drm_gpu_scheduler, work_run);
>         int r;
>
> -       sched_set_fifo_low(current);
> -
> -       while (!kthread_should_stop()) {
> -               struct drm_sched_entity *entity = NULL;
> +       while (!READ_ONCE(sched->pause_run_wq)) {
> +               struct drm_sched_entity *entity;
>                 struct drm_sched_fence *s_fence;
>                 struct drm_sched_job *sched_job;
>                 struct dma_fence *fence;
> -               struct drm_sched_job *cleanup_job = NULL;
> +               struct drm_sched_job *cleanup_job;
>
> -               wait_event_interruptible(sched->wake_up_worker,
> -                                        (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> -                                        (!drm_sched_blocked(sched) &&
> -                                         (entity = drm_sched_select_entity(sched))) ||
> -                                        kthread_should_stop());
> +               cleanup_job = drm_sched_get_cleanup_job(sched);
> +               entity = drm_sched_select_entity(sched);
>
>                 if (cleanup_job)
>                         sched->ops->free_job(cleanup_job);
>
> -               if (!entity)
> +               if (!entity) {
> +                       if (!cleanup_job)
> +                               break;
>                         continue;
> +               }
>
>                 sched_job = drm_sched_entity_pop_job(entity);
>
>                 if (!sched_job) {
>                         complete_all(&entity->entity_idle);
> +                       if (!cleanup_job)
> +                               break;
>                         continue;
>                 }
>
> @@ -1022,14 +1050,14 @@ static int drm_sched_main(void *param)
>                                           r);
>                 } else {
>                         if (IS_ERR(fence))
> -                               dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> +                               dma_fence_set_error(&s_fence->finished,
> +                                                   PTR_ERR(fence));
>
>                         drm_sched_job_done(sched_job);
>                 }
>
>                 wake_up(&sched->job_scheduled);
>         }
> -       return 0;
>  }
>
>  /**
> @@ -1054,35 +1082,28 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>                    long timeout, struct workqueue_struct *timeout_wq,
>                    atomic_t *score, const char *name, struct device *dev)
>  {
> -       int i, ret;
> +       int i;
>         sched->ops = ops;
>         sched->hw_submission_limit = hw_submission;
>         sched->name = name;
>         sched->timeout = timeout;
>         sched->timeout_wq = timeout_wq ? : system_wq;
> +       sched->run_wq = system_wq;      /* FIXME: Let user pass this in */
>         sched->hang_limit = hang_limit;
>         sched->score = score ? score : &sched->_score;
>         sched->dev = dev;
>         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
>                 drm_sched_rq_init(sched, &sched->sched_rq[i]);
>
> -       init_waitqueue_head(&sched->wake_up_worker);
>         init_waitqueue_head(&sched->job_scheduled);
>         INIT_LIST_HEAD(&sched->pending_list);
>         spin_lock_init(&sched->job_list_lock);
>         atomic_set(&sched->hw_rq_count, 0);
>         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> +       INIT_WORK(&sched->work_run, drm_sched_main);
>         atomic_set(&sched->_score, 0);
>         atomic64_set(&sched->job_id_count, 0);
> -
> -       /* Each scheduler will run on a seperate kernel thread */
> -       sched->thread = kthread_run(drm_sched_main, sched, sched->name);
> -       if (IS_ERR(sched->thread)) {
> -               ret = PTR_ERR(sched->thread);
> -               sched->thread = NULL;
> -               DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
> -               return ret;
> -       }
> +       sched->pause_run_wq = false;
>
>         sched->ready = true;
>         return 0;
> @@ -1101,8 +1122,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>         struct drm_sched_entity *s_entity;
>         int i;
>
> -       if (sched->thread)
> -               kthread_stop(sched->thread);
> +       drm_sched_run_wq_stop(sched);
>
>         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>                 struct drm_sched_rq *rq = &sched->sched_rq[i];
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index ca857ec9e7eb..ff50f3c289cd 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -456,17 +456,16 @@ struct drm_sched_backend_ops {
>   * @timeout: the time after which a job is removed from the scheduler.
>   * @name: name of the ring for which this scheduler is being used.
>   * @sched_rq: priority wise array of run queues.
> - * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
> - *                  is ready to be scheduled.
>   * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
>   *                 waits on this wait queue until all the scheduled jobs are
>   *                 finished.
>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
> + * @run_wq: workqueue used to queue @work_run
>   * @timeout_wq: workqueue used to queue @work_tdr
> + * @work_run: schedules jobs and cleans up entities
>   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
>   *            timeout interval is over.
> - * @thread: the kthread on which the scheduler which run.
>   * @pending_list: the list of jobs which are currently in the job queue.
>   * @job_list_lock: lock to protect the pending_list.
>   * @hang_limit: once the hangs by a job crosses this limit then it is marked
> @@ -475,6 +474,7 @@ struct drm_sched_backend_ops {
>   * @_score: score used when the driver doesn't provide one
>   * @ready: marks if the underlying HW is ready to work
>   * @free_guilty: A hit to time out handler to free the guilty job.
> + * @pause_run_wq: pause queuing of @work_run on @run_wq
>   * @dev: system &struct device
>   *
>   * One scheduler is implemented for each hardware ring.
> @@ -485,13 +485,13 @@ struct drm_gpu_scheduler {
>         long                            timeout;
>         const char                      *name;
>         struct drm_sched_rq             sched_rq[DRM_SCHED_PRIORITY_COUNT];
> -       wait_queue_head_t               wake_up_worker;
>         wait_queue_head_t               job_scheduled;
>         atomic_t                        hw_rq_count;
>         atomic64_t                      job_id_count;
> +       struct workqueue_struct         *run_wq;
>         struct workqueue_struct         *timeout_wq;
> +       struct work_struct              work_run;
>         struct delayed_work             work_tdr;
> -       struct task_struct              *thread;
>         struct list_head                pending_list;
>         spinlock_t                      job_list_lock;
>         int                             hang_limit;
> @@ -499,6 +499,7 @@ struct drm_gpu_scheduler {
>         atomic_t                        _score;
>         bool                            ready;
>         bool                            free_guilty;
> +       bool                            pause_run_wq;
>         struct device                   *dev;
>  };
>
> @@ -529,6 +530,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
>
>  void drm_sched_job_cleanup(struct drm_sched_job *job);
>  void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
>  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
>  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
>  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> --
> 2.37.3
>
Matthew Brost Dec. 28, 2022, 10:21 p.m. UTC | #2
On Fri, Dec 23, 2022 at 09:42:58AM -0800, Rob Clark wrote:
> On Thu, Dec 22, 2022 at 2:29 PM Matthew Brost <matthew.brost@intel.com> wrote:
> >
> > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > seems a bit odd but let us explain the reasoning below.
> >
> > 1. In XE the submission order from multiple drm_sched_entity is not
> > guaranteed to be the same completion even if targeting the same hardware
> > engine. This is because in XE we have a firmware scheduler, the GuC,
> > which allowed to reorder, timeslice, and preempt submissions. If a using
> > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > apart as the TDR expects submission order == completion order. Using a
> > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
> >
> > 2. In XE submissions are done via programming a ring buffer (circular
> > buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
> > limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
> > control on the ring for free.
> >
> > A problem with this design is currently a drm_gpu_scheduler uses a
> > kthread for submission / job cleanup. This doesn't scale if a large
> > number of drm_gpu_scheduler are used. To work around the scaling issue,
> > use a worker rather than kthread for submission / job cleanup.
> 
> You might want to enable CONFIG_DRM_MSM in your kconfig, I think you
> missed a part
> 
> BR,
> -R
> 

Thanks for feedback Rob, yes indeed we missed updating the MSM driver. Fixed up
in our Xe repo and will be fixed in the next rev on the list.

Matt

> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  14 +--
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  12 +-
> >  drivers/gpu/drm/scheduler/sched_main.c      | 124 ++++++++++++--------
> >  include/drm/gpu_scheduler.h                 |  13 +-
> >  4 files changed, 93 insertions(+), 70 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index f60753f97ac5..9c2a10aeb0b3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> >         for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> >                 struct amdgpu_ring *ring = adev->rings[i];
> >
> > -               if (!ring || !ring->sched.thread)
> > +               if (!ring || !ring->sched.ready)
> >                         continue;
> > -               kthread_park(ring->sched.thread);
> > +               drm_sched_run_wq_stop(&ring->sched);
> >         }
> >
> >         seq_printf(m, "run ib test:\n");
> > @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> >         for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> >                 struct amdgpu_ring *ring = adev->rings[i];
> >
> > -               if (!ring || !ring->sched.thread)
> > +               if (!ring || !ring->sched.ready)
> >                         continue;
> > -               kthread_unpark(ring->sched.thread);
> > +               drm_sched_run_wq_start(&ring->sched);
> >         }
> >
> >         up_write(&adev->reset_domain->sem);
> > @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> >
> >         ring = adev->rings[val];
> >
> > -       if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
> > +       if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
> >                 return -EINVAL;
> >
> >         /* the last preemption failed */
> > @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> >                 goto pro_end;
> >
> >         /* stop the scheduler */
> > -       kthread_park(ring->sched.thread);
> > +       drm_sched_run_wq_stop(&ring->sched);
> >
> >         /* preempt the IB */
> >         r = amdgpu_ring_preempt_ib(ring);
> > @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> >
> >  failure:
> >         /* restart the scheduler */
> > -       kthread_unpark(ring->sched.thread);
> > +       drm_sched_run_wq_start(&ring->sched);
> >
> >         up_read(&adev->reset_domain->sem);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 076ae400d099..9552929ccf87 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4577,7 +4577,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
> >         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                 struct amdgpu_ring *ring = adev->rings[i];
> >
> > -               if (!ring || !ring->sched.thread)
> > +               if (!ring || !ring->sched.ready)
> >                         continue;
> >
> >                 spin_lock(&ring->sched.job_list_lock);
> > @@ -4708,7 +4708,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                 struct amdgpu_ring *ring = adev->rings[i];
> >
> > -               if (!ring || !ring->sched.thread)
> > +               if (!ring || !ring->sched.ready)
> >                         continue;
> >
> >                 /*clear job fence from fence drv to avoid force_completion
> > @@ -5247,7 +5247,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> >                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                         struct amdgpu_ring *ring = tmp_adev->rings[i];
> >
> > -                       if (!ring || !ring->sched.thread)
> > +                       if (!ring || !ring->sched.ready)
> >                                 continue;
> >
> >                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> > @@ -5321,7 +5321,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> >                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                         struct amdgpu_ring *ring = tmp_adev->rings[i];
> >
> > -                       if (!ring || !ring->sched.thread)
> > +                       if (!ring || !ring->sched.ready)
> >                                 continue;
> >
> >                         drm_sched_start(&ring->sched, true);
> > @@ -5648,7 +5648,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> >                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                         struct amdgpu_ring *ring = adev->rings[i];
> >
> > -                       if (!ring || !ring->sched.thread)
> > +                       if (!ring || !ring->sched.ready)
> >                                 continue;
> >
> >                         drm_sched_stop(&ring->sched, NULL);
> > @@ -5776,7 +5776,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
> >         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                 struct amdgpu_ring *ring = adev->rings[i];
> >
> > -               if (!ring || !ring->sched.thread)
> > +               if (!ring || !ring->sched.ready)
> >                         continue;
> >
> >                 drm_sched_start(&ring->sched, true);
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index 27d52ffbb808..8c64045d0692 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -44,7 +44,6 @@
> >   * The jobs in a entity are always scheduled in the order that they were pushed.
> >   */
> >
> > -#include <linux/kthread.h>
> >  #include <linux/wait.h>
> >  #include <linux/sched.h>
> >  #include <linux/completion.h>
> > @@ -251,6 +250,53 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
> >         return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
> >  }
> >
> > +/**
> > + * drm_sched_run_wq_stop - stop scheduler run worker
> > + *
> > + * @sched: scheduler instance to stop run worker
> > + */
> > +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
> > +{
> > +       sched->pause_run_wq = true;
> > +       smp_wmb();
> > +
> > +       cancel_work_sync(&sched->work_run);
> > +}
> > +EXPORT_SYMBOL(drm_sched_run_wq_stop);
> > +
> > +/**
> > + * drm_sched_run_wq_start - start scheduler run worker
> > + *
> > + * @sched: scheduler instance to start run worker
> > + */
> > +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
> > +{
> > +       sched->pause_run_wq = false;
> > +       smp_wmb();
> > +
> > +       queue_work(sched->run_wq, &sched->work_run);
> > +}
> > +EXPORT_SYMBOL(drm_sched_run_wq_start);
> > +
> > +/**
> > + * drm_sched_run_wq_queue - queue scheduler run worker
> > + *
> > + * @sched: scheduler instance to queue run worker
> > + */
> > +static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
> > +{
> > +       smp_rmb();
> > +
> > +       /*
> > +        * Try not to schedule work if pause_run_wq set but not the end of world
> > +        * if we do as either it will be cancelled by the above
> > +        * cancel_work_sync, or drm_sched_main turns into a NOP while
> > +        * pause_run_wq is set.
> > +        */
> > +       if (!sched->pause_run_wq)
> > +               queue_work(sched->run_wq, &sched->work_run);
> > +}
> > +
> >  /**
> >   * drm_sched_job_done - complete a job
> >   * @s_job: pointer to the job which is done
> > @@ -270,7 +316,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
> >         dma_fence_get(&s_fence->finished);
> >         drm_sched_fence_finished(s_fence);
> >         dma_fence_put(&s_fence->finished);
> > -       wake_up_interruptible(&sched->wake_up_worker);
> > +       drm_sched_run_wq_queue(sched);
> >  }
> >
> >  /**
> > @@ -433,7 +479,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
> >  {
> >         struct drm_sched_job *s_job, *tmp;
> >
> > -       kthread_park(sched->thread);
> > +       drm_sched_run_wq_stop(sched);
> >
> >         /*
> >          * Reinsert back the bad job here - now it's safe as
> > @@ -546,7 +592,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
> >                 spin_unlock(&sched->job_list_lock);
> >         }
> >
> > -       kthread_unpark(sched->thread);
> > +       drm_sched_run_wq_start(sched);
> >  }
> >  EXPORT_SYMBOL(drm_sched_start);
> >
> > @@ -831,7 +877,7 @@ static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
> >  void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
> >  {
> >         if (drm_sched_ready(sched))
> > -               wake_up_interruptible(&sched->wake_up_worker);
> > +               drm_sched_run_wq_queue(sched);
> >  }
> >
> >  /**
> > @@ -941,60 +987,42 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> >  }
> >  EXPORT_SYMBOL(drm_sched_pick_best);
> >
> > -/**
> > - * drm_sched_blocked - check if the scheduler is blocked
> > - *
> > - * @sched: scheduler instance
> > - *
> > - * Returns true if blocked, otherwise false.
> > - */
> > -static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
> > -{
> > -       if (kthread_should_park()) {
> > -               kthread_parkme();
> > -               return true;
> > -       }
> > -
> > -       return false;
> > -}
> > -
> >  /**
> >   * drm_sched_main - main scheduler thread
> >   *
> >   * @param: scheduler instance
> > - *
> > - * Returns 0.
> >   */
> > -static int drm_sched_main(void *param)
> > +static void drm_sched_main(struct work_struct *w)
> >  {
> > -       struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> > +       struct drm_gpu_scheduler *sched =
> > +               container_of(w, struct drm_gpu_scheduler, work_run);
> >         int r;
> >
> > -       sched_set_fifo_low(current);
> > -
> > -       while (!kthread_should_stop()) {
> > -               struct drm_sched_entity *entity = NULL;
> > +       while (!READ_ONCE(sched->pause_run_wq)) {
> > +               struct drm_sched_entity *entity;
> >                 struct drm_sched_fence *s_fence;
> >                 struct drm_sched_job *sched_job;
> >                 struct dma_fence *fence;
> > -               struct drm_sched_job *cleanup_job = NULL;
> > +               struct drm_sched_job *cleanup_job;
> >
> > -               wait_event_interruptible(sched->wake_up_worker,
> > -                                        (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
> > -                                        (!drm_sched_blocked(sched) &&
> > -                                         (entity = drm_sched_select_entity(sched))) ||
> > -                                        kthread_should_stop());
> > +               cleanup_job = drm_sched_get_cleanup_job(sched);
> > +               entity = drm_sched_select_entity(sched);
> >
> >                 if (cleanup_job)
> >                         sched->ops->free_job(cleanup_job);
> >
> > -               if (!entity)
> > +               if (!entity) {
> > +                       if (!cleanup_job)
> > +                               break;
> >                         continue;
> > +               }
> >
> >                 sched_job = drm_sched_entity_pop_job(entity);
> >
> >                 if (!sched_job) {
> >                         complete_all(&entity->entity_idle);
> > +                       if (!cleanup_job)
> > +                               break;
> >                         continue;
> >                 }
> >
> > @@ -1022,14 +1050,14 @@ static int drm_sched_main(void *param)
> >                                           r);
> >                 } else {
> >                         if (IS_ERR(fence))
> > -                               dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
> > +                               dma_fence_set_error(&s_fence->finished,
> > +                                                   PTR_ERR(fence));
> >
> >                         drm_sched_job_done(sched_job);
> >                 }
> >
> >                 wake_up(&sched->job_scheduled);
> >         }
> > -       return 0;
> >  }
> >
> >  /**
> > @@ -1054,35 +1082,28 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >                    long timeout, struct workqueue_struct *timeout_wq,
> >                    atomic_t *score, const char *name, struct device *dev)
> >  {
> > -       int i, ret;
> > +       int i;
> >         sched->ops = ops;
> >         sched->hw_submission_limit = hw_submission;
> >         sched->name = name;
> >         sched->timeout = timeout;
> >         sched->timeout_wq = timeout_wq ? : system_wq;
> > +       sched->run_wq = system_wq;      /* FIXME: Let user pass this in */
> >         sched->hang_limit = hang_limit;
> >         sched->score = score ? score : &sched->_score;
> >         sched->dev = dev;
> >         for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
> >                 drm_sched_rq_init(sched, &sched->sched_rq[i]);
> >
> > -       init_waitqueue_head(&sched->wake_up_worker);
> >         init_waitqueue_head(&sched->job_scheduled);
> >         INIT_LIST_HEAD(&sched->pending_list);
> >         spin_lock_init(&sched->job_list_lock);
> >         atomic_set(&sched->hw_rq_count, 0);
> >         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> > +       INIT_WORK(&sched->work_run, drm_sched_main);
> >         atomic_set(&sched->_score, 0);
> >         atomic64_set(&sched->job_id_count, 0);
> > -
> > -       /* Each scheduler will run on a seperate kernel thread */
> > -       sched->thread = kthread_run(drm_sched_main, sched, sched->name);
> > -       if (IS_ERR(sched->thread)) {
> > -               ret = PTR_ERR(sched->thread);
> > -               sched->thread = NULL;
> > -               DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
> > -               return ret;
> > -       }
> > +       sched->pause_run_wq = false;
> >
> >         sched->ready = true;
> >         return 0;
> > @@ -1101,8 +1122,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
> >         struct drm_sched_entity *s_entity;
> >         int i;
> >
> > -       if (sched->thread)
> > -               kthread_stop(sched->thread);
> > +       drm_sched_run_wq_stop(sched);
> >
> >         for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> >                 struct drm_sched_rq *rq = &sched->sched_rq[i];
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index ca857ec9e7eb..ff50f3c289cd 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -456,17 +456,16 @@ struct drm_sched_backend_ops {
> >   * @timeout: the time after which a job is removed from the scheduler.
> >   * @name: name of the ring for which this scheduler is being used.
> >   * @sched_rq: priority wise array of run queues.
> > - * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
> > - *                  is ready to be scheduled.
> >   * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
> >   *                 waits on this wait queue until all the scheduled jobs are
> >   *                 finished.
> >   * @hw_rq_count: the number of jobs currently in the hardware queue.
> >   * @job_id_count: used to assign unique id to the each job.
> > + * @run_wq: workqueue used to queue @work_run
> >   * @timeout_wq: workqueue used to queue @work_tdr
> > + * @work_run: schedules jobs and cleans up entities
> >   * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> >   *            timeout interval is over.
> > - * @thread: the kthread on which the scheduler which run.
> >   * @pending_list: the list of jobs which are currently in the job queue.
> >   * @job_list_lock: lock to protect the pending_list.
> >   * @hang_limit: once the hangs by a job crosses this limit then it is marked
> > @@ -475,6 +474,7 @@ struct drm_sched_backend_ops {
> >   * @_score: score used when the driver doesn't provide one
> >   * @ready: marks if the underlying HW is ready to work
> >   * @free_guilty: A hit to time out handler to free the guilty job.
> > + * @pause_run_wq: pause queuing of @work_run on @run_wq
> >   * @dev: system &struct device
> >   *
> >   * One scheduler is implemented for each hardware ring.
> > @@ -485,13 +485,13 @@ struct drm_gpu_scheduler {
> >         long                            timeout;
> >         const char                      *name;
> >         struct drm_sched_rq             sched_rq[DRM_SCHED_PRIORITY_COUNT];
> > -       wait_queue_head_t               wake_up_worker;
> >         wait_queue_head_t               job_scheduled;
> >         atomic_t                        hw_rq_count;
> >         atomic64_t                      job_id_count;
> > +       struct workqueue_struct         *run_wq;
> >         struct workqueue_struct         *timeout_wq;
> > +       struct work_struct              work_run;
> >         struct delayed_work             work_tdr;
> > -       struct task_struct              *thread;
> >         struct list_head                pending_list;
> >         spinlock_t                      job_list_lock;
> >         int                             hang_limit;
> > @@ -499,6 +499,7 @@ struct drm_gpu_scheduler {
> >         atomic_t                        _score;
> >         bool                            ready;
> >         bool                            free_guilty;
> > +       bool                            pause_run_wq;
> >         struct device                   *dev;
> >  };
> >
> > @@ -529,6 +530,8 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
> >
> >  void drm_sched_job_cleanup(struct drm_sched_job *job);
> >  void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
> > +void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
> > +void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
> >  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
> >  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
> >  void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
> > --
> > 2.37.3
> >
Boris Brezillon Dec. 30, 2022, 10:20 a.m. UTC | #3
Hello Matthew,

On Thu, 22 Dec 2022 14:21:11 -0800
Matthew Brost <matthew.brost@intel.com> wrote:

> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> seems a bit odd but let us explain the reasoning below.
> 
> 1. In XE the submission order from multiple drm_sched_entity is not
> guaranteed to be the same completion even if targeting the same hardware
> engine. This is because in XE we have a firmware scheduler, the GuC,
> which allowed to reorder, timeslice, and preempt submissions. If a using
> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> apart as the TDR expects submission order == completion order. Using a
> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.

Oh, that's interesting. I've been trying to solve the same sort of
issues to support Arm's new Mali GPU which is relying on a FW-assisted
scheduling scheme (you give the FW N streams to execute, and it does
the scheduling between those N command streams, the kernel driver
does timeslice scheduling to update the command streams passed to the
FW). I must admit I gave up on using drm_sched at some point, mostly
because the integration with drm_sched was painful, but also because I
felt trying to bend drm_sched to make it interact with a
timeslice-oriented scheduling model wasn't really future proof. Giving
drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
help for a few things (didn't think it through yet), but I feel it's
coming short on other aspects we have to deal with on Arm GPUs. Here
are a few things I noted while working on the drm_sched-based PoC:

- The complexity to suspend/resume streams and recover from failures
  remains quite important (because everything is still very asynchronous
  under the hood). Sure, you don't have to do this fancy
  timeslice-based scheduling, but that's still a lot of code, and
  AFAICT, it didn't integrate well with drm_sched TDR (my previous
  attempt at reconciling them has been unsuccessful, but maybe your
  patches would help there)
- You lose one of the nice thing that's brought by timeslice-based
  scheduling: a tiny bit of fairness. That is, if one stream is queuing
  a compute job that's monopolizing the GPU core, you know the kernel
  part of the scheduler will eventually evict it and let other streams
  with same or higher priority run, even before the job timeout
  kicks in.
- Stream slots exposed by the Arm FW are not exactly HW queues that run
  things concurrently. The FW can decide to let only the stream with the
  highest priority get access to the various HW resources (GPU cores,
  tiler, ...), and let other streams starve. That means you might get
  spurious timeouts on some jobs/sched-entities while they didn't even
  get a chance to run.

So overall, and given I'm no longer the only one having to deal with a
FW scheduler that's designed with timeslice scheduling in mind, I'm
wondering if it's not time to design a common timeslice-based scheduler
instead of trying to bend drivers to use the model enforced by
drm_sched. But that's just my 2 cents, of course.

Regards,

Boris
Boris Brezillon Dec. 30, 2022, 11:55 a.m. UTC | #4
On Fri, 30 Dec 2022 11:20:42 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> Hello Matthew,
> 
> On Thu, 22 Dec 2022 14:21:11 -0800
> Matthew Brost <matthew.brost@intel.com> wrote:
> 
> > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > seems a bit odd but let us explain the reasoning below.
> > 
> > 1. In XE the submission order from multiple drm_sched_entity is not
> > guaranteed to be the same completion even if targeting the same hardware
> > engine. This is because in XE we have a firmware scheduler, the GuC,
> > which allowed to reorder, timeslice, and preempt submissions. If a using
> > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > apart as the TDR expects submission order == completion order. Using a
> > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.  
> 
> Oh, that's interesting. I've been trying to solve the same sort of
> issues to support Arm's new Mali GPU which is relying on a FW-assisted
> scheduling scheme (you give the FW N streams to execute, and it does
> the scheduling between those N command streams, the kernel driver
> does timeslice scheduling to update the command streams passed to the
> FW). I must admit I gave up on using drm_sched at some point, mostly
> because the integration with drm_sched was painful, but also because I
> felt trying to bend drm_sched to make it interact with a
> timeslice-oriented scheduling model wasn't really future proof. Giving
> drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> help for a few things (didn't think it through yet), but I feel it's
> coming short on other aspects we have to deal with on Arm GPUs.

Ok, so I just had a quick look at the Xe driver and how it
instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
have a better understanding of how you get away with using drm_sched
while still controlling how scheduling is really done. Here
drm_gpu_scheduler is just a dummy abstract that let's you use the
drm_sched job queuing/dep/tracking mechanism. The whole run-queue
selection is dumb because there's only one entity ever bound to the
scheduler (the one that's part of the xe_guc_engine object which also
contains the drm_gpu_scheduler instance). I guess the main issue we'd
have on Arm is the fact that the stream doesn't necessarily get
scheduled when ->run_job() is called, it can be placed in the runnable
queue and be picked later by the kernel-side scheduler when a FW slot
gets released. That can probably be sorted out by manually disabling the
job timer and re-enabling it when the stream gets picked by the
scheduler. But my main concern remains, we're basically abusing
drm_sched here.

For the Arm driver, that means turning the following sequence

1. wait for job deps
2. queue job to ringbuf and push the stream to the runnable
   queue (if it wasn't queued already). Wakeup the timeslice scheduler
   to re-evaluate (if the stream is not on a FW slot already)
3. stream gets picked by the timeslice scheduler and sent to the FW for
   execution

into

1. queue job to entity which takes care of waiting for job deps for
   us
2. schedule a drm_sched_main iteration
3. the only available entity is picked, and the first job from this
   entity is dequeued. ->run_job() is called: the job is queued to the
   ringbuf and the stream is pushed to the runnable queue (if it wasn't
   queued already). Wakeup the timeslice scheduler to re-evaluate (if
   the stream is not on a FW slot already)
4. stream gets picked by the timeslice scheduler and sent to the FW for
   execution

That's one extra step we don't really need. To sum-up, yes, all the
job/entity tracking might be interesting to share/re-use, but I wonder
if we couldn't have that without pulling out the scheduling part of
drm_sched, or maybe I'm missing something, and there's something in
drm_gpu_scheduler you really need.
Boris Brezillon Jan. 2, 2023, 7:30 a.m. UTC | #5
On Fri, 30 Dec 2022 12:55:08 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> On Fri, 30 Dec 2022 11:20:42 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
> > Hello Matthew,
> > 
> > On Thu, 22 Dec 2022 14:21:11 -0800
> > Matthew Brost <matthew.brost@intel.com> wrote:
> >   
> > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > > seems a bit odd but let us explain the reasoning below.
> > > 
> > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > guaranteed to be the same completion even if targeting the same hardware
> > > engine. This is because in XE we have a firmware scheduler, the GuC,
> > > which allowed to reorder, timeslice, and preempt submissions. If a using
> > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > > apart as the TDR expects submission order == completion order. Using a
> > > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.    
> > 
> > Oh, that's interesting. I've been trying to solve the same sort of
> > issues to support Arm's new Mali GPU which is relying on a FW-assisted
> > scheduling scheme (you give the FW N streams to execute, and it does
> > the scheduling between those N command streams, the kernel driver
> > does timeslice scheduling to update the command streams passed to the
> > FW). I must admit I gave up on using drm_sched at some point, mostly
> > because the integration with drm_sched was painful, but also because I
> > felt trying to bend drm_sched to make it interact with a
> > timeslice-oriented scheduling model wasn't really future proof. Giving
> > drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> > help for a few things (didn't think it through yet), but I feel it's
> > coming short on other aspects we have to deal with on Arm GPUs.  
> 
> Ok, so I just had a quick look at the Xe driver and how it
> instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> have a better understanding of how you get away with using drm_sched
> while still controlling how scheduling is really done. Here
> drm_gpu_scheduler is just a dummy abstract that let's you use the
> drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> selection is dumb because there's only one entity ever bound to the
> scheduler (the one that's part of the xe_guc_engine object which also
> contains the drm_gpu_scheduler instance). I guess the main issue we'd
> have on Arm is the fact that the stream doesn't necessarily get
> scheduled when ->run_job() is called, it can be placed in the runnable
> queue and be picked later by the kernel-side scheduler when a FW slot
> gets released. That can probably be sorted out by manually disabling the
> job timer and re-enabling it when the stream gets picked by the
> scheduler. But my main concern remains, we're basically abusing
> drm_sched here.
> 
> For the Arm driver, that means turning the following sequence
> 
> 1. wait for job deps
> 2. queue job to ringbuf and push the stream to the runnable
>    queue (if it wasn't queued already). Wakeup the timeslice scheduler
>    to re-evaluate (if the stream is not on a FW slot already)
> 3. stream gets picked by the timeslice scheduler and sent to the FW for
>    execution
> 
> into
> 
> 1. queue job to entity which takes care of waiting for job deps for
>    us
> 2. schedule a drm_sched_main iteration
> 3. the only available entity is picked, and the first job from this
>    entity is dequeued. ->run_job() is called: the job is queued to the
>    ringbuf and the stream is pushed to the runnable queue (if it wasn't
>    queued already). Wakeup the timeslice scheduler to re-evaluate (if
>    the stream is not on a FW slot already)
> 4. stream gets picked by the timeslice scheduler and sent to the FW for
>    execution
> 
> That's one extra step we don't really need. To sum-up, yes, all the
> job/entity tracking might be interesting to share/re-use, but I wonder
> if we couldn't have that without pulling out the scheduling part of
> drm_sched, or maybe I'm missing something, and there's something in
> drm_gpu_scheduler you really need.

On second thought, that's probably an acceptable overhead (not even
sure the extra step I was mentioning exists in practice, because dep
fence signaled state is checked as part of the drm_sched_main
iteration, so that's basically replacing the worker I schedule to
check job deps), and I like the idea of being able to re-use drm_sched
dep-tracking without resorting to invasive changes to the existing
logic, so I'll probably give it a try.
Tvrtko Ursulin Jan. 3, 2023, 1:02 p.m. UTC | #6
On 02/01/2023 07:30, Boris Brezillon wrote:
> On Fri, 30 Dec 2022 12:55:08 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
>> On Fri, 30 Dec 2022 11:20:42 +0100
>> Boris Brezillon <boris.brezillon@collabora.com> wrote:
>>
>>> Hello Matthew,
>>>
>>> On Thu, 22 Dec 2022 14:21:11 -0800
>>> Matthew Brost <matthew.brost@intel.com> wrote:
>>>    
>>>> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
>>>> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
>>>> seems a bit odd but let us explain the reasoning below.
>>>>
>>>> 1. In XE the submission order from multiple drm_sched_entity is not
>>>> guaranteed to be the same completion even if targeting the same hardware
>>>> engine. This is because in XE we have a firmware scheduler, the GuC,
>>>> which allowed to reorder, timeslice, and preempt submissions. If a using
>>>> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
>>>> apart as the TDR expects submission order == completion order. Using a
>>>> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
>>>
>>> Oh, that's interesting. I've been trying to solve the same sort of
>>> issues to support Arm's new Mali GPU which is relying on a FW-assisted
>>> scheduling scheme (you give the FW N streams to execute, and it does
>>> the scheduling between those N command streams, the kernel driver
>>> does timeslice scheduling to update the command streams passed to the
>>> FW). I must admit I gave up on using drm_sched at some point, mostly
>>> because the integration with drm_sched was painful, but also because I
>>> felt trying to bend drm_sched to make it interact with a
>>> timeslice-oriented scheduling model wasn't really future proof. Giving
>>> drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
>>> help for a few things (didn't think it through yet), but I feel it's
>>> coming short on other aspects we have to deal with on Arm GPUs.
>>
>> Ok, so I just had a quick look at the Xe driver and how it
>> instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
>> have a better understanding of how you get away with using drm_sched
>> while still controlling how scheduling is really done. Here
>> drm_gpu_scheduler is just a dummy abstract that let's you use the
>> drm_sched job queuing/dep/tracking mechanism. The whole run-queue
>> selection is dumb because there's only one entity ever bound to the
>> scheduler (the one that's part of the xe_guc_engine object which also
>> contains the drm_gpu_scheduler instance). I guess the main issue we'd
>> have on Arm is the fact that the stream doesn't necessarily get
>> scheduled when ->run_job() is called, it can be placed in the runnable
>> queue and be picked later by the kernel-side scheduler when a FW slot
>> gets released. That can probably be sorted out by manually disabling the
>> job timer and re-enabling it when the stream gets picked by the
>> scheduler. But my main concern remains, we're basically abusing
>> drm_sched here.
>>
>> For the Arm driver, that means turning the following sequence
>>
>> 1. wait for job deps
>> 2. queue job to ringbuf and push the stream to the runnable
>>     queue (if it wasn't queued already). Wakeup the timeslice scheduler
>>     to re-evaluate (if the stream is not on a FW slot already)
>> 3. stream gets picked by the timeslice scheduler and sent to the FW for
>>     execution
>>
>> into
>>
>> 1. queue job to entity which takes care of waiting for job deps for
>>     us
>> 2. schedule a drm_sched_main iteration
>> 3. the only available entity is picked, and the first job from this
>>     entity is dequeued. ->run_job() is called: the job is queued to the
>>     ringbuf and the stream is pushed to the runnable queue (if it wasn't
>>     queued already). Wakeup the timeslice scheduler to re-evaluate (if
>>     the stream is not on a FW slot already)
>> 4. stream gets picked by the timeslice scheduler and sent to the FW for
>>     execution
>>
>> That's one extra step we don't really need. To sum-up, yes, all the
>> job/entity tracking might be interesting to share/re-use, but I wonder
>> if we couldn't have that without pulling out the scheduling part of
>> drm_sched, or maybe I'm missing something, and there's something in
>> drm_gpu_scheduler you really need.
> 
> On second thought, that's probably an acceptable overhead (not even
> sure the extra step I was mentioning exists in practice, because dep
> fence signaled state is checked as part of the drm_sched_main
> iteration, so that's basically replacing the worker I schedule to
> check job deps), and I like the idea of being able to re-use drm_sched
> dep-tracking without resorting to invasive changes to the existing
> logic, so I'll probably give it a try.

I agree with the concerns and think that how Xe proposes to integrate 
with drm_sched is a problem, or at least significantly inelegant.

AFAICT it proposes to have 1:1 between *userspace* created contexts (per 
context _and_ engine) and drm_sched. I am not sure avoiding invasive 
changes to the shared code is in the spirit of the overall idea and 
instead opportunity should be used to look at way to refactor/improve 
drm_sched.

Even on the low level, the idea to replace drm_sched threads with 
workers has a few problems.

To start with, the pattern of:

   while (not_stopped) {
	keep picking jobs
   }

Feels fundamentally in disagreement with workers (while obviously fits 
perfectly with the current kthread design).

Secondly, it probably demands separate workers (not optional), otherwise 
behaviour of shared workqueues has either the potential to explode 
number kernel threads anyway, or add latency.

What would be interesting to learn is whether the option of refactoring 
drm_sched to deal with out of order completion was considered and what 
were the conclusions.

Second option perhaps to split out the drm_sched code into parts which 
would lend themselves more to "pick and choose" of its functionalities. 
Specifically, Xe wants frontend dependency tracking, but not any 
scheduling really (neither least busy drm_sched, neither FIFO/RQ entity 
picking), so even having all these data structures in memory is a waste.

With the first option then the end result could be drm_sched per engine 
class (hardware view), which I think fits with the GuC model. Give all 
schedulable contexts (entities) to the GuC and then mostly forget about 
them. Timeslicing and re-ordering and all happens transparently to the 
kernel from that point until completion.

Or with the second option you would build on some smaller refactored 
sub-components of drm_sched, by maybe splitting the dependency tracking 
from scheduling (RR/FIFO entity picking code).

Second option is especially a bit vague and I haven't thought about the 
required mechanics, but it just appeared too obvious the proposed design 
has a bit too much impedance mismatch.

Oh and as a side note, when I went into the drm_sched code base to 
remind myself how things worked, it is quite easy to find some FIXME 
comments which suggest people working on it are unsure of locking desing 
there and such. So perhaps that all needs cleanup too, I mean would 
benefit from refactoring/improving work as brainstormed above anyway.

Regards,

Tvrtko
Boris Brezillon Jan. 3, 2023, 2:21 p.m. UTC | #7
Hi,

On Tue, 3 Jan 2023 13:02:15 +0000
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:

> On 02/01/2023 07:30, Boris Brezillon wrote:
> > On Fri, 30 Dec 2022 12:55:08 +0100
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >   
> >> On Fri, 30 Dec 2022 11:20:42 +0100
> >> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >>  
> >>> Hello Matthew,
> >>>
> >>> On Thu, 22 Dec 2022 14:21:11 -0800
> >>> Matthew Brost <matthew.brost@intel.com> wrote:
> >>>      
> >>>> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> >>>> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> >>>> seems a bit odd but let us explain the reasoning below.
> >>>>
> >>>> 1. In XE the submission order from multiple drm_sched_entity is not
> >>>> guaranteed to be the same completion even if targeting the same hardware
> >>>> engine. This is because in XE we have a firmware scheduler, the GuC,
> >>>> which allowed to reorder, timeslice, and preempt submissions. If a using
> >>>> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> >>>> apart as the TDR expects submission order == completion order. Using a
> >>>> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.  
> >>>
> >>> Oh, that's interesting. I've been trying to solve the same sort of
> >>> issues to support Arm's new Mali GPU which is relying on a FW-assisted
> >>> scheduling scheme (you give the FW N streams to execute, and it does
> >>> the scheduling between those N command streams, the kernel driver
> >>> does timeslice scheduling to update the command streams passed to the
> >>> FW). I must admit I gave up on using drm_sched at some point, mostly
> >>> because the integration with drm_sched was painful, but also because I
> >>> felt trying to bend drm_sched to make it interact with a
> >>> timeslice-oriented scheduling model wasn't really future proof. Giving
> >>> drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> >>> help for a few things (didn't think it through yet), but I feel it's
> >>> coming short on other aspects we have to deal with on Arm GPUs.  
> >>
> >> Ok, so I just had a quick look at the Xe driver and how it
> >> instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> >> have a better understanding of how you get away with using drm_sched
> >> while still controlling how scheduling is really done. Here
> >> drm_gpu_scheduler is just a dummy abstract that let's you use the
> >> drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> >> selection is dumb because there's only one entity ever bound to the
> >> scheduler (the one that's part of the xe_guc_engine object which also
> >> contains the drm_gpu_scheduler instance). I guess the main issue we'd
> >> have on Arm is the fact that the stream doesn't necessarily get
> >> scheduled when ->run_job() is called, it can be placed in the runnable
> >> queue and be picked later by the kernel-side scheduler when a FW slot
> >> gets released. That can probably be sorted out by manually disabling the
> >> job timer and re-enabling it when the stream gets picked by the
> >> scheduler. But my main concern remains, we're basically abusing
> >> drm_sched here.
> >>
> >> For the Arm driver, that means turning the following sequence
> >>
> >> 1. wait for job deps
> >> 2. queue job to ringbuf and push the stream to the runnable
> >>     queue (if it wasn't queued already). Wakeup the timeslice scheduler
> >>     to re-evaluate (if the stream is not on a FW slot already)
> >> 3. stream gets picked by the timeslice scheduler and sent to the FW for
> >>     execution
> >>
> >> into
> >>
> >> 1. queue job to entity which takes care of waiting for job deps for
> >>     us
> >> 2. schedule a drm_sched_main iteration
> >> 3. the only available entity is picked, and the first job from this
> >>     entity is dequeued. ->run_job() is called: the job is queued to the
> >>     ringbuf and the stream is pushed to the runnable queue (if it wasn't
> >>     queued already). Wakeup the timeslice scheduler to re-evaluate (if
> >>     the stream is not on a FW slot already)
> >> 4. stream gets picked by the timeslice scheduler and sent to the FW for
> >>     execution
> >>
> >> That's one extra step we don't really need. To sum-up, yes, all the
> >> job/entity tracking might be interesting to share/re-use, but I wonder
> >> if we couldn't have that without pulling out the scheduling part of
> >> drm_sched, or maybe I'm missing something, and there's something in
> >> drm_gpu_scheduler you really need.  
> > 
> > On second thought, that's probably an acceptable overhead (not even
> > sure the extra step I was mentioning exists in practice, because dep
> > fence signaled state is checked as part of the drm_sched_main
> > iteration, so that's basically replacing the worker I schedule to
> > check job deps), and I like the idea of being able to re-use drm_sched
> > dep-tracking without resorting to invasive changes to the existing
> > logic, so I'll probably give it a try.  
> 
> I agree with the concerns and think that how Xe proposes to integrate 
> with drm_sched is a problem, or at least significantly inelegant.

Okay, so it looks like I'm not the only one to be bothered by the way Xe
tries to bypass the drm_sched limitations :-).

> 
> AFAICT it proposes to have 1:1 between *userspace* created contexts (per 
> context _and_ engine) and drm_sched. I am not sure avoiding invasive 
> changes to the shared code is in the spirit of the overall idea and 
> instead opportunity should be used to look at way to refactor/improve 
> drm_sched.
> 
> Even on the low level, the idea to replace drm_sched threads with 
> workers has a few problems.
> 
> To start with, the pattern of:
> 
>    while (not_stopped) {
> 	keep picking jobs
>    }
> 
> Feels fundamentally in disagreement with workers (while obviously fits 
> perfectly with the current kthread design).
> 
> Secondly, it probably demands separate workers (not optional), otherwise 
> behaviour of shared workqueues has either the potential to explode 
> number kernel threads anyway, or add latency.
> 
> What would be interesting to learn is whether the option of refactoring 
> drm_sched to deal with out of order completion was considered and what 
> were the conclusions.

I might be wrong, but I don't think the fundamental issue here is the
out-of-order completion thing that's mentioned in the commit message.
It just feels like this is a symptom of the impedance mismatch we
have between priority+FIFO-based job scheduling and
priority+timeslice-based queue scheduling (a queue being represented by
a drm_sched_entity in drm_sched).

> 
> Second option perhaps to split out the drm_sched code into parts which 
> would lend themselves more to "pick and choose" of its functionalities. 
> Specifically, Xe wants frontend dependency tracking, but not any 
> scheduling really (neither least busy drm_sched, neither FIFO/RQ entity 
> picking), so even having all these data structures in memory is a waste.

Same thing for the panfrost+CSF driver I was mentioning in my previous
emails.

> 
> With the first option then the end result could be drm_sched per engine 
> class (hardware view), which I think fits with the GuC model. Give all 
> schedulable contexts (entities) to the GuC and then mostly forget about 
> them. Timeslicing and re-ordering and all happens transparently to the 
> kernel from that point until completion.

Yep, that would work. I guess it would mean creating an intermediate
abstract/interface to schedule entities and then implement this
interface for the simple HW-engine+job-scheduling case, so that
existing drm_sched users don't see a difference, and new drivers that
need to interface with FW-assisted schedulers can implement the
higher-lever entity scheduling interface. Don't know what this
interface would look like though.

> 
> Or with the second option you would build on some smaller refactored 
> sub-components of drm_sched, by maybe splitting the dependency tracking 
> from scheduling (RR/FIFO entity picking code).

What I've done so far is duplicate the dep-tracking logic in the
driver. It's not that much code, but it would be nice to not have to
duplicate it in the first place...

Regards,

Boris
Matthew Brost Jan. 5, 2023, 7:40 p.m. UTC | #8
On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:
> On Fri, 30 Dec 2022 12:55:08 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
> > On Fri, 30 Dec 2022 11:20:42 +0100
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > 
> > > Hello Matthew,
> > > 
> > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > Matthew Brost <matthew.brost@intel.com> wrote:
> > >   
> > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > > > seems a bit odd but let us explain the reasoning below.
> > > > 
> > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > guaranteed to be the same completion even if targeting the same hardware
> > > > engine. This is because in XE we have a firmware scheduler, the GuC,
> > > > which allowed to reorder, timeslice, and preempt submissions. If a using
> > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > > > apart as the TDR expects submission order == completion order. Using a
> > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.    
> > > 
> > > Oh, that's interesting. I've been trying to solve the same sort of
> > > issues to support Arm's new Mali GPU which is relying on a FW-assisted
> > > scheduling scheme (you give the FW N streams to execute, and it does
> > > the scheduling between those N command streams, the kernel driver
> > > does timeslice scheduling to update the command streams passed to the
> > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > because the integration with drm_sched was painful, but also because I
> > > felt trying to bend drm_sched to make it interact with a
> > > timeslice-oriented scheduling model wasn't really future proof. Giving
> > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> > > help for a few things (didn't think it through yet), but I feel it's
> > > coming short on other aspects we have to deal with on Arm GPUs.  
> > 
> > Ok, so I just had a quick look at the Xe driver and how it
> > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > have a better understanding of how you get away with using drm_sched
> > while still controlling how scheduling is really done. Here
> > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > drm_sched job queuing/dep/tracking mechanism. The whole run-queue

You nailed it here, we use the DRM scheduler for queuing jobs,
dependency tracking and releasing jobs to be scheduled when dependencies
are met, and lastly a tracking mechanism of inflights jobs that need to
be cleaned up if an error occurs. It doesn't actually do any scheduling
aside from the most basic level of not overflowing the submission ring
buffer. In this sense, a 1 to 1 relationship between entity and
scheduler fits quite well.

FWIW this design was also ran by AMD quite a while ago (off the list)
and we didn't get any serious push back. Things can change however...

> > selection is dumb because there's only one entity ever bound to the
> > scheduler (the one that's part of the xe_guc_engine object which also
> > contains the drm_gpu_scheduler instance). I guess the main issue we'd
> > have on Arm is the fact that the stream doesn't necessarily get
> > scheduled when ->run_job() is called, it can be placed in the runnable
> > queue and be picked later by the kernel-side scheduler when a FW slot
> > gets released. That can probably be sorted out by manually disabling the
> > job timer and re-enabling it when the stream gets picked by the
> > scheduler. But my main concern remains, we're basically abusing
> > drm_sched here.
> > 

That's a matter of opinion, yes we are using it slightly differently
than anyone else but IMO the fact the DRM scheduler works for the Xe use
case with barely any changes is a testament to its design.

> > For the Arm driver, that means turning the following sequence
> > 
> > 1. wait for job deps
> > 2. queue job to ringbuf and push the stream to the runnable
> >    queue (if it wasn't queued already). Wakeup the timeslice scheduler
> >    to re-evaluate (if the stream is not on a FW slot already)
> > 3. stream gets picked by the timeslice scheduler and sent to the FW for
> >    execution
> > 
> > into
> > 
> > 1. queue job to entity which takes care of waiting for job deps for
> >    us
> > 2. schedule a drm_sched_main iteration
> > 3. the only available entity is picked, and the first job from this
> >    entity is dequeued. ->run_job() is called: the job is queued to the
> >    ringbuf and the stream is pushed to the runnable queue (if it wasn't
> >    queued already). Wakeup the timeslice scheduler to re-evaluate (if
> >    the stream is not on a FW slot already)
> > 4. stream gets picked by the timeslice scheduler and sent to the FW for
> >    execution
> >

Yes, an extra step but you get to use all the nice DRM scheduler
functions for dependency tracking. Also in our case we really want a
single entry point in the backend (the work queue). Also see [1] which
helped us seal a bunch of races we had in the i915 by using a single
entry point. All these benefits are why we landed on the DRM scheduler
and it has worked of rather nicely compared to the i915.

[1] https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
 
> > That's one extra step we don't really need. To sum-up, yes, all the
> > job/entity tracking might be interesting to share/re-use, but I wonder
> > if we couldn't have that without pulling out the scheduling part of
> > drm_sched, or maybe I'm missing something, and there's something in
> > drm_gpu_scheduler you really need.
> 
> On second thought, that's probably an acceptable overhead (not even
> sure the extra step I was mentioning exists in practice, because dep
> fence signaled state is checked as part of the drm_sched_main
> iteration, so that's basically replacing the worker I schedule to
> check job deps), and I like the idea of being able to re-use drm_sched
> dep-tracking without resorting to invasive changes to the existing
> logic, so I'll probably give it a try.

Let me know how this goes.

Matt
Matthew Brost Jan. 5, 2023, 9:43 p.m. UTC | #9
On Tue, Jan 03, 2023 at 01:02:15PM +0000, Tvrtko Ursulin wrote:
> 
> On 02/01/2023 07:30, Boris Brezillon wrote:
> > On Fri, 30 Dec 2022 12:55:08 +0100
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > 
> > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > 
> > > > Hello Matthew,
> > > > 
> > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > > > > seems a bit odd but let us explain the reasoning below.
> > > > > 
> > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > guaranteed to be the same completion even if targeting the same hardware
> > > > > engine. This is because in XE we have a firmware scheduler, the GuC,
> > > > > which allowed to reorder, timeslice, and preempt submissions. If a using
> > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > > > > apart as the TDR expects submission order == completion order. Using a
> > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
> > > > 
> > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > issues to support Arm's new Mali GPU which is relying on a FW-assisted
> > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > the scheduling between those N command streams, the kernel driver
> > > > does timeslice scheduling to update the command streams passed to the
> > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > because the integration with drm_sched was painful, but also because I
> > > > felt trying to bend drm_sched to make it interact with a
> > > > timeslice-oriented scheduling model wasn't really future proof. Giving
> > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> > > > help for a few things (didn't think it through yet), but I feel it's
> > > > coming short on other aspects we have to deal with on Arm GPUs.
> > > 
> > > Ok, so I just had a quick look at the Xe driver and how it
> > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > have a better understanding of how you get away with using drm_sched
> > > while still controlling how scheduling is really done. Here
> > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> > > selection is dumb because there's only one entity ever bound to the
> > > scheduler (the one that's part of the xe_guc_engine object which also
> > > contains the drm_gpu_scheduler instance). I guess the main issue we'd
> > > have on Arm is the fact that the stream doesn't necessarily get
> > > scheduled when ->run_job() is called, it can be placed in the runnable
> > > queue and be picked later by the kernel-side scheduler when a FW slot
> > > gets released. That can probably be sorted out by manually disabling the
> > > job timer and re-enabling it when the stream gets picked by the
> > > scheduler. But my main concern remains, we're basically abusing
> > > drm_sched here.
> > > 
> > > For the Arm driver, that means turning the following sequence
> > > 
> > > 1. wait for job deps
> > > 2. queue job to ringbuf and push the stream to the runnable
> > >     queue (if it wasn't queued already). Wakeup the timeslice scheduler
> > >     to re-evaluate (if the stream is not on a FW slot already)
> > > 3. stream gets picked by the timeslice scheduler and sent to the FW for
> > >     execution
> > > 
> > > into
> > > 
> > > 1. queue job to entity which takes care of waiting for job deps for
> > >     us
> > > 2. schedule a drm_sched_main iteration
> > > 3. the only available entity is picked, and the first job from this
> > >     entity is dequeued. ->run_job() is called: the job is queued to the
> > >     ringbuf and the stream is pushed to the runnable queue (if it wasn't
> > >     queued already). Wakeup the timeslice scheduler to re-evaluate (if
> > >     the stream is not on a FW slot already)
> > > 4. stream gets picked by the timeslice scheduler and sent to the FW for
> > >     execution
> > > 
> > > That's one extra step we don't really need. To sum-up, yes, all the
> > > job/entity tracking might be interesting to share/re-use, but I wonder
> > > if we couldn't have that without pulling out the scheduling part of
> > > drm_sched, or maybe I'm missing something, and there's something in
> > > drm_gpu_scheduler you really need.
> > 
> > On second thought, that's probably an acceptable overhead (not even
> > sure the extra step I was mentioning exists in practice, because dep
> > fence signaled state is checked as part of the drm_sched_main
> > iteration, so that's basically replacing the worker I schedule to
> > check job deps), and I like the idea of being able to re-use drm_sched
> > dep-tracking without resorting to invasive changes to the existing
> > logic, so I'll probably give it a try.
> 
> I agree with the concerns and think that how Xe proposes to integrate with
> drm_sched is a problem, or at least significantly inelegant.
>

Inelegant is a matter of opinion, I actually rather like this solution.

BTW this isn't my design rather this was Jason's idea.
 
> AFAICT it proposes to have 1:1 between *userspace* created contexts (per
> context _and_ engine) and drm_sched. I am not sure avoiding invasive changes
> to the shared code is in the spirit of the overall idea and instead
> opportunity should be used to look at way to refactor/improve drm_sched.
>

Yes, it is 1:1 *userspace* engines and drm_sched.

I'm not really prepared to make large changes to DRM scheduler at the
moment for Xe as they are not really required nor does Boris seem they
will be required for his work either. I am interested to see what Boris
comes up with.

> Even on the low level, the idea to replace drm_sched threads with workers
> has a few problems.
> 
> To start with, the pattern of:
> 
>   while (not_stopped) {
> 	keep picking jobs
>   }
> 
> Feels fundamentally in disagreement with workers (while obviously fits
> perfectly with the current kthread design).
>

The while loop breaks and worker exists if no jobs are ready.

> Secondly, it probably demands separate workers (not optional), otherwise
> behaviour of shared workqueues has either the potential to explode number
> kernel threads anyway, or add latency.
> 

Right now the system_unbound_wq is used which does have a limit on the
number of threads, right? I do have a FIXME to allow a worker to be
passed in similar to TDR.

WRT to latency, the 1:1 ratio could actually have lower latency as 2 GPU
schedulers can be pushing jobs into the backend / cleaning up jobs in
parallel.

> What would be interesting to learn is whether the option of refactoring
> drm_sched to deal with out of order completion was considered and what were
> the conclusions.
>

I coded this up a while back when trying to convert the i915 to the DRM
scheduler it isn't all that hard either. The free flow control on the
ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is really what
sold me on the this design.

> Second option perhaps to split out the drm_sched code into parts which would
> lend themselves more to "pick and choose" of its functionalities.
> Specifically, Xe wants frontend dependency tracking, but not any scheduling
> really (neither least busy drm_sched, neither FIFO/RQ entity picking), so
> even having all these data structures in memory is a waste.
> 

I don't think that we are wasting memory is a very good argument for
making intrusive changes to the DRM scheduler.

> With the first option then the end result could be drm_sched per engine
> class (hardware view), which I think fits with the GuC model. Give all
> schedulable contexts (entities) to the GuC and then mostly forget about
> them. Timeslicing and re-ordering and all happens transparently to the
> kernel from that point until completion.
> 

Out-of-order problem still exists here.

> Or with the second option you would build on some smaller refactored
> sub-components of drm_sched, by maybe splitting the dependency tracking from
> scheduling (RR/FIFO entity picking code).
> 
> Second option is especially a bit vague and I haven't thought about the
> required mechanics, but it just appeared too obvious the proposed design has
> a bit too much impedance mismatch.
>

IMO ROI on this is low and again lets see what Boris comes up with.

Matt

> Oh and as a side note, when I went into the drm_sched code base to remind
> myself how things worked, it is quite easy to find some FIXME comments which
> suggest people working on it are unsure of locking desing there and such. So
> perhaps that all needs cleanup too, I mean would benefit from
> refactoring/improving work as brainstormed above anyway.
> 
> Regards,
> 
> Tvrtko
Matthew Brost Jan. 6, 2023, 11:52 p.m. UTC | #10
On Thu, Jan 05, 2023 at 09:43:41PM +0000, Matthew Brost wrote:
> On Tue, Jan 03, 2023 at 01:02:15PM +0000, Tvrtko Ursulin wrote:
> > 
> > On 02/01/2023 07:30, Boris Brezillon wrote:
> > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > 
> > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > 
> > > > > Hello Matthew,
> > > > > 
> > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
> > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > 
> > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > guaranteed to be the same completion even if targeting the same hardware
> > > > > > engine. This is because in XE we have a firmware scheduler, the GuC,
> > > > > > which allowed to reorder, timeslice, and preempt submissions. If a using
> > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
> > > > > > apart as the TDR expects submission order == completion order. Using a
> > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
> > > > > 
> > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > issues to support Arm's new Mali GPU which is relying on a FW-assisted
> > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > the scheduling between those N command streams, the kernel driver
> > > > > does timeslice scheduling to update the command streams passed to the
> > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > because the integration with drm_sched was painful, but also because I
> > > > > felt trying to bend drm_sched to make it interact with a
> > > > > timeslice-oriented scheduling model wasn't really future proof. Giving
> > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
> > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > coming short on other aspects we have to deal with on Arm GPUs.
> > > > 
> > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > have a better understanding of how you get away with using drm_sched
> > > > while still controlling how scheduling is really done. Here
> > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> > > > selection is dumb because there's only one entity ever bound to the
> > > > scheduler (the one that's part of the xe_guc_engine object which also
> > > > contains the drm_gpu_scheduler instance). I guess the main issue we'd
> > > > have on Arm is the fact that the stream doesn't necessarily get
> > > > scheduled when ->run_job() is called, it can be placed in the runnable
> > > > queue and be picked later by the kernel-side scheduler when a FW slot
> > > > gets released. That can probably be sorted out by manually disabling the
> > > > job timer and re-enabling it when the stream gets picked by the
> > > > scheduler. But my main concern remains, we're basically abusing
> > > > drm_sched here.
> > > > 
> > > > For the Arm driver, that means turning the following sequence
> > > > 
> > > > 1. wait for job deps
> > > > 2. queue job to ringbuf and push the stream to the runnable
> > > >     queue (if it wasn't queued already). Wakeup the timeslice scheduler
> > > >     to re-evaluate (if the stream is not on a FW slot already)
> > > > 3. stream gets picked by the timeslice scheduler and sent to the FW for
> > > >     execution
> > > > 
> > > > into
> > > > 
> > > > 1. queue job to entity which takes care of waiting for job deps for
> > > >     us
> > > > 2. schedule a drm_sched_main iteration
> > > > 3. the only available entity is picked, and the first job from this
> > > >     entity is dequeued. ->run_job() is called: the job is queued to the
> > > >     ringbuf and the stream is pushed to the runnable queue (if it wasn't
> > > >     queued already). Wakeup the timeslice scheduler to re-evaluate (if
> > > >     the stream is not on a FW slot already)
> > > > 4. stream gets picked by the timeslice scheduler and sent to the FW for
> > > >     execution
> > > > 
> > > > That's one extra step we don't really need. To sum-up, yes, all the
> > > > job/entity tracking might be interesting to share/re-use, but I wonder
> > > > if we couldn't have that without pulling out the scheduling part of
> > > > drm_sched, or maybe I'm missing something, and there's something in
> > > > drm_gpu_scheduler you really need.
> > > 
> > > On second thought, that's probably an acceptable overhead (not even
> > > sure the extra step I was mentioning exists in practice, because dep
> > > fence signaled state is checked as part of the drm_sched_main
> > > iteration, so that's basically replacing the worker I schedule to
> > > check job deps), and I like the idea of being able to re-use drm_sched
> > > dep-tracking without resorting to invasive changes to the existing
> > > logic, so I'll probably give it a try.
> > 
> > I agree with the concerns and think that how Xe proposes to integrate with
> > drm_sched is a problem, or at least significantly inelegant.
> >
> 
> Inelegant is a matter of opinion, I actually rather like this solution.
> 
> BTW this isn't my design rather this was Jason's idea.
>  
> > AFAICT it proposes to have 1:1 between *userspace* created contexts (per
> > context _and_ engine) and drm_sched. I am not sure avoiding invasive changes
> > to the shared code is in the spirit of the overall idea and instead
> > opportunity should be used to look at way to refactor/improve drm_sched.
> >
> 
> Yes, it is 1:1 *userspace* engines and drm_sched.
> 
> I'm not really prepared to make large changes to DRM scheduler at the
> moment for Xe as they are not really required nor does Boris seem they
> will be required for his work either. I am interested to see what Boris
> comes up with.
> 
> > Even on the low level, the idea to replace drm_sched threads with workers
> > has a few problems.
> > 
> > To start with, the pattern of:
> > 
> >   while (not_stopped) {
> > 	keep picking jobs
> >   }
> > 
> > Feels fundamentally in disagreement with workers (while obviously fits
> > perfectly with the current kthread design).
> >
> 
> The while loop breaks and worker exists if no jobs are ready.
> 
> > Secondly, it probably demands separate workers (not optional), otherwise
> > behaviour of shared workqueues has either the potential to explode number
> > kernel threads anyway, or add latency.
> > 
> 
> Right now the system_unbound_wq is used which does have a limit on the
> number of threads, right? I do have a FIXME to allow a worker to be
> passed in similar to TDR.
> 
> WRT to latency, the 1:1 ratio could actually have lower latency as 2 GPU
> schedulers can be pushing jobs into the backend / cleaning up jobs in
> parallel.
> 

Thought of one more point here where why in Xe we absolutely want a 1 to
1 ratio between entity and scheduler - the way we implement timeslicing
for preempt fences.

Let me try to explain.

Preempt fences are implemented via the generic messaging interface [1]
with suspend / resume messages. If a suspend messages is received to
soon after calling resume (this is per entity) we simply sleep in the
suspend call thus giving the entity a timeslice. This completely falls
apart with a many to 1 relationship as now a entity waiting for a
timeslice blocks the other entities. Could we work aroudn this, sure but
just another bunch of code we'd have to add in Xe. Being to freely sleep
in backend without affecting other entities is really, really nice IMO
and I bet Xe isn't the only driver that is going to feel this way.

Last thing I'll say regardless of how anyone feels about Xe using a 1 to
1 relationship this patch IMO makes sense as I hope we can all agree a
workqueue scales better than kthreads.

Matt

[1] https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1 

> > What would be interesting to learn is whether the option of refactoring
> > drm_sched to deal with out of order completion was considered and what were
> > the conclusions.
> >
> 
> I coded this up a while back when trying to convert the i915 to the DRM
> scheduler it isn't all that hard either. The free flow control on the
> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is really what
> sold me on the this design.
> 
> > Second option perhaps to split out the drm_sched code into parts which would
> > lend themselves more to "pick and choose" of its functionalities.
> > Specifically, Xe wants frontend dependency tracking, but not any scheduling
> > really (neither least busy drm_sched, neither FIFO/RQ entity picking), so
> > even having all these data structures in memory is a waste.
> > 
> 
> I don't think that we are wasting memory is a very good argument for
> making intrusive changes to the DRM scheduler.
> 
> > With the first option then the end result could be drm_sched per engine
> > class (hardware view), which I think fits with the GuC model. Give all
> > schedulable contexts (entities) to the GuC and then mostly forget about
> > them. Timeslicing and re-ordering and all happens transparently to the
> > kernel from that point until completion.
> > 
> 
> Out-of-order problem still exists here.
> 
> > Or with the second option you would build on some smaller refactored
> > sub-components of drm_sched, by maybe splitting the dependency tracking from
> > scheduling (RR/FIFO entity picking code).
> > 
> > Second option is especially a bit vague and I haven't thought about the
> > required mechanics, but it just appeared too obvious the proposed design has
> > a bit too much impedance mismatch.
> >
> 
> IMO ROI on this is low and again lets see what Boris comes up with.
> 
> Matt
> 
> > Oh and as a side note, when I went into the drm_sched code base to remind
> > myself how things worked, it is quite easy to find some FIXME comments which
> > suggest people working on it are unsure of locking desing there and such. So
> > perhaps that all needs cleanup too, I mean would benefit from
> > refactoring/improving work as brainstormed above anyway.
> > 
> > Regards,
> > 
> > Tvrtko
Tvrtko Ursulin Jan. 9, 2023, 1:46 p.m. UTC | #11
On 06/01/2023 23:52, Matthew Brost wrote:
> On Thu, Jan 05, 2023 at 09:43:41PM +0000, Matthew Brost wrote:
>> On Tue, Jan 03, 2023 at 01:02:15PM +0000, Tvrtko Ursulin wrote:
>>>
>>> On 02/01/2023 07:30, Boris Brezillon wrote:
>>>> On Fri, 30 Dec 2022 12:55:08 +0100
>>>> Boris Brezillon <boris.brezillon@collabora.com> wrote:
>>>>
>>>>> On Fri, 30 Dec 2022 11:20:42 +0100
>>>>> Boris Brezillon <boris.brezillon@collabora.com> wrote:
>>>>>
>>>>>> Hello Matthew,
>>>>>>
>>>>>> On Thu, 22 Dec 2022 14:21:11 -0800
>>>>>> Matthew Brost <matthew.brost@intel.com> wrote:
>>>>>>> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
>>>>>>> mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
>>>>>>> seems a bit odd but let us explain the reasoning below.
>>>>>>>
>>>>>>> 1. In XE the submission order from multiple drm_sched_entity is not
>>>>>>> guaranteed to be the same completion even if targeting the same hardware
>>>>>>> engine. This is because in XE we have a firmware scheduler, the GuC,
>>>>>>> which allowed to reorder, timeslice, and preempt submissions. If a using
>>>>>>> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
>>>>>>> apart as the TDR expects submission order == completion order. Using a
>>>>>>> dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.
>>>>>>
>>>>>> Oh, that's interesting. I've been trying to solve the same sort of
>>>>>> issues to support Arm's new Mali GPU which is relying on a FW-assisted
>>>>>> scheduling scheme (you give the FW N streams to execute, and it does
>>>>>> the scheduling between those N command streams, the kernel driver
>>>>>> does timeslice scheduling to update the command streams passed to the
>>>>>> FW). I must admit I gave up on using drm_sched at some point, mostly
>>>>>> because the integration with drm_sched was painful, but also because I
>>>>>> felt trying to bend drm_sched to make it interact with a
>>>>>> timeslice-oriented scheduling model wasn't really future proof. Giving
>>>>>> drm_sched_entity exlusive access to a drm_gpu_scheduler probably might
>>>>>> help for a few things (didn't think it through yet), but I feel it's
>>>>>> coming short on other aspects we have to deal with on Arm GPUs.
>>>>>
>>>>> Ok, so I just had a quick look at the Xe driver and how it
>>>>> instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
>>>>> have a better understanding of how you get away with using drm_sched
>>>>> while still controlling how scheduling is really done. Here
>>>>> drm_gpu_scheduler is just a dummy abstract that let's you use the
>>>>> drm_sched job queuing/dep/tracking mechanism. The whole run-queue
>>>>> selection is dumb because there's only one entity ever bound to the
>>>>> scheduler (the one that's part of the xe_guc_engine object which also
>>>>> contains the drm_gpu_scheduler instance). I guess the main issue we'd
>>>>> have on Arm is the fact that the stream doesn't necessarily get
>>>>> scheduled when ->run_job() is called, it can be placed in the runnable
>>>>> queue and be picked later by the kernel-side scheduler when a FW slot
>>>>> gets released. That can probably be sorted out by manually disabling the
>>>>> job timer and re-enabling it when the stream gets picked by the
>>>>> scheduler. But my main concern remains, we're basically abusing
>>>>> drm_sched here.
>>>>>
>>>>> For the Arm driver, that means turning the following sequence
>>>>>
>>>>> 1. wait for job deps
>>>>> 2. queue job to ringbuf and push the stream to the runnable
>>>>>      queue (if it wasn't queued already). Wakeup the timeslice scheduler
>>>>>      to re-evaluate (if the stream is not on a FW slot already)
>>>>> 3. stream gets picked by the timeslice scheduler and sent to the FW for
>>>>>      execution
>>>>>
>>>>> into
>>>>>
>>>>> 1. queue job to entity which takes care of waiting for job deps for
>>>>>      us
>>>>> 2. schedule a drm_sched_main iteration
>>>>> 3. the only available entity is picked, and the first job from this
>>>>>      entity is dequeued. ->run_job() is called: the job is queued to the
>>>>>      ringbuf and the stream is pushed to the runnable queue (if it wasn't
>>>>>      queued already). Wakeup the timeslice scheduler to re-evaluate (if
>>>>>      the stream is not on a FW slot already)
>>>>> 4. stream gets picked by the timeslice scheduler and sent to the FW for
>>>>>      execution
>>>>>
>>>>> That's one extra step we don't really need. To sum-up, yes, all the
>>>>> job/entity tracking might be interesting to share/re-use, but I wonder
>>>>> if we couldn't have that without pulling out the scheduling part of
>>>>> drm_sched, or maybe I'm missing something, and there's something in
>>>>> drm_gpu_scheduler you really need.
>>>>
>>>> On second thought, that's probably an acceptable overhead (not even
>>>> sure the extra step I was mentioning exists in practice, because dep
>>>> fence signaled state is checked as part of the drm_sched_main
>>>> iteration, so that's basically replacing the worker I schedule to
>>>> check job deps), and I like the idea of being able to re-use drm_sched
>>>> dep-tracking without resorting to invasive changes to the existing
>>>> logic, so I'll probably give it a try.
>>>
>>> I agree with the concerns and think that how Xe proposes to integrate with
>>> drm_sched is a problem, or at least significantly inelegant.
>>>
>>
>> Inelegant is a matter of opinion, I actually rather like this solution.
>>
>> BTW this isn't my design rather this was Jason's idea.
>>   
>>> AFAICT it proposes to have 1:1 between *userspace* created contexts (per
>>> context _and_ engine) and drm_sched. I am not sure avoiding invasive changes
>>> to the shared code is in the spirit of the overall idea and instead
>>> opportunity should be used to look at way to refactor/improve drm_sched.
>>>
>>
>> Yes, it is 1:1 *userspace* engines and drm_sched.
>>
>> I'm not really prepared to make large changes to DRM scheduler at the
>> moment for Xe as they are not really required nor does Boris seem they
>> will be required for his work either. I am interested to see what Boris
>> comes up with.
>>
>>> Even on the low level, the idea to replace drm_sched threads with workers
>>> has a few problems.
>>>
>>> To start with, the pattern of:
>>>
>>>    while (not_stopped) {
>>> 	keep picking jobs
>>>    }
>>>
>>> Feels fundamentally in disagreement with workers (while obviously fits
>>> perfectly with the current kthread design).
>>>
>>
>> The while loop breaks and worker exists if no jobs are ready.
>>
>>> Secondly, it probably demands separate workers (not optional), otherwise
>>> behaviour of shared workqueues has either the potential to explode number
>>> kernel threads anyway, or add latency.
>>>
>>
>> Right now the system_unbound_wq is used which does have a limit on the
>> number of threads, right? I do have a FIXME to allow a worker to be
>> passed in similar to TDR.
>>
>> WRT to latency, the 1:1 ratio could actually have lower latency as 2 GPU
>> schedulers can be pushing jobs into the backend / cleaning up jobs in
>> parallel.
>>
> 
> Thought of one more point here where why in Xe we absolutely want a 1 to
> 1 ratio between entity and scheduler - the way we implement timeslicing
> for preempt fences.
> 
> Let me try to explain.
> 
> Preempt fences are implemented via the generic messaging interface [1]
> with suspend / resume messages. If a suspend messages is received to
> soon after calling resume (this is per entity) we simply sleep in the
> suspend call thus giving the entity a timeslice. This completely falls
> apart with a many to 1 relationship as now a entity waiting for a
> timeslice blocks the other entities. Could we work aroudn this, sure but
> just another bunch of code we'd have to add in Xe. Being to freely sleep
> in backend without affecting other entities is really, really nice IMO
> and I bet Xe isn't the only driver that is going to feel this way.
> 
> Last thing I'll say regardless of how anyone feels about Xe using a 1 to
> 1 relationship this patch IMO makes sense as I hope we can all agree a
> workqueue scales better than kthreads.

I don't know for sure what will scale better and for what use case, 
combination of CPU cores vs number of GPU engines to keep busy vs other 
system activity. But I wager someone is bound to ask for some numbers to 
make sure proposal is not negatively affecting any other drivers.

In any case that's a low level question caused by the high level design 
decision. So I'd think first focus on the high level - which is the 1:1 
mapping of entity to scheduler instance proposal.

Fundamentally it will be up to the DRM maintainers and the community to 
bless your approach. And it is important to stress 1:1 is about 
userspace contexts, so I believe unlike any other current scheduler 
user. And also important to stress this effectively does not make Xe 
_really_ use the scheduler that much.

I can only offer my opinion, which is that the two options mentioned in 
this thread (either improve drm scheduler to cope with what is required, 
or split up the code so you can use just the parts of drm_sched which 
you want - which is frontend dependency tracking) shouldn't be so 
readily dismissed, given how I think the idea was for the new driver to 
work less in a silo and more in the community (not do kludges to 
workaround stuff because it is thought to be too hard to improve common 
code), but fundamentally, "goto previous paragraph" for what I am concerned.

Regards,

Tvrtko

P.S. And as a related side note, there are more areas where drm_sched 
could be improved, like for instance priority handling.
Take a look at msm_submitqueue_create / msm_gpu_convert_priority / 
get_sched_entity to see how msm works around the drm_sched hardcoded 
limit of available priority levels, in order to avoid having to leave a 
hw capability unused. I suspect msm would be happier if they could have 
all priority levels equal in terms of whether they apply only at the 
frontend level or completely throughout the pipeline.

> [1] https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> 
>>> What would be interesting to learn is whether the option of refactoring
>>> drm_sched to deal with out of order completion was considered and what were
>>> the conclusions.
>>>
>>
>> I coded this up a while back when trying to convert the i915 to the DRM
>> scheduler it isn't all that hard either. The free flow control on the
>> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is really what
>> sold me on the this design.
>>
>>> Second option perhaps to split out the drm_sched code into parts which would
>>> lend themselves more to "pick and choose" of its functionalities.
>>> Specifically, Xe wants frontend dependency tracking, but not any scheduling
>>> really (neither least busy drm_sched, neither FIFO/RQ entity picking), so
>>> even having all these data structures in memory is a waste.
>>>
>>
>> I don't think that we are wasting memory is a very good argument for
>> making intrusive changes to the DRM scheduler.
>>
>>> With the first option then the end result could be drm_sched per engine
>>> class (hardware view), which I think fits with the GuC model. Give all
>>> schedulable contexts (entities) to the GuC and then mostly forget about
>>> them. Timeslicing and re-ordering and all happens transparently to the
>>> kernel from that point until completion.
>>>
>>
>> Out-of-order problem still exists here.
>>
>>> Or with the second option you would build on some smaller refactored
>>> sub-components of drm_sched, by maybe splitting the dependency tracking from
>>> scheduling (RR/FIFO entity picking code).
>>>
>>> Second option is especially a bit vague and I haven't thought about the
>>> required mechanics, but it just appeared too obvious the proposed design has
>>> a bit too much impedance mismatch.
>>>
>>
>> IMO ROI on this is low and again lets see what Boris comes up with.
>>
>> Matt
>>
>>> Oh and as a side note, when I went into the drm_sched code base to remind
>>> myself how things worked, it is quite easy to find some FIXME comments which
>>> suggest people working on it are unsure of locking desing there and such. So
>>> perhaps that all needs cleanup too, I mean would benefit from
>>> refactoring/improving work as brainstormed above anyway.
>>>
>>> Regards,
>>>
>>> Tvrtko
Jason Ekstrand Jan. 9, 2023, 3:45 p.m. UTC | #12
On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
wrote:

> On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:
> > On Fri, 30 Dec 2022 12:55:08 +0100
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >
> > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > >
> > > > Hello Matthew,
> > > >
> > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > >
> > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first
> this
> > > > > seems a bit odd but let us explain the reasoning below.
> > > > >
> > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > guaranteed to be the same completion even if targeting the same
> hardware
> > > > > engine. This is because in XE we have a firmware scheduler, the
> GuC,
> > > > > which allowed to reorder, timeslice, and preempt submissions. If a
> using
> > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR
> falls
> > > > > apart as the TDR expects submission order == completion order.
> Using a
> > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this
> problem.
> > > >
> > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > issues to support Arm's new Mali GPU which is relying on a
> FW-assisted
> > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > the scheduling between those N command streams, the kernel driver
> > > > does timeslice scheduling to update the command streams passed to the
> > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > because the integration with drm_sched was painful, but also because
> I
> > > > felt trying to bend drm_sched to make it interact with a
> > > > timeslice-oriented scheduling model wasn't really future proof.
> Giving
> > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably
> might
> > > > help for a few things (didn't think it through yet), but I feel it's
> > > > coming short on other aspects we have to deal with on Arm GPUs.
> > >
> > > Ok, so I just had a quick look at the Xe driver and how it
> > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > have a better understanding of how you get away with using drm_sched
> > > while still controlling how scheduling is really done. Here
> > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue
>
> You nailed it here, we use the DRM scheduler for queuing jobs,
> dependency tracking and releasing jobs to be scheduled when dependencies
> are met, and lastly a tracking mechanism of inflights jobs that need to
> be cleaned up if an error occurs. It doesn't actually do any scheduling
> aside from the most basic level of not overflowing the submission ring
> buffer. In this sense, a 1 to 1 relationship between entity and
> scheduler fits quite well.
>

Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
want here and what you need for Arm thanks to the number of FW queues
available. I don't remember the exact number of GuC queues but it's at
least 1k. This puts it in an entirely different class from what you have on
Mali. Roughly, there's about three categories here:

 1. Hardware where the kernel is placing jobs on actual HW rings. This is
old Mali, Intel Haswell and earlier, and probably a bunch of others.
(Intel BDW+ with execlists is a weird case that doesn't fit in this
categorization.)

 2. Hardware (or firmware) with a very limited number of queues where
you're going to have to juggle in the kernel in order to run desktop Linux.

 3. Firmware scheduling with a high queue count. In this case, you don't
want the kernel scheduling anything. Just throw it at the firmware and let
it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
temporarily pause some low-priority contexts and do some juggling or,
frankly, just fail userspace queue creation and tell the user to close some
windows.

The existence of this 2nd class is a bit annoying but it's where we are. I
think it's worth recognizing that Xe and panfrost are in different places
here and will require different designs. For Xe, we really are just using
drm/scheduler as a front-end and the firmware does all the real scheduling.

How do we deal with class 2? That's an interesting question.  We may
eventually want to break that off into a separate discussion and not litter
the Xe thread but let's keep going here for a bit.  I think there are some
pretty reasonable solutions but they're going to look a bit different.

The way I did this for Xe with execlists was to keep the 1:1:1 mapping
between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
ring and then there was a tiny kernel which operated entirely in IRQ
handlers which juggled those execlists by smashing HW registers.  For
Panfrost, I think we want something slightly different but can borrow some
ideas here.  In particular, have the schedulers feed kernel-side SW queues
(they can even be fixed-size if that helps) and then have a kthread which
juggles those feeds the limited FW queues.  In the case where you have few
enough active contexts to fit them all in FW, I do think it's best to have
them all active in FW and let it schedule. But with only 31, you need to be
able to juggle if you run out.

FWIW this design was also ran by AMD quite a while ago (off the list)
> and we didn't get any serious push back. Things can change however...
>

Yup, AMD and NVIDIA both want this, more-or-less.


> > > selection is dumb because there's only one entity ever bound to the
> > > scheduler (the one that's part of the xe_guc_engine object which also
> > > contains the drm_gpu_scheduler instance). I guess the main issue we'd
> > > have on Arm is the fact that the stream doesn't necessarily get
> > > scheduled when ->run_job() is called, it can be placed in the runnable
> > > queue and be picked later by the kernel-side scheduler when a FW slot
> > > gets released. That can probably be sorted out by manually disabling
> the
> > > job timer and re-enabling it when the stream gets picked by the
> > > scheduler. But my main concern remains, we're basically abusing
> > > drm_sched here.
> > >
>
> That's a matter of opinion, yes we are using it slightly differently
> than anyone else but IMO the fact the DRM scheduler works for the Xe use
> case with barely any changes is a testament to its design.
>
> > > For the Arm driver, that means turning the following sequence
> > >
> > > 1. wait for job deps
> > > 2. queue job to ringbuf and push the stream to the runnable
> > >    queue (if it wasn't queued already). Wakeup the timeslice scheduler
> > >    to re-evaluate (if the stream is not on a FW slot already)
> > > 3. stream gets picked by the timeslice scheduler and sent to the FW for
> > >    execution
> > >
> > > into
> > >
> > > 1. queue job to entity which takes care of waiting for job deps for
> > >    us
> > > 2. schedule a drm_sched_main iteration
> > > 3. the only available entity is picked, and the first job from this
> > >    entity is dequeued. ->run_job() is called: the job is queued to the
> > >    ringbuf and the stream is pushed to the runnable queue (if it wasn't
> > >    queued already). Wakeup the timeslice scheduler to re-evaluate (if
> > >    the stream is not on a FW slot already)
> > > 4. stream gets picked by the timeslice scheduler and sent to the FW for
> > >    execution
> > >
>
> Yes, an extra step but you get to use all the nice DRM scheduler
> functions for dependency tracking. Also in our case we really want a
> single entry point in the backend (the work queue). Also see [1] which
> helped us seal a bunch of races we had in the i915 by using a single
> entry point. All these benefits are why we landed on the DRM scheduler
> and it has worked of rather nicely compared to the i915.
>
> [1] https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
>
> > > That's one extra step we don't really need. To sum-up, yes, all the
> > > job/entity tracking might be interesting to share/re-use, but I wonder
> > > if we couldn't have that without pulling out the scheduling part of
> > > drm_sched, or maybe I'm missing something, and there's something in
> > > drm_gpu_scheduler you really need.
> >
> > On second thought, that's probably an acceptable overhead (not even
> > sure the extra step I was mentioning exists in practice, because dep
> > fence signaled state is checked as part of the drm_sched_main
> > iteration, so that's basically replacing the worker I schedule to
> > check job deps), and I like the idea of being able to re-use drm_sched
> > dep-tracking without resorting to invasive changes to the existing
> > logic, so I'll probably give it a try.
>
> Let me know how this goes.
>
> Matt
>
Boris Brezillon Jan. 9, 2023, 5:17 p.m. UTC | #13
Hi Jason,

On Mon, 9 Jan 2023 09:45:09 -0600
Jason Ekstrand <jason@jlekstrand.net> wrote:

> On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> wrote:
> 
> > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:  
> > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > >  
> > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > >  
> > > > > Hello Matthew,
> > > > >
> > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > >  
> > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first  
> > this  
> > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > >
> > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > guaranteed to be the same completion even if targeting the same  
> > hardware  
> > > > > > engine. This is because in XE we have a firmware scheduler, the  
> > GuC,  
> > > > > > which allowed to reorder, timeslice, and preempt submissions. If a  
> > using  
> > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR  
> > falls  
> > > > > > apart as the TDR expects submission order == completion order.  
> > Using a  
> > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this  
> > problem.  
> > > > >
> > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > issues to support Arm's new Mali GPU which is relying on a  
> > FW-assisted  
> > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > the scheduling between those N command streams, the kernel driver
> > > > > does timeslice scheduling to update the command streams passed to the
> > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > because the integration with drm_sched was painful, but also because  
> > I  
> > > > > felt trying to bend drm_sched to make it interact with a
> > > > > timeslice-oriented scheduling model wasn't really future proof.  
> > Giving  
> > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably  
> > might  
> > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > coming short on other aspects we have to deal with on Arm GPUs.  
> > > >
> > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > have a better understanding of how you get away with using drm_sched
> > > > while still controlling how scheduling is really done. Here
> > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue  
> >
> > You nailed it here, we use the DRM scheduler for queuing jobs,
> > dependency tracking and releasing jobs to be scheduled when dependencies
> > are met, and lastly a tracking mechanism of inflights jobs that need to
> > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > aside from the most basic level of not overflowing the submission ring
> > buffer. In this sense, a 1 to 1 relationship between entity and
> > scheduler fits quite well.
> >  
> 
> Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> want here and what you need for Arm thanks to the number of FW queues
> available. I don't remember the exact number of GuC queues but it's at
> least 1k. This puts it in an entirely different class from what you have on
> Mali. Roughly, there's about three categories here:
> 
>  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> old Mali, Intel Haswell and earlier, and probably a bunch of others.
> (Intel BDW+ with execlists is a weird case that doesn't fit in this
> categorization.)
> 
>  2. Hardware (or firmware) with a very limited number of queues where
> you're going to have to juggle in the kernel in order to run desktop Linux.
> 
>  3. Firmware scheduling with a high queue count. In this case, you don't
> want the kernel scheduling anything. Just throw it at the firmware and let
> it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> temporarily pause some low-priority contexts and do some juggling or,
> frankly, just fail userspace queue creation and tell the user to close some
> windows.
> 
> The existence of this 2nd class is a bit annoying but it's where we are. I
> think it's worth recognizing that Xe and panfrost are in different places
> here and will require different designs. For Xe, we really are just using
> drm/scheduler as a front-end and the firmware does all the real scheduling.
> 
> How do we deal with class 2? That's an interesting question.  We may
> eventually want to break that off into a separate discussion and not litter
> the Xe thread but let's keep going here for a bit.  I think there are some
> pretty reasonable solutions but they're going to look a bit different.
> 
> The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> ring and then there was a tiny kernel which operated entirely in IRQ
> handlers which juggled those execlists by smashing HW registers.  For
> Panfrost, I think we want something slightly different but can borrow some
> ideas here.  In particular, have the schedulers feed kernel-side SW queues
> (they can even be fixed-size if that helps) and then have a kthread which
> juggles those feeds the limited FW queues.  In the case where you have few
> enough active contexts to fit them all in FW, I do think it's best to have
> them all active in FW and let it schedule. But with only 31, you need to be
> able to juggle if you run out.

That's more or less what I do right now, except I don't use the
drm_sched front-end to handle deps or queue jobs (at least not yet). The
kernel-side timeslice-based scheduler juggling with runnable queues
(queues with pending jobs that are not yet resident on a FW slot)
uses a dedicated ordered-workqueue instead of a thread, with scheduler
ticks being handled with a delayed-work (tick happening every X
milliseconds when queues are waiting for a slot). It all seems very
HW/FW-specific though, and I think it's a bit premature to try to
generalize that part, but the dep-tracking logic implemented by
drm_sched looked like something I could easily re-use, hence my
interest in Xe's approach.

Regards,

Boris
Jason Ekstrand Jan. 9, 2023, 5:27 p.m. UTC | #14
On Mon, Jan 9, 2023 at 7:46 AM Tvrtko Ursulin <
tvrtko.ursulin@linux.intel.com> wrote:

>
> On 06/01/2023 23:52, Matthew Brost wrote:
> > On Thu, Jan 05, 2023 at 09:43:41PM +0000, Matthew Brost wrote:
> >> On Tue, Jan 03, 2023 at 01:02:15PM +0000, Tvrtko Ursulin wrote:
> >>>
> >>> On 02/01/2023 07:30, Boris Brezillon wrote:
> >>>> On Fri, 30 Dec 2022 12:55:08 +0100
> >>>> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >>>>
> >>>>> On Fri, 30 Dec 2022 11:20:42 +0100
> >>>>> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >>>>>
> >>>>>> Hello Matthew,
> >>>>>>
> >>>>>> On Thu, 22 Dec 2022 14:21:11 -0800
> >>>>>> Matthew Brost <matthew.brost@intel.com> wrote:
> >>>>>>> In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> >>>>>>> mapping between a drm_gpu_scheduler and drm_sched_entity. At first
> this
> >>>>>>> seems a bit odd but let us explain the reasoning below.
> >>>>>>>
> >>>>>>> 1. In XE the submission order from multiple drm_sched_entity is not
> >>>>>>> guaranteed to be the same completion even if targeting the same
> hardware
> >>>>>>> engine. This is because in XE we have a firmware scheduler, the
> GuC,
> >>>>>>> which allowed to reorder, timeslice, and preempt submissions. If a
> using
> >>>>>>> shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR
> falls
> >>>>>>> apart as the TDR expects submission order == completion order.
> Using a
> >>>>>>> dedicated drm_gpu_scheduler per drm_sched_entity solve this
> problem.
> >>>>>>
> >>>>>> Oh, that's interesting. I've been trying to solve the same sort of
> >>>>>> issues to support Arm's new Mali GPU which is relying on a
> FW-assisted
> >>>>>> scheduling scheme (you give the FW N streams to execute, and it does
> >>>>>> the scheduling between those N command streams, the kernel driver
> >>>>>> does timeslice scheduling to update the command streams passed to
> the
> >>>>>> FW). I must admit I gave up on using drm_sched at some point, mostly
> >>>>>> because the integration with drm_sched was painful, but also
> because I
> >>>>>> felt trying to bend drm_sched to make it interact with a
> >>>>>> timeslice-oriented scheduling model wasn't really future proof.
> Giving
> >>>>>> drm_sched_entity exlusive access to a drm_gpu_scheduler probably
> might
> >>>>>> help for a few things (didn't think it through yet), but I feel it's
> >>>>>> coming short on other aspects we have to deal with on Arm GPUs.
> >>>>>
> >>>>> Ok, so I just had a quick look at the Xe driver and how it
> >>>>> instantiates the drm_sched_entity and drm_gpu_scheduler, and I think
> I
> >>>>> have a better understanding of how you get away with using drm_sched
> >>>>> while still controlling how scheduling is really done. Here
> >>>>> drm_gpu_scheduler is just a dummy abstract that let's you use the
> >>>>> drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> >>>>> selection is dumb because there's only one entity ever bound to the
> >>>>> scheduler (the one that's part of the xe_guc_engine object which also
> >>>>> contains the drm_gpu_scheduler instance). I guess the main issue we'd
> >>>>> have on Arm is the fact that the stream doesn't necessarily get
> >>>>> scheduled when ->run_job() is called, it can be placed in the
> runnable
> >>>>> queue and be picked later by the kernel-side scheduler when a FW slot
> >>>>> gets released. That can probably be sorted out by manually disabling
> the
> >>>>> job timer and re-enabling it when the stream gets picked by the
> >>>>> scheduler. But my main concern remains, we're basically abusing
> >>>>> drm_sched here.
> >>>>>
> >>>>> For the Arm driver, that means turning the following sequence
> >>>>>
> >>>>> 1. wait for job deps
> >>>>> 2. queue job to ringbuf and push the stream to the runnable
> >>>>>      queue (if it wasn't queued already). Wakeup the timeslice
> scheduler
> >>>>>      to re-evaluate (if the stream is not on a FW slot already)
> >>>>> 3. stream gets picked by the timeslice scheduler and sent to the FW
> for
> >>>>>      execution
> >>>>>
> >>>>> into
> >>>>>
> >>>>> 1. queue job to entity which takes care of waiting for job deps for
> >>>>>      us
> >>>>> 2. schedule a drm_sched_main iteration
> >>>>> 3. the only available entity is picked, and the first job from this
> >>>>>      entity is dequeued. ->run_job() is called: the job is queued to
> the
> >>>>>      ringbuf and the stream is pushed to the runnable queue (if it
> wasn't
> >>>>>      queued already). Wakeup the timeslice scheduler to re-evaluate
> (if
> >>>>>      the stream is not on a FW slot already)
> >>>>> 4. stream gets picked by the timeslice scheduler and sent to the FW
> for
> >>>>>      execution
> >>>>>
> >>>>> That's one extra step we don't really need. To sum-up, yes, all the
> >>>>> job/entity tracking might be interesting to share/re-use, but I
> wonder
> >>>>> if we couldn't have that without pulling out the scheduling part of
> >>>>> drm_sched, or maybe I'm missing something, and there's something in
> >>>>> drm_gpu_scheduler you really need.
> >>>>
> >>>> On second thought, that's probably an acceptable overhead (not even
> >>>> sure the extra step I was mentioning exists in practice, because dep
> >>>> fence signaled state is checked as part of the drm_sched_main
> >>>> iteration, so that's basically replacing the worker I schedule to
> >>>> check job deps), and I like the idea of being able to re-use drm_sched
> >>>> dep-tracking without resorting to invasive changes to the existing
> >>>> logic, so I'll probably give it a try.
> >>>
> >>> I agree with the concerns and think that how Xe proposes to integrate
> with
> >>> drm_sched is a problem, or at least significantly inelegant.
> >>>
> >>
> >> Inelegant is a matter of opinion, I actually rather like this solution.
> >>
> >> BTW this isn't my design rather this was Jason's idea.
>

Sure, throw me under the bus, why don't you? :-P  Nah, it's all fine.  It's
my design and I'm happy to defend it or be blamed for it in the history
books as the case may be.


> >>> AFAICT it proposes to have 1:1 between *userspace* created contexts
> (per
> >>> context _and_ engine) and drm_sched. I am not sure avoiding invasive
> changes
> >>> to the shared code is in the spirit of the overall idea and instead
> >>> opportunity should be used to look at way to refactor/improve
> drm_sched.
>

Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
really needs to drive a re-factor.  (More on that later.)  There's only one
real issue which is that it fires off potentially a lot of kthreads. Even
that's not that bad given that kthreads are pretty light and you're not
likely to have more kthreads than userspace threads which are much
heavier.  Not ideal, but not the end of the world either.  Definitely
something we can/should optimize but if we went through with Xe without
this patch, it would probably be mostly ok.


> >> Yes, it is 1:1 *userspace* engines and drm_sched.
> >>
> >> I'm not really prepared to make large changes to DRM scheduler at the
> >> moment for Xe as they are not really required nor does Boris seem they
> >> will be required for his work either. I am interested to see what Boris
> >> comes up with.
> >>
> >>> Even on the low level, the idea to replace drm_sched threads with
> workers
> >>> has a few problems.
> >>>
> >>> To start with, the pattern of:
> >>>
> >>>    while (not_stopped) {
> >>>     keep picking jobs
> >>>    }
> >>>
> >>> Feels fundamentally in disagreement with workers (while obviously fits
> >>> perfectly with the current kthread design).
> >>
> >> The while loop breaks and worker exists if no jobs are ready.
>

I'm not very familiar with workqueues. What are you saying would fit
better? One scheduling job per work item rather than one big work item
which handles all available jobs?


> >>> Secondly, it probably demands separate workers (not optional),
> otherwise
> >>> behaviour of shared workqueues has either the potential to explode
> number
> >>> kernel threads anyway, or add latency.
> >>>
> >>
> >> Right now the system_unbound_wq is used which does have a limit on the
> >> number of threads, right? I do have a FIXME to allow a worker to be
> >> passed in similar to TDR.
> >>
> >> WRT to latency, the 1:1 ratio could actually have lower latency as 2 GPU
> >> schedulers can be pushing jobs into the backend / cleaning up jobs in
> >> parallel.
> >>
> >
> > Thought of one more point here where why in Xe we absolutely want a 1 to
> > 1 ratio between entity and scheduler - the way we implement timeslicing
> > for preempt fences.
> >
> > Let me try to explain.
> >
> > Preempt fences are implemented via the generic messaging interface [1]
> > with suspend / resume messages. If a suspend messages is received to
> > soon after calling resume (this is per entity) we simply sleep in the
> > suspend call thus giving the entity a timeslice. This completely falls
> > apart with a many to 1 relationship as now a entity waiting for a
> > timeslice blocks the other entities. Could we work aroudn this, sure but
> > just another bunch of code we'd have to add in Xe. Being to freely sleep
> > in backend without affecting other entities is really, really nice IMO
> > and I bet Xe isn't the only driver that is going to feel this way.
> >
> > Last thing I'll say regardless of how anyone feels about Xe using a 1 to
> > 1 relationship this patch IMO makes sense as I hope we can all agree a
> > workqueue scales better than kthreads.
>
> I don't know for sure what will scale better and for what use case,
> combination of CPU cores vs number of GPU engines to keep busy vs other
> system activity. But I wager someone is bound to ask for some numbers to
> make sure proposal is not negatively affecting any other drivers.
>

Then let them ask.  Waving your hands vaguely in the direction of the rest
of DRM and saying "Uh, someone (not me) might object" is profoundly
unhelpful.  Sure, someone might.  That's why it's on dri-devel.  If you
think there's someone in particular who might have a useful opinion on
this, throw them in the CC so they don't miss the e-mail thread.

Or are you asking for numbers?  If so, what numbers are you asking for?

Also, If we're talking about a design that might paint us into an
Intel-HW-specific hole, that would be one thing.  But we're not.  We're
talking about switching which kernel threading/task mechanism to use for
what's really a very generic problem.  The core Xe design works without
this patch (just with more kthreads).  If we land this patch or something
like it and get it wrong and it causes a performance problem for someone
down the line, we can revisit it.


> In any case that's a low level question caused by the high level design
> decision. So I'd think first focus on the high level - which is the 1:1
> mapping of entity to scheduler instance proposal.
>
> Fundamentally it will be up to the DRM maintainers and the community to
> bless your approach. And it is important to stress 1:1 is about
> userspace contexts, so I believe unlike any other current scheduler
> user. And also important to stress this effectively does not make Xe
> _really_ use the scheduler that much.
>

I don't think this makes Xe nearly as much of a one-off as you think it
does.  I've already told the Asahi team working on Apple M1/2 hardware to
do it this way and it seems to be a pretty good mapping for them.  I
believe this is roughly the plan for nouveau as well.  It's not the way it
currently works for anyone because most other groups aren't doing FW
scheduling yet.  In the world of FW scheduling and hardware designed to
support userspace direct-to-FW submit, I think the design makes perfect
sense (see below) and I expect we'll see more drivers move in this
direction as those drivers evolve.  (AMD is doing some customish thing for
how with gpu_scheduler on the front-end somehow.  I've not dug into those
details.)


> I can only offer my opinion, which is that the two options mentioned in
> this thread (either improve drm scheduler to cope with what is required,
> or split up the code so you can use just the parts of drm_sched which
> you want - which is frontend dependency tracking) shouldn't be so
> readily dismissed, given how I think the idea was for the new driver to
> work less in a silo and more in the community (not do kludges to
> workaround stuff because it is thought to be too hard to improve common
> code), but fundamentally, "goto previous paragraph" for what I am
> concerned.
>

Meta comment:  It appears as if you're falling into the standard i915 team
trap of having an internal discussion about what the community discussion
might look like instead of actually having the community discussion.  If
you are seriously concerned about interactions with other drivers or
whether or setting common direction, the right way to do that is to break a
patch or two out into a separate RFC series and tag a handful of driver
maintainers.  Trying to predict the questions other people might ask is
pointless. Cc them and asking for their input instead.


> Regards,
>
> Tvrtko
>
> P.S. And as a related side note, there are more areas where drm_sched
> could be improved, like for instance priority handling.
> Take a look at msm_submitqueue_create / msm_gpu_convert_priority /
> get_sched_entity to see how msm works around the drm_sched hardcoded
> limit of available priority levels, in order to avoid having to leave a
> hw capability unused. I suspect msm would be happier if they could have
> all priority levels equal in terms of whether they apply only at the
> frontend level or completely throughout the pipeline.
>
> > [1] https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> >
> >>> What would be interesting to learn is whether the option of refactoring
> >>> drm_sched to deal with out of order completion was considered and what
> were
> >>> the conclusions.
> >>>
> >>
> >> I coded this up a while back when trying to convert the i915 to the DRM
> >> scheduler it isn't all that hard either. The free flow control on the
> >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is really what
> >> sold me on the this design.
>

You're not the only one to suggest supporting out-of-order completion.
However, it's tricky and breaks a lot of internal assumptions of the
scheduler. It also reduces functionality a bit because it can no longer
automatically rate-limit HW/FW queues which are often fixed-size.  (Ok,
yes, it probably could but it becomes a substantially harder problem.)

It also seems like a worse mapping to me.  The goal here is to turn
submissions on a userspace-facing engine/queue into submissions to a FW
queue submissions, sorting out any dma_fence dependencies.  Matt's
description of saying this is a 1:1 mapping between sched/entity doesn't
tell the whole story. It's a 1:1:1 mapping between xe_engine,
gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1 mapping?
Why is that better?

There are two places where this 1:1:1 mapping is causing problems:

 1. It creates lots of kthreads. This is what this patch is trying to
solve. IDK if it's solving it the best way but that's the goal.

 2. There are a far more limited number of communication queues between the
kernel and GuC for more meta things like pausing and resuming queues,
getting events back from GuC, etc. Unless we're in a weird pressure
scenario, the amount of traffic on this queue should be low so we can
probably just have one per physical device.  The vast majority of kernel ->
GuC communication should be on the individual FW queue rings and maybe
smashing in-memory doorbells.

Doing out-of-order completion sort-of solves the 1 but does nothing for 2
and actually makes managing FW queues harder because we no longer have
built-in rate limiting.  Seems like a net loss to me.

>>> Second option perhaps to split out the drm_sched code into parts which
> would
> >>> lend themselves more to "pick and choose" of its functionalities.
> >>> Specifically, Xe wants frontend dependency tracking, but not any
> scheduling
> >>> really (neither least busy drm_sched, neither FIFO/RQ entity picking),
> so
> >>> even having all these data structures in memory is a waste.
> >>>
> >>
> >> I don't think that we are wasting memory is a very good argument for
> >> making intrusive changes to the DRM scheduler.
>

Worse than that, I think the "we could split it up" kind-of misses the
point of the way Xe is using drm/scheduler.  It's not just about re-using a
tiny bit of dependency tracking code.  Using the scheduler in this way
provides a clean separation between front-end and back-end.  The job of the
userspace-facing ioctl code is to shove things on the scheduler.  The job
of the run_job callback is to encode the job into the FW queue format,
stick it in the FW queue ring, and maybe smash a doorbell.  Everything else
happens in terms of managing those queues side-band.  The gpu_scheduler
code manages the front-end queues and Xe manages the FW queues via the
Kernel <-> GuC communication rings.  From a high level, this is a really
clean design.  There are potentially some sticky bits around the dual-use
of dma_fence for scheduling and memory management but none of those are
solved by breaking the DRM scheduler into chunks or getting rid of the
1:1:1 mapping.

If we split it out, we're basically asking the driver to implement a bunch
of kthread or workqueue stuff, all the ring rate-limiting, etc.  It may not
be all that much code but also, why?  To save a few bytes of memory per
engine?  Each engine already has 32K(ish) worth of context state and a
similar size ring to communicate with the FW.  No one is going to notice an
extra CPU data structure.

I'm not seeing a solid argument against the 1:1:1 design here other than
that it doesn't seem like the way DRM scheduler was intended to be used.  I
won't argue that.  It's not.  But it is a fairly natural way to take
advantage of the benefits the DRM scheduler does provide while also mapping
it to hardware that was designed for userspace direct-to-FW submit.

--Jason



> >>> With the first option then the end result could be drm_sched per engine
> >>> class (hardware view), which I think fits with the GuC model. Give all
> >>> schedulable contexts (entities) to the GuC and then mostly forget about
> >>> them. Timeslicing and re-ordering and all happens transparently to the
> >>> kernel from that point until completion.
> >>>
> >>
> >> Out-of-order problem still exists here.
> >>
> >>> Or with the second option you would build on some smaller refactored
> >>> sub-components of drm_sched, by maybe splitting the dependency
> tracking from
> >>> scheduling (RR/FIFO entity picking code).
> >>>
> >>> Second option is especially a bit vague and I haven't thought about the
> >>> required mechanics, but it just appeared too obvious the proposed
> design has
> >>> a bit too much impedance mismatch.
> >>>
> >>
> >> IMO ROI on this is low and again lets see what Boris comes up with.
> >>
> >> Matt
> >>
> >>> Oh and as a side note, when I went into the drm_sched code base to
> remind
> >>> myself how things worked, it is quite easy to find some FIXME comments
> which
> >>> suggest people working on it are unsure of locking desing there and
> such. So
> >>> perhaps that all needs cleanup too, I mean would benefit from
> >>> refactoring/improving work as brainstormed above anyway.
> >>>
> >>> Regards,
> >>>
> >>> Tvrtko
>
Daniel Vetter Jan. 9, 2023, 8:40 p.m. UTC | #15
On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:
> Hi Jason,
> 
> On Mon, 9 Jan 2023 09:45:09 -0600
> Jason Ekstrand <jason@jlekstrand.net> wrote:
> 
> > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > wrote:
> > 
> > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:  
> > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > >  
> > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > >  
> > > > > > Hello Matthew,
> > > > > >
> > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > >  
> > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first  
> > > this  
> > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > >
> > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > guaranteed to be the same completion even if targeting the same  
> > > hardware  
> > > > > > > engine. This is because in XE we have a firmware scheduler, the  
> > > GuC,  
> > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a  
> > > using  
> > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR  
> > > falls  
> > > > > > > apart as the TDR expects submission order == completion order.  
> > > Using a  
> > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this  
> > > problem.  
> > > > > >
> > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > issues to support Arm's new Mali GPU which is relying on a  
> > > FW-assisted  
> > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > because the integration with drm_sched was painful, but also because  
> > > I  
> > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > timeslice-oriented scheduling model wasn't really future proof.  
> > > Giving  
> > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably  
> > > might  
> > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > coming short on other aspects we have to deal with on Arm GPUs.  
> > > > >
> > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > have a better understanding of how you get away with using drm_sched
> > > > > while still controlling how scheduling is really done. Here
> > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue  
> > >
> > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > aside from the most basic level of not overflowing the submission ring
> > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > scheduler fits quite well.
> > >  
> > 
> > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > want here and what you need for Arm thanks to the number of FW queues
> > available. I don't remember the exact number of GuC queues but it's at
> > least 1k. This puts it in an entirely different class from what you have on
> > Mali. Roughly, there's about three categories here:
> > 
> >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > categorization.)
> > 
> >  2. Hardware (or firmware) with a very limited number of queues where
> > you're going to have to juggle in the kernel in order to run desktop Linux.
> > 
> >  3. Firmware scheduling with a high queue count. In this case, you don't
> > want the kernel scheduling anything. Just throw it at the firmware and let
> > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > temporarily pause some low-priority contexts and do some juggling or,
> > frankly, just fail userspace queue creation and tell the user to close some
> > windows.
> > 
> > The existence of this 2nd class is a bit annoying but it's where we are. I
> > think it's worth recognizing that Xe and panfrost are in different places
> > here and will require different designs. For Xe, we really are just using
> > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > 
> > How do we deal with class 2? That's an interesting question.  We may
> > eventually want to break that off into a separate discussion and not litter
> > the Xe thread but let's keep going here for a bit.  I think there are some
> > pretty reasonable solutions but they're going to look a bit different.
> > 
> > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > ring and then there was a tiny kernel which operated entirely in IRQ
> > handlers which juggled those execlists by smashing HW registers.  For
> > Panfrost, I think we want something slightly different but can borrow some
> > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > (they can even be fixed-size if that helps) and then have a kthread which
> > juggles those feeds the limited FW queues.  In the case where you have few
> > enough active contexts to fit them all in FW, I do think it's best to have
> > them all active in FW and let it schedule. But with only 31, you need to be
> > able to juggle if you run out.
> 
> That's more or less what I do right now, except I don't use the
> drm_sched front-end to handle deps or queue jobs (at least not yet). The
> kernel-side timeslice-based scheduler juggling with runnable queues
> (queues with pending jobs that are not yet resident on a FW slot)
> uses a dedicated ordered-workqueue instead of a thread, with scheduler
> ticks being handled with a delayed-work (tick happening every X
> milliseconds when queues are waiting for a slot). It all seems very
> HW/FW-specific though, and I think it's a bit premature to try to
> generalize that part, but the dep-tracking logic implemented by
> drm_sched looked like something I could easily re-use, hence my
> interest in Xe's approach.

So another option for these few fw queue slots schedulers would be to
treat them as vram and enlist ttm.

Well maybe more enlist ttm and less treat them like vram, but ttm can
handle idr (or xarray or whatever you want) and then help you with all the
pipelining (and the drm_sched then with sorting out dependencies). If you
then also preferentially "evict" low-priority queus you pretty much have
the perfect thing.

Note that GuC with sriov splits up the id space and together with some
restrictions due to multi-engine contexts media needs might also need this
all.

If you're balking at the idea of enlisting ttm just for fw queue
management, amdgpu has a shoddy version of id allocation for their vm/tlb
index allocation. Might be worth it to instead lift that into some sched
helper code.

Either way there's two imo rather solid approaches available to sort this
out. And once you have that, then there shouldn't be any big difference in
driver design between fw with defacto unlimited queue ids, and those with
severe restrictions in number of queues.
-Daniel
Boris Brezillon Jan. 10, 2023, 8:46 a.m. UTC | #16
Hi Daniel,

On Mon, 9 Jan 2023 21:40:21 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:
> > Hi Jason,
> > 
> > On Mon, 9 Jan 2023 09:45:09 -0600
> > Jason Ekstrand <jason@jlekstrand.net> wrote:
> >   
> > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > wrote:
> > >   
> > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:    
> > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > >    
> > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > >    
> > > > > > > Hello Matthew,
> > > > > > >
> > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > >    
> > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first    
> > > > this    
> > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > >
> > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > guaranteed to be the same completion even if targeting the same    
> > > > hardware    
> > > > > > > > engine. This is because in XE we have a firmware scheduler, the    
> > > > GuC,    
> > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a    
> > > > using    
> > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR    
> > > > falls    
> > > > > > > > apart as the TDR expects submission order == completion order.    
> > > > Using a    
> > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this    
> > > > problem.    
> > > > > > >
> > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > issues to support Arm's new Mali GPU which is relying on a    
> > > > FW-assisted    
> > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > because the integration with drm_sched was painful, but also because    
> > > > I    
> > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > timeslice-oriented scheduling model wasn't really future proof.    
> > > > Giving    
> > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably    
> > > > might    
> > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > coming short on other aspects we have to deal with on Arm GPUs.    
> > > > > >
> > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > while still controlling how scheduling is really done. Here
> > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue    
> > > >
> > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > aside from the most basic level of not overflowing the submission ring
> > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > scheduler fits quite well.
> > > >    
> > > 
> > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > want here and what you need for Arm thanks to the number of FW queues
> > > available. I don't remember the exact number of GuC queues but it's at
> > > least 1k. This puts it in an entirely different class from what you have on
> > > Mali. Roughly, there's about three categories here:
> > > 
> > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > categorization.)
> > > 
> > >  2. Hardware (or firmware) with a very limited number of queues where
> > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > 
> > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > temporarily pause some low-priority contexts and do some juggling or,
> > > frankly, just fail userspace queue creation and tell the user to close some
> > > windows.
> > > 
> > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > think it's worth recognizing that Xe and panfrost are in different places
> > > here and will require different designs. For Xe, we really are just using
> > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > 
> > > How do we deal with class 2? That's an interesting question.  We may
> > > eventually want to break that off into a separate discussion and not litter
> > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > pretty reasonable solutions but they're going to look a bit different.
> > > 
> > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > handlers which juggled those execlists by smashing HW registers.  For
> > > Panfrost, I think we want something slightly different but can borrow some
> > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > (they can even be fixed-size if that helps) and then have a kthread which
> > > juggles those feeds the limited FW queues.  In the case where you have few
> > > enough active contexts to fit them all in FW, I do think it's best to have
> > > them all active in FW and let it schedule. But with only 31, you need to be
> > > able to juggle if you run out.  
> > 
> > That's more or less what I do right now, except I don't use the
> > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > kernel-side timeslice-based scheduler juggling with runnable queues
> > (queues with pending jobs that are not yet resident on a FW slot)
> > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > ticks being handled with a delayed-work (tick happening every X
> > milliseconds when queues are waiting for a slot). It all seems very
> > HW/FW-specific though, and I think it's a bit premature to try to
> > generalize that part, but the dep-tracking logic implemented by
> > drm_sched looked like something I could easily re-use, hence my
> > interest in Xe's approach.  
> 
> So another option for these few fw queue slots schedulers would be to
> treat them as vram and enlist ttm.
> 
> Well maybe more enlist ttm and less treat them like vram, but ttm can
> handle idr (or xarray or whatever you want) and then help you with all the
> pipelining (and the drm_sched then with sorting out dependencies). If you
> then also preferentially "evict" low-priority queus you pretty much have
> the perfect thing.
> 
> Note that GuC with sriov splits up the id space and together with some
> restrictions due to multi-engine contexts media needs might also need this
> all.
> 
> If you're balking at the idea of enlisting ttm just for fw queue
> management, amdgpu has a shoddy version of id allocation for their vm/tlb
> index allocation. Might be worth it to instead lift that into some sched
> helper code.

Would you mind pointing me to the amdgpu code you're mentioning here?
Still have a hard time seeing what TTM has to do with scheduling, but I
also don't know much about TTM, so I'll keep digging.

> 
> Either way there's two imo rather solid approaches available to sort this
> out. And once you have that, then there shouldn't be any big difference in
> driver design between fw with defacto unlimited queue ids, and those with
> severe restrictions in number of queues.

Honestly, I don't think there's much difference between those two cases
already. There's just a bunch of additional code to schedule queues on
FW slots for the limited-number-of-FW-slots case, which, right now, is
driver specific. The job queuing front-end pretty much achieves what
drm_sched does already: queuing job to entities, checking deps,
submitting job to HW (in our case, writing to the command stream ring
buffer). Things start to differ after that point: once a scheduling
entity has pending jobs, we add it to one of the runnable queues (one
queue per prio) and kick the kernel-side timeslice-based scheduler to
re-evaluate, if needed.

I'm all for using generic code when it makes sense, even if that means
adding this common code when it doesn't exists, but I don't want to be
dragged into some major refactoring that might take years to land.
Especially if pancsf is the first
FW-assisted-scheduler-with-few-FW-slot driver.

Here's a link to my WIP branch [1], and here is the scheduler logic
[2] if you want to have a look. Don't pay too much attention to the
driver uAPI (it's being redesigned).

Regards,

Boris

[1]https://gitlab.freedesktop.org/bbrezillon/linux/-/tree/pancsf
[2]https://gitlab.freedesktop.org/bbrezillon/linux/-/blob/pancsf/drivers/gpu/drm/pancsf/pancsf_sched.c
Tvrtko Ursulin Jan. 10, 2023, 11:28 a.m. UTC | #17
On 09/01/2023 17:27, Jason Ekstrand wrote:

[snip]

>      >>> AFAICT it proposes to have 1:1 between *userspace* created
>     contexts (per
>      >>> context _and_ engine) and drm_sched. I am not sure avoiding
>     invasive changes
>      >>> to the shared code is in the spirit of the overall idea and instead
>      >>> opportunity should be used to look at way to refactor/improve
>     drm_sched.
> 
> 
> Maybe?  I'm not convinced that what Xe is doing is an abuse at all or 
> really needs to drive a re-factor.  (More on that later.)  There's only 
> one real issue which is that it fires off potentially a lot of kthreads. 
> Even that's not that bad given that kthreads are pretty light and you're 
> not likely to have more kthreads than userspace threads which are much 
> heavier.  Not ideal, but not the end of the world either.  Definitely 
> something we can/should optimize but if we went through with Xe without 
> this patch, it would probably be mostly ok.
> 
>      >> Yes, it is 1:1 *userspace* engines and drm_sched.
>      >>
>      >> I'm not really prepared to make large changes to DRM scheduler
>     at the
>      >> moment for Xe as they are not really required nor does Boris
>     seem they
>      >> will be required for his work either. I am interested to see
>     what Boris
>      >> comes up with.
>      >>
>      >>> Even on the low level, the idea to replace drm_sched threads
>     with workers
>      >>> has a few problems.
>      >>>
>      >>> To start with, the pattern of:
>      >>>
>      >>>    while (not_stopped) {
>      >>>     keep picking jobs
>      >>>    }
>      >>>
>      >>> Feels fundamentally in disagreement with workers (while
>     obviously fits
>      >>> perfectly with the current kthread design).
>      >>
>      >> The while loop breaks and worker exists if no jobs are ready.
> 
> 
> I'm not very familiar with workqueues. What are you saying would fit 
> better? One scheduling job per work item rather than one big work item 
> which handles all available jobs?

Yes and no, it indeed IMO does not fit to have a work item which is 
potentially unbound in runtime. But it is a bit moot conceptual mismatch 
because it is a worst case / theoretical, and I think due more 
fundamental concerns.

If we have to go back to the low level side of things, I've picked this 
random spot to consolidate what I have already mentioned and perhaps expand.

To start with, let me pull out some thoughts from workqueue.rst:

"""
Generally, work items are not expected to hog a CPU and consume many 
cycles. That means maintaining just enough concurrency to prevent work 
processing from stalling should be optimal.
"""

For unbound queues:
"""
The responsibility of regulating concurrency level is on the users.
"""

Given the unbound queues will be spawned on demand to service all queued 
work items (more interesting when mixing up with the system_unbound_wq), 
in the proposed design the number of instantiated worker threads does 
not correspond to the number of user threads (as you have elsewhere 
stated), but pessimistically to the number of active user contexts. That 
is the number which drives the maximum number of not-runnable jobs that 
can become runnable at once, and hence spawn that many work items, and 
in turn unbound worker threads.

Several problems there.

It is fundamentally pointless to have potentially that many more threads 
than the number of CPU cores - it simply creates a scheduling storm.

Unbound workers have no CPU / cache locality either and no connection 
with the CPU scheduler to optimize scheduling patterns. This may matter 
either on large systems or on small ones. Whereas the current design 
allows for scheduler to notice userspace CPU thread keeps waking up the 
same drm scheduler kernel thread, and so it can keep them on the same 
CPU, the unbound workers lose that ability and so 2nd CPU might be 
getting woken up from low sleep for every submission.

Hence, apart from being a bit of a impedance mismatch, the proposal has 
the potential to change performance and power patterns and both large 
and small machines.

>      >>> Secondly, it probably demands separate workers (not optional),
>     otherwise
>      >>> behaviour of shared workqueues has either the potential to
>     explode number
>      >>> kernel threads anyway, or add latency.
>      >>>
>      >>
>      >> Right now the system_unbound_wq is used which does have a limit
>     on the
>      >> number of threads, right? I do have a FIXME to allow a worker to be
>      >> passed in similar to TDR.
>      >>
>      >> WRT to latency, the 1:1 ratio could actually have lower latency
>     as 2 GPU
>      >> schedulers can be pushing jobs into the backend / cleaning up
>     jobs in
>      >> parallel.
>      >>
>      >
>      > Thought of one more point here where why in Xe we absolutely want
>     a 1 to
>      > 1 ratio between entity and scheduler - the way we implement
>     timeslicing
>      > for preempt fences.
>      >
>      > Let me try to explain.
>      >
>      > Preempt fences are implemented via the generic messaging
>     interface [1]
>      > with suspend / resume messages. If a suspend messages is received to
>      > soon after calling resume (this is per entity) we simply sleep in the
>      > suspend call thus giving the entity a timeslice. This completely
>     falls
>      > apart with a many to 1 relationship as now a entity waiting for a
>      > timeslice blocks the other entities. Could we work aroudn this,
>     sure but
>      > just another bunch of code we'd have to add in Xe. Being to
>     freely sleep
>      > in backend without affecting other entities is really, really
>     nice IMO
>      > and I bet Xe isn't the only driver that is going to feel this way.
>      >
>      > Last thing I'll say regardless of how anyone feels about Xe using
>     a 1 to
>      > 1 relationship this patch IMO makes sense as I hope we can all
>     agree a
>      > workqueue scales better than kthreads.
> 
>     I don't know for sure what will scale better and for what use case,
>     combination of CPU cores vs number of GPU engines to keep busy vs other
>     system activity. But I wager someone is bound to ask for some
>     numbers to
>     make sure proposal is not negatively affecting any other drivers.
> 
> 
> Then let them ask.  Waving your hands vaguely in the direction of the 
> rest of DRM and saying "Uh, someone (not me) might object" is profoundly 
> unhelpful.  Sure, someone might.  That's why it's on dri-devel.  If you 
> think there's someone in particular who might have a useful opinion on 
> this, throw them in the CC so they don't miss the e-mail thread.
> 
> Or are you asking for numbers?  If so, what numbers are you asking for?

It was a heads up to the Xe team in case people weren't appreciating how 
the proposed change has the potential influence power and performance 
across the board. And nothing in the follow up discussion made me think 
it was considered so I don't think it was redundant to raise it.

In my experience it is typical that such core changes come with some 
numbers. Which is in case of drm scheduler is tricky and probably 
requires explicitly asking everyone to test (rather than count on "don't 
miss the email thread"). Real products can fail to ship due ten mW here 
or there. Like suddenly an extra core prevented from getting into deep 
sleep.

If that was "profoundly unhelpful" so be it.

> Also, If we're talking about a design that might paint us into an 
> Intel-HW-specific hole, that would be one thing.  But we're not.  We're 
> talking about switching which kernel threading/task mechanism to use for 
> what's really a very generic problem.  The core Xe design works without 
> this patch (just with more kthreads).  If we land this patch or 
> something like it and get it wrong and it causes a performance problem 
> for someone down the line, we can revisit it.

For some definition of "it works" - I really wouldn't suggest shipping a 
kthread per user context at any point.

>     In any case that's a low level question caused by the high level design
>     decision. So I'd think first focus on the high level - which is the 1:1
>     mapping of entity to scheduler instance proposal.
> 
>     Fundamentally it will be up to the DRM maintainers and the community to
>     bless your approach. And it is important to stress 1:1 is about
>     userspace contexts, so I believe unlike any other current scheduler
>     user. And also important to stress this effectively does not make Xe
>     _really_ use the scheduler that much.
> 
> 
> I don't think this makes Xe nearly as much of a one-off as you think it 
> does.  I've already told the Asahi team working on Apple M1/2 hardware 
> to do it this way and it seems to be a pretty good mapping for them. I 
> believe this is roughly the plan for nouveau as well.  It's not the way 
> it currently works for anyone because most other groups aren't doing FW 
> scheduling yet.  In the world of FW scheduling and hardware designed to 
> support userspace direct-to-FW submit, I think the design makes perfect 
> sense (see below) and I expect we'll see more drivers move in this 
> direction as those drivers evolve.  (AMD is doing some customish thing 
> for how with gpu_scheduler on the front-end somehow. I've not dug into 
> those details.)
> 
>     I can only offer my opinion, which is that the two options mentioned in
>     this thread (either improve drm scheduler to cope with what is
>     required,
>     or split up the code so you can use just the parts of drm_sched which
>     you want - which is frontend dependency tracking) shouldn't be so
>     readily dismissed, given how I think the idea was for the new driver to
>     work less in a silo and more in the community (not do kludges to
>     workaround stuff because it is thought to be too hard to improve common
>     code), but fundamentally, "goto previous paragraph" for what I am
>     concerned.
> 
> 
> Meta comment:  It appears as if you're falling into the standard i915 
> team trap of having an internal discussion about what the community 
> discussion might look like instead of actually having the community 
> discussion.  If you are seriously concerned about interactions with 
> other drivers or whether or setting common direction, the right way to 
> do that is to break a patch or two out into a separate RFC series and 
> tag a handful of driver maintainers.  Trying to predict the questions 
> other people might ask is pointless. Cc them and asking for their input 
> instead.

I don't follow you here. It's not an internal discussion - I am raising 
my concerns on the design publicly. I am supposed to write a patch to 
show something, but am allowed to comment on a RFC series?

It is "drm/sched: Convert drm scheduler to use a work queue rather than 
kthread" which should have Cc-ed _everyone_ who use drm scheduler.

> 
>     Regards,
> 
>     Tvrtko
> 
>     P.S. And as a related side note, there are more areas where drm_sched
>     could be improved, like for instance priority handling.
>     Take a look at msm_submitqueue_create / msm_gpu_convert_priority /
>     get_sched_entity to see how msm works around the drm_sched hardcoded
>     limit of available priority levels, in order to avoid having to leave a
>     hw capability unused. I suspect msm would be happier if they could have
>     all priority levels equal in terms of whether they apply only at the
>     frontend level or completely throughout the pipeline.
> 
>      > [1]
>     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
>     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>
>      >
>      >>> What would be interesting to learn is whether the option of
>     refactoring
>      >>> drm_sched to deal with out of order completion was considered
>     and what were
>      >>> the conclusions.
>      >>>
>      >>
>      >> I coded this up a while back when trying to convert the i915 to
>     the DRM
>      >> scheduler it isn't all that hard either. The free flow control
>     on the
>      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
>     really what
>      >> sold me on the this design.
> 
> 
> You're not the only one to suggest supporting out-of-order completion. 
> However, it's tricky and breaks a lot of internal assumptions of the 
> scheduler. It also reduces functionality a bit because it can no longer 
> automatically rate-limit HW/FW queues which are often fixed-size.  (Ok, 
> yes, it probably could but it becomes a substantially harder problem.)
> 
> It also seems like a worse mapping to me.  The goal here is to turn 
> submissions on a userspace-facing engine/queue into submissions to a FW 
> queue submissions, sorting out any dma_fence dependencies.  Matt's 
> description of saying this is a 1:1 mapping between sched/entity doesn't 
> tell the whole story. It's a 1:1:1 mapping between xe_engine, 
> gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1 mapping?  
> Why is that better?

As I have stated before, what I think what would fit well for Xe is one 
drm_scheduler per engine class. In specific terms on our current 
hardware, one drm scheduler instance for render, compute, blitter, video 
and video enhance. Userspace contexts remain scheduler entities.

That way you avoid the whole kthread/kworker story and you have it 
actually use the entity picking code in the scheduler, which may be 
useful when the backend is congested.

Yes you have to solve the out of order problem so in my mind that is 
something to discuss. What the problem actually is (just TDR?), how 
tricky and why etc.

And yes you lose the handy LRCA ring buffer size management so you'd 
have to make those entities not runnable in some other way.

Regarding the argument you raise below - would any of that make the 
frontend / backend separation worse and why? Do you think it is less 
natural? If neither is true then all remains is that it appears extra 
work to support out of order completion of entities has been discounted 
in favour of an easy but IMO inelegant option.

> There are two places where this 1:1:1 mapping is causing problems:
> 
>   1. It creates lots of kthreads. This is what this patch is trying to 
> solve. IDK if it's solving it the best way but that's the goal.
> 
>   2. There are a far more limited number of communication queues between 
> the kernel and GuC for more meta things like pausing and resuming 
> queues, getting events back from GuC, etc. Unless we're in a weird 
> pressure scenario, the amount of traffic on this queue should be low so 
> we can probably just have one per physical device.  The vast majority of 
> kernel -> GuC communication should be on the individual FW queue rings 
> and maybe smashing in-memory doorbells.

I don't follow your terminology here. I suppose you are talking about 
global GuC CT and context ringbuffers. If so then isn't "far more 
limited" actually one?

Regards,

Tvrtko

> Doing out-of-order completion sort-of solves the 1 but does nothing for 
> 2 and actually makes managing FW queues harder because we no longer have 
> built-in rate limiting.  Seems like a net loss to me.
> 
>      >>> Second option perhaps to split out the drm_sched code into
>     parts which would
>      >>> lend themselves more to "pick and choose" of its functionalities.
>      >>> Specifically, Xe wants frontend dependency tracking, but not
>     any scheduling
>      >>> really (neither least busy drm_sched, neither FIFO/RQ entity
>     picking), so
>      >>> even having all these data structures in memory is a waste.
>      >>>
>      >>
>      >> I don't think that we are wasting memory is a very good argument for
>      >> making intrusive changes to the DRM scheduler.
> 
> 
> Worse than that, I think the "we could split it up" kind-of misses the 
> point of the way Xe is using drm/scheduler.  It's not just about 
> re-using a tiny bit of dependency tracking code.  Using the scheduler in 
> this way provides a clean separation between front-end and back-end.  
> The job of the userspace-facing ioctl code is to shove things on the 
> scheduler.  The job of the run_job callback is to encode the job into 
> the FW queue format, stick it in the FW queue ring, and maybe smash a 
> doorbell.  Everything else happens in terms of managing those queues 
> side-band.  The gpu_scheduler code manages the front-end queues and Xe 
> manages the FW queues via the Kernel <-> GuC communication rings.  From 
> a high level, this is a really clean design.  There are potentially some 
> sticky bits around the dual-use of dma_fence for scheduling and memory 
> management but none of those are solved by breaking the DRM scheduler 
> into chunks or getting rid of the 1:1:1 mapping.
> 
> If we split it out, we're basically asking the driver to implement a 
> bunch of kthread or workqueue stuff, all the ring rate-limiting, etc.  
> It may not be all that much code but also, why?  To save a few bytes of 
> memory per engine?  Each engine already has 32K(ish) worth of context 
> state and a similar size ring to communicate with the FW.  No one is 
> going to notice an extra CPU data structure.
> 
> I'm not seeing a solid argument against the 1:1:1 design here other than 
> that it doesn't seem like the way DRM scheduler was intended to be 
> used.  I won't argue that.  It's not.  But it is a fairly natural way to 
> take advantage of the benefits the DRM scheduler does provide while also 
> mapping it to hardware that was designed for userspace direct-to-FW submit.
> 
> --Jason
> 
>      >>> With the first option then the end result could be drm_sched
>     per engine
>      >>> class (hardware view), which I think fits with the GuC model.
>     Give all
>      >>> schedulable contexts (entities) to the GuC and then mostly
>     forget about
>      >>> them. Timeslicing and re-ordering and all happens transparently
>     to the
>      >>> kernel from that point until completion.
>      >>>
>      >>
>      >> Out-of-order problem still exists here.
>      >>
>      >>> Or with the second option you would build on some smaller
>     refactored
>      >>> sub-components of drm_sched, by maybe splitting the dependency
>     tracking from
>      >>> scheduling (RR/FIFO entity picking code).
>      >>>
>      >>> Second option is especially a bit vague and I haven't thought
>     about the
>      >>> required mechanics, but it just appeared too obvious the
>     proposed design has
>      >>> a bit too much impedance mismatch.
>      >>>
>      >>
>      >> IMO ROI on this is low and again lets see what Boris comes up with.
>      >>
>      >> Matt
>      >>
>      >>> Oh and as a side note, when I went into the drm_sched code base
>     to remind
>      >>> myself how things worked, it is quite easy to find some FIXME
>     comments which
>      >>> suggest people working on it are unsure of locking desing there
>     and such. So
>      >>> perhaps that all needs cleanup too, I mean would benefit from
>      >>> refactoring/improving work as brainstormed above anyway.
>      >>>
>      >>> Regards,
>      >>>
>      >>> Tvrtko
>
Tvrtko Ursulin Jan. 10, 2023, 12:19 p.m. UTC | #18
On 10/01/2023 11:28, Tvrtko Ursulin wrote:
> 
> 
> On 09/01/2023 17:27, Jason Ekstrand wrote:
> 
> [snip]
> 
>>      >>> AFAICT it proposes to have 1:1 between *userspace* created
>>     contexts (per
>>      >>> context _and_ engine) and drm_sched. I am not sure avoiding
>>     invasive changes
>>      >>> to the shared code is in the spirit of the overall idea and 
>> instead
>>      >>> opportunity should be used to look at way to refactor/improve
>>     drm_sched.
>>
>>
>> Maybe?  I'm not convinced that what Xe is doing is an abuse at all or 
>> really needs to drive a re-factor.  (More on that later.)  There's 
>> only one real issue which is that it fires off potentially a lot of 
>> kthreads. Even that's not that bad given that kthreads are pretty 
>> light and you're not likely to have more kthreads than userspace 
>> threads which are much heavier.  Not ideal, but not the end of the 
>> world either.  Definitely something we can/should optimize but if we 
>> went through with Xe without this patch, it would probably be mostly ok.
>>
>>      >> Yes, it is 1:1 *userspace* engines and drm_sched.
>>      >>
>>      >> I'm not really prepared to make large changes to DRM scheduler
>>     at the
>>      >> moment for Xe as they are not really required nor does Boris
>>     seem they
>>      >> will be required for his work either. I am interested to see
>>     what Boris
>>      >> comes up with.
>>      >>
>>      >>> Even on the low level, the idea to replace drm_sched threads
>>     with workers
>>      >>> has a few problems.
>>      >>>
>>      >>> To start with, the pattern of:
>>      >>>
>>      >>>    while (not_stopped) {
>>      >>>     keep picking jobs
>>      >>>    }
>>      >>>
>>      >>> Feels fundamentally in disagreement with workers (while
>>     obviously fits
>>      >>> perfectly with the current kthread design).
>>      >>
>>      >> The while loop breaks and worker exists if no jobs are ready.
>>
>>
>> I'm not very familiar with workqueues. What are you saying would fit 
>> better? One scheduling job per work item rather than one big work item 
>> which handles all available jobs?
> 
> Yes and no, it indeed IMO does not fit to have a work item which is 
> potentially unbound in runtime. But it is a bit moot conceptual mismatch 
> because it is a worst case / theoretical, and I think due more 
> fundamental concerns.
> 
> If we have to go back to the low level side of things, I've picked this 
> random spot to consolidate what I have already mentioned and perhaps 
> expand.
> 
> To start with, let me pull out some thoughts from workqueue.rst:
> 
> """
> Generally, work items are not expected to hog a CPU and consume many 
> cycles. That means maintaining just enough concurrency to prevent work 
> processing from stalling should be optimal.
> """
> 
> For unbound queues:
> """
> The responsibility of regulating concurrency level is on the users.
> """
> 
> Given the unbound queues will be spawned on demand to service all queued 
> work items (more interesting when mixing up with the system_unbound_wq), 
> in the proposed design the number of instantiated worker threads does 
> not correspond to the number of user threads (as you have elsewhere 
> stated), but pessimistically to the number of active user contexts. That 
> is the number which drives the maximum number of not-runnable jobs that 
> can become runnable at once, and hence spawn that many work items, and 
> in turn unbound worker threads.
> 
> Several problems there.
> 
> It is fundamentally pointless to have potentially that many more threads 
> than the number of CPU cores - it simply creates a scheduling storm.

To make matters worse, if I follow the code correctly, all these per 
user context worker thread / work items end up contending on the same 
lock or circular buffer, both are one instance per GPU:

guc_engine_run_job
  -> submit_engine
     a) wq_item_append
         -> wq_wait_for_space
           -> msleep
     b) xe_guc_ct_send
         -> guc_ct_send
           -> mutex_lock(&ct->lock);
           -> later a potential msleep in h2g_has_room

Regards,

Tvrtko
Jason Ekstrand Jan. 10, 2023, 2:08 p.m. UTC | #19
On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin <
tvrtko.ursulin@linux.intel.com> wrote:

>
>
> On 09/01/2023 17:27, Jason Ekstrand wrote:
>
> [snip]
>
> >      >>> AFAICT it proposes to have 1:1 between *userspace* created
> >     contexts (per
> >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
> >     invasive changes
> >      >>> to the shared code is in the spirit of the overall idea and
> instead
> >      >>> opportunity should be used to look at way to refactor/improve
> >     drm_sched.
> >
> >
> > Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
> > really needs to drive a re-factor.  (More on that later.)  There's only
> > one real issue which is that it fires off potentially a lot of kthreads.
> > Even that's not that bad given that kthreads are pretty light and you're
> > not likely to have more kthreads than userspace threads which are much
> > heavier.  Not ideal, but not the end of the world either.  Definitely
> > something we can/should optimize but if we went through with Xe without
> > this patch, it would probably be mostly ok.
> >
> >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> >      >>
> >      >> I'm not really prepared to make large changes to DRM scheduler
> >     at the
> >      >> moment for Xe as they are not really required nor does Boris
> >     seem they
> >      >> will be required for his work either. I am interested to see
> >     what Boris
> >      >> comes up with.
> >      >>
> >      >>> Even on the low level, the idea to replace drm_sched threads
> >     with workers
> >      >>> has a few problems.
> >      >>>
> >      >>> To start with, the pattern of:
> >      >>>
> >      >>>    while (not_stopped) {
> >      >>>     keep picking jobs
> >      >>>    }
> >      >>>
> >      >>> Feels fundamentally in disagreement with workers (while
> >     obviously fits
> >      >>> perfectly with the current kthread design).
> >      >>
> >      >> The while loop breaks and worker exists if no jobs are ready.
> >
> >
> > I'm not very familiar with workqueues. What are you saying would fit
> > better? One scheduling job per work item rather than one big work item
> > which handles all available jobs?
>
> Yes and no, it indeed IMO does not fit to have a work item which is
> potentially unbound in runtime. But it is a bit moot conceptual mismatch
> because it is a worst case / theoretical, and I think due more
> fundamental concerns.
>
> If we have to go back to the low level side of things, I've picked this
> random spot to consolidate what I have already mentioned and perhaps
> expand.
>
> To start with, let me pull out some thoughts from workqueue.rst:
>
> """
> Generally, work items are not expected to hog a CPU and consume many
> cycles. That means maintaining just enough concurrency to prevent work
> processing from stalling should be optimal.
> """
>
> For unbound queues:
> """
> The responsibility of regulating concurrency level is on the users.
> """
>
> Given the unbound queues will be spawned on demand to service all queued
> work items (more interesting when mixing up with the system_unbound_wq),
> in the proposed design the number of instantiated worker threads does
> not correspond to the number of user threads (as you have elsewhere
> stated), but pessimistically to the number of active user contexts.


Those are pretty much the same in practice.  Rather, user threads is
typically an upper bound on the number of contexts.  Yes, a single user
thread could have a bunch of contexts but basically nothing does that
except IGT.  In real-world usage, it's at most one context per user thread.


> That
> is the number which drives the maximum number of not-runnable jobs that
> can become runnable at once, and hence spawn that many work items, and
> in turn unbound worker threads.
>
> Several problems there.
>
> It is fundamentally pointless to have potentially that many more threads
> than the number of CPU cores - it simply creates a scheduling storm.
>
> Unbound workers have no CPU / cache locality either and no connection
> with the CPU scheduler to optimize scheduling patterns. This may matter
> either on large systems or on small ones. Whereas the current design
> allows for scheduler to notice userspace CPU thread keeps waking up the
> same drm scheduler kernel thread, and so it can keep them on the same
> CPU, the unbound workers lose that ability and so 2nd CPU might be
> getting woken up from low sleep for every submission.
>
> Hence, apart from being a bit of a impedance mismatch, the proposal has
> the potential to change performance and power patterns and both large
> and small machines.
>

Ok, thanks for explaining the issue you're seeing in more detail.  Yes,
deferred kwork does appear to mismatch somewhat with what the scheduler
needs or at least how it's worked in the past.  How much impact will that
mismatch have?  Unclear.


> >      >>> Secondly, it probably demands separate workers (not optional),
> >     otherwise
> >      >>> behaviour of shared workqueues has either the potential to
> >     explode number
> >      >>> kernel threads anyway, or add latency.
> >      >>>
> >      >>
> >      >> Right now the system_unbound_wq is used which does have a limit
> >     on the
> >      >> number of threads, right? I do have a FIXME to allow a worker to
> be
> >      >> passed in similar to TDR.
> >      >>
> >      >> WRT to latency, the 1:1 ratio could actually have lower latency
> >     as 2 GPU
> >      >> schedulers can be pushing jobs into the backend / cleaning up
> >     jobs in
> >      >> parallel.
> >      >>
> >      >
> >      > Thought of one more point here where why in Xe we absolutely want
> >     a 1 to
> >      > 1 ratio between entity and scheduler - the way we implement
> >     timeslicing
> >      > for preempt fences.
> >      >
> >      > Let me try to explain.
> >      >
> >      > Preempt fences are implemented via the generic messaging
> >     interface [1]
> >      > with suspend / resume messages. If a suspend messages is received
> to
> >      > soon after calling resume (this is per entity) we simply sleep in
> the
> >      > suspend call thus giving the entity a timeslice. This completely
> >     falls
> >      > apart with a many to 1 relationship as now a entity waiting for a
> >      > timeslice blocks the other entities. Could we work aroudn this,
> >     sure but
> >      > just another bunch of code we'd have to add in Xe. Being to
> >     freely sleep
> >      > in backend without affecting other entities is really, really
> >     nice IMO
> >      > and I bet Xe isn't the only driver that is going to feel this way.
> >      >
> >      > Last thing I'll say regardless of how anyone feels about Xe using
> >     a 1 to
> >      > 1 relationship this patch IMO makes sense as I hope we can all
> >     agree a
> >      > workqueue scales better than kthreads.
> >
> >     I don't know for sure what will scale better and for what use case,
> >     combination of CPU cores vs number of GPU engines to keep busy vs
> other
> >     system activity. But I wager someone is bound to ask for some
> >     numbers to
> >     make sure proposal is not negatively affecting any other drivers.
> >
> >
> > Then let them ask.  Waving your hands vaguely in the direction of the
> > rest of DRM and saying "Uh, someone (not me) might object" is profoundly
> > unhelpful.  Sure, someone might.  That's why it's on dri-devel.  If you
> > think there's someone in particular who might have a useful opinion on
> > this, throw them in the CC so they don't miss the e-mail thread.
> >
> > Or are you asking for numbers?  If so, what numbers are you asking for?
>
> It was a heads up to the Xe team in case people weren't appreciating how
> the proposed change has the potential influence power and performance
> across the board. And nothing in the follow up discussion made me think
> it was considered so I don't think it was redundant to raise it.
>
> In my experience it is typical that such core changes come with some
> numbers. Which is in case of drm scheduler is tricky and probably
> requires explicitly asking everyone to test (rather than count on "don't
> miss the email thread"). Real products can fail to ship due ten mW here
> or there. Like suddenly an extra core prevented from getting into deep
> sleep.
>
> If that was "profoundly unhelpful" so be it.
>

With your above explanation, it makes more sense what you're asking.  It's
still not something Matt is likely to be able to provide on his own.  We
need to tag some other folks and ask them to test it out.  We could play
around a bit with it on Xe but it's not exactly production grade yet and is
going to hit this differently from most.  Likely candidates are probably
AMD and Freedreno.


> > Also, If we're talking about a design that might paint us into an
> > Intel-HW-specific hole, that would be one thing.  But we're not.  We're
> > talking about switching which kernel threading/task mechanism to use for
> > what's really a very generic problem.  The core Xe design works without
> > this patch (just with more kthreads).  If we land this patch or
> > something like it and get it wrong and it causes a performance problem
> > for someone down the line, we can revisit it.
>
> For some definition of "it works" - I really wouldn't suggest shipping a
> kthread per user context at any point.
>

You have yet to elaborate on why. What resources is it consuming that's
going to be a problem? Are you anticipating CPU affinity problems? Or does
it just seem wasteful?

I think I largely agree that it's probably unnecessary/wasteful but
reducing the number of kthreads seems like a tractable problem to solve
regardless of where we put the gpu_scheduler object.  Is this the right
solution?  Maybe not.  It was also proposed at one point that we could
split the scheduler into two pieces: A scheduler which owns the kthread,
and a back-end which targets some HW ring thing where you can have multiple
back-ends per scheduler.  That's certainly more invasive from a DRM
scheduler internal API PoV but would solve the kthread problem in a way
that's more similar to what we have now.


> >     In any case that's a low level question caused by the high level
> design
> >     decision. So I'd think first focus on the high level - which is the
> 1:1
> >     mapping of entity to scheduler instance proposal.
> >
> >     Fundamentally it will be up to the DRM maintainers and the community
> to
> >     bless your approach. And it is important to stress 1:1 is about
> >     userspace contexts, so I believe unlike any other current scheduler
> >     user. And also important to stress this effectively does not make Xe
> >     _really_ use the scheduler that much.
> >
> >
> > I don't think this makes Xe nearly as much of a one-off as you think it
> > does.  I've already told the Asahi team working on Apple M1/2 hardware
> > to do it this way and it seems to be a pretty good mapping for them. I
> > believe this is roughly the plan for nouveau as well.  It's not the way
> > it currently works for anyone because most other groups aren't doing FW
> > scheduling yet.  In the world of FW scheduling and hardware designed to
> > support userspace direct-to-FW submit, I think the design makes perfect
> > sense (see below) and I expect we'll see more drivers move in this
> > direction as those drivers evolve.  (AMD is doing some customish thing
> > for how with gpu_scheduler on the front-end somehow. I've not dug into
> > those details.)
> >
> >     I can only offer my opinion, which is that the two options mentioned
> in
> >     this thread (either improve drm scheduler to cope with what is
> >     required,
> >     or split up the code so you can use just the parts of drm_sched which
> >     you want - which is frontend dependency tracking) shouldn't be so
> >     readily dismissed, given how I think the idea was for the new driver
> to
> >     work less in a silo and more in the community (not do kludges to
> >     workaround stuff because it is thought to be too hard to improve
> common
> >     code), but fundamentally, "goto previous paragraph" for what I am
> >     concerned.
> >
> >
> > Meta comment:  It appears as if you're falling into the standard i915
> > team trap of having an internal discussion about what the community
> > discussion might look like instead of actually having the community
> > discussion.  If you are seriously concerned about interactions with
> > other drivers or whether or setting common direction, the right way to
> > do that is to break a patch or two out into a separate RFC series and
> > tag a handful of driver maintainers.  Trying to predict the questions
> > other people might ask is pointless. Cc them and asking for their input
> > instead.
>
> I don't follow you here. It's not an internal discussion - I am raising
> my concerns on the design publicly. I am supposed to write a patch to
> show something, but am allowed to comment on a RFC series?
>

I may have misread your tone a bit.  It felt a bit like too many
discussions I've had in the past where people are trying to predict what
others will say instead of just asking them.  Reading it again, I was
probably jumping to conclusions a bit.  Sorry about that.


> It is "drm/sched: Convert drm scheduler to use a work queue rather than
> kthread" which should have Cc-ed _everyone_ who use drm scheduler.
>

Yeah, it probably should have.  I think that's mostly what I've been trying
to say.


> >
> >     Regards,
> >
> >     Tvrtko
> >
> >     P.S. And as a related side note, there are more areas where drm_sched
> >     could be improved, like for instance priority handling.
> >     Take a look at msm_submitqueue_create / msm_gpu_convert_priority /
> >     get_sched_entity to see how msm works around the drm_sched hardcoded
> >     limit of available priority levels, in order to avoid having to
> leave a
> >     hw capability unused. I suspect msm would be happier if they could
> have
> >     all priority levels equal in terms of whether they apply only at the
> >     frontend level or completely throughout the pipeline.
> >
> >      > [1]
> >     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> >     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> >
> >      >
> >      >>> What would be interesting to learn is whether the option of
> >     refactoring
> >      >>> drm_sched to deal with out of order completion was considered
> >     and what were
> >      >>> the conclusions.
> >      >>>
> >      >>
> >      >> I coded this up a while back when trying to convert the i915 to
> >     the DRM
> >      >> scheduler it isn't all that hard either. The free flow control
> >     on the
> >      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
> >     really what
> >      >> sold me on the this design.
> >
> >
> > You're not the only one to suggest supporting out-of-order completion.
> > However, it's tricky and breaks a lot of internal assumptions of the
> > scheduler. It also reduces functionality a bit because it can no longer
> > automatically rate-limit HW/FW queues which are often fixed-size.  (Ok,
> > yes, it probably could but it becomes a substantially harder problem.)
> >
> > It also seems like a worse mapping to me.  The goal here is to turn
> > submissions on a userspace-facing engine/queue into submissions to a FW
> > queue submissions, sorting out any dma_fence dependencies.  Matt's
> > description of saying this is a 1:1 mapping between sched/entity doesn't
> > tell the whole story. It's a 1:1:1 mapping between xe_engine,
> > gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1 mapping?
> > Why is that better?
>
> As I have stated before, what I think what would fit well for Xe is one
> drm_scheduler per engine class. In specific terms on our current
> hardware, one drm scheduler instance for render, compute, blitter, video
> and video enhance. Userspace contexts remain scheduler entities.
>

And this is where we fairly strongly disagree.  More in a bit.


> That way you avoid the whole kthread/kworker story and you have it
> actually use the entity picking code in the scheduler, which may be
> useful when the backend is congested.
>

What back-end congestion are you referring to here?  Running out of FW
queue IDs?  Something else?


> Yes you have to solve the out of order problem so in my mind that is
> something to discuss. What the problem actually is (just TDR?), how
> tricky and why etc.
>
> And yes you lose the handy LRCA ring buffer size management so you'd
> have to make those entities not runnable in some other way.
>
> Regarding the argument you raise below - would any of that make the
> frontend / backend separation worse and why? Do you think it is less
> natural? If neither is true then all remains is that it appears extra
> work to support out of order completion of entities has been discounted
> in favour of an easy but IMO inelegant option.
>

Broadly speaking, the kernel needs to stop thinking about GPU scheduling in
terms of scheduling jobs and start thinking in terms of scheduling
contexts/engines.  There is still some need for scheduling individual jobs
but that is only for the purpose of delaying them as needed to resolve
dma_fence dependencies.  Once dependencies are resolved, they get shoved
onto the context/engine queue and from there the kernel only really manages
whole contexts/engines.  This is a major architectural shift, entirely
different from the way i915 scheduling works.  It's also different from the
historical usage of DRM scheduler which I think is why this all looks a bit
funny.

To justify this architectural shift, let's look at where we're headed.  In
the glorious future...

 1. Userspace submits directly to firmware queues.  The kernel has no
visibility whatsoever into individual jobs.  At most it can pause/resume FW
contexts as needed to handle eviction and memory management.

 2. Because of 1, apart from handing out the FW queue IDs at the beginning,
the kernel can't really juggle them that much.  Depending on FW design, it
may be able to pause a client, give its IDs to another, and then resume it
later when IDs free up.  What it's not doing is juggling IDs on a
job-by-job basis like i915 currently is.

 3. Long-running compute jobs may not complete for days.  This means that
memory management needs to happen in terms of pause/resume of entire
contexts/engines using the memory rather than based on waiting for
individual jobs to complete or pausing individual jobs until the memory is
available.

 4. Synchronization happens via userspace memory fences (UMF) and the
kernel is mostly unaware of most dependencies and when a context/engine is
or is not runnable.  Instead, it keeps as many of them minimally active
(memory is available, even if it's in system RAM) as possible and lets the
FW sort out dependencies.  (There may need to be some facility for sleeping
a context until a memory change similar to futex() or poll() for userspace
threads.  There are some details TBD.)

Are there potential problems that will need to be solved here?  Yes.  Is it
a good design?  Well, Microsoft has been living in this future for half a
decade or better and it's working quite well for them.  It's also the way
all modern game consoles work.  It really is just Linux that's stuck with
the same old job model we've had since the monumental shift to DRI2.

To that end, one of the core goals of the Xe project was to make the driver
internally behave as close to the above model as possible while keeping the
old-school job model as a very thin layer on top.  As the broader ecosystem
problems (window-system support for UMF, for instance) are solved, that
layer can be peeled back.  The core driver will already be ready for it.

To that end, the point of the DRM scheduler in Xe isn't to schedule jobs.
It's to resolve syncobj and dma-buf implicit sync dependencies and stuff
jobs into their respective context/engine queue once they're ready.  All
the actual scheduling happens in firmware and any scheduling the kernel
does to deal with contention, oversubscriptions, too many contexts, etc. is
between contexts/engines, not individual jobs.  Sure, the individual job
visibility is nice, but if we design around it, we'll never get to the
glorious future.

I really need to turn the above (with a bit more detail) into a blog
post.... Maybe I'll do that this week.

In any case, I hope that provides more insight into why Xe is designed the
way it is and why I'm pushing back so hard on trying to make it more of a
"classic" driver as far as scheduling is concerned.  Are there potential
problems here?  Yes, that's why Xe has been labeled a prototype.  Are such
radical changes necessary to get to said glorious future?  Yes, I think
they are.  Will it be worth it?  I believe so.

> There are two places where this 1:1:1 mapping is causing problems:
> >
> >   1. It creates lots of kthreads. This is what this patch is trying to
> > solve. IDK if it's solving it the best way but that's the goal.
> >
> >   2. There are a far more limited number of communication queues between
> > the kernel and GuC for more meta things like pausing and resuming
> > queues, getting events back from GuC, etc. Unless we're in a weird
> > pressure scenario, the amount of traffic on this queue should be low so
> > we can probably just have one per physical device.  The vast majority of
> > kernel -> GuC communication should be on the individual FW queue rings
> > and maybe smashing in-memory doorbells.
>
> I don't follow your terminology here. I suppose you are talking about
> global GuC CT and context ringbuffers. If so then isn't "far more
> limited" actually one?
>

I thought there could be more than one but I think in practice it's just
the one.

--Jason



> Regards,
>
> Tvrtko
>
> > Doing out-of-order completion sort-of solves the 1 but does nothing for
> > 2 and actually makes managing FW queues harder because we no longer have
> > built-in rate limiting.  Seems like a net loss to me.
> >
> >      >>> Second option perhaps to split out the drm_sched code into
> >     parts which would
> >      >>> lend themselves more to "pick and choose" of its
> functionalities.
> >      >>> Specifically, Xe wants frontend dependency tracking, but not
> >     any scheduling
> >      >>> really (neither least busy drm_sched, neither FIFO/RQ entity
> >     picking), so
> >      >>> even having all these data structures in memory is a waste.
> >      >>>
> >      >>
> >      >> I don't think that we are wasting memory is a very good argument
> for
> >      >> making intrusive changes to the DRM scheduler.
> >
> >
> > Worse than that, I think the "we could split it up" kind-of misses the
> > point of the way Xe is using drm/scheduler.  It's not just about
> > re-using a tiny bit of dependency tracking code.  Using the scheduler in
> > this way provides a clean separation between front-end and back-end.
> > The job of the userspace-facing ioctl code is to shove things on the
> > scheduler.  The job of the run_job callback is to encode the job into
> > the FW queue format, stick it in the FW queue ring, and maybe smash a
> > doorbell.  Everything else happens in terms of managing those queues
> > side-band.  The gpu_scheduler code manages the front-end queues and Xe
> > manages the FW queues via the Kernel <-> GuC communication rings.  From
> > a high level, this is a really clean design.  There are potentially some
> > sticky bits around the dual-use of dma_fence for scheduling and memory
> > management but none of those are solved by breaking the DRM scheduler
> > into chunks or getting rid of the 1:1:1 mapping.
> >
> > If we split it out, we're basically asking the driver to implement a
> > bunch of kthread or workqueue stuff, all the ring rate-limiting, etc.
> > It may not be all that much code but also, why?  To save a few bytes of
> > memory per engine?  Each engine already has 32K(ish) worth of context
> > state and a similar size ring to communicate with the FW.  No one is
> > going to notice an extra CPU data structure.
> >
> > I'm not seeing a solid argument against the 1:1:1 design here other than
> > that it doesn't seem like the way DRM scheduler was intended to be
> > used.  I won't argue that.  It's not.  But it is a fairly natural way to
> > take advantage of the benefits the DRM scheduler does provide while also
> > mapping it to hardware that was designed for userspace direct-to-FW
> submit.
> >
> > --Jason
> >
> >      >>> With the first option then the end result could be drm_sched
> >     per engine
> >      >>> class (hardware view), which I think fits with the GuC model.
> >     Give all
> >      >>> schedulable contexts (entities) to the GuC and then mostly
> >     forget about
> >      >>> them. Timeslicing and re-ordering and all happens transparently
> >     to the
> >      >>> kernel from that point until completion.
> >      >>>
> >      >>
> >      >> Out-of-order problem still exists here.
> >      >>
> >      >>> Or with the second option you would build on some smaller
> >     refactored
> >      >>> sub-components of drm_sched, by maybe splitting the dependency
> >     tracking from
> >      >>> scheduling (RR/FIFO entity picking code).
> >      >>>
> >      >>> Second option is especially a bit vague and I haven't thought
> >     about the
> >      >>> required mechanics, but it just appeared too obvious the
> >     proposed design has
> >      >>> a bit too much impedance mismatch.
> >      >>>
> >      >>
> >      >> IMO ROI on this is low and again lets see what Boris comes up
> with.
> >      >>
> >      >> Matt
> >      >>
> >      >>> Oh and as a side note, when I went into the drm_sched code base
> >     to remind
> >      >>> myself how things worked, it is quite easy to find some FIXME
> >     comments which
> >      >>> suggest people working on it are unsure of locking desing there
> >     and such. So
> >      >>> perhaps that all needs cleanup too, I mean would benefit from
> >      >>> refactoring/improving work as brainstormed above anyway.
> >      >>>
> >      >>> Regards,
> >      >>>
> >      >>> Tvrtko
> >
>
Matthew Brost Jan. 10, 2023, 3:55 p.m. UTC | #20
On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
> 
> On 10/01/2023 11:28, Tvrtko Ursulin wrote:
> > 
> > 
> > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > 
> > [snip]
> > 
> > >      >>> AFAICT it proposes to have 1:1 between *userspace* created
> > >     contexts (per
> > >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > >     invasive changes
> > >      >>> to the shared code is in the spirit of the overall idea and
> > > instead
> > >      >>> opportunity should be used to look at way to refactor/improve
> > >     drm_sched.
> > > 
> > > 
> > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all
> > > or really needs to drive a re-factor.  (More on that later.) 
> > > There's only one real issue which is that it fires off potentially a
> > > lot of kthreads. Even that's not that bad given that kthreads are
> > > pretty light and you're not likely to have more kthreads than
> > > userspace threads which are much heavier.  Not ideal, but not the
> > > end of the world either.  Definitely something we can/should
> > > optimize but if we went through with Xe without this patch, it would
> > > probably be mostly ok.
> > > 
> > >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > >      >>
> > >      >> I'm not really prepared to make large changes to DRM scheduler
> > >     at the
> > >      >> moment for Xe as they are not really required nor does Boris
> > >     seem they
> > >      >> will be required for his work either. I am interested to see
> > >     what Boris
> > >      >> comes up with.
> > >      >>
> > >      >>> Even on the low level, the idea to replace drm_sched threads
> > >     with workers
> > >      >>> has a few problems.
> > >      >>>
> > >      >>> To start with, the pattern of:
> > >      >>>
> > >      >>>    while (not_stopped) {
> > >      >>>     keep picking jobs
> > >      >>>    }
> > >      >>>
> > >      >>> Feels fundamentally in disagreement with workers (while
> > >     obviously fits
> > >      >>> perfectly with the current kthread design).
> > >      >>
> > >      >> The while loop breaks and worker exists if no jobs are ready.
> > > 
> > > 
> > > I'm not very familiar with workqueues. What are you saying would fit
> > > better? One scheduling job per work item rather than one big work
> > > item which handles all available jobs?
> > 
> > Yes and no, it indeed IMO does not fit to have a work item which is
> > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > because it is a worst case / theoretical, and I think due more
> > fundamental concerns.
> > 
> > If we have to go back to the low level side of things, I've picked this
> > random spot to consolidate what I have already mentioned and perhaps
> > expand.
> > 
> > To start with, let me pull out some thoughts from workqueue.rst:
> > 
> > """
> > Generally, work items are not expected to hog a CPU and consume many
> > cycles. That means maintaining just enough concurrency to prevent work
> > processing from stalling should be optimal.
> > """
> > 
> > For unbound queues:
> > """
> > The responsibility of regulating concurrency level is on the users.
> > """
> > 
> > Given the unbound queues will be spawned on demand to service all queued
> > work items (more interesting when mixing up with the system_unbound_wq),
> > in the proposed design the number of instantiated worker threads does
> > not correspond to the number of user threads (as you have elsewhere
> > stated), but pessimistically to the number of active user contexts. That
> > is the number which drives the maximum number of not-runnable jobs that
> > can become runnable at once, and hence spawn that many work items, and
> > in turn unbound worker threads.
> > 
> > Several problems there.
> > 
> > It is fundamentally pointless to have potentially that many more threads
> > than the number of CPU cores - it simply creates a scheduling storm.
> 
> To make matters worse, if I follow the code correctly, all these per user
> context worker thread / work items end up contending on the same lock or
> circular buffer, both are one instance per GPU:
> 
> guc_engine_run_job
>  -> submit_engine
>     a) wq_item_append
>         -> wq_wait_for_space
>           -> msleep

a) is dedicated per xe_engine

Also you missed the step of programming the ring which is dedicated per xe_engine

>     b) xe_guc_ct_send
>         -> guc_ct_send
>           -> mutex_lock(&ct->lock);
>           -> later a potential msleep in h2g_has_room

Techincally there is 1 instance per GT not GPU, yes this is shared but
in practice there will always be space in the CT channel so contention
on the lock should be rare.

I haven't read your rather long reply yet, but also FWIW using a
workqueue has suggested by AMD (original authors of the DRM scheduler)
when we ran this design by them.

Matt 

> 
> Regards,
> 
> Tvrtko
Matthew Brost Jan. 10, 2023, 4:39 p.m. UTC | #21
On Tue, Jan 10, 2023 at 11:28:08AM +0000, Tvrtko Ursulin wrote:
> 
> 
> On 09/01/2023 17:27, Jason Ekstrand wrote:
> 
> [snip]
> 
> >      >>> AFAICT it proposes to have 1:1 between *userspace* created
> >     contexts (per
> >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
> >     invasive changes
> >      >>> to the shared code is in the spirit of the overall idea and instead
> >      >>> opportunity should be used to look at way to refactor/improve
> >     drm_sched.
> > 
> > 
> > Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
> > really needs to drive a re-factor.  (More on that later.)  There's only
> > one real issue which is that it fires off potentially a lot of kthreads.
> > Even that's not that bad given that kthreads are pretty light and you're
> > not likely to have more kthreads than userspace threads which are much
> > heavier.  Not ideal, but not the end of the world either.  Definitely
> > something we can/should optimize but if we went through with Xe without
> > this patch, it would probably be mostly ok.
> > 
> >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> >      >>
> >      >> I'm not really prepared to make large changes to DRM scheduler
> >     at the
> >      >> moment for Xe as they are not really required nor does Boris
> >     seem they
> >      >> will be required for his work either. I am interested to see
> >     what Boris
> >      >> comes up with.
> >      >>
> >      >>> Even on the low level, the idea to replace drm_sched threads
> >     with workers
> >      >>> has a few problems.
> >      >>>
> >      >>> To start with, the pattern of:
> >      >>>
> >      >>>    while (not_stopped) {
> >      >>>     keep picking jobs
> >      >>>    }
> >      >>>
> >      >>> Feels fundamentally in disagreement with workers (while
> >     obviously fits
> >      >>> perfectly with the current kthread design).
> >      >>
> >      >> The while loop breaks and worker exists if no jobs are ready.
> > 
> > 
> > I'm not very familiar with workqueues. What are you saying would fit
> > better? One scheduling job per work item rather than one big work item
> > which handles all available jobs?
> 
> Yes and no, it indeed IMO does not fit to have a work item which is
> potentially unbound in runtime. But it is a bit moot conceptual mismatch
> because it is a worst case / theoretical, and I think due more fundamental
> concerns.
> 
> If we have to go back to the low level side of things, I've picked this
> random spot to consolidate what I have already mentioned and perhaps expand.
> 
> To start with, let me pull out some thoughts from workqueue.rst:
> 
> """
> Generally, work items are not expected to hog a CPU and consume many cycles.
> That means maintaining just enough concurrency to prevent work processing
> from stalling should be optimal.
> """
> 
> For unbound queues:
> """
> The responsibility of regulating concurrency level is on the users.
> """
> 
> Given the unbound queues will be spawned on demand to service all queued
> work items (more interesting when mixing up with the system_unbound_wq), in
> the proposed design the number of instantiated worker threads does not
> correspond to the number of user threads (as you have elsewhere stated), but
> pessimistically to the number of active user contexts. That is the number
> which drives the maximum number of not-runnable jobs that can become
> runnable at once, and hence spawn that many work items, and in turn unbound
> worker threads.
> 
> Several problems there.
> 
> It is fundamentally pointless to have potentially that many more threads
> than the number of CPU cores - it simply creates a scheduling storm.
> 

We can use a different work queue if this is an issue, have a FIXME
which indicates we should allow the user to pass in the work queue.

> Unbound workers have no CPU / cache locality either and no connection with
> the CPU scheduler to optimize scheduling patterns. This may matter either on
> large systems or on small ones. Whereas the current design allows for
> scheduler to notice userspace CPU thread keeps waking up the same drm
> scheduler kernel thread, and so it can keep them on the same CPU, the
> unbound workers lose that ability and so 2nd CPU might be getting woken up
> from low sleep for every submission.
>

I guess I don't understand kthread vs. workqueue scheduling internals.
 
> Hence, apart from being a bit of a impedance mismatch, the proposal has the
> potential to change performance and power patterns and both large and small
> machines.
>

We are going to have to test this out I suppose and play around to see
if this design has any real world impacts. As Jason said, yea probably
will need a bit of help here from others. Will CC relavent parties on
next rev. 
 
> >      >>> Secondly, it probably demands separate workers (not optional),
> >     otherwise
> >      >>> behaviour of shared workqueues has either the potential to
> >     explode number
> >      >>> kernel threads anyway, or add latency.
> >      >>>
> >      >>
> >      >> Right now the system_unbound_wq is used which does have a limit
> >     on the
> >      >> number of threads, right? I do have a FIXME to allow a worker to be
> >      >> passed in similar to TDR.
> >      >>
> >      >> WRT to latency, the 1:1 ratio could actually have lower latency
> >     as 2 GPU
> >      >> schedulers can be pushing jobs into the backend / cleaning up
> >     jobs in
> >      >> parallel.
> >      >>
> >      >
> >      > Thought of one more point here where why in Xe we absolutely want
> >     a 1 to
> >      > 1 ratio between entity and scheduler - the way we implement
> >     timeslicing
> >      > for preempt fences.
> >      >
> >      > Let me try to explain.
> >      >
> >      > Preempt fences are implemented via the generic messaging
> >     interface [1]
> >      > with suspend / resume messages. If a suspend messages is received to
> >      > soon after calling resume (this is per entity) we simply sleep in the
> >      > suspend call thus giving the entity a timeslice. This completely
> >     falls
> >      > apart with a many to 1 relationship as now a entity waiting for a
> >      > timeslice blocks the other entities. Could we work aroudn this,
> >     sure but
> >      > just another bunch of code we'd have to add in Xe. Being to
> >     freely sleep
> >      > in backend without affecting other entities is really, really
> >     nice IMO
> >      > and I bet Xe isn't the only driver that is going to feel this way.
> >      >
> >      > Last thing I'll say regardless of how anyone feels about Xe using
> >     a 1 to
> >      > 1 relationship this patch IMO makes sense as I hope we can all
> >     agree a
> >      > workqueue scales better than kthreads.
> > 
> >     I don't know for sure what will scale better and for what use case,
> >     combination of CPU cores vs number of GPU engines to keep busy vs other
> >     system activity. But I wager someone is bound to ask for some
> >     numbers to
> >     make sure proposal is not negatively affecting any other drivers.
> > 
> > 
> > Then let them ask.  Waving your hands vaguely in the direction of the
> > rest of DRM and saying "Uh, someone (not me) might object" is profoundly
> > unhelpful.  Sure, someone might.  That's why it's on dri-devel.  If you
> > think there's someone in particular who might have a useful opinion on
> > this, throw them in the CC so they don't miss the e-mail thread.
> > 
> > Or are you asking for numbers?  If so, what numbers are you asking for?
> 
> It was a heads up to the Xe team in case people weren't appreciating how the
> proposed change has the potential influence power and performance across the
> board. And nothing in the follow up discussion made me think it was
> considered so I don't think it was redundant to raise it.
> 
> In my experience it is typical that such core changes come with some
> numbers. Which is in case of drm scheduler is tricky and probably requires
> explicitly asking everyone to test (rather than count on "don't miss the
> email thread"). Real products can fail to ship due ten mW here or there.
> Like suddenly an extra core prevented from getting into deep sleep.
> 
> If that was "profoundly unhelpful" so be it.
> 
> > Also, If we're talking about a design that might paint us into an
> > Intel-HW-specific hole, that would be one thing.  But we're not.  We're
> > talking about switching which kernel threading/task mechanism to use for
> > what's really a very generic problem.  The core Xe design works without
> > this patch (just with more kthreads).  If we land this patch or
> > something like it and get it wrong and it causes a performance problem
> > for someone down the line, we can revisit it.
> 
> For some definition of "it works" - I really wouldn't suggest shipping a
> kthread per user context at any point.
>

Yea, this is why using a workqueue rathre than a kthread was suggested
to me by AMD. I should've put a suggested by on the commit message, need
to dig through my emails and figure out who exactly suggested this.
 
> >     In any case that's a low level question caused by the high level design
> >     decision. So I'd think first focus on the high level - which is the 1:1
> >     mapping of entity to scheduler instance proposal.
> > 
> >     Fundamentally it will be up to the DRM maintainers and the community to
> >     bless your approach. And it is important to stress 1:1 is about
> >     userspace contexts, so I believe unlike any other current scheduler
> >     user. And also important to stress this effectively does not make Xe
> >     _really_ use the scheduler that much.
> > 
> > 
> > I don't think this makes Xe nearly as much of a one-off as you think it
> > does.  I've already told the Asahi team working on Apple M1/2 hardware
> > to do it this way and it seems to be a pretty good mapping for them. I
> > believe this is roughly the plan for nouveau as well.  It's not the way
> > it currently works for anyone because most other groups aren't doing FW
> > scheduling yet.  In the world of FW scheduling and hardware designed to
> > support userspace direct-to-FW submit, I think the design makes perfect
> > sense (see below) and I expect we'll see more drivers move in this
> > direction as those drivers evolve.  (AMD is doing some customish thing
> > for how with gpu_scheduler on the front-end somehow. I've not dug into
> > those details.)
> > 
> >     I can only offer my opinion, which is that the two options mentioned in
> >     this thread (either improve drm scheduler to cope with what is
> >     required,
> >     or split up the code so you can use just the parts of drm_sched which
> >     you want - which is frontend dependency tracking) shouldn't be so
> >     readily dismissed, given how I think the idea was for the new driver to
> >     work less in a silo and more in the community (not do kludges to
> >     workaround stuff because it is thought to be too hard to improve common
> >     code), but fundamentally, "goto previous paragraph" for what I am
> >     concerned.
> > 
> > 
> > Meta comment:  It appears as if you're falling into the standard i915
> > team trap of having an internal discussion about what the community
> > discussion might look like instead of actually having the community
> > discussion.  If you are seriously concerned about interactions with
> > other drivers or whether or setting common direction, the right way to
> > do that is to break a patch or two out into a separate RFC series and
> > tag a handful of driver maintainers.  Trying to predict the questions
> > other people might ask is pointless. Cc them and asking for their input
> > instead.
> 
> I don't follow you here. It's not an internal discussion - I am raising my
> concerns on the design publicly. I am supposed to write a patch to show
> something, but am allowed to comment on a RFC series?
> 
> It is "drm/sched: Convert drm scheduler to use a work queue rather than
> kthread" which should have Cc-ed _everyone_ who use drm scheduler.
>

Yea, will do on next rev.
 
> > 
> >     Regards,
> > 
> >     Tvrtko
> > 
> >     P.S. And as a related side note, there are more areas where drm_sched
> >     could be improved, like for instance priority handling.
> >     Take a look at msm_submitqueue_create / msm_gpu_convert_priority /
> >     get_sched_entity to see how msm works around the drm_sched hardcoded
> >     limit of available priority levels, in order to avoid having to leave a
> >     hw capability unused. I suspect msm would be happier if they could have
> >     all priority levels equal in terms of whether they apply only at the
> >     frontend level or completely throughout the pipeline.
> > 
> >      > [1]
> >     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> >     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>
> >      >
> >      >>> What would be interesting to learn is whether the option of
> >     refactoring
> >      >>> drm_sched to deal with out of order completion was considered
> >     and what were
> >      >>> the conclusions.
> >      >>>
> >      >>
> >      >> I coded this up a while back when trying to convert the i915 to
> >     the DRM
> >      >> scheduler it isn't all that hard either. The free flow control
> >     on the
> >      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
> >     really what
> >      >> sold me on the this design.
> > 
> > 
> > You're not the only one to suggest supporting out-of-order completion.
> > However, it's tricky and breaks a lot of internal assumptions of the
> > scheduler. It also reduces functionality a bit because it can no longer
> > automatically rate-limit HW/FW queues which are often fixed-size.  (Ok,
> > yes, it probably could but it becomes a substantially harder problem.)
> > 
> > It also seems like a worse mapping to me.  The goal here is to turn
> > submissions on a userspace-facing engine/queue into submissions to a FW
> > queue submissions, sorting out any dma_fence dependencies.  Matt's
> > description of saying this is a 1:1 mapping between sched/entity doesn't
> > tell the whole story. It's a 1:1:1 mapping between xe_engine,
> > gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1 mapping?
> > Why is that better?
> 
> As I have stated before, what I think what would fit well for Xe is one
> drm_scheduler per engine class. In specific terms on our current hardware,
> one drm scheduler instance for render, compute, blitter, video and video
> enhance. Userspace contexts remain scheduler entities.
>

I disagree.
 
> That way you avoid the whole kthread/kworker story and you have it actually
> use the entity picking code in the scheduler, which may be useful when the
> backend is congested.
>

In practice the backend shouldn't be congested but if it is a mutex
provides fairness probably better than using a shared scheduler. Also
what you are suggesting doesn't make sense at all as the congestion is
per-GT, so if anything we should use 1 scheduler per-GT not per engine
class.
 
> Yes you have to solve the out of order problem so in my mind that is
> something to discuss. What the problem actually is (just TDR?), how tricky
> and why etc.
>

Cleanup of jobs, TDR, replaying jobs, etc... It has decent amount of
impact.
 
> And yes you lose the handy LRCA ring buffer size management so you'd have to
> make those entities not runnable in some other way.
>

Also we lose our preempt fence implemenation too. Again I don't see how
the design you are suggesting is a win.
 
> Regarding the argument you raise below - would any of that make the frontend
> / backend separation worse and why? Do you think it is less natural? If
> neither is true then all remains is that it appears extra work to support
> out of order completion of entities has been discounted in favour of an easy
> but IMO inelegant option.
> 
> > There are two places where this 1:1:1 mapping is causing problems:
> > 
> >   1. It creates lots of kthreads. This is what this patch is trying to
> > solve. IDK if it's solving it the best way but that's the goal.
> > 
> >   2. There are a far more limited number of communication queues between
> > the kernel and GuC for more meta things like pausing and resuming
> > queues, getting events back from GuC, etc. Unless we're in a weird
> > pressure scenario, the amount of traffic on this queue should be low so
> > we can probably just have one per physical device.  The vast majority of
> > kernel -> GuC communication should be on the individual FW queue rings
> > and maybe smashing in-memory doorbells.
> 
> I don't follow your terminology here. I suppose you are talking about global
> GuC CT and context ringbuffers. If so then isn't "far more limited" actually
> one?
> 

We have 1 GuC GT per-GT.

Matt

> Regards,
> 
> Tvrtko
> 
> > Doing out-of-order completion sort-of solves the 1 but does nothing for
> > 2 and actually makes managing FW queues harder because we no longer have
> > built-in rate limiting.  Seems like a net loss to me.
> > 
> >      >>> Second option perhaps to split out the drm_sched code into
> >     parts which would
> >      >>> lend themselves more to "pick and choose" of its functionalities.
> >      >>> Specifically, Xe wants frontend dependency tracking, but not
> >     any scheduling
> >      >>> really (neither least busy drm_sched, neither FIFO/RQ entity
> >     picking), so
> >      >>> even having all these data structures in memory is a waste.
> >      >>>
> >      >>
> >      >> I don't think that we are wasting memory is a very good argument for
> >      >> making intrusive changes to the DRM scheduler.
> > 
> > 
> > Worse than that, I think the "we could split it up" kind-of misses the
> > point of the way Xe is using drm/scheduler.  It's not just about
> > re-using a tiny bit of dependency tracking code.  Using the scheduler in
> > this way provides a clean separation between front-end and back-end.
> > The job of the userspace-facing ioctl code is to shove things on the
> > scheduler.  The job of the run_job callback is to encode the job into
> > the FW queue format, stick it in the FW queue ring, and maybe smash a
> > doorbell.  Everything else happens in terms of managing those queues
> > side-band.  The gpu_scheduler code manages the front-end queues and Xe
> > manages the FW queues via the Kernel <-> GuC communication rings.  From
> > a high level, this is a really clean design.  There are potentially some
> > sticky bits around the dual-use of dma_fence for scheduling and memory
> > management but none of those are solved by breaking the DRM scheduler
> > into chunks or getting rid of the 1:1:1 mapping.
> > 
> > If we split it out, we're basically asking the driver to implement a
> > bunch of kthread or workqueue stuff, all the ring rate-limiting, etc.
> > It may not be all that much code but also, why?  To save a few bytes of
> > memory per engine?  Each engine already has 32K(ish) worth of context
> > state and a similar size ring to communicate with the FW.  No one is
> > going to notice an extra CPU data structure.
> > 
> > I'm not seeing a solid argument against the 1:1:1 design here other than
> > that it doesn't seem like the way DRM scheduler was intended to be
> > used.  I won't argue that.  It's not.  But it is a fairly natural way to
> > take advantage of the benefits the DRM scheduler does provide while also
> > mapping it to hardware that was designed for userspace direct-to-FW
> > submit.
> > 
> > --Jason
> > 
> >      >>> With the first option then the end result could be drm_sched
> >     per engine
> >      >>> class (hardware view), which I think fits with the GuC model.
> >     Give all
> >      >>> schedulable contexts (entities) to the GuC and then mostly
> >     forget about
> >      >>> them. Timeslicing and re-ordering and all happens transparently
> >     to the
> >      >>> kernel from that point until completion.
> >      >>>
> >      >>
> >      >> Out-of-order problem still exists here.
> >      >>
> >      >>> Or with the second option you would build on some smaller
> >     refactored
> >      >>> sub-components of drm_sched, by maybe splitting the dependency
> >     tracking from
> >      >>> scheduling (RR/FIFO entity picking code).
> >      >>>
> >      >>> Second option is especially a bit vague and I haven't thought
> >     about the
> >      >>> required mechanics, but it just appeared too obvious the
> >     proposed design has
> >      >>> a bit too much impedance mismatch.
> >      >>>
> >      >>
> >      >> IMO ROI on this is low and again lets see what Boris comes up with.
> >      >>
> >      >> Matt
> >      >>
> >      >>> Oh and as a side note, when I went into the drm_sched code base
> >     to remind
> >      >>> myself how things worked, it is quite easy to find some FIXME
> >     comments which
> >      >>> suggest people working on it are unsure of locking desing there
> >     and such. So
> >      >>> perhaps that all needs cleanup too, I mean would benefit from
> >      >>> refactoring/improving work as brainstormed above anyway.
> >      >>>
> >      >>> Regards,
> >      >>>
> >      >>> Tvrtko
> >
Tvrtko Ursulin Jan. 10, 2023, 4:50 p.m. UTC | #22
On 10/01/2023 15:55, Matthew Brost wrote:
> On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
>>
>> On 10/01/2023 11:28, Tvrtko Ursulin wrote:
>>>
>>>
>>> On 09/01/2023 17:27, Jason Ekstrand wrote:
>>>
>>> [snip]
>>>
>>>>       >>> AFAICT it proposes to have 1:1 between *userspace* created
>>>>      contexts (per
>>>>       >>> context _and_ engine) and drm_sched. I am not sure avoiding
>>>>      invasive changes
>>>>       >>> to the shared code is in the spirit of the overall idea and
>>>> instead
>>>>       >>> opportunity should be used to look at way to refactor/improve
>>>>      drm_sched.
>>>>
>>>>
>>>> Maybe?  I'm not convinced that what Xe is doing is an abuse at all
>>>> or really needs to drive a re-factor.  (More on that later.)
>>>> There's only one real issue which is that it fires off potentially a
>>>> lot of kthreads. Even that's not that bad given that kthreads are
>>>> pretty light and you're not likely to have more kthreads than
>>>> userspace threads which are much heavier.  Not ideal, but not the
>>>> end of the world either.  Definitely something we can/should
>>>> optimize but if we went through with Xe without this patch, it would
>>>> probably be mostly ok.
>>>>
>>>>       >> Yes, it is 1:1 *userspace* engines and drm_sched.
>>>>       >>
>>>>       >> I'm not really prepared to make large changes to DRM scheduler
>>>>      at the
>>>>       >> moment for Xe as they are not really required nor does Boris
>>>>      seem they
>>>>       >> will be required for his work either. I am interested to see
>>>>      what Boris
>>>>       >> comes up with.
>>>>       >>
>>>>       >>> Even on the low level, the idea to replace drm_sched threads
>>>>      with workers
>>>>       >>> has a few problems.
>>>>       >>>
>>>>       >>> To start with, the pattern of:
>>>>       >>>
>>>>       >>>    while (not_stopped) {
>>>>       >>>     keep picking jobs
>>>>       >>>    }
>>>>       >>>
>>>>       >>> Feels fundamentally in disagreement with workers (while
>>>>      obviously fits
>>>>       >>> perfectly with the current kthread design).
>>>>       >>
>>>>       >> The while loop breaks and worker exists if no jobs are ready.
>>>>
>>>>
>>>> I'm not very familiar with workqueues. What are you saying would fit
>>>> better? One scheduling job per work item rather than one big work
>>>> item which handles all available jobs?
>>>
>>> Yes and no, it indeed IMO does not fit to have a work item which is
>>> potentially unbound in runtime. But it is a bit moot conceptual mismatch
>>> because it is a worst case / theoretical, and I think due more
>>> fundamental concerns.
>>>
>>> If we have to go back to the low level side of things, I've picked this
>>> random spot to consolidate what I have already mentioned and perhaps
>>> expand.
>>>
>>> To start with, let me pull out some thoughts from workqueue.rst:
>>>
>>> """
>>> Generally, work items are not expected to hog a CPU and consume many
>>> cycles. That means maintaining just enough concurrency to prevent work
>>> processing from stalling should be optimal.
>>> """
>>>
>>> For unbound queues:
>>> """
>>> The responsibility of regulating concurrency level is on the users.
>>> """
>>>
>>> Given the unbound queues will be spawned on demand to service all queued
>>> work items (more interesting when mixing up with the system_unbound_wq),
>>> in the proposed design the number of instantiated worker threads does
>>> not correspond to the number of user threads (as you have elsewhere
>>> stated), but pessimistically to the number of active user contexts. That
>>> is the number which drives the maximum number of not-runnable jobs that
>>> can become runnable at once, and hence spawn that many work items, and
>>> in turn unbound worker threads.
>>>
>>> Several problems there.
>>>
>>> It is fundamentally pointless to have potentially that many more threads
>>> than the number of CPU cores - it simply creates a scheduling storm.
>>
>> To make matters worse, if I follow the code correctly, all these per user
>> context worker thread / work items end up contending on the same lock or
>> circular buffer, both are one instance per GPU:
>>
>> guc_engine_run_job
>>   -> submit_engine
>>      a) wq_item_append
>>          -> wq_wait_for_space
>>            -> msleep
> 
> a) is dedicated per xe_engine

Hah true, what its for then? I thought throttling the LRCA ring is done via:

   drm_sched_init(&ge->sched, &drm_sched_ops,
		 e->lrc[0].ring.size / MAX_JOB_SIZE_BYTES,

Is there something more to throttle other than the ring? It is 
throttling something using msleeps..

> Also you missed the step of programming the ring which is dedicated per xe_engine

I was trying to quickly find places which serialize on something in the 
backend, ringbuffer emission did not seem to do that but maybe I missed 
something.

> 
>>      b) xe_guc_ct_send
>>          -> guc_ct_send
>>            -> mutex_lock(&ct->lock);
>>            -> later a potential msleep in h2g_has_room
> 
> Techincally there is 1 instance per GT not GPU, yes this is shared but
> in practice there will always be space in the CT channel so contention
> on the lock should be rare.

Yeah I used the term GPU to be more understandable to outside audience.

I am somewhat disappointed that the Xe opportunity hasn't been used to 
improve upon the CT communication bottlenecks. I mean those backoff 
sleeps and lock contention. I wish there would be a single thread in 
charge of the CT channel and internal users (other parts of the driver) 
would be able to send their requests to it in a more efficient manner, 
with less lock contention and centralized backoff.

> I haven't read your rather long reply yet, but also FWIW using a
> workqueue has suggested by AMD (original authors of the DRM scheduler)
> when we ran this design by them.

Commit message says nothing about that. ;)

Regards,

Tvrtko
Matthew Brost Jan. 10, 2023, 7:01 p.m. UTC | #23
On Tue, Jan 10, 2023 at 04:50:55PM +0000, Tvrtko Ursulin wrote:
> 
> On 10/01/2023 15:55, Matthew Brost wrote:
> > On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
> > > 
> > > On 10/01/2023 11:28, Tvrtko Ursulin wrote:
> > > > 
> > > > 
> > > > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > > > 
> > > > [snip]
> > > > 
> > > > >       >>> AFAICT it proposes to have 1:1 between *userspace* created
> > > > >      contexts (per
> > > > >       >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > > > >      invasive changes
> > > > >       >>> to the shared code is in the spirit of the overall idea and
> > > > > instead
> > > > >       >>> opportunity should be used to look at way to refactor/improve
> > > > >      drm_sched.
> > > > > 
> > > > > 
> > > > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all
> > > > > or really needs to drive a re-factor.  (More on that later.)
> > > > > There's only one real issue which is that it fires off potentially a
> > > > > lot of kthreads. Even that's not that bad given that kthreads are
> > > > > pretty light and you're not likely to have more kthreads than
> > > > > userspace threads which are much heavier.  Not ideal, but not the
> > > > > end of the world either.  Definitely something we can/should
> > > > > optimize but if we went through with Xe without this patch, it would
> > > > > probably be mostly ok.
> > > > > 
> > > > >       >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > > > >       >>
> > > > >       >> I'm not really prepared to make large changes to DRM scheduler
> > > > >      at the
> > > > >       >> moment for Xe as they are not really required nor does Boris
> > > > >      seem they
> > > > >       >> will be required for his work either. I am interested to see
> > > > >      what Boris
> > > > >       >> comes up with.
> > > > >       >>
> > > > >       >>> Even on the low level, the idea to replace drm_sched threads
> > > > >      with workers
> > > > >       >>> has a few problems.
> > > > >       >>>
> > > > >       >>> To start with, the pattern of:
> > > > >       >>>
> > > > >       >>>    while (not_stopped) {
> > > > >       >>>     keep picking jobs
> > > > >       >>>    }
> > > > >       >>>
> > > > >       >>> Feels fundamentally in disagreement with workers (while
> > > > >      obviously fits
> > > > >       >>> perfectly with the current kthread design).
> > > > >       >>
> > > > >       >> The while loop breaks and worker exists if no jobs are ready.
> > > > > 
> > > > > 
> > > > > I'm not very familiar with workqueues. What are you saying would fit
> > > > > better? One scheduling job per work item rather than one big work
> > > > > item which handles all available jobs?
> > > > 
> > > > Yes and no, it indeed IMO does not fit to have a work item which is
> > > > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > > > because it is a worst case / theoretical, and I think due more
> > > > fundamental concerns.
> > > > 
> > > > If we have to go back to the low level side of things, I've picked this
> > > > random spot to consolidate what I have already mentioned and perhaps
> > > > expand.
> > > > 
> > > > To start with, let me pull out some thoughts from workqueue.rst:
> > > > 
> > > > """
> > > > Generally, work items are not expected to hog a CPU and consume many
> > > > cycles. That means maintaining just enough concurrency to prevent work
> > > > processing from stalling should be optimal.
> > > > """
> > > > 
> > > > For unbound queues:
> > > > """
> > > > The responsibility of regulating concurrency level is on the users.
> > > > """
> > > > 
> > > > Given the unbound queues will be spawned on demand to service all queued
> > > > work items (more interesting when mixing up with the system_unbound_wq),
> > > > in the proposed design the number of instantiated worker threads does
> > > > not correspond to the number of user threads (as you have elsewhere
> > > > stated), but pessimistically to the number of active user contexts. That
> > > > is the number which drives the maximum number of not-runnable jobs that
> > > > can become runnable at once, and hence spawn that many work items, and
> > > > in turn unbound worker threads.
> > > > 
> > > > Several problems there.
> > > > 
> > > > It is fundamentally pointless to have potentially that many more threads
> > > > than the number of CPU cores - it simply creates a scheduling storm.
> > > 
> > > To make matters worse, if I follow the code correctly, all these per user
> > > context worker thread / work items end up contending on the same lock or
> > > circular buffer, both are one instance per GPU:
> > > 
> > > guc_engine_run_job
> > >   -> submit_engine
> > >      a) wq_item_append
> > >          -> wq_wait_for_space
> > >            -> msleep
> > 
> > a) is dedicated per xe_engine
> 
> Hah true, what its for then? I thought throttling the LRCA ring is done via:
> 

This is a per guc_id 'work queue' which is used for parallel submission
(e.g. multiple LRC tail values need to written atomically by the GuC).
Again in practice there should always be space.

>   drm_sched_init(&ge->sched, &drm_sched_ops,
> 		 e->lrc[0].ring.size / MAX_JOB_SIZE_BYTES,
> 
> Is there something more to throttle other than the ring? It is throttling
> something using msleeps..
> 
> > Also you missed the step of programming the ring which is dedicated per xe_engine
> 
> I was trying to quickly find places which serialize on something in the
> backend, ringbuffer emission did not seem to do that but maybe I missed
> something.
>

xe_ring_ops vfunc emit_job is called to write the ring.
 
> > 
> > >      b) xe_guc_ct_send
> > >          -> guc_ct_send
> > >            -> mutex_lock(&ct->lock);
> > >            -> later a potential msleep in h2g_has_room
> > 
> > Techincally there is 1 instance per GT not GPU, yes this is shared but
> > in practice there will always be space in the CT channel so contention
> > on the lock should be rare.
> 
> Yeah I used the term GPU to be more understandable to outside audience.
> 
> I am somewhat disappointed that the Xe opportunity hasn't been used to
> improve upon the CT communication bottlenecks. I mean those backoff sleeps
> and lock contention. I wish there would be a single thread in charge of the
> CT channel and internal users (other parts of the driver) would be able to
> send their requests to it in a more efficient manner, with less lock
> contention and centralized backoff.
>

Well the CT backend was more or less a complete rewrite. Mutexes
actually work rather well to ensure fairness compared to the spin locks
used in the i915. This code was pretty heavily reviewed by Daniel and
both of us landed a big mutex for all of the CT code compared to the 3
or 4 spin locks used in the i915.
 
> > I haven't read your rather long reply yet, but also FWIW using a
> > workqueue has suggested by AMD (original authors of the DRM scheduler)
> > when we ran this design by them.
> 
> Commit message says nothing about that. ;)
>

Yea I missed that, will fix in the next rev. Just dug through my emails
and Christian suggested a work queue and Andrey also gave some input on
the DRM scheduler design.

Also in the next will likely update the run_wq to be passed in by the
user.

Matt

> Regards,
> 
> Tvrtko
Matthew Brost Jan. 11, 2023, 1:13 a.m. UTC | #24
On Tue, Jan 10, 2023 at 04:39:00PM +0000, Matthew Brost wrote:
> On Tue, Jan 10, 2023 at 11:28:08AM +0000, Tvrtko Ursulin wrote:
> > 
> > 
> > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > 
> > [snip]
> > 
> > >      >>> AFAICT it proposes to have 1:1 between *userspace* created
> > >     contexts (per
> > >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > >     invasive changes
> > >      >>> to the shared code is in the spirit of the overall idea and instead
> > >      >>> opportunity should be used to look at way to refactor/improve
> > >     drm_sched.
> > > 
> > > 
> > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
> > > really needs to drive a re-factor.  (More on that later.)  There's only
> > > one real issue which is that it fires off potentially a lot of kthreads.
> > > Even that's not that bad given that kthreads are pretty light and you're
> > > not likely to have more kthreads than userspace threads which are much
> > > heavier.  Not ideal, but not the end of the world either.  Definitely
> > > something we can/should optimize but if we went through with Xe without
> > > this patch, it would probably be mostly ok.
> > > 
> > >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > >      >>
> > >      >> I'm not really prepared to make large changes to DRM scheduler
> > >     at the
> > >      >> moment for Xe as they are not really required nor does Boris
> > >     seem they
> > >      >> will be required for his work either. I am interested to see
> > >     what Boris
> > >      >> comes up with.
> > >      >>
> > >      >>> Even on the low level, the idea to replace drm_sched threads
> > >     with workers
> > >      >>> has a few problems.
> > >      >>>
> > >      >>> To start with, the pattern of:
> > >      >>>
> > >      >>>    while (not_stopped) {
> > >      >>>     keep picking jobs
> > >      >>>    }
> > >      >>>
> > >      >>> Feels fundamentally in disagreement with workers (while
> > >     obviously fits
> > >      >>> perfectly with the current kthread design).
> > >      >>
> > >      >> The while loop breaks and worker exists if no jobs are ready.
> > > 
> > > 
> > > I'm not very familiar with workqueues. What are you saying would fit
> > > better? One scheduling job per work item rather than one big work item
> > > which handles all available jobs?
> > 
> > Yes and no, it indeed IMO does not fit to have a work item which is
> > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > because it is a worst case / theoretical, and I think due more fundamental
> > concerns.
> > 
> > If we have to go back to the low level side of things, I've picked this
> > random spot to consolidate what I have already mentioned and perhaps expand.
> > 
> > To start with, let me pull out some thoughts from workqueue.rst:
> > 
> > """
> > Generally, work items are not expected to hog a CPU and consume many cycles.
> > That means maintaining just enough concurrency to prevent work processing
> > from stalling should be optimal.
> > """
> > 
> > For unbound queues:
> > """
> > The responsibility of regulating concurrency level is on the users.
> > """
> > 
> > Given the unbound queues will be spawned on demand to service all queued
> > work items (more interesting when mixing up with the system_unbound_wq), in
> > the proposed design the number of instantiated worker threads does not
> > correspond to the number of user threads (as you have elsewhere stated), but
> > pessimistically to the number of active user contexts. That is the number
> > which drives the maximum number of not-runnable jobs that can become
> > runnable at once, and hence spawn that many work items, and in turn unbound
> > worker threads.
> > 
> > Several problems there.
> > 
> > It is fundamentally pointless to have potentially that many more threads
> > than the number of CPU cores - it simply creates a scheduling storm.
> > 
> 
> We can use a different work queue if this is an issue, have a FIXME
> which indicates we should allow the user to pass in the work queue.
> 
> > Unbound workers have no CPU / cache locality either and no connection with
> > the CPU scheduler to optimize scheduling patterns. This may matter either on
> > large systems or on small ones. Whereas the current design allows for
> > scheduler to notice userspace CPU thread keeps waking up the same drm
> > scheduler kernel thread, and so it can keep them on the same CPU, the
> > unbound workers lose that ability and so 2nd CPU might be getting woken up
> > from low sleep for every submission.
> >
> 
> I guess I don't understand kthread vs. workqueue scheduling internals.
>  

Looked into this and we are not using unbound workers rather we are just
using the system_wq which is indeed bound. Again we can change this so a
user can just pass in worker too. After doing a of research bound
workers allows the scheduler to use locality too avoid that exact
problem your reading.

TL;DR I'm not buying any of these arguments although it is possible I am
missing something.

Matt 

> > Hence, apart from being a bit of a impedance mismatch, the proposal has the
> > potential to change performance and power patterns and both large and small
> > machines.
> >
> 
> We are going to have to test this out I suppose and play around to see
> if this design has any real world impacts. As Jason said, yea probably
> will need a bit of help here from others. Will CC relavent parties on
> next rev. 
>  
> > >      >>> Secondly, it probably demands separate workers (not optional),
> > >     otherwise
> > >      >>> behaviour of shared workqueues has either the potential to
> > >     explode number
> > >      >>> kernel threads anyway, or add latency.
> > >      >>>
> > >      >>
> > >      >> Right now the system_unbound_wq is used which does have a limit
> > >     on the
> > >      >> number of threads, right? I do have a FIXME to allow a worker to be
> > >      >> passed in similar to TDR.
> > >      >>
> > >      >> WRT to latency, the 1:1 ratio could actually have lower latency
> > >     as 2 GPU
> > >      >> schedulers can be pushing jobs into the backend / cleaning up
> > >     jobs in
> > >      >> parallel.
> > >      >>
> > >      >
> > >      > Thought of one more point here where why in Xe we absolutely want
> > >     a 1 to
> > >      > 1 ratio between entity and scheduler - the way we implement
> > >     timeslicing
> > >      > for preempt fences.
> > >      >
> > >      > Let me try to explain.
> > >      >
> > >      > Preempt fences are implemented via the generic messaging
> > >     interface [1]
> > >      > with suspend / resume messages. If a suspend messages is received to
> > >      > soon after calling resume (this is per entity) we simply sleep in the
> > >      > suspend call thus giving the entity a timeslice. This completely
> > >     falls
> > >      > apart with a many to 1 relationship as now a entity waiting for a
> > >      > timeslice blocks the other entities. Could we work aroudn this,
> > >     sure but
> > >      > just another bunch of code we'd have to add in Xe. Being to
> > >     freely sleep
> > >      > in backend without affecting other entities is really, really
> > >     nice IMO
> > >      > and I bet Xe isn't the only driver that is going to feel this way.
> > >      >
> > >      > Last thing I'll say regardless of how anyone feels about Xe using
> > >     a 1 to
> > >      > 1 relationship this patch IMO makes sense as I hope we can all
> > >     agree a
> > >      > workqueue scales better than kthreads.
> > > 
> > >     I don't know for sure what will scale better and for what use case,
> > >     combination of CPU cores vs number of GPU engines to keep busy vs other
> > >     system activity. But I wager someone is bound to ask for some
> > >     numbers to
> > >     make sure proposal is not negatively affecting any other drivers.
> > > 
> > > 
> > > Then let them ask.  Waving your hands vaguely in the direction of the
> > > rest of DRM and saying "Uh, someone (not me) might object" is profoundly
> > > unhelpful.  Sure, someone might.  That's why it's on dri-devel.  If you
> > > think there's someone in particular who might have a useful opinion on
> > > this, throw them in the CC so they don't miss the e-mail thread.
> > > 
> > > Or are you asking for numbers?  If so, what numbers are you asking for?
> > 
> > It was a heads up to the Xe team in case people weren't appreciating how the
> > proposed change has the potential influence power and performance across the
> > board. And nothing in the follow up discussion made me think it was
> > considered so I don't think it was redundant to raise it.
> > 
> > In my experience it is typical that such core changes come with some
> > numbers. Which is in case of drm scheduler is tricky and probably requires
> > explicitly asking everyone to test (rather than count on "don't miss the
> > email thread"). Real products can fail to ship due ten mW here or there.
> > Like suddenly an extra core prevented from getting into deep sleep.
> > 
> > If that was "profoundly unhelpful" so be it.
> > 
> > > Also, If we're talking about a design that might paint us into an
> > > Intel-HW-specific hole, that would be one thing.  But we're not.  We're
> > > talking about switching which kernel threading/task mechanism to use for
> > > what's really a very generic problem.  The core Xe design works without
> > > this patch (just with more kthreads).  If we land this patch or
> > > something like it and get it wrong and it causes a performance problem
> > > for someone down the line, we can revisit it.
> > 
> > For some definition of "it works" - I really wouldn't suggest shipping a
> > kthread per user context at any point.
> >
> 
> Yea, this is why using a workqueue rathre than a kthread was suggested
> to me by AMD. I should've put a suggested by on the commit message, need
> to dig through my emails and figure out who exactly suggested this.
>  
> > >     In any case that's a low level question caused by the high level design
> > >     decision. So I'd think first focus on the high level - which is the 1:1
> > >     mapping of entity to scheduler instance proposal.
> > > 
> > >     Fundamentally it will be up to the DRM maintainers and the community to
> > >     bless your approach. And it is important to stress 1:1 is about
> > >     userspace contexts, so I believe unlike any other current scheduler
> > >     user. And also important to stress this effectively does not make Xe
> > >     _really_ use the scheduler that much.
> > > 
> > > 
> > > I don't think this makes Xe nearly as much of a one-off as you think it
> > > does.  I've already told the Asahi team working on Apple M1/2 hardware
> > > to do it this way and it seems to be a pretty good mapping for them. I
> > > believe this is roughly the plan for nouveau as well.  It's not the way
> > > it currently works for anyone because most other groups aren't doing FW
> > > scheduling yet.  In the world of FW scheduling and hardware designed to
> > > support userspace direct-to-FW submit, I think the design makes perfect
> > > sense (see below) and I expect we'll see more drivers move in this
> > > direction as those drivers evolve.  (AMD is doing some customish thing
> > > for how with gpu_scheduler on the front-end somehow. I've not dug into
> > > those details.)
> > > 
> > >     I can only offer my opinion, which is that the two options mentioned in
> > >     this thread (either improve drm scheduler to cope with what is
> > >     required,
> > >     or split up the code so you can use just the parts of drm_sched which
> > >     you want - which is frontend dependency tracking) shouldn't be so
> > >     readily dismissed, given how I think the idea was for the new driver to
> > >     work less in a silo and more in the community (not do kludges to
> > >     workaround stuff because it is thought to be too hard to improve common
> > >     code), but fundamentally, "goto previous paragraph" for what I am
> > >     concerned.
> > > 
> > > 
> > > Meta comment:  It appears as if you're falling into the standard i915
> > > team trap of having an internal discussion about what the community
> > > discussion might look like instead of actually having the community
> > > discussion.  If you are seriously concerned about interactions with
> > > other drivers or whether or setting common direction, the right way to
> > > do that is to break a patch or two out into a separate RFC series and
> > > tag a handful of driver maintainers.  Trying to predict the questions
> > > other people might ask is pointless. Cc them and asking for their input
> > > instead.
> > 
> > I don't follow you here. It's not an internal discussion - I am raising my
> > concerns on the design publicly. I am supposed to write a patch to show
> > something, but am allowed to comment on a RFC series?
> > 
> > It is "drm/sched: Convert drm scheduler to use a work queue rather than
> > kthread" which should have Cc-ed _everyone_ who use drm scheduler.
> >
> 
> Yea, will do on next rev.
>  
> > > 
> > >     Regards,
> > > 
> > >     Tvrtko
> > > 
> > >     P.S. And as a related side note, there are more areas where drm_sched
> > >     could be improved, like for instance priority handling.
> > >     Take a look at msm_submitqueue_create / msm_gpu_convert_priority /
> > >     get_sched_entity to see how msm works around the drm_sched hardcoded
> > >     limit of available priority levels, in order to avoid having to leave a
> > >     hw capability unused. I suspect msm would be happier if they could have
> > >     all priority levels equal in terms of whether they apply only at the
> > >     frontend level or completely throughout the pipeline.
> > > 
> > >      > [1]
> > >     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> > >     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>
> > >      >
> > >      >>> What would be interesting to learn is whether the option of
> > >     refactoring
> > >      >>> drm_sched to deal with out of order completion was considered
> > >     and what were
> > >      >>> the conclusions.
> > >      >>>
> > >      >>
> > >      >> I coded this up a while back when trying to convert the i915 to
> > >     the DRM
> > >      >> scheduler it isn't all that hard either. The free flow control
> > >     on the
> > >      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
> > >     really what
> > >      >> sold me on the this design.
> > > 
> > > 
> > > You're not the only one to suggest supporting out-of-order completion.
> > > However, it's tricky and breaks a lot of internal assumptions of the
> > > scheduler. It also reduces functionality a bit because it can no longer
> > > automatically rate-limit HW/FW queues which are often fixed-size.  (Ok,
> > > yes, it probably could but it becomes a substantially harder problem.)
> > > 
> > > It also seems like a worse mapping to me.  The goal here is to turn
> > > submissions on a userspace-facing engine/queue into submissions to a FW
> > > queue submissions, sorting out any dma_fence dependencies.  Matt's
> > > description of saying this is a 1:1 mapping between sched/entity doesn't
> > > tell the whole story. It's a 1:1:1 mapping between xe_engine,
> > > gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1 mapping?
> > > Why is that better?
> > 
> > As I have stated before, what I think what would fit well for Xe is one
> > drm_scheduler per engine class. In specific terms on our current hardware,
> > one drm scheduler instance for render, compute, blitter, video and video
> > enhance. Userspace contexts remain scheduler entities.
> >
> 
> I disagree.
>  
> > That way you avoid the whole kthread/kworker story and you have it actually
> > use the entity picking code in the scheduler, which may be useful when the
> > backend is congested.
> >
> 
> In practice the backend shouldn't be congested but if it is a mutex
> provides fairness probably better than using a shared scheduler. Also
> what you are suggesting doesn't make sense at all as the congestion is
> per-GT, so if anything we should use 1 scheduler per-GT not per engine
> class.
>  
> > Yes you have to solve the out of order problem so in my mind that is
> > something to discuss. What the problem actually is (just TDR?), how tricky
> > and why etc.
> >
> 
> Cleanup of jobs, TDR, replaying jobs, etc... It has decent amount of
> impact.
>  
> > And yes you lose the handy LRCA ring buffer size management so you'd have to
> > make those entities not runnable in some other way.
> >
> 
> Also we lose our preempt fence implemenation too. Again I don't see how
> the design you are suggesting is a win.
>  
> > Regarding the argument you raise below - would any of that make the frontend
> > / backend separation worse and why? Do you think it is less natural? If
> > neither is true then all remains is that it appears extra work to support
> > out of order completion of entities has been discounted in favour of an easy
> > but IMO inelegant option.
> > 
> > > There are two places where this 1:1:1 mapping is causing problems:
> > > 
> > >   1. It creates lots of kthreads. This is what this patch is trying to
> > > solve. IDK if it's solving it the best way but that's the goal.
> > > 
> > >   2. There are a far more limited number of communication queues between
> > > the kernel and GuC for more meta things like pausing and resuming
> > > queues, getting events back from GuC, etc. Unless we're in a weird
> > > pressure scenario, the amount of traffic on this queue should be low so
> > > we can probably just have one per physical device.  The vast majority of
> > > kernel -> GuC communication should be on the individual FW queue rings
> > > and maybe smashing in-memory doorbells.
> > 
> > I don't follow your terminology here. I suppose you are talking about global
> > GuC CT and context ringbuffers. If so then isn't "far more limited" actually
> > one?
> > 
> 
> We have 1 GuC GT per-GT.
> 
> Matt
> 
> > Regards,
> > 
> > Tvrtko
> > 
> > > Doing out-of-order completion sort-of solves the 1 but does nothing for
> > > 2 and actually makes managing FW queues harder because we no longer have
> > > built-in rate limiting.  Seems like a net loss to me.
> > > 
> > >      >>> Second option perhaps to split out the drm_sched code into
> > >     parts which would
> > >      >>> lend themselves more to "pick and choose" of its functionalities.
> > >      >>> Specifically, Xe wants frontend dependency tracking, but not
> > >     any scheduling
> > >      >>> really (neither least busy drm_sched, neither FIFO/RQ entity
> > >     picking), so
> > >      >>> even having all these data structures in memory is a waste.
> > >      >>>
> > >      >>
> > >      >> I don't think that we are wasting memory is a very good argument for
> > >      >> making intrusive changes to the DRM scheduler.
> > > 
> > > 
> > > Worse than that, I think the "we could split it up" kind-of misses the
> > > point of the way Xe is using drm/scheduler.  It's not just about
> > > re-using a tiny bit of dependency tracking code.  Using the scheduler in
> > > this way provides a clean separation between front-end and back-end.
> > > The job of the userspace-facing ioctl code is to shove things on the
> > > scheduler.  The job of the run_job callback is to encode the job into
> > > the FW queue format, stick it in the FW queue ring, and maybe smash a
> > > doorbell.  Everything else happens in terms of managing those queues
> > > side-band.  The gpu_scheduler code manages the front-end queues and Xe
> > > manages the FW queues via the Kernel <-> GuC communication rings.  From
> > > a high level, this is a really clean design.  There are potentially some
> > > sticky bits around the dual-use of dma_fence for scheduling and memory
> > > management but none of those are solved by breaking the DRM scheduler
> > > into chunks or getting rid of the 1:1:1 mapping.
> > > 
> > > If we split it out, we're basically asking the driver to implement a
> > > bunch of kthread or workqueue stuff, all the ring rate-limiting, etc.
> > > It may not be all that much code but also, why?  To save a few bytes of
> > > memory per engine?  Each engine already has 32K(ish) worth of context
> > > state and a similar size ring to communicate with the FW.  No one is
> > > going to notice an extra CPU data structure.
> > > 
> > > I'm not seeing a solid argument against the 1:1:1 design here other than
> > > that it doesn't seem like the way DRM scheduler was intended to be
> > > used.  I won't argue that.  It's not.  But it is a fairly natural way to
> > > take advantage of the benefits the DRM scheduler does provide while also
> > > mapping it to hardware that was designed for userspace direct-to-FW
> > > submit.
> > > 
> > > --Jason
> > > 
> > >      >>> With the first option then the end result could be drm_sched
> > >     per engine
> > >      >>> class (hardware view), which I think fits with the GuC model.
> > >     Give all
> > >      >>> schedulable contexts (entities) to the GuC and then mostly
> > >     forget about
> > >      >>> them. Timeslicing and re-ordering and all happens transparently
> > >     to the
> > >      >>> kernel from that point until completion.
> > >      >>>
> > >      >>
> > >      >> Out-of-order problem still exists here.
> > >      >>
> > >      >>> Or with the second option you would build on some smaller
> > >     refactored
> > >      >>> sub-components of drm_sched, by maybe splitting the dependency
> > >     tracking from
> > >      >>> scheduling (RR/FIFO entity picking code).
> > >      >>>
> > >      >>> Second option is especially a bit vague and I haven't thought
> > >     about the
> > >      >>> required mechanics, but it just appeared too obvious the
> > >     proposed design has
> > >      >>> a bit too much impedance mismatch.
> > >      >>>
> > >      >>
> > >      >> IMO ROI on this is low and again lets see what Boris comes up with.
> > >      >>
> > >      >> Matt
> > >      >>
> > >      >>> Oh and as a side note, when I went into the drm_sched code base
> > >     to remind
> > >      >>> myself how things worked, it is quite easy to find some FIXME
> > >     comments which
> > >      >>> suggest people working on it are unsure of locking desing there
> > >     and such. So
> > >      >>> perhaps that all needs cleanup too, I mean would benefit from
> > >      >>> refactoring/improving work as brainstormed above anyway.
> > >      >>>
> > >      >>> Regards,
> > >      >>>
> > >      >>> Tvrtko
> > >
Tvrtko Ursulin Jan. 11, 2023, 8:50 a.m. UTC | #25
On 10/01/2023 14:08, Jason Ekstrand wrote:
> On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin 
> <tvrtko.ursulin@linux.intel.com <mailto:tvrtko.ursulin@linux.intel.com>> 
> wrote:
> 
> 
> 
>     On 09/01/2023 17:27, Jason Ekstrand wrote:
> 
>     [snip]
> 
>      >      >>> AFAICT it proposes to have 1:1 between *userspace* created
>      >     contexts (per
>      >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
>      >     invasive changes
>      >      >>> to the shared code is in the spirit of the overall idea
>     and instead
>      >      >>> opportunity should be used to look at way to
>     refactor/improve
>      >     drm_sched.
>      >
>      >
>      > Maybe?  I'm not convinced that what Xe is doing is an abuse at
>     all or
>      > really needs to drive a re-factor.  (More on that later.) 
>     There's only
>      > one real issue which is that it fires off potentially a lot of
>     kthreads.
>      > Even that's not that bad given that kthreads are pretty light and
>     you're
>      > not likely to have more kthreads than userspace threads which are
>     much
>      > heavier.  Not ideal, but not the end of the world either. 
>     Definitely
>      > something we can/should optimize but if we went through with Xe
>     without
>      > this patch, it would probably be mostly ok.
>      >
>      >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
>      >      >>
>      >      >> I'm not really prepared to make large changes to DRM
>     scheduler
>      >     at the
>      >      >> moment for Xe as they are not really required nor does Boris
>      >     seem they
>      >      >> will be required for his work either. I am interested to see
>      >     what Boris
>      >      >> comes up with.
>      >      >>
>      >      >>> Even on the low level, the idea to replace drm_sched threads
>      >     with workers
>      >      >>> has a few problems.
>      >      >>>
>      >      >>> To start with, the pattern of:
>      >      >>>
>      >      >>>    while (not_stopped) {
>      >      >>>     keep picking jobs
>      >      >>>    }
>      >      >>>
>      >      >>> Feels fundamentally in disagreement with workers (while
>      >     obviously fits
>      >      >>> perfectly with the current kthread design).
>      >      >>
>      >      >> The while loop breaks and worker exists if no jobs are ready.
>      >
>      >
>      > I'm not very familiar with workqueues. What are you saying would fit
>      > better? One scheduling job per work item rather than one big work
>     item
>      > which handles all available jobs?
> 
>     Yes and no, it indeed IMO does not fit to have a work item which is
>     potentially unbound in runtime. But it is a bit moot conceptual
>     mismatch
>     because it is a worst case / theoretical, and I think due more
>     fundamental concerns.
> 
>     If we have to go back to the low level side of things, I've picked this
>     random spot to consolidate what I have already mentioned and perhaps
>     expand.
> 
>     To start with, let me pull out some thoughts from workqueue.rst:
> 
>     """
>     Generally, work items are not expected to hog a CPU and consume many
>     cycles. That means maintaining just enough concurrency to prevent work
>     processing from stalling should be optimal.
>     """
> 
>     For unbound queues:
>     """
>     The responsibility of regulating concurrency level is on the users.
>     """
> 
>     Given the unbound queues will be spawned on demand to service all
>     queued
>     work items (more interesting when mixing up with the
>     system_unbound_wq),
>     in the proposed design the number of instantiated worker threads does
>     not correspond to the number of user threads (as you have elsewhere
>     stated), but pessimistically to the number of active user contexts.
> 
> 
> Those are pretty much the same in practice.  Rather, user threads is 
> typically an upper bound on the number of contexts.  Yes, a single user 
> thread could have a bunch of contexts but basically nothing does that 
> except IGT.  In real-world usage, it's at most one context per user thread.

Typically is the key here. But I am not sure it is good enough. Consider 
this example - Intel Flex 170:

  * Delivers up to 36 streams 1080p60 transcode throughput per card.
  * When scaled to 10 cards in a 4U server configuration, it can support 
up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.

One transcode stream from my experience typically is 3-4 GPU contexts 
(buffer travels from vcs -> rcs -> vcs, maybe vecs) used from a single 
CPU thread. 4 contexts * 36 streams = 144 active contexts. Multiply by 
60fps = 8640 jobs submitted and completed per second.

144 active contexts in the proposed scheme means possibly means 144 
kernel worker threads spawned (driven by 36 transcode CPU threads). (I 
don't think the pools would scale down given all are constantly pinged 
at 60fps.)

And then each of 144 threads goes to grab the single GuC CT mutex. First 
threads are being made schedulable, then put to sleep as mutex 
contention is hit, then woken again as mutexes are getting released, 
rinse, repeat.

(And yes this backend contention is there regardless of 1:1:1, it would 
require a different re-design to solve that. But it is just a question 
whether there are 144 contending threads, or just 6 with the thread per 
engine class scheme.)

Then multiply all by 10 for a 4U server use case and you get 1440 worker 
kthreads, yes 10 more CT locks, but contending on how many CPU cores? 
Just so they can grab a timeslice and maybe content on a mutex as the 
next step.

This example is where it would hurt on large systems. Imagine only an 
even wider media transcode card...

Second example is only a single engine class used (3d desktop?) but with 
a bunch of not-runnable jobs queued and waiting on a fence to signal. 
Implicit or explicit dependencies doesn't matter. Then the fence signals 
and call backs run. N work items get scheduled, but they all submit to 
the same HW engine. So we end up with:

         /-- wi1 --\
        / ..     .. \
  cb --+---  wi.. ---+-- rq1 -- .. -- rqN
        \ ..    ..  /
         \-- wiN --/


All that we have achieved is waking up N CPUs to contend on the same 
lock and effectively insert the job into the same single HW queue. I 
don't see any positives there.

This example I think can particularly hurt small / low power devices 
because of needless waking up of many cores for no benefit. Granted, I 
don't have a good feel on how common this pattern is in practice.

> 
>     That
>     is the number which drives the maximum number of not-runnable jobs that
>     can become runnable at once, and hence spawn that many work items, and
>     in turn unbound worker threads.
> 
>     Several problems there.
> 
>     It is fundamentally pointless to have potentially that many more
>     threads
>     than the number of CPU cores - it simply creates a scheduling storm.
> 
>     Unbound workers have no CPU / cache locality either and no connection
>     with the CPU scheduler to optimize scheduling patterns. This may matter
>     either on large systems or on small ones. Whereas the current design
>     allows for scheduler to notice userspace CPU thread keeps waking up the
>     same drm scheduler kernel thread, and so it can keep them on the same
>     CPU, the unbound workers lose that ability and so 2nd CPU might be
>     getting woken up from low sleep for every submission.
> 
>     Hence, apart from being a bit of a impedance mismatch, the proposal has
>     the potential to change performance and power patterns and both large
>     and small machines.
> 
> 
> Ok, thanks for explaining the issue you're seeing in more detail.  Yes, 
> deferred kwork does appear to mismatch somewhat with what the scheduler 
> needs or at least how it's worked in the past.  How much impact will 
> that mismatch have?  Unclear.
> 
>      >      >>> Secondly, it probably demands separate workers (not
>     optional),
>      >     otherwise
>      >      >>> behaviour of shared workqueues has either the potential to
>      >     explode number
>      >      >>> kernel threads anyway, or add latency.
>      >      >>>
>      >      >>
>      >      >> Right now the system_unbound_wq is used which does have a
>     limit
>      >     on the
>      >      >> number of threads, right? I do have a FIXME to allow a
>     worker to be
>      >      >> passed in similar to TDR.
>      >      >>
>      >      >> WRT to latency, the 1:1 ratio could actually have lower
>     latency
>      >     as 2 GPU
>      >      >> schedulers can be pushing jobs into the backend / cleaning up
>      >     jobs in
>      >      >> parallel.
>      >      >>
>      >      >
>      >      > Thought of one more point here where why in Xe we
>     absolutely want
>      >     a 1 to
>      >      > 1 ratio between entity and scheduler - the way we implement
>      >     timeslicing
>      >      > for preempt fences.
>      >      >
>      >      > Let me try to explain.
>      >      >
>      >      > Preempt fences are implemented via the generic messaging
>      >     interface [1]
>      >      > with suspend / resume messages. If a suspend messages is
>     received to
>      >      > soon after calling resume (this is per entity) we simply
>     sleep in the
>      >      > suspend call thus giving the entity a timeslice. This
>     completely
>      >     falls
>      >      > apart with a many to 1 relationship as now a entity
>     waiting for a
>      >      > timeslice blocks the other entities. Could we work aroudn
>     this,
>      >     sure but
>      >      > just another bunch of code we'd have to add in Xe. Being to
>      >     freely sleep
>      >      > in backend without affecting other entities is really, really
>      >     nice IMO
>      >      > and I bet Xe isn't the only driver that is going to feel
>     this way.
>      >      >
>      >      > Last thing I'll say regardless of how anyone feels about
>     Xe using
>      >     a 1 to
>      >      > 1 relationship this patch IMO makes sense as I hope we can all
>      >     agree a
>      >      > workqueue scales better than kthreads.
>      >
>      >     I don't know for sure what will scale better and for what use
>     case,
>      >     combination of CPU cores vs number of GPU engines to keep
>     busy vs other
>      >     system activity. But I wager someone is bound to ask for some
>      >     numbers to
>      >     make sure proposal is not negatively affecting any other drivers.
>      >
>      >
>      > Then let them ask.  Waving your hands vaguely in the direction of
>     the
>      > rest of DRM and saying "Uh, someone (not me) might object" is
>     profoundly
>      > unhelpful.  Sure, someone might.  That's why it's on dri-devel. 
>     If you
>      > think there's someone in particular who might have a useful
>     opinion on
>      > this, throw them in the CC so they don't miss the e-mail thread.
>      >
>      > Or are you asking for numbers?  If so, what numbers are you
>     asking for?
> 
>     It was a heads up to the Xe team in case people weren't appreciating
>     how
>     the proposed change has the potential influence power and performance
>     across the board. And nothing in the follow up discussion made me think
>     it was considered so I don't think it was redundant to raise it.
> 
>     In my experience it is typical that such core changes come with some
>     numbers. Which is in case of drm scheduler is tricky and probably
>     requires explicitly asking everyone to test (rather than count on
>     "don't
>     miss the email thread"). Real products can fail to ship due ten mW here
>     or there. Like suddenly an extra core prevented from getting into deep
>     sleep.
> 
>     If that was "profoundly unhelpful" so be it.
> 
> 
> With your above explanation, it makes more sense what you're asking.  
> It's still not something Matt is likely to be able to provide on his 
> own.  We need to tag some other folks and ask them to test it out.  We 
> could play around a bit with it on Xe but it's not exactly production 
> grade yet and is going to hit this differently from most.  Likely 
> candidates are probably AMD and Freedreno.

Whoever is setup to check out power and performance would be good to 
give it a spin, yes.

PS. I don't think I was asking Matt to test with other devices. To start 
with I think Xe is a team effort. I was asking for more background on 
the design decision since patch 4/20 does not say anything on that 
angle, nor later in the thread it was IMO sufficiently addressed.

>      > Also, If we're talking about a design that might paint us into an
>      > Intel-HW-specific hole, that would be one thing.  But we're not. 
>     We're
>      > talking about switching which kernel threading/task mechanism to
>     use for
>      > what's really a very generic problem.  The core Xe design works
>     without
>      > this patch (just with more kthreads).  If we land this patch or
>      > something like it and get it wrong and it causes a performance
>     problem
>      > for someone down the line, we can revisit it.
> 
>     For some definition of "it works" - I really wouldn't suggest
>     shipping a
>     kthread per user context at any point.
> 
> 
> You have yet to elaborate on why. What resources is it consuming that's 
> going to be a problem? Are you anticipating CPU affinity problems? Or 
> does it just seem wasteful?

Well I don't know, commit message says the approach does not scale. :)

> I think I largely agree that it's probably unnecessary/wasteful but 
> reducing the number of kthreads seems like a tractable problem to solve 
> regardless of where we put the gpu_scheduler object.  Is this the right 
> solution?  Maybe not.  It was also proposed at one point that we could 
> split the scheduler into two pieces: A scheduler which owns the kthread, 
> and a back-end which targets some HW ring thing where you can have 
> multiple back-ends per scheduler.  That's certainly more invasive from a 
> DRM scheduler internal API PoV but would solve the kthread problem in a 
> way that's more similar to what we have now.
> 
>      >     In any case that's a low level question caused by the high
>     level design
>      >     decision. So I'd think first focus on the high level - which
>     is the 1:1
>      >     mapping of entity to scheduler instance proposal.
>      >
>      >     Fundamentally it will be up to the DRM maintainers and the
>     community to
>      >     bless your approach. And it is important to stress 1:1 is about
>      >     userspace contexts, so I believe unlike any other current
>     scheduler
>      >     user. And also important to stress this effectively does not
>     make Xe
>      >     _really_ use the scheduler that much.
>      >
>      >
>      > I don't think this makes Xe nearly as much of a one-off as you
>     think it
>      > does.  I've already told the Asahi team working on Apple M1/2
>     hardware
>      > to do it this way and it seems to be a pretty good mapping for
>     them. I
>      > believe this is roughly the plan for nouveau as well.  It's not
>     the way
>      > it currently works for anyone because most other groups aren't
>     doing FW
>      > scheduling yet.  In the world of FW scheduling and hardware
>     designed to
>      > support userspace direct-to-FW submit, I think the design makes
>     perfect
>      > sense (see below) and I expect we'll see more drivers move in this
>      > direction as those drivers evolve.  (AMD is doing some customish
>     thing
>      > for how with gpu_scheduler on the front-end somehow. I've not dug
>     into
>      > those details.)
>      >
>      >     I can only offer my opinion, which is that the two options
>     mentioned in
>      >     this thread (either improve drm scheduler to cope with what is
>      >     required,
>      >     or split up the code so you can use just the parts of
>     drm_sched which
>      >     you want - which is frontend dependency tracking) shouldn't be so
>      >     readily dismissed, given how I think the idea was for the new
>     driver to
>      >     work less in a silo and more in the community (not do kludges to
>      >     workaround stuff because it is thought to be too hard to
>     improve common
>      >     code), but fundamentally, "goto previous paragraph" for what I am
>      >     concerned.
>      >
>      >
>      > Meta comment:  It appears as if you're falling into the standard
>     i915
>      > team trap of having an internal discussion about what the community
>      > discussion might look like instead of actually having the community
>      > discussion.  If you are seriously concerned about interactions with
>      > other drivers or whether or setting common direction, the right
>     way to
>      > do that is to break a patch or two out into a separate RFC series
>     and
>      > tag a handful of driver maintainers.  Trying to predict the
>     questions
>      > other people might ask is pointless. Cc them and asking for their
>     input
>      > instead.
> 
>     I don't follow you here. It's not an internal discussion - I am raising
>     my concerns on the design publicly. I am supposed to write a patch to
>     show something, but am allowed to comment on a RFC series?
> 
> 
> I may have misread your tone a bit.  It felt a bit like too many 
> discussions I've had in the past where people are trying to predict what 
> others will say instead of just asking them.  Reading it again, I was 
> probably jumping to conclusions a bit.  Sorry about that.

Okay no problem, thanks. In any case we don't have to keep discussing 
it, since I wrote one or two emails ago it is fundamentally on the 
maintainers and community to ack the approach. I only felt like RFC did 
not explain the potential downsides sufficiently so I wanted to probe 
that area a bit.

>     It is "drm/sched: Convert drm scheduler to use a work queue rather than
>     kthread" which should have Cc-ed _everyone_ who use drm scheduler.
> 
> 
> Yeah, it probably should have.  I think that's mostly what I've been 
> trying to say.
> 
>      >
>      >     Regards,
>      >
>      >     Tvrtko
>      >
>      >     P.S. And as a related side note, there are more areas where
>     drm_sched
>      >     could be improved, like for instance priority handling.
>      >     Take a look at msm_submitqueue_create /
>     msm_gpu_convert_priority /
>      >     get_sched_entity to see how msm works around the drm_sched
>     hardcoded
>      >     limit of available priority levels, in order to avoid having
>     to leave a
>      >     hw capability unused. I suspect msm would be happier if they
>     could have
>      >     all priority levels equal in terms of whether they apply only
>     at the
>      >     frontend level or completely throughout the pipeline.
>      >
>      >      > [1]
>      >
>     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
>     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>
>      >   
>       <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1 <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>>
>      >      >
>      >      >>> What would be interesting to learn is whether the option of
>      >     refactoring
>      >      >>> drm_sched to deal with out of order completion was
>     considered
>      >     and what were
>      >      >>> the conclusions.
>      >      >>>
>      >      >>
>      >      >> I coded this up a while back when trying to convert the
>     i915 to
>      >     the DRM
>      >      >> scheduler it isn't all that hard either. The free flow
>     control
>      >     on the
>      >      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
>      >     really what
>      >      >> sold me on the this design.
>      >
>      >
>      > You're not the only one to suggest supporting out-of-order
>     completion.
>      > However, it's tricky and breaks a lot of internal assumptions of the
>      > scheduler. It also reduces functionality a bit because it can no
>     longer
>      > automatically rate-limit HW/FW queues which are often
>     fixed-size.  (Ok,
>      > yes, it probably could but it becomes a substantially harder
>     problem.)
>      >
>      > It also seems like a worse mapping to me.  The goal here is to turn
>      > submissions on a userspace-facing engine/queue into submissions
>     to a FW
>      > queue submissions, sorting out any dma_fence dependencies.  Matt's
>      > description of saying this is a 1:1 mapping between sched/entity
>     doesn't
>      > tell the whole story. It's a 1:1:1 mapping between xe_engine,
>      > gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1
>     mapping?
>      > Why is that better?
> 
>     As I have stated before, what I think what would fit well for Xe is one
>     drm_scheduler per engine class. In specific terms on our current
>     hardware, one drm scheduler instance for render, compute, blitter,
>     video
>     and video enhance. Userspace contexts remain scheduler entities.
> 
> 
> And this is where we fairly strongly disagree.  More in a bit.
> 
>     That way you avoid the whole kthread/kworker story and you have it
>     actually use the entity picking code in the scheduler, which may be
>     useful when the backend is congested.
> 
> 
> What back-end congestion are you referring to here?  Running out of FW 
> queue IDs?  Something else?

CT channel, number of context ids.

> 
>     Yes you have to solve the out of order problem so in my mind that is
>     something to discuss. What the problem actually is (just TDR?), how
>     tricky and why etc.
> 
>     And yes you lose the handy LRCA ring buffer size management so you'd
>     have to make those entities not runnable in some other way.
> 
>     Regarding the argument you raise below - would any of that make the
>     frontend / backend separation worse and why? Do you think it is less
>     natural? If neither is true then all remains is that it appears extra
>     work to support out of order completion of entities has been discounted
>     in favour of an easy but IMO inelegant option.
> 
> 
> Broadly speaking, the kernel needs to stop thinking about GPU scheduling 
> in terms of scheduling jobs and start thinking in terms of scheduling 
> contexts/engines.  There is still some need for scheduling individual 
> jobs but that is only for the purpose of delaying them as needed to 
> resolve dma_fence dependencies.  Once dependencies are resolved, they 
> get shoved onto the context/engine queue and from there the kernel only 
> really manages whole contexts/engines.  This is a major architectural 
> shift, entirely different from the way i915 scheduling works.  It's also 
> different from the historical usage of DRM scheduler which I think is 
> why this all looks a bit funny.
> 
> To justify this architectural shift, let's look at where we're headed.  
> In the glorious future...
> 
>   1. Userspace submits directly to firmware queues.  The kernel has no 
> visibility whatsoever into individual jobs.  At most it can pause/resume 
> FW contexts as needed to handle eviction and memory management.
> 
>   2. Because of 1, apart from handing out the FW queue IDs at the 
> beginning, the kernel can't really juggle them that much.  Depending on 
> FW design, it may be able to pause a client, give its IDs to another, 
> and then resume it later when IDs free up.  What it's not doing is 
> juggling IDs on a job-by-job basis like i915 currently is.
> 
>   3. Long-running compute jobs may not complete for days.  This means 
> that memory management needs to happen in terms of pause/resume of 
> entire contexts/engines using the memory rather than based on waiting 
> for individual jobs to complete or pausing individual jobs until the 
> memory is available.
> 
>   4. Synchronization happens via userspace memory fences (UMF) and the 
> kernel is mostly unaware of most dependencies and when a context/engine 
> is or is not runnable.  Instead, it keeps as many of them minimally 
> active (memory is available, even if it's in system RAM) as possible and 
> lets the FW sort out dependencies.  (There may need to be some facility 
> for sleeping a context until a memory change similar to futex() or 
> poll() for userspace threads.  There are some details TBD.)
> 
> Are there potential problems that will need to be solved here?  Yes.  Is 
> it a good design?  Well, Microsoft has been living in this future for 
> half a decade or better and it's working quite well for them.  It's also 
> the way all modern game consoles work.  It really is just Linux that's 
> stuck with the same old job model we've had since the monumental shift 
> to DRI2.
> 
> To that end, one of the core goals of the Xe project was to make the 
> driver internally behave as close to the above model as possible while 
> keeping the old-school job model as a very thin layer on top.  As the 
> broader ecosystem problems (window-system support for UMF, for instance) 
> are solved, that layer can be peeled back.  The core driver will already 
> be ready for it.
> 
> To that end, the point of the DRM scheduler in Xe isn't to schedule 
> jobs.  It's to resolve syncobj and dma-buf implicit sync dependencies 
> and stuff jobs into their respective context/engine queue once they're 
> ready.  All the actual scheduling happens in firmware and any scheduling 
> the kernel does to deal with contention, oversubscriptions, too many 
> contexts, etc. is between contexts/engines, not individual jobs.  Sure, 
> the individual job visibility is nice, but if we design around it, we'll 
> never get to the glorious future.
> 
> I really need to turn the above (with a bit more detail) into a blog 
> post.... Maybe I'll do that this week.
> 
> In any case, I hope that provides more insight into why Xe is designed 
> the way it is and why I'm pushing back so hard on trying to make it more 
> of a "classic" driver as far as scheduling is concerned.  Are there 
> potential problems here?  Yes, that's why Xe has been labeled a 
> prototype.  Are such radical changes necessary to get to said glorious 
> future?  Yes, I think they are.  Will it be worth it?  I believe so.

Right, that's all solid I think. My takeaway is that frontend priority 
sorting and that stuff isn't needed and that is okay. And that there are 
multiple options to maybe improve drm scheduler, like the fore mentioned 
making it deal with out of order, or split into functional components, 
or split frontend/backend what you suggested. For most of them cost vs 
benefit is more or less not completely clear, neither how much effort 
was invested to look into them.

One thing I missed from this explanation is how drm_scheduler per engine 
class interferes with the high level concepts. And I did not manage to 
pick up on what exactly is the TDR problem in that case. Maybe the two 
are one and the same.

Bottom line is I still have the concern that conversion to kworkers has 
an opportunity to regress. Possibly more opportunity for some Xe use 
cases than to affect other vendors, since they would still be using per 
physical engine / queue scheduler instances.

And to put my money where my mouth is I will try to put testing Xe 
inside the full blown ChromeOS environment in my team plans. It would 
probably also be beneficial if Xe team could take a look at real world 
behaviour of the extreme transcode use cases too. If the stack is ready 
for that and all. It would be better to know earlier rather than later 
if there is a fundamental issue.

For the patch at hand, and the cover letter, it certainly feels it would 
benefit to record the past design discussion had with AMD folks, to 
explicitly copy other drivers, and to record the theoretical pros and 
cons of threads vs unbound workers as I have tried to highlight them.

Regards,

Tvrtko
Tvrtko Ursulin Jan. 11, 2023, 9:09 a.m. UTC | #26
On 11/01/2023 01:13, Matthew Brost wrote:
> On Tue, Jan 10, 2023 at 04:39:00PM +0000, Matthew Brost wrote:
>> On Tue, Jan 10, 2023 at 11:28:08AM +0000, Tvrtko Ursulin wrote:
>>>
>>>
>>> On 09/01/2023 17:27, Jason Ekstrand wrote:
>>>
>>> [snip]
>>>
>>>>       >>> AFAICT it proposes to have 1:1 between *userspace* created
>>>>      contexts (per
>>>>       >>> context _and_ engine) and drm_sched. I am not sure avoiding
>>>>      invasive changes
>>>>       >>> to the shared code is in the spirit of the overall idea and instead
>>>>       >>> opportunity should be used to look at way to refactor/improve
>>>>      drm_sched.
>>>>
>>>>
>>>> Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
>>>> really needs to drive a re-factor.  (More on that later.)  There's only
>>>> one real issue which is that it fires off potentially a lot of kthreads.
>>>> Even that's not that bad given that kthreads are pretty light and you're
>>>> not likely to have more kthreads than userspace threads which are much
>>>> heavier.  Not ideal, but not the end of the world either.  Definitely
>>>> something we can/should optimize but if we went through with Xe without
>>>> this patch, it would probably be mostly ok.
>>>>
>>>>       >> Yes, it is 1:1 *userspace* engines and drm_sched.
>>>>       >>
>>>>       >> I'm not really prepared to make large changes to DRM scheduler
>>>>      at the
>>>>       >> moment for Xe as they are not really required nor does Boris
>>>>      seem they
>>>>       >> will be required for his work either. I am interested to see
>>>>      what Boris
>>>>       >> comes up with.
>>>>       >>
>>>>       >>> Even on the low level, the idea to replace drm_sched threads
>>>>      with workers
>>>>       >>> has a few problems.
>>>>       >>>
>>>>       >>> To start with, the pattern of:
>>>>       >>>
>>>>       >>>    while (not_stopped) {
>>>>       >>>     keep picking jobs
>>>>       >>>    }
>>>>       >>>
>>>>       >>> Feels fundamentally in disagreement with workers (while
>>>>      obviously fits
>>>>       >>> perfectly with the current kthread design).
>>>>       >>
>>>>       >> The while loop breaks and worker exists if no jobs are ready.
>>>>
>>>>
>>>> I'm not very familiar with workqueues. What are you saying would fit
>>>> better? One scheduling job per work item rather than one big work item
>>>> which handles all available jobs?
>>>
>>> Yes and no, it indeed IMO does not fit to have a work item which is
>>> potentially unbound in runtime. But it is a bit moot conceptual mismatch
>>> because it is a worst case / theoretical, and I think due more fundamental
>>> concerns.
>>>
>>> If we have to go back to the low level side of things, I've picked this
>>> random spot to consolidate what I have already mentioned and perhaps expand.
>>>
>>> To start with, let me pull out some thoughts from workqueue.rst:
>>>
>>> """
>>> Generally, work items are not expected to hog a CPU and consume many cycles.
>>> That means maintaining just enough concurrency to prevent work processing
>>> from stalling should be optimal.
>>> """
>>>
>>> For unbound queues:
>>> """
>>> The responsibility of regulating concurrency level is on the users.
>>> """
>>>
>>> Given the unbound queues will be spawned on demand to service all queued
>>> work items (more interesting when mixing up with the system_unbound_wq), in
>>> the proposed design the number of instantiated worker threads does not
>>> correspond to the number of user threads (as you have elsewhere stated), but
>>> pessimistically to the number of active user contexts. That is the number
>>> which drives the maximum number of not-runnable jobs that can become
>>> runnable at once, and hence spawn that many work items, and in turn unbound
>>> worker threads.
>>>
>>> Several problems there.
>>>
>>> It is fundamentally pointless to have potentially that many more threads
>>> than the number of CPU cores - it simply creates a scheduling storm.
>>>
>>
>> We can use a different work queue if this is an issue, have a FIXME
>> which indicates we should allow the user to pass in the work queue.
>>
>>> Unbound workers have no CPU / cache locality either and no connection with
>>> the CPU scheduler to optimize scheduling patterns. This may matter either on
>>> large systems or on small ones. Whereas the current design allows for
>>> scheduler to notice userspace CPU thread keeps waking up the same drm
>>> scheduler kernel thread, and so it can keep them on the same CPU, the
>>> unbound workers lose that ability and so 2nd CPU might be getting woken up
>>> from low sleep for every submission.
>>>
>>
>> I guess I don't understand kthread vs. workqueue scheduling internals.
>>   
> 
> Looked into this and we are not using unbound workers rather we are just
> using the system_wq which is indeed bound. Again we can change this so a
> user can just pass in worker too. After doing a of research bound
> workers allows the scheduler to use locality too avoid that exact
> problem your reading.
> 
> TL;DR I'm not buying any of these arguments although it is possible I am
> missing something.

Well you told me it's using unbound.. message id 
Y7dEjcuc1arHBTGu@DUT025-TGLU.fm.intel.com:

"""
Right now the system_unbound_wq is used which does have a limit on the
number of threads, right? I do have a FIXME to allow a worker to be
passed in similar to TDR.
"""

With bound workers you will indeed get CPU locality. I am not sure what 
it will do in terms of concurrency. If it will serialize work items to 
fewer spawned workers that will be good for the CT contention issue, but 
may negatively affect latency. And possibly preemption / time slicing 
decisions since the order of submitting to the backend will not be in 
the order of context priority, hence high prio may be submitted right 
after low and immediately trigger preemption.

Anyway, since you are not buying any arguments on paper perhaps you are 
more open towards testing. If you would adapt gem_wsim for Xe you would 
be able to spawn N simulated transcode sessions on any Gen11+ machine 
and try it out.

For example:

gem_wsim -w benchmarks/wsim/media_load_balance_fhd26u7.wsim -c 36 -r 600

That will run you 36 parallel transcoding sessions streams for 600 
frames each. No client setup needed whatsoever apart from compiling IGT.

In the past that was quite a handy tool to identify scheduling issues, 
or validate changes against. All workloads with the media prefix have 
actually been hand crafted by looking at what real media pipelines do 
with real data. Few years back at least.

It could show you real world behaviour of the kworkers approach and it 
could also enable you to cross reference any power and performance 
changes relative to i915. Background story there is that media servers 
like to fit N streams to a server and if a change comes along which 
suddenly makes only N-1 stream fit before dropping out of realtime, 
that's a big problem.

If you will believe me there is value in that kind of testing I am happy 
to help you add Xe support to the tool, time permitting so possibly 
guidance only at the moment.

Regards,

Tvrtko
Tvrtko Ursulin Jan. 11, 2023, 9:17 a.m. UTC | #27
On 10/01/2023 19:01, Matthew Brost wrote:
> On Tue, Jan 10, 2023 at 04:50:55PM +0000, Tvrtko Ursulin wrote:
>>
>> On 10/01/2023 15:55, Matthew Brost wrote:
>>> On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 10/01/2023 11:28, Tvrtko Ursulin wrote:
>>>>>
>>>>>
>>>>> On 09/01/2023 17:27, Jason Ekstrand wrote:
>>>>>
>>>>> [snip]
>>>>>
>>>>>>        >>> AFAICT it proposes to have 1:1 between *userspace* created
>>>>>>       contexts (per
>>>>>>        >>> context _and_ engine) and drm_sched. I am not sure avoiding
>>>>>>       invasive changes
>>>>>>        >>> to the shared code is in the spirit of the overall idea and
>>>>>> instead
>>>>>>        >>> opportunity should be used to look at way to refactor/improve
>>>>>>       drm_sched.
>>>>>>
>>>>>>
>>>>>> Maybe?  I'm not convinced that what Xe is doing is an abuse at all
>>>>>> or really needs to drive a re-factor.  (More on that later.)
>>>>>> There's only one real issue which is that it fires off potentially a
>>>>>> lot of kthreads. Even that's not that bad given that kthreads are
>>>>>> pretty light and you're not likely to have more kthreads than
>>>>>> userspace threads which are much heavier.  Not ideal, but not the
>>>>>> end of the world either.  Definitely something we can/should
>>>>>> optimize but if we went through with Xe without this patch, it would
>>>>>> probably be mostly ok.
>>>>>>
>>>>>>        >> Yes, it is 1:1 *userspace* engines and drm_sched.
>>>>>>        >>
>>>>>>        >> I'm not really prepared to make large changes to DRM scheduler
>>>>>>       at the
>>>>>>        >> moment for Xe as they are not really required nor does Boris
>>>>>>       seem they
>>>>>>        >> will be required for his work either. I am interested to see
>>>>>>       what Boris
>>>>>>        >> comes up with.
>>>>>>        >>
>>>>>>        >>> Even on the low level, the idea to replace drm_sched threads
>>>>>>       with workers
>>>>>>        >>> has a few problems.
>>>>>>        >>>
>>>>>>        >>> To start with, the pattern of:
>>>>>>        >>>
>>>>>>        >>>    while (not_stopped) {
>>>>>>        >>>     keep picking jobs
>>>>>>        >>>    }
>>>>>>        >>>
>>>>>>        >>> Feels fundamentally in disagreement with workers (while
>>>>>>       obviously fits
>>>>>>        >>> perfectly with the current kthread design).
>>>>>>        >>
>>>>>>        >> The while loop breaks and worker exists if no jobs are ready.
>>>>>>
>>>>>>
>>>>>> I'm not very familiar with workqueues. What are you saying would fit
>>>>>> better? One scheduling job per work item rather than one big work
>>>>>> item which handles all available jobs?
>>>>>
>>>>> Yes and no, it indeed IMO does not fit to have a work item which is
>>>>> potentially unbound in runtime. But it is a bit moot conceptual mismatch
>>>>> because it is a worst case / theoretical, and I think due more
>>>>> fundamental concerns.
>>>>>
>>>>> If we have to go back to the low level side of things, I've picked this
>>>>> random spot to consolidate what I have already mentioned and perhaps
>>>>> expand.
>>>>>
>>>>> To start with, let me pull out some thoughts from workqueue.rst:
>>>>>
>>>>> """
>>>>> Generally, work items are not expected to hog a CPU and consume many
>>>>> cycles. That means maintaining just enough concurrency to prevent work
>>>>> processing from stalling should be optimal.
>>>>> """
>>>>>
>>>>> For unbound queues:
>>>>> """
>>>>> The responsibility of regulating concurrency level is on the users.
>>>>> """
>>>>>
>>>>> Given the unbound queues will be spawned on demand to service all queued
>>>>> work items (more interesting when mixing up with the system_unbound_wq),
>>>>> in the proposed design the number of instantiated worker threads does
>>>>> not correspond to the number of user threads (as you have elsewhere
>>>>> stated), but pessimistically to the number of active user contexts. That
>>>>> is the number which drives the maximum number of not-runnable jobs that
>>>>> can become runnable at once, and hence spawn that many work items, and
>>>>> in turn unbound worker threads.
>>>>>
>>>>> Several problems there.
>>>>>
>>>>> It is fundamentally pointless to have potentially that many more threads
>>>>> than the number of CPU cores - it simply creates a scheduling storm.
>>>>
>>>> To make matters worse, if I follow the code correctly, all these per user
>>>> context worker thread / work items end up contending on the same lock or
>>>> circular buffer, both are one instance per GPU:
>>>>
>>>> guc_engine_run_job
>>>>    -> submit_engine
>>>>       a) wq_item_append
>>>>           -> wq_wait_for_space
>>>>             -> msleep
>>>
>>> a) is dedicated per xe_engine
>>
>> Hah true, what its for then? I thought throttling the LRCA ring is done via:
>>
> 
> This is a per guc_id 'work queue' which is used for parallel submission
> (e.g. multiple LRC tail values need to written atomically by the GuC).
> Again in practice there should always be space.

Speaking of guc id, where does blocking when none are available happen 
in the non parallel case?

>>    drm_sched_init(&ge->sched, &drm_sched_ops,
>> 		 e->lrc[0].ring.size / MAX_JOB_SIZE_BYTES,
>>
>> Is there something more to throttle other than the ring? It is throttling
>> something using msleeps..
>>
>>> Also you missed the step of programming the ring which is dedicated per xe_engine
>>
>> I was trying to quickly find places which serialize on something in the
>> backend, ringbuffer emission did not seem to do that but maybe I missed
>> something.
>>
> 
> xe_ring_ops vfunc emit_job is called to write the ring.

Right but does it serialize between different contexts, I didn't spot 
that it does in which case it wasn't relevant to the sub story.

>>>
>>>>       b) xe_guc_ct_send
>>>>           -> guc_ct_send
>>>>             -> mutex_lock(&ct->lock);
>>>>             -> later a potential msleep in h2g_has_room
>>>
>>> Techincally there is 1 instance per GT not GPU, yes this is shared but
>>> in practice there will always be space in the CT channel so contention
>>> on the lock should be rare.
>>
>> Yeah I used the term GPU to be more understandable to outside audience.
>>
>> I am somewhat disappointed that the Xe opportunity hasn't been used to
>> improve upon the CT communication bottlenecks. I mean those backoff sleeps
>> and lock contention. I wish there would be a single thread in charge of the
>> CT channel and internal users (other parts of the driver) would be able to
>> send their requests to it in a more efficient manner, with less lock
>> contention and centralized backoff.
>>
> 
> Well the CT backend was more or less a complete rewrite. Mutexes
> actually work rather well to ensure fairness compared to the spin locks
> used in the i915. This code was pretty heavily reviewed by Daniel and
> both of us landed a big mutex for all of the CT code compared to the 3
> or 4 spin locks used in the i915.

Are the "nb" sends gone? But that aside, I wasn't meaning just the 
locking but the high level approach. Never  mind.

>>> I haven't read your rather long reply yet, but also FWIW using a
>>> workqueue has suggested by AMD (original authors of the DRM scheduler)
>>> when we ran this design by them.
>>
>> Commit message says nothing about that. ;)
>>
> 
> Yea I missed that, will fix in the next rev. Just dug through my emails
> and Christian suggested a work queue and Andrey also gave some input on
> the DRM scheduler design.
> 
> Also in the next will likely update the run_wq to be passed in by the
> user.

Yes, and IMO that may need to be non-optional.

Regards,

Tvrtko
Matthew Brost Jan. 11, 2023, 5:52 p.m. UTC | #28
On Wed, Jan 11, 2023 at 09:09:45AM +0000, Tvrtko Ursulin wrote:
> 
> On 11/01/2023 01:13, Matthew Brost wrote:
> > On Tue, Jan 10, 2023 at 04:39:00PM +0000, Matthew Brost wrote:
> > > On Tue, Jan 10, 2023 at 11:28:08AM +0000, Tvrtko Ursulin wrote:
> > > > 
> > > > 
> > > > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > > > 
> > > > [snip]
> > > > 
> > > > >       >>> AFAICT it proposes to have 1:1 between *userspace* created
> > > > >      contexts (per
> > > > >       >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > > > >      invasive changes
> > > > >       >>> to the shared code is in the spirit of the overall idea and instead
> > > > >       >>> opportunity should be used to look at way to refactor/improve
> > > > >      drm_sched.
> > > > > 
> > > > > 
> > > > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all or
> > > > > really needs to drive a re-factor.  (More on that later.)  There's only
> > > > > one real issue which is that it fires off potentially a lot of kthreads.
> > > > > Even that's not that bad given that kthreads are pretty light and you're
> > > > > not likely to have more kthreads than userspace threads which are much
> > > > > heavier.  Not ideal, but not the end of the world either.  Definitely
> > > > > something we can/should optimize but if we went through with Xe without
> > > > > this patch, it would probably be mostly ok.
> > > > > 
> > > > >       >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > > > >       >>
> > > > >       >> I'm not really prepared to make large changes to DRM scheduler
> > > > >      at the
> > > > >       >> moment for Xe as they are not really required nor does Boris
> > > > >      seem they
> > > > >       >> will be required for his work either. I am interested to see
> > > > >      what Boris
> > > > >       >> comes up with.
> > > > >       >>
> > > > >       >>> Even on the low level, the idea to replace drm_sched threads
> > > > >      with workers
> > > > >       >>> has a few problems.
> > > > >       >>>
> > > > >       >>> To start with, the pattern of:
> > > > >       >>>
> > > > >       >>>    while (not_stopped) {
> > > > >       >>>     keep picking jobs
> > > > >       >>>    }
> > > > >       >>>
> > > > >       >>> Feels fundamentally in disagreement with workers (while
> > > > >      obviously fits
> > > > >       >>> perfectly with the current kthread design).
> > > > >       >>
> > > > >       >> The while loop breaks and worker exists if no jobs are ready.
> > > > > 
> > > > > 
> > > > > I'm not very familiar with workqueues. What are you saying would fit
> > > > > better? One scheduling job per work item rather than one big work item
> > > > > which handles all available jobs?
> > > > 
> > > > Yes and no, it indeed IMO does not fit to have a work item which is
> > > > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > > > because it is a worst case / theoretical, and I think due more fundamental
> > > > concerns.
> > > > 
> > > > If we have to go back to the low level side of things, I've picked this
> > > > random spot to consolidate what I have already mentioned and perhaps expand.
> > > > 
> > > > To start with, let me pull out some thoughts from workqueue.rst:
> > > > 
> > > > """
> > > > Generally, work items are not expected to hog a CPU and consume many cycles.
> > > > That means maintaining just enough concurrency to prevent work processing
> > > > from stalling should be optimal.
> > > > """
> > > > 
> > > > For unbound queues:
> > > > """
> > > > The responsibility of regulating concurrency level is on the users.
> > > > """
> > > > 
> > > > Given the unbound queues will be spawned on demand to service all queued
> > > > work items (more interesting when mixing up with the system_unbound_wq), in
> > > > the proposed design the number of instantiated worker threads does not
> > > > correspond to the number of user threads (as you have elsewhere stated), but
> > > > pessimistically to the number of active user contexts. That is the number
> > > > which drives the maximum number of not-runnable jobs that can become
> > > > runnable at once, and hence spawn that many work items, and in turn unbound
> > > > worker threads.
> > > > 
> > > > Several problems there.
> > > > 
> > > > It is fundamentally pointless to have potentially that many more threads
> > > > than the number of CPU cores - it simply creates a scheduling storm.
> > > > 
> > > 
> > > We can use a different work queue if this is an issue, have a FIXME
> > > which indicates we should allow the user to pass in the work queue.
> > > 
> > > > Unbound workers have no CPU / cache locality either and no connection with
> > > > the CPU scheduler to optimize scheduling patterns. This may matter either on
> > > > large systems or on small ones. Whereas the current design allows for
> > > > scheduler to notice userspace CPU thread keeps waking up the same drm
> > > > scheduler kernel thread, and so it can keep them on the same CPU, the
> > > > unbound workers lose that ability and so 2nd CPU might be getting woken up
> > > > from low sleep for every submission.
> > > > 
> > > 
> > > I guess I don't understand kthread vs. workqueue scheduling internals.
> > 
> > Looked into this and we are not using unbound workers rather we are just
> > using the system_wq which is indeed bound. Again we can change this so a
> > user can just pass in worker too. After doing a of research bound
> > workers allows the scheduler to use locality too avoid that exact
> > problem your reading.
> > 
> > TL;DR I'm not buying any of these arguments although it is possible I am
> > missing something.
> 
> Well you told me it's using unbound.. message id
> Y7dEjcuc1arHBTGu@DUT025-TGLU.fm.intel.com:
> 
> """
> Right now the system_unbound_wq is used which does have a limit on the
> number of threads, right? I do have a FIXME to allow a worker to be
> passed in similar to TDR.
> """
> 

Yea, my mistake. A quick look at the shows we are using system_wq (same
as TDR).

> With bound workers you will indeed get CPU locality. I am not sure what it
> will do in terms of concurrency. If it will serialize work items to fewer
> spawned workers that will be good for the CT contention issue, but may
> negatively affect latency. And possibly preemption / time slicing decisions
> since the order of submitting to the backend will not be in the order of
> context priority, hence high prio may be submitted right after low and
> immediately trigger preemption.
>

We should probably use system_highpri_wq for high priority contexts
(xe_engine).
 
> Anyway, since you are not buying any arguments on paper perhaps you are more
> open towards testing. If you would adapt gem_wsim for Xe you would be able
> to spawn N simulated transcode sessions on any Gen11+ machine and try it
> out.
> 
> For example:
> 
> gem_wsim -w benchmarks/wsim/media_load_balance_fhd26u7.wsim -c 36 -r 600
> 
> That will run you 36 parallel transcoding sessions streams for 600 frames
> each. No client setup needed whatsoever apart from compiling IGT.
> 
> In the past that was quite a handy tool to identify scheduling issues, or
> validate changes against. All workloads with the media prefix have actually
> been hand crafted by looking at what real media pipelines do with real data.
> Few years back at least.
> 

Porting this is non-trivial as this is 2.5k. Also in Xe we are trending
to use UMD benchmarks to determine if there are performance problems as
in the i915 we had tons microbenchmarks / IGT benchmarks that we found
meant absolutely nothing. Can't say if this benchmark falls into that
category.

We VK and compute benchmarks running and haven't found any major issues
yet. The media UMD hasn't been ported because of the VM bind dependency
so I can't say if there are any issues with the media UMD + Xe.

What I can do hack up xe_exec_threads to really hammer Xe - change it to
128x xe_engines + 8k execs per thread. Each exec is super simple, it
just stores a dword. It creates a thread per hardware engine, so on TGL
this is 5x threads.

Results below:
root@DUT025-TGLU:mbrost# xe_exec_threads --r threads-basic
IGT-Version: 1.26-ge26de4b2 (x86_64) (Linux: 6.1.0-rc1-xe+ x86_64)
Starting subtest: threads-basic
Subtest threads-basic: SUCCESS (1.215s)
root@DUT025-TGLU:mbrost# dumptrace | grep job | wc
  40960  491520 7401728
root@DUT025-TGLU:mbrost# dumptrace | grep engine | wc
    645    7095   82457

So with 640 xe_engines (5x are VM engines) it takes 1.215 seconds test
time to run 40960 execs. That seems to indicate we do not have a
scheduling problem.

This is 8 core (or at least 8 threads) TGL:

root@DUT025-TGLU:mbrost# cat /proc/cpuinfo
...
processor       : 7
vendor_id       : GenuineIntel
cpu family      : 6
model           : 140
model name      : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
stepping        : 1
microcode       : 0x3a
cpu MHz         : 2344.098
cache size      : 12288 KB
physical id     : 0
siblings        : 8
core id         : 3
cpu cores       : 4
...

Enough data to be convinced there is not issue with this design? I can
also hack up Xe to use less GPU schedulers /w a kthreads but again that
isn't trivial and doesn't seem necessary based on these results.

> It could show you real world behaviour of the kworkers approach and it could
> also enable you to cross reference any power and performance changes
> relative to i915. Background story there is that media servers like to fit N
> streams to a server and if a change comes along which suddenly makes only
> N-1 stream fit before dropping out of realtime, that's a big problem.
> 
> If you will believe me there is value in that kind of testing I am happy to
> help you add Xe support to the tool, time permitting so possibly guidance
> only at the moment.

If we want to port the tool I wont stop you and provide support if you
struggle with the uAPI but based on my results above I don't think this
is necessary.

Matt

> 
> Regards,
> 
> Tvrtko
Matthew Brost Jan. 11, 2023, 6:07 p.m. UTC | #29
On Wed, Jan 11, 2023 at 09:17:01AM +0000, Tvrtko Ursulin wrote:
> 
> On 10/01/2023 19:01, Matthew Brost wrote:
> > On Tue, Jan 10, 2023 at 04:50:55PM +0000, Tvrtko Ursulin wrote:
> > > 
> > > On 10/01/2023 15:55, Matthew Brost wrote:
> > > > On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
> > > > > 
> > > > > On 10/01/2023 11:28, Tvrtko Ursulin wrote:
> > > > > > 
> > > > > > 
> > > > > > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > > > > > 
> > > > > > [snip]
> > > > > > 
> > > > > > >        >>> AFAICT it proposes to have 1:1 between *userspace* created
> > > > > > >       contexts (per
> > > > > > >        >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > > > > > >       invasive changes
> > > > > > >        >>> to the shared code is in the spirit of the overall idea and
> > > > > > > instead
> > > > > > >        >>> opportunity should be used to look at way to refactor/improve
> > > > > > >       drm_sched.
> > > > > > > 
> > > > > > > 
> > > > > > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all
> > > > > > > or really needs to drive a re-factor.  (More on that later.)
> > > > > > > There's only one real issue which is that it fires off potentially a
> > > > > > > lot of kthreads. Even that's not that bad given that kthreads are
> > > > > > > pretty light and you're not likely to have more kthreads than
> > > > > > > userspace threads which are much heavier.  Not ideal, but not the
> > > > > > > end of the world either.  Definitely something we can/should
> > > > > > > optimize but if we went through with Xe without this patch, it would
> > > > > > > probably be mostly ok.
> > > > > > > 
> > > > > > >        >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > > > > > >        >>
> > > > > > >        >> I'm not really prepared to make large changes to DRM scheduler
> > > > > > >       at the
> > > > > > >        >> moment for Xe as they are not really required nor does Boris
> > > > > > >       seem they
> > > > > > >        >> will be required for his work either. I am interested to see
> > > > > > >       what Boris
> > > > > > >        >> comes up with.
> > > > > > >        >>
> > > > > > >        >>> Even on the low level, the idea to replace drm_sched threads
> > > > > > >       with workers
> > > > > > >        >>> has a few problems.
> > > > > > >        >>>
> > > > > > >        >>> To start with, the pattern of:
> > > > > > >        >>>
> > > > > > >        >>>    while (not_stopped) {
> > > > > > >        >>>     keep picking jobs
> > > > > > >        >>>    }
> > > > > > >        >>>
> > > > > > >        >>> Feels fundamentally in disagreement with workers (while
> > > > > > >       obviously fits
> > > > > > >        >>> perfectly with the current kthread design).
> > > > > > >        >>
> > > > > > >        >> The while loop breaks and worker exists if no jobs are ready.
> > > > > > > 
> > > > > > > 
> > > > > > > I'm not very familiar with workqueues. What are you saying would fit
> > > > > > > better? One scheduling job per work item rather than one big work
> > > > > > > item which handles all available jobs?
> > > > > > 
> > > > > > Yes and no, it indeed IMO does not fit to have a work item which is
> > > > > > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > > > > > because it is a worst case / theoretical, and I think due more
> > > > > > fundamental concerns.
> > > > > > 
> > > > > > If we have to go back to the low level side of things, I've picked this
> > > > > > random spot to consolidate what I have already mentioned and perhaps
> > > > > > expand.
> > > > > > 
> > > > > > To start with, let me pull out some thoughts from workqueue.rst:
> > > > > > 
> > > > > > """
> > > > > > Generally, work items are not expected to hog a CPU and consume many
> > > > > > cycles. That means maintaining just enough concurrency to prevent work
> > > > > > processing from stalling should be optimal.
> > > > > > """
> > > > > > 
> > > > > > For unbound queues:
> > > > > > """
> > > > > > The responsibility of regulating concurrency level is on the users.
> > > > > > """
> > > > > > 
> > > > > > Given the unbound queues will be spawned on demand to service all queued
> > > > > > work items (more interesting when mixing up with the system_unbound_wq),
> > > > > > in the proposed design the number of instantiated worker threads does
> > > > > > not correspond to the number of user threads (as you have elsewhere
> > > > > > stated), but pessimistically to the number of active user contexts. That
> > > > > > is the number which drives the maximum number of not-runnable jobs that
> > > > > > can become runnable at once, and hence spawn that many work items, and
> > > > > > in turn unbound worker threads.
> > > > > > 
> > > > > > Several problems there.
> > > > > > 
> > > > > > It is fundamentally pointless to have potentially that many more threads
> > > > > > than the number of CPU cores - it simply creates a scheduling storm.
> > > > > 
> > > > > To make matters worse, if I follow the code correctly, all these per user
> > > > > context worker thread / work items end up contending on the same lock or
> > > > > circular buffer, both are one instance per GPU:
> > > > > 
> > > > > guc_engine_run_job
> > > > >    -> submit_engine
> > > > >       a) wq_item_append
> > > > >           -> wq_wait_for_space
> > > > >             -> msleep
> > > > 
> > > > a) is dedicated per xe_engine
> > > 
> > > Hah true, what its for then? I thought throttling the LRCA ring is done via:
> > > 
> > 
> > This is a per guc_id 'work queue' which is used for parallel submission
> > (e.g. multiple LRC tail values need to written atomically by the GuC).
> > Again in practice there should always be space.
> 
> Speaking of guc id, where does blocking when none are available happen in
> the non parallel case?
> 

We have 64k guc_ids on native, 1k guc_ids with 64k VFs. Either way we
think that is more than enough and can just reject xe_engine creation if
we run out of guc_ids. If this proves to false, we can fix this but the
guc_id stealing the i915 is rather complicated and hopefully not needed.

We will limit the number of guc_ids allowed per user pid to reasonible
number to prevent a DoS. Elevated pids (e.g. IGTs) will be able do to
whatever they want.

> > >    drm_sched_init(&ge->sched, &drm_sched_ops,
> > > 		 e->lrc[0].ring.size / MAX_JOB_SIZE_BYTES,
> > > 
> > > Is there something more to throttle other than the ring? It is throttling
> > > something using msleeps..
> > > 
> > > > Also you missed the step of programming the ring which is dedicated per xe_engine
> > > 
> > > I was trying to quickly find places which serialize on something in the
> > > backend, ringbuffer emission did not seem to do that but maybe I missed
> > > something.
> > > 
> > 
> > xe_ring_ops vfunc emit_job is called to write the ring.
> 
> Right but does it serialize between different contexts, I didn't spot that
> it does in which case it wasn't relevant to the sub story.
>

Right just saying this is an additional step that is done in parallel
between xe_engines.
 
> > > > 
> > > > >       b) xe_guc_ct_send
> > > > >           -> guc_ct_send
> > > > >             -> mutex_lock(&ct->lock);
> > > > >             -> later a potential msleep in h2g_has_room
> > > > 
> > > > Techincally there is 1 instance per GT not GPU, yes this is shared but
> > > > in practice there will always be space in the CT channel so contention
> > > > on the lock should be rare.
> > > 
> > > Yeah I used the term GPU to be more understandable to outside audience.
> > > 
> > > I am somewhat disappointed that the Xe opportunity hasn't been used to
> > > improve upon the CT communication bottlenecks. I mean those backoff sleeps
> > > and lock contention. I wish there would be a single thread in charge of the
> > > CT channel and internal users (other parts of the driver) would be able to
> > > send their requests to it in a more efficient manner, with less lock
> > > contention and centralized backoff.
> > > 
> > 
> > Well the CT backend was more or less a complete rewrite. Mutexes
> > actually work rather well to ensure fairness compared to the spin locks
> > used in the i915. This code was pretty heavily reviewed by Daniel and
> > both of us landed a big mutex for all of the CT code compared to the 3
> > or 4 spin locks used in the i915.
> 
> Are the "nb" sends gone? But that aside, I wasn't meaning just the locking
> but the high level approach. Never  mind.
>

xe_guc_ct_send is non-blocking, xe_guc_ct_send_block is blocking. I
don't think the later is used yet.
 
> > > > I haven't read your rather long reply yet, but also FWIW using a
> > > > workqueue has suggested by AMD (original authors of the DRM scheduler)
> > > > when we ran this design by them.
> > > 
> > > Commit message says nothing about that. ;)
> > > 
> > 
> > Yea I missed that, will fix in the next rev. Just dug through my emails
> > and Christian suggested a work queue and Andrey also gave some input on
> > the DRM scheduler design.
> > 
> > Also in the next will likely update the run_wq to be passed in by the
> > user.
> 
> Yes, and IMO that may need to be non-optional.
>

Yea, will fix.

Matt
 
> Regards,
> 
> Tvrtko
John Harrison Jan. 11, 2023, 6:52 p.m. UTC | #30
On 1/11/2023 10:07, Matthew Brost wrote:
> On Wed, Jan 11, 2023 at 09:17:01AM +0000, Tvrtko Ursulin wrote:
>> On 10/01/2023 19:01, Matthew Brost wrote:
>>> On Tue, Jan 10, 2023 at 04:50:55PM +0000, Tvrtko Ursulin wrote:
>>>> On 10/01/2023 15:55, Matthew Brost wrote:
>>>>> On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
>>>>>> On 10/01/2023 11:28, Tvrtko Ursulin wrote:
>>>>>>> On 09/01/2023 17:27, Jason Ekstrand wrote:
>>>>>>>
>>>>>>> [snip]
>>>>>>>
>>>>>>>>         >>> AFAICT it proposes to have 1:1 between *userspace* created
>>>>>>>>        contexts (per
>>>>>>>>         >>> context _and_ engine) and drm_sched. I am not sure avoiding
>>>>>>>>        invasive changes
>>>>>>>>         >>> to the shared code is in the spirit of the overall idea and
>>>>>>>> instead
>>>>>>>>         >>> opportunity should be used to look at way to refactor/improve
>>>>>>>>        drm_sched.
>>>>>>>>
>>>>>>>>
>>>>>>>> Maybe?  I'm not convinced that what Xe is doing is an abuse at all
>>>>>>>> or really needs to drive a re-factor.  (More on that later.)
>>>>>>>> There's only one real issue which is that it fires off potentially a
>>>>>>>> lot of kthreads. Even that's not that bad given that kthreads are
>>>>>>>> pretty light and you're not likely to have more kthreads than
>>>>>>>> userspace threads which are much heavier.  Not ideal, but not the
>>>>>>>> end of the world either.  Definitely something we can/should
>>>>>>>> optimize but if we went through with Xe without this patch, it would
>>>>>>>> probably be mostly ok.
>>>>>>>>
>>>>>>>>         >> Yes, it is 1:1 *userspace* engines and drm_sched.
>>>>>>>>         >>
>>>>>>>>         >> I'm not really prepared to make large changes to DRM scheduler
>>>>>>>>        at the
>>>>>>>>         >> moment for Xe as they are not really required nor does Boris
>>>>>>>>        seem they
>>>>>>>>         >> will be required for his work either. I am interested to see
>>>>>>>>        what Boris
>>>>>>>>         >> comes up with.
>>>>>>>>         >>
>>>>>>>>         >>> Even on the low level, the idea to replace drm_sched threads
>>>>>>>>        with workers
>>>>>>>>         >>> has a few problems.
>>>>>>>>         >>>
>>>>>>>>         >>> To start with, the pattern of:
>>>>>>>>         >>>
>>>>>>>>         >>>    while (not_stopped) {
>>>>>>>>         >>>     keep picking jobs
>>>>>>>>         >>>    }
>>>>>>>>         >>>
>>>>>>>>         >>> Feels fundamentally in disagreement with workers (while
>>>>>>>>        obviously fits
>>>>>>>>         >>> perfectly with the current kthread design).
>>>>>>>>         >>
>>>>>>>>         >> The while loop breaks and worker exists if no jobs are ready.
>>>>>>>>
>>>>>>>>
>>>>>>>> I'm not very familiar with workqueues. What are you saying would fit
>>>>>>>> better? One scheduling job per work item rather than one big work
>>>>>>>> item which handles all available jobs?
>>>>>>> Yes and no, it indeed IMO does not fit to have a work item which is
>>>>>>> potentially unbound in runtime. But it is a bit moot conceptual mismatch
>>>>>>> because it is a worst case / theoretical, and I think due more
>>>>>>> fundamental concerns.
>>>>>>>
>>>>>>> If we have to go back to the low level side of things, I've picked this
>>>>>>> random spot to consolidate what I have already mentioned and perhaps
>>>>>>> expand.
>>>>>>>
>>>>>>> To start with, let me pull out some thoughts from workqueue.rst:
>>>>>>>
>>>>>>> """
>>>>>>> Generally, work items are not expected to hog a CPU and consume many
>>>>>>> cycles. That means maintaining just enough concurrency to prevent work
>>>>>>> processing from stalling should be optimal.
>>>>>>> """
>>>>>>>
>>>>>>> For unbound queues:
>>>>>>> """
>>>>>>> The responsibility of regulating concurrency level is on the users.
>>>>>>> """
>>>>>>>
>>>>>>> Given the unbound queues will be spawned on demand to service all queued
>>>>>>> work items (more interesting when mixing up with the system_unbound_wq),
>>>>>>> in the proposed design the number of instantiated worker threads does
>>>>>>> not correspond to the number of user threads (as you have elsewhere
>>>>>>> stated), but pessimistically to the number of active user contexts. That
>>>>>>> is the number which drives the maximum number of not-runnable jobs that
>>>>>>> can become runnable at once, and hence spawn that many work items, and
>>>>>>> in turn unbound worker threads.
>>>>>>>
>>>>>>> Several problems there.
>>>>>>>
>>>>>>> It is fundamentally pointless to have potentially that many more threads
>>>>>>> than the number of CPU cores - it simply creates a scheduling storm.
>>>>>> To make matters worse, if I follow the code correctly, all these per user
>>>>>> context worker thread / work items end up contending on the same lock or
>>>>>> circular buffer, both are one instance per GPU:
>>>>>>
>>>>>> guc_engine_run_job
>>>>>>     -> submit_engine
>>>>>>        a) wq_item_append
>>>>>>            -> wq_wait_for_space
>>>>>>              -> msleep
>>>>> a) is dedicated per xe_engine
>>>> Hah true, what its for then? I thought throttling the LRCA ring is done via:
>>>>
>>> This is a per guc_id 'work queue' which is used for parallel submission
>>> (e.g. multiple LRC tail values need to written atomically by the GuC).
>>> Again in practice there should always be space.
>> Speaking of guc id, where does blocking when none are available happen in
>> the non parallel case?
>>
> We have 64k guc_ids on native, 1k guc_ids with 64k VFs. Either way we
> think that is more than enough and can just reject xe_engine creation if
> we run out of guc_ids. If this proves to false, we can fix this but the
> guc_id stealing the i915 is rather complicated and hopefully not needed.
>
> We will limit the number of guc_ids allowed per user pid to reasonible
> number to prevent a DoS. Elevated pids (e.g. IGTs) will be able do to
> whatever they want.
What about doorbells? As some point, we will have to start using those 
and they are a much more limited resource - 256 total and way less with VFs.

John.
Matthew Brost Jan. 11, 2023, 6:55 p.m. UTC | #31
On Wed, Jan 11, 2023 at 10:52:54AM -0800, John Harrison wrote:
> On 1/11/2023 10:07, Matthew Brost wrote:
> > On Wed, Jan 11, 2023 at 09:17:01AM +0000, Tvrtko Ursulin wrote:
> > > On 10/01/2023 19:01, Matthew Brost wrote:
> > > > On Tue, Jan 10, 2023 at 04:50:55PM +0000, Tvrtko Ursulin wrote:
> > > > > On 10/01/2023 15:55, Matthew Brost wrote:
> > > > > > On Tue, Jan 10, 2023 at 12:19:35PM +0000, Tvrtko Ursulin wrote:
> > > > > > > On 10/01/2023 11:28, Tvrtko Ursulin wrote:
> > > > > > > > On 09/01/2023 17:27, Jason Ekstrand wrote:
> > > > > > > > 
> > > > > > > > [snip]
> > > > > > > > 
> > > > > > > > >         >>> AFAICT it proposes to have 1:1 between *userspace* created
> > > > > > > > >        contexts (per
> > > > > > > > >         >>> context _and_ engine) and drm_sched. I am not sure avoiding
> > > > > > > > >        invasive changes
> > > > > > > > >         >>> to the shared code is in the spirit of the overall idea and
> > > > > > > > > instead
> > > > > > > > >         >>> opportunity should be used to look at way to refactor/improve
> > > > > > > > >        drm_sched.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Maybe?  I'm not convinced that what Xe is doing is an abuse at all
> > > > > > > > > or really needs to drive a re-factor.  (More on that later.)
> > > > > > > > > There's only one real issue which is that it fires off potentially a
> > > > > > > > > lot of kthreads. Even that's not that bad given that kthreads are
> > > > > > > > > pretty light and you're not likely to have more kthreads than
> > > > > > > > > userspace threads which are much heavier.  Not ideal, but not the
> > > > > > > > > end of the world either.  Definitely something we can/should
> > > > > > > > > optimize but if we went through with Xe without this patch, it would
> > > > > > > > > probably be mostly ok.
> > > > > > > > > 
> > > > > > > > >         >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > > > > > > > >         >>
> > > > > > > > >         >> I'm not really prepared to make large changes to DRM scheduler
> > > > > > > > >        at the
> > > > > > > > >         >> moment for Xe as they are not really required nor does Boris
> > > > > > > > >        seem they
> > > > > > > > >         >> will be required for his work either. I am interested to see
> > > > > > > > >        what Boris
> > > > > > > > >         >> comes up with.
> > > > > > > > >         >>
> > > > > > > > >         >>> Even on the low level, the idea to replace drm_sched threads
> > > > > > > > >        with workers
> > > > > > > > >         >>> has a few problems.
> > > > > > > > >         >>>
> > > > > > > > >         >>> To start with, the pattern of:
> > > > > > > > >         >>>
> > > > > > > > >         >>>    while (not_stopped) {
> > > > > > > > >         >>>     keep picking jobs
> > > > > > > > >         >>>    }
> > > > > > > > >         >>>
> > > > > > > > >         >>> Feels fundamentally in disagreement with workers (while
> > > > > > > > >        obviously fits
> > > > > > > > >         >>> perfectly with the current kthread design).
> > > > > > > > >         >>
> > > > > > > > >         >> The while loop breaks and worker exists if no jobs are ready.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > I'm not very familiar with workqueues. What are you saying would fit
> > > > > > > > > better? One scheduling job per work item rather than one big work
> > > > > > > > > item which handles all available jobs?
> > > > > > > > Yes and no, it indeed IMO does not fit to have a work item which is
> > > > > > > > potentially unbound in runtime. But it is a bit moot conceptual mismatch
> > > > > > > > because it is a worst case / theoretical, and I think due more
> > > > > > > > fundamental concerns.
> > > > > > > > 
> > > > > > > > If we have to go back to the low level side of things, I've picked this
> > > > > > > > random spot to consolidate what I have already mentioned and perhaps
> > > > > > > > expand.
> > > > > > > > 
> > > > > > > > To start with, let me pull out some thoughts from workqueue.rst:
> > > > > > > > 
> > > > > > > > """
> > > > > > > > Generally, work items are not expected to hog a CPU and consume many
> > > > > > > > cycles. That means maintaining just enough concurrency to prevent work
> > > > > > > > processing from stalling should be optimal.
> > > > > > > > """
> > > > > > > > 
> > > > > > > > For unbound queues:
> > > > > > > > """
> > > > > > > > The responsibility of regulating concurrency level is on the users.
> > > > > > > > """
> > > > > > > > 
> > > > > > > > Given the unbound queues will be spawned on demand to service all queued
> > > > > > > > work items (more interesting when mixing up with the system_unbound_wq),
> > > > > > > > in the proposed design the number of instantiated worker threads does
> > > > > > > > not correspond to the number of user threads (as you have elsewhere
> > > > > > > > stated), but pessimistically to the number of active user contexts. That
> > > > > > > > is the number which drives the maximum number of not-runnable jobs that
> > > > > > > > can become runnable at once, and hence spawn that many work items, and
> > > > > > > > in turn unbound worker threads.
> > > > > > > > 
> > > > > > > > Several problems there.
> > > > > > > > 
> > > > > > > > It is fundamentally pointless to have potentially that many more threads
> > > > > > > > than the number of CPU cores - it simply creates a scheduling storm.
> > > > > > > To make matters worse, if I follow the code correctly, all these per user
> > > > > > > context worker thread / work items end up contending on the same lock or
> > > > > > > circular buffer, both are one instance per GPU:
> > > > > > > 
> > > > > > > guc_engine_run_job
> > > > > > >     -> submit_engine
> > > > > > >        a) wq_item_append
> > > > > > >            -> wq_wait_for_space
> > > > > > >              -> msleep
> > > > > > a) is dedicated per xe_engine
> > > > > Hah true, what its for then? I thought throttling the LRCA ring is done via:
> > > > > 
> > > > This is a per guc_id 'work queue' which is used for parallel submission
> > > > (e.g. multiple LRC tail values need to written atomically by the GuC).
> > > > Again in practice there should always be space.
> > > Speaking of guc id, where does blocking when none are available happen in
> > > the non parallel case?
> > > 
> > We have 64k guc_ids on native, 1k guc_ids with 64k VFs. Either way we
> > think that is more than enough and can just reject xe_engine creation if
> > we run out of guc_ids. If this proves to false, we can fix this but the
> > guc_id stealing the i915 is rather complicated and hopefully not needed.
> > 
> > We will limit the number of guc_ids allowed per user pid to reasonible
> > number to prevent a DoS. Elevated pids (e.g. IGTs) will be able do to
> > whatever they want.
> What about doorbells? As some point, we will have to start using those and
> they are a much more limited resource - 256 total and way less with VFs.
> 

We haven't thought about that one yet, will figure this one out when we
implement this.

Matt

> John.
>
Matthew Brost Jan. 11, 2023, 7:40 p.m. UTC | #32
On Wed, Jan 11, 2023 at 08:50:37AM +0000, Tvrtko Ursulin wrote:
> 
> On 10/01/2023 14:08, Jason Ekstrand wrote:
> > On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com <mailto:tvrtko.ursulin@linux.intel.com>>
> > wrote:
> > 
> > 
> > 
> >     On 09/01/2023 17:27, Jason Ekstrand wrote:
> > 
> >     [snip]
> > 
> >      >      >>> AFAICT it proposes to have 1:1 between *userspace* created
> >      >     contexts (per
> >      >      >>> context _and_ engine) and drm_sched. I am not sure avoiding
> >      >     invasive changes
> >      >      >>> to the shared code is in the spirit of the overall idea
> >     and instead
> >      >      >>> opportunity should be used to look at way to
> >     refactor/improve
> >      >     drm_sched.
> >      >
> >      >
> >      > Maybe?  I'm not convinced that what Xe is doing is an abuse at
> >     all or
> >      > really needs to drive a re-factor.  (More on that later.)
> > There's only
> >      > one real issue which is that it fires off potentially a lot of
> >     kthreads.
> >      > Even that's not that bad given that kthreads are pretty light and
> >     you're
> >      > not likely to have more kthreads than userspace threads which are
> >     much
> >      > heavier.  Not ideal, but not the end of the world either.
> > Definitely
> >      > something we can/should optimize but if we went through with Xe
> >     without
> >      > this patch, it would probably be mostly ok.
> >      >
> >      >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> >      >      >>
> >      >      >> I'm not really prepared to make large changes to DRM
> >     scheduler
> >      >     at the
> >      >      >> moment for Xe as they are not really required nor does Boris
> >      >     seem they
> >      >      >> will be required for his work either. I am interested to see
> >      >     what Boris
> >      >      >> comes up with.
> >      >      >>
> >      >      >>> Even on the low level, the idea to replace drm_sched threads
> >      >     with workers
> >      >      >>> has a few problems.
> >      >      >>>
> >      >      >>> To start with, the pattern of:
> >      >      >>>
> >      >      >>>    while (not_stopped) {
> >      >      >>>     keep picking jobs
> >      >      >>>    }
> >      >      >>>
> >      >      >>> Feels fundamentally in disagreement with workers (while
> >      >     obviously fits
> >      >      >>> perfectly with the current kthread design).
> >      >      >>
> >      >      >> The while loop breaks and worker exists if no jobs are ready.
> >      >
> >      >
> >      > I'm not very familiar with workqueues. What are you saying would fit
> >      > better? One scheduling job per work item rather than one big work
> >     item
> >      > which handles all available jobs?
> > 
> >     Yes and no, it indeed IMO does not fit to have a work item which is
> >     potentially unbound in runtime. But it is a bit moot conceptual
> >     mismatch
> >     because it is a worst case / theoretical, and I think due more
> >     fundamental concerns.
> > 
> >     If we have to go back to the low level side of things, I've picked this
> >     random spot to consolidate what I have already mentioned and perhaps
> >     expand.
> > 
> >     To start with, let me pull out some thoughts from workqueue.rst:
> > 
> >     """
> >     Generally, work items are not expected to hog a CPU and consume many
> >     cycles. That means maintaining just enough concurrency to prevent work
> >     processing from stalling should be optimal.
> >     """
> > 
> >     For unbound queues:
> >     """
> >     The responsibility of regulating concurrency level is on the users.
> >     """
> > 
> >     Given the unbound queues will be spawned on demand to service all
> >     queued
> >     work items (more interesting when mixing up with the
> >     system_unbound_wq),
> >     in the proposed design the number of instantiated worker threads does
> >     not correspond to the number of user threads (as you have elsewhere
> >     stated), but pessimistically to the number of active user contexts.
> > 
> > 
> > Those are pretty much the same in practice.  Rather, user threads is
> > typically an upper bound on the number of contexts.  Yes, a single user
> > thread could have a bunch of contexts but basically nothing does that
> > except IGT.  In real-world usage, it's at most one context per user
> > thread.
> 
> Typically is the key here. But I am not sure it is good enough. Consider
> this example - Intel Flex 170:
> 
>  * Delivers up to 36 streams 1080p60 transcode throughput per card.
>  * When scaled to 10 cards in a 4U server configuration, it can support up
> to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
> 
> One transcode stream from my experience typically is 3-4 GPU contexts
> (buffer travels from vcs -> rcs -> vcs, maybe vecs) used from a single CPU
> thread. 4 contexts * 36 streams = 144 active contexts. Multiply by 60fps =
> 8640 jobs submitted and completed per second.
> 

See my reply with my numbers based running xe_exec_threads, on a TGL we
are getting 33711 jobs per sec /w 640 xe_engines. This seems to scale
just fine.

> 144 active contexts in the proposed scheme means possibly means 144 kernel
> worker threads spawned (driven by 36 transcode CPU threads). (I don't think
> the pools would scale down given all are constantly pinged at 60fps.)
> 
> And then each of 144 threads goes to grab the single GuC CT mutex. First
> threads are being made schedulable, then put to sleep as mutex contention is
> hit, then woken again as mutexes are getting released, rinse, repeat.
> 
> (And yes this backend contention is there regardless of 1:1:1, it would
> require a different re-design to solve that. But it is just a question
> whether there are 144 contending threads, or just 6 with the thread per
> engine class scheme.)
> 
> Then multiply all by 10 for a 4U server use case and you get 1440 worker
> kthreads, yes 10 more CT locks, but contending on how many CPU cores? Just
> so they can grab a timeslice and maybe content on a mutex as the next step.
>

Same as above, this seems to scale just fine as I bet the above example
of 33711 job per sec is limited by a single GuC context switching rather
than Xe being about to feed the GuC. Also certainly a server in this
configuration is going to a CPU much faster than the TGL I was using.

Also did another quick change to use 1280 xe_engines in xe_exec_threads:
root@DUT025-TGLU:igt-gpu-tools# xe_exec_threads --r threads-basic
IGT-Version: 1.26-ge26de4b2 (x86_64) (Linux: 6.1.0-rc1-xe+ x86_64)
Starting subtest: threads-basic
Subtest threads-basic: SUCCESS (1.198s)

More or less same results as 640 xe_engines.
 
> This example is where it would hurt on large systems. Imagine only an even
> wider media transcode card...
> 
> Second example is only a single engine class used (3d desktop?) but with a
> bunch of not-runnable jobs queued and waiting on a fence to signal. Implicit
> or explicit dependencies doesn't matter. Then the fence signals and call
> backs run. N work items get scheduled, but they all submit to the same HW
> engine. So we end up with:
> 
>         /-- wi1 --\
>        / ..     .. \
>  cb --+---  wi.. ---+-- rq1 -- .. -- rqN
>        \ ..    ..  /
>         \-- wiN --/
> 
> 
> All that we have achieved is waking up N CPUs to contend on the same lock
> and effectively insert the job into the same single HW queue. I don't see
> any positives there.
>

I've said this before, the CT channel in practice isn't going to be full
so the section of code protected by the mutex is really, really small.
The mutex really shouldn't ever have contention. Also does a mutex spin
for small period of time before going to sleep? I seem to recall some
type of core lock did this, if we can use a lock that spins for short
period of time this argument falls apart.
 
> This example I think can particularly hurt small / low power devices because
> of needless waking up of many cores for no benefit. Granted, I don't have a
> good feel on how common this pattern is in practice.
> 
> > 
> >     That
> >     is the number which drives the maximum number of not-runnable jobs that
> >     can become runnable at once, and hence spawn that many work items, and
> >     in turn unbound worker threads.
> > 
> >     Several problems there.
> > 
> >     It is fundamentally pointless to have potentially that many more
> >     threads
> >     than the number of CPU cores - it simply creates a scheduling storm.
> > 
> >     Unbound workers have no CPU / cache locality either and no connection
> >     with the CPU scheduler to optimize scheduling patterns. This may matter
> >     either on large systems or on small ones. Whereas the current design
> >     allows for scheduler to notice userspace CPU thread keeps waking up the
> >     same drm scheduler kernel thread, and so it can keep them on the same
> >     CPU, the unbound workers lose that ability and so 2nd CPU might be
> >     getting woken up from low sleep for every submission.
> > 
> >     Hence, apart from being a bit of a impedance mismatch, the proposal has
> >     the potential to change performance and power patterns and both large
> >     and small machines.
> > 
> > 
> > Ok, thanks for explaining the issue you're seeing in more detail.  Yes,
> > deferred kwork does appear to mismatch somewhat with what the scheduler
> > needs or at least how it's worked in the past.  How much impact will
> > that mismatch have?  Unclear.
> > 
> >      >      >>> Secondly, it probably demands separate workers (not
> >     optional),
> >      >     otherwise
> >      >      >>> behaviour of shared workqueues has either the potential to
> >      >     explode number
> >      >      >>> kernel threads anyway, or add latency.
> >      >      >>>
> >      >      >>
> >      >      >> Right now the system_unbound_wq is used which does have a
> >     limit
> >      >     on the
> >      >      >> number of threads, right? I do have a FIXME to allow a
> >     worker to be
> >      >      >> passed in similar to TDR.
> >      >      >>
> >      >      >> WRT to latency, the 1:1 ratio could actually have lower
> >     latency
> >      >     as 2 GPU
> >      >      >> schedulers can be pushing jobs into the backend / cleaning up
> >      >     jobs in
> >      >      >> parallel.
> >      >      >>
> >      >      >
> >      >      > Thought of one more point here where why in Xe we
> >     absolutely want
> >      >     a 1 to
> >      >      > 1 ratio between entity and scheduler - the way we implement
> >      >     timeslicing
> >      >      > for preempt fences.
> >      >      >
> >      >      > Let me try to explain.
> >      >      >
> >      >      > Preempt fences are implemented via the generic messaging
> >      >     interface [1]
> >      >      > with suspend / resume messages. If a suspend messages is
> >     received to
> >      >      > soon after calling resume (this is per entity) we simply
> >     sleep in the
> >      >      > suspend call thus giving the entity a timeslice. This
> >     completely
> >      >     falls
> >      >      > apart with a many to 1 relationship as now a entity
> >     waiting for a
> >      >      > timeslice blocks the other entities. Could we work aroudn
> >     this,
> >      >     sure but
> >      >      > just another bunch of code we'd have to add in Xe. Being to
> >      >     freely sleep
> >      >      > in backend without affecting other entities is really, really
> >      >     nice IMO
> >      >      > and I bet Xe isn't the only driver that is going to feel
> >     this way.
> >      >      >
> >      >      > Last thing I'll say regardless of how anyone feels about
> >     Xe using
> >      >     a 1 to
> >      >      > 1 relationship this patch IMO makes sense as I hope we can all
> >      >     agree a
> >      >      > workqueue scales better than kthreads.
> >      >
> >      >     I don't know for sure what will scale better and for what use
> >     case,
> >      >     combination of CPU cores vs number of GPU engines to keep
> >     busy vs other
> >      >     system activity. But I wager someone is bound to ask for some
> >      >     numbers to
> >      >     make sure proposal is not negatively affecting any other drivers.
> >      >
> >      >
> >      > Then let them ask.  Waving your hands vaguely in the direction of
> >     the
> >      > rest of DRM and saying "Uh, someone (not me) might object" is
> >     profoundly
> >      > unhelpful.  Sure, someone might.  That's why it's on dri-devel.
> > If you
> >      > think there's someone in particular who might have a useful
> >     opinion on
> >      > this, throw them in the CC so they don't miss the e-mail thread.
> >      >
> >      > Or are you asking for numbers?  If so, what numbers are you
> >     asking for?
> > 
> >     It was a heads up to the Xe team in case people weren't appreciating
> >     how
> >     the proposed change has the potential influence power and performance
> >     across the board. And nothing in the follow up discussion made me think
> >     it was considered so I don't think it was redundant to raise it.
> > 
> >     In my experience it is typical that such core changes come with some
> >     numbers. Which is in case of drm scheduler is tricky and probably
> >     requires explicitly asking everyone to test (rather than count on
> >     "don't
> >     miss the email thread"). Real products can fail to ship due ten mW here
> >     or there. Like suddenly an extra core prevented from getting into deep
> >     sleep.
> > 
> >     If that was "profoundly unhelpful" so be it.
> > 
> > 
> > With your above explanation, it makes more sense what you're asking.
> > It's still not something Matt is likely to be able to provide on his
> > own.  We need to tag some other folks and ask them to test it out.  We
> > could play around a bit with it on Xe but it's not exactly production
> > grade yet and is going to hit this differently from most.  Likely
> > candidates are probably AMD and Freedreno.
> 
> Whoever is setup to check out power and performance would be good to give it
> a spin, yes.
> 
> PS. I don't think I was asking Matt to test with other devices. To start
> with I think Xe is a team effort. I was asking for more background on the
> design decision since patch 4/20 does not say anything on that angle, nor
> later in the thread it was IMO sufficiently addressed.
> 
> >      > Also, If we're talking about a design that might paint us into an
> >      > Intel-HW-specific hole, that would be one thing.  But we're not.
> > We're
> >      > talking about switching which kernel threading/task mechanism to
> >     use for
> >      > what's really a very generic problem.  The core Xe design works
> >     without
> >      > this patch (just with more kthreads).  If we land this patch or
> >      > something like it and get it wrong and it causes a performance
> >     problem
> >      > for someone down the line, we can revisit it.
> > 
> >     For some definition of "it works" - I really wouldn't suggest
> >     shipping a
> >     kthread per user context at any point.
> > 
> > 
> > You have yet to elaborate on why. What resources is it consuming that's
> > going to be a problem? Are you anticipating CPU affinity problems? Or
> > does it just seem wasteful?
> 
> Well I don't know, commit message says the approach does not scale. :)
>

I don't think we want a user interface to directly be able to create a
kthread, that seems like a bad idea which Christian pointed out to us
off the list last March.
 
> > I think I largely agree that it's probably unnecessary/wasteful but
> > reducing the number of kthreads seems like a tractable problem to solve
> > regardless of where we put the gpu_scheduler object.  Is this the right
> > solution?  Maybe not.  It was also proposed at one point that we could
> > split the scheduler into two pieces: A scheduler which owns the kthread,
> > and a back-end which targets some HW ring thing where you can have
> > multiple back-ends per scheduler.  That's certainly more invasive from a
> > DRM scheduler internal API PoV but would solve the kthread problem in a
> > way that's more similar to what we have now.
> > 
> >      >     In any case that's a low level question caused by the high
> >     level design
> >      >     decision. So I'd think first focus on the high level - which
> >     is the 1:1
> >      >     mapping of entity to scheduler instance proposal.
> >      >
> >      >     Fundamentally it will be up to the DRM maintainers and the
> >     community to
> >      >     bless your approach. And it is important to stress 1:1 is about
> >      >     userspace contexts, so I believe unlike any other current
> >     scheduler
> >      >     user. And also important to stress this effectively does not
> >     make Xe
> >      >     _really_ use the scheduler that much.
> >      >
> >      >
> >      > I don't think this makes Xe nearly as much of a one-off as you
> >     think it
> >      > does.  I've already told the Asahi team working on Apple M1/2
> >     hardware
> >      > to do it this way and it seems to be a pretty good mapping for
> >     them. I
> >      > believe this is roughly the plan for nouveau as well.  It's not
> >     the way
> >      > it currently works for anyone because most other groups aren't
> >     doing FW
> >      > scheduling yet.  In the world of FW scheduling and hardware
> >     designed to
> >      > support userspace direct-to-FW submit, I think the design makes
> >     perfect
> >      > sense (see below) and I expect we'll see more drivers move in this
> >      > direction as those drivers evolve.  (AMD is doing some customish
> >     thing
> >      > for how with gpu_scheduler on the front-end somehow. I've not dug
> >     into
> >      > those details.)
> >      >
> >      >     I can only offer my opinion, which is that the two options
> >     mentioned in
> >      >     this thread (either improve drm scheduler to cope with what is
> >      >     required,
> >      >     or split up the code so you can use just the parts of
> >     drm_sched which
> >      >     you want - which is frontend dependency tracking) shouldn't be so
> >      >     readily dismissed, given how I think the idea was for the new
> >     driver to
> >      >     work less in a silo and more in the community (not do kludges to
> >      >     workaround stuff because it is thought to be too hard to
> >     improve common
> >      >     code), but fundamentally, "goto previous paragraph" for what I am
> >      >     concerned.
> >      >
> >      >
> >      > Meta comment:  It appears as if you're falling into the standard
> >     i915
> >      > team trap of having an internal discussion about what the community
> >      > discussion might look like instead of actually having the community
> >      > discussion.  If you are seriously concerned about interactions with
> >      > other drivers or whether or setting common direction, the right
> >     way to
> >      > do that is to break a patch or two out into a separate RFC series
> >     and
> >      > tag a handful of driver maintainers.  Trying to predict the
> >     questions
> >      > other people might ask is pointless. Cc them and asking for their
> >     input
> >      > instead.
> > 
> >     I don't follow you here. It's not an internal discussion - I am raising
> >     my concerns on the design publicly. I am supposed to write a patch to
> >     show something, but am allowed to comment on a RFC series?
> > 
> > 
> > I may have misread your tone a bit.  It felt a bit like too many
> > discussions I've had in the past where people are trying to predict what
> > others will say instead of just asking them.  Reading it again, I was
> > probably jumping to conclusions a bit.  Sorry about that.
> 
> Okay no problem, thanks. In any case we don't have to keep discussing it,
> since I wrote one or two emails ago it is fundamentally on the maintainers
> and community to ack the approach. I only felt like RFC did not explain the
> potential downsides sufficiently so I wanted to probe that area a bit.
> 
> >     It is "drm/sched: Convert drm scheduler to use a work queue rather than
> >     kthread" which should have Cc-ed _everyone_ who use drm scheduler.
> > 
> > 
> > Yeah, it probably should have.  I think that's mostly what I've been
> > trying to say.
> > 
> >      >
> >      >     Regards,
> >      >
> >      >     Tvrtko
> >      >
> >      >     P.S. And as a related side note, there are more areas where
> >     drm_sched
> >      >     could be improved, like for instance priority handling.
> >      >     Take a look at msm_submitqueue_create /
> >     msm_gpu_convert_priority /
> >      >     get_sched_entity to see how msm works around the drm_sched
> >     hardcoded
> >      >     limit of available priority levels, in order to avoid having
> >     to leave a
> >      >     hw capability unused. I suspect msm would be happier if they
> >     could have
> >      >     all priority levels equal in terms of whether they apply only
> >     at the
> >      >     frontend level or completely throughout the pipeline.
> >      >
> >      >      > [1]
> >      >
> >     https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> >     <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>
> >      >
> >  <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1
> > <https://patchwork.freedesktop.org/patch/515857/?series=112189&rev=1>>
> >      >      >
> >      >      >>> What would be interesting to learn is whether the option of
> >      >     refactoring
> >      >      >>> drm_sched to deal with out of order completion was
> >     considered
> >      >     and what were
> >      >      >>> the conclusions.
> >      >      >>>
> >      >      >>
> >      >      >> I coded this up a while back when trying to convert the
> >     i915 to
> >      >     the DRM
> >      >      >> scheduler it isn't all that hard either. The free flow
> >     control
> >      >     on the
> >      >      >> ring (e.g. set job limit == SIZE OF RING / MAX JOB SIZE) is
> >      >     really what
> >      >      >> sold me on the this design.
> >      >
> >      >
> >      > You're not the only one to suggest supporting out-of-order
> >     completion.
> >      > However, it's tricky and breaks a lot of internal assumptions of the
> >      > scheduler. It also reduces functionality a bit because it can no
> >     longer
> >      > automatically rate-limit HW/FW queues which are often
> >     fixed-size.  (Ok,
> >      > yes, it probably could but it becomes a substantially harder
> >     problem.)
> >      >
> >      > It also seems like a worse mapping to me.  The goal here is to turn
> >      > submissions on a userspace-facing engine/queue into submissions
> >     to a FW
> >      > queue submissions, sorting out any dma_fence dependencies.  Matt's
> >      > description of saying this is a 1:1 mapping between sched/entity
> >     doesn't
> >      > tell the whole story. It's a 1:1:1 mapping between xe_engine,
> >      > gpu_scheduler, and GuC FW engine.  Why make it a 1:something:1
> >     mapping?
> >      > Why is that better?
> > 
> >     As I have stated before, what I think what would fit well for Xe is one
> >     drm_scheduler per engine class. In specific terms on our current
> >     hardware, one drm scheduler instance for render, compute, blitter,
> >     video
> >     and video enhance. Userspace contexts remain scheduler entities.
> > 
> > 
> > And this is where we fairly strongly disagree.  More in a bit.
> > 
> >     That way you avoid the whole kthread/kworker story and you have it
> >     actually use the entity picking code in the scheduler, which may be
> >     useful when the backend is congested.
> > 
> > 
> > What back-end congestion are you referring to here?  Running out of FW
> > queue IDs?  Something else?
> 
> CT channel, number of context ids.
> 
> > 
> >     Yes you have to solve the out of order problem so in my mind that is
> >     something to discuss. What the problem actually is (just TDR?), how
> >     tricky and why etc.
> > 
> >     And yes you lose the handy LRCA ring buffer size management so you'd
> >     have to make those entities not runnable in some other way.
> > 
> >     Regarding the argument you raise below - would any of that make the
> >     frontend / backend separation worse and why? Do you think it is less
> >     natural? If neither is true then all remains is that it appears extra
> >     work to support out of order completion of entities has been discounted
> >     in favour of an easy but IMO inelegant option.
> > 
> > 
> > Broadly speaking, the kernel needs to stop thinking about GPU scheduling
> > in terms of scheduling jobs and start thinking in terms of scheduling
> > contexts/engines.  There is still some need for scheduling individual
> > jobs but that is only for the purpose of delaying them as needed to
> > resolve dma_fence dependencies.  Once dependencies are resolved, they
> > get shoved onto the context/engine queue and from there the kernel only
> > really manages whole contexts/engines.  This is a major architectural
> > shift, entirely different from the way i915 scheduling works.  It's also
> > different from the historical usage of DRM scheduler which I think is
> > why this all looks a bit funny.
> > 
> > To justify this architectural shift, let's look at where we're headed.
> > In the glorious future...
> > 
> >   1. Userspace submits directly to firmware queues.  The kernel has no
> > visibility whatsoever into individual jobs.  At most it can pause/resume
> > FW contexts as needed to handle eviction and memory management.
> > 
> >   2. Because of 1, apart from handing out the FW queue IDs at the
> > beginning, the kernel can't really juggle them that much.  Depending on
> > FW design, it may be able to pause a client, give its IDs to another,
> > and then resume it later when IDs free up.  What it's not doing is
> > juggling IDs on a job-by-job basis like i915 currently is.
> > 
> >   3. Long-running compute jobs may not complete for days.  This means
> > that memory management needs to happen in terms of pause/resume of
> > entire contexts/engines using the memory rather than based on waiting
> > for individual jobs to complete or pausing individual jobs until the
> > memory is available.
> > 
> >   4. Synchronization happens via userspace memory fences (UMF) and the
> > kernel is mostly unaware of most dependencies and when a context/engine
> > is or is not runnable.  Instead, it keeps as many of them minimally
> > active (memory is available, even if it's in system RAM) as possible and
> > lets the FW sort out dependencies.  (There may need to be some facility
> > for sleeping a context until a memory change similar to futex() or
> > poll() for userspace threads.  There are some details TBD.)
> > 
> > Are there potential problems that will need to be solved here?  Yes.  Is
> > it a good design?  Well, Microsoft has been living in this future for
> > half a decade or better and it's working quite well for them.  It's also
> > the way all modern game consoles work.  It really is just Linux that's
> > stuck with the same old job model we've had since the monumental shift
> > to DRI2.
> > 
> > To that end, one of the core goals of the Xe project was to make the
> > driver internally behave as close to the above model as possible while
> > keeping the old-school job model as a very thin layer on top.  As the
> > broader ecosystem problems (window-system support for UMF, for instance)
> > are solved, that layer can be peeled back.  The core driver will already
> > be ready for it.
> > 
> > To that end, the point of the DRM scheduler in Xe isn't to schedule
> > jobs.  It's to resolve syncobj and dma-buf implicit sync dependencies
> > and stuff jobs into their respective context/engine queue once they're
> > ready.  All the actual scheduling happens in firmware and any scheduling
> > the kernel does to deal with contention, oversubscriptions, too many
> > contexts, etc. is between contexts/engines, not individual jobs.  Sure,
> > the individual job visibility is nice, but if we design around it, we'll
> > never get to the glorious future.
> > 
> > I really need to turn the above (with a bit more detail) into a blog
> > post.... Maybe I'll do that this week.
> > 
> > In any case, I hope that provides more insight into why Xe is designed
> > the way it is and why I'm pushing back so hard on trying to make it more
> > of a "classic" driver as far as scheduling is concerned.  Are there
> > potential problems here?  Yes, that's why Xe has been labeled a
> > prototype.  Are such radical changes necessary to get to said glorious
> > future?  Yes, I think they are.  Will it be worth it?  I believe so.
> 
> Right, that's all solid I think. My takeaway is that frontend priority
> sorting and that stuff isn't needed and that is okay. And that there are
> multiple options to maybe improve drm scheduler, like the fore mentioned
> making it deal with out of order, or split into functional components, or
> split frontend/backend what you suggested. For most of them cost vs benefit
> is more or less not completely clear, neither how much effort was invested
> to look into them.
> 
> One thing I missed from this explanation is how drm_scheduler per engine
> class interferes with the high level concepts. And I did not manage to pick
> up on what exactly is the TDR problem in that case. Maybe the two are one
> and the same.
> 
> Bottom line is I still have the concern that conversion to kworkers has an
> opportunity to regress. Possibly more opportunity for some Xe use cases than
> to affect other vendors, since they would still be using per physical engine
> / queue scheduler instances.
> 

We certainly don't want to affect other vendors but I haven't yet heard
any push back from other vendors. I don't think speculating about
potential problems is helpful.

> And to put my money where my mouth is I will try to put testing Xe inside
> the full blown ChromeOS environment in my team plans. It would probably also
> be beneficial if Xe team could take a look at real world behaviour of the
> extreme transcode use cases too. If the stack is ready for that and all. It
> would be better to know earlier rather than later if there is a fundamental
> issue.
>

We don't have a media UMD yet it will be tough to test at this point in
time. Also not sure when Xe is going to be POR for a Chrome product
either so porting Xe into ChromeOS likely isn't a top priority for your
team. I know from experience that porting things into ChromeOS isn't
trivial as I've support several of these efforts. Not saying don't do
this just mentioning the realities of what you are suggesting.

Matt

> For the patch at hand, and the cover letter, it certainly feels it would
> benefit to record the past design discussion had with AMD folks, to
> explicitly copy other drivers, and to record the theoretical pros and cons
> of threads vs unbound workers as I have tried to highlight them.
> 
> Regards,
> 
> Tvrtko
Daniel Vetter Jan. 11, 2023, 9:47 p.m. UTC | #33
On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
<boris.brezillon@collabora.com> wrote:
>
> Hi Daniel,
>
> On Mon, 9 Jan 2023 21:40:21 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
>
> > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:
> > > Hi Jason,
> > >
> > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > >
> > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > wrote:
> > > >
> > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:
> > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > >
> > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > >
> > > > > > > > Hello Matthew,
> > > > > > > >
> > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > >
> > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first
> > > > > this
> > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > >
> > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > guaranteed to be the same completion even if targeting the same
> > > > > hardware
> > > > > > > > > engine. This is because in XE we have a firmware scheduler, the
> > > > > GuC,
> > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a
> > > > > using
> > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR
> > > > > falls
> > > > > > > > > apart as the TDR expects submission order == completion order.
> > > > > Using a
> > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this
> > > > > problem.
> > > > > > > >
> > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > issues to support Arm's new Mali GPU which is relying on a
> > > > > FW-assisted
> > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > because the integration with drm_sched was painful, but also because
> > > > > I
> > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > timeslice-oriented scheduling model wasn't really future proof.
> > > > > Giving
> > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably
> > > > > might
> > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.
> > > > > > >
> > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> > > > >
> > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > aside from the most basic level of not overflowing the submission ring
> > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > scheduler fits quite well.
> > > > >
> > > >
> > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > want here and what you need for Arm thanks to the number of FW queues
> > > > available. I don't remember the exact number of GuC queues but it's at
> > > > least 1k. This puts it in an entirely different class from what you have on
> > > > Mali. Roughly, there's about three categories here:
> > > >
> > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > categorization.)
> > > >
> > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > >
> > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > windows.
> > > >
> > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > here and will require different designs. For Xe, we really are just using
> > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > >
> > > > How do we deal with class 2? That's an interesting question.  We may
> > > > eventually want to break that off into a separate discussion and not litter
> > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > pretty reasonable solutions but they're going to look a bit different.
> > > >
> > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > Panfrost, I think we want something slightly different but can borrow some
> > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > able to juggle if you run out.
> > >
> > > That's more or less what I do right now, except I don't use the
> > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > (queues with pending jobs that are not yet resident on a FW slot)
> > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > ticks being handled with a delayed-work (tick happening every X
> > > milliseconds when queues are waiting for a slot). It all seems very
> > > HW/FW-specific though, and I think it's a bit premature to try to
> > > generalize that part, but the dep-tracking logic implemented by
> > > drm_sched looked like something I could easily re-use, hence my
> > > interest in Xe's approach.
> >
> > So another option for these few fw queue slots schedulers would be to
> > treat them as vram and enlist ttm.
> >
> > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > handle idr (or xarray or whatever you want) and then help you with all the
> > pipelining (and the drm_sched then with sorting out dependencies). If you
> > then also preferentially "evict" low-priority queus you pretty much have
> > the perfect thing.
> >
> > Note that GuC with sriov splits up the id space and together with some
> > restrictions due to multi-engine contexts media needs might also need this
> > all.
> >
> > If you're balking at the idea of enlisting ttm just for fw queue
> > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > index allocation. Might be worth it to instead lift that into some sched
> > helper code.
>
> Would you mind pointing me to the amdgpu code you're mentioning here?
> Still have a hard time seeing what TTM has to do with scheduling, but I
> also don't know much about TTM, so I'll keep digging.

ttm is about moving stuff in&out of a limited space and gives you some
nice tooling for pipelining it all. It doesn't care whether that space
is vram or some limited id space. vmwgfx used ttm as an id manager
iirc.

> > Either way there's two imo rather solid approaches available to sort this
> > out. And once you have that, then there shouldn't be any big difference in
> > driver design between fw with defacto unlimited queue ids, and those with
> > severe restrictions in number of queues.
>
> Honestly, I don't think there's much difference between those two cases
> already. There's just a bunch of additional code to schedule queues on
> FW slots for the limited-number-of-FW-slots case, which, right now, is
> driver specific. The job queuing front-end pretty much achieves what
> drm_sched does already: queuing job to entities, checking deps,
> submitting job to HW (in our case, writing to the command stream ring
> buffer). Things start to differ after that point: once a scheduling
> entity has pending jobs, we add it to one of the runnable queues (one
> queue per prio) and kick the kernel-side timeslice-based scheduler to
> re-evaluate, if needed.
>
> I'm all for using generic code when it makes sense, even if that means
> adding this common code when it doesn't exists, but I don't want to be
> dragged into some major refactoring that might take years to land.
> Especially if pancsf is the first
> FW-assisted-scheduler-with-few-FW-slot driver.

I don't see where there's a major refactoring that you're getting dragged into?

Yes there's a huge sprawling discussion right now, but I think that's
just largely people getting confused.

Wrt the actual id assignment stuff, in amdgpu at least it's few lines
of code. See the amdgpu_vmid_grab stuff for the simplest starting
point.

And also yes a scheduler frontend for dependency sorting shouldn't
really be a that big thing, so there's not going to be huge amounts of
code sharing in the end. It's the conceptual sharing, and sharing
stuff like drm_sched_entity to eventual build some cross driver gpu
context stuff on top that really is going to matter.

Also like I mentioned, at least in some cases i915-guc might also have
a need for fw scheduler slot allocation for a bunch of running things.

Finally I'm a bit confused why you're building a time sharing
scheduler in the kernel if you have one in fw already. Or do I get
that part wrong?
-Daniel

> Here's a link to my WIP branch [1], and here is the scheduler logic
> [2] if you want to have a look. Don't pay too much attention to the
> driver uAPI (it's being redesigned).
>
> Regards,
>
> Boris
>
> [1]https://gitlab.freedesktop.org/bbrezillon/linux/-/tree/pancsf
> [2]https://gitlab.freedesktop.org/bbrezillon/linux/-/blob/pancsf/drivers/gpu/drm/pancsf/pancsf_sched.c
Jason Ekstrand Jan. 11, 2023, 10:18 p.m. UTC | #34
On Wed, Jan 11, 2023 at 2:50 AM Tvrtko Ursulin <
tvrtko.ursulin@linux.intel.com> wrote:

>
> On 10/01/2023 14:08, Jason Ekstrand wrote:
> > On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com <mailto:tvrtko.ursulin@linux.intel.com>>
>
> > wrote:
> >
> >
> >
> >     On 09/01/2023 17:27, Jason Ekstrand wrote:
> >
> >     [snip]
> >
> >      >      >>> AFAICT it proposes to have 1:1 between *userspace*
> created
> >      >     contexts (per
> >      >      >>> context _and_ engine) and drm_sched. I am not sure
> avoiding
> >      >     invasive changes
> >      >      >>> to the shared code is in the spirit of the overall idea
> >     and instead
> >      >      >>> opportunity should be used to look at way to
> >     refactor/improve
> >      >     drm_sched.
> >      >
> >      >
> >      > Maybe?  I'm not convinced that what Xe is doing is an abuse at
> >     all or
> >      > really needs to drive a re-factor.  (More on that later.)
> >     There's only
> >      > one real issue which is that it fires off potentially a lot of
> >     kthreads.
> >      > Even that's not that bad given that kthreads are pretty light and
> >     you're
> >      > not likely to have more kthreads than userspace threads which are
> >     much
> >      > heavier.  Not ideal, but not the end of the world either.
> >     Definitely
> >      > something we can/should optimize but if we went through with Xe
> >     without
> >      > this patch, it would probably be mostly ok.
> >      >
> >      >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> >      >      >>
> >      >      >> I'm not really prepared to make large changes to DRM
> >     scheduler
> >      >     at the
> >      >      >> moment for Xe as they are not really required nor does
> Boris
> >      >     seem they
> >      >      >> will be required for his work either. I am interested to
> see
> >      >     what Boris
> >      >      >> comes up with.
> >      >      >>
> >      >      >>> Even on the low level, the idea to replace drm_sched
> threads
> >      >     with workers
> >      >      >>> has a few problems.
> >      >      >>>
> >      >      >>> To start with, the pattern of:
> >      >      >>>
> >      >      >>>    while (not_stopped) {
> >      >      >>>     keep picking jobs
> >      >      >>>    }
> >      >      >>>
> >      >      >>> Feels fundamentally in disagreement with workers (while
> >      >     obviously fits
> >      >      >>> perfectly with the current kthread design).
> >      >      >>
> >      >      >> The while loop breaks and worker exists if no jobs are
> ready.
> >      >
> >      >
> >      > I'm not very familiar with workqueues. What are you saying would
> fit
> >      > better? One scheduling job per work item rather than one big work
> >     item
> >      > which handles all available jobs?
> >
> >     Yes and no, it indeed IMO does not fit to have a work item which is
> >     potentially unbound in runtime. But it is a bit moot conceptual
> >     mismatch
> >     because it is a worst case / theoretical, and I think due more
> >     fundamental concerns.
> >
> >     If we have to go back to the low level side of things, I've picked
> this
> >     random spot to consolidate what I have already mentioned and perhaps
> >     expand.
> >
> >     To start with, let me pull out some thoughts from workqueue.rst:
> >
> >     """
> >     Generally, work items are not expected to hog a CPU and consume many
> >     cycles. That means maintaining just enough concurrency to prevent
> work
> >     processing from stalling should be optimal.
> >     """
> >
> >     For unbound queues:
> >     """
> >     The responsibility of regulating concurrency level is on the users.
> >     """
> >
> >     Given the unbound queues will be spawned on demand to service all
> >     queued
> >     work items (more interesting when mixing up with the
> >     system_unbound_wq),
> >     in the proposed design the number of instantiated worker threads does
> >     not correspond to the number of user threads (as you have elsewhere
> >     stated), but pessimistically to the number of active user contexts.
> >
> >
> > Those are pretty much the same in practice.  Rather, user threads is
> > typically an upper bound on the number of contexts.  Yes, a single user
> > thread could have a bunch of contexts but basically nothing does that
> > except IGT.  In real-world usage, it's at most one context per user
> thread.
>
> Typically is the key here. But I am not sure it is good enough. Consider
> this example - Intel Flex 170:
>
>   * Delivers up to 36 streams 1080p60 transcode throughput per card.
>   * When scaled to 10 cards in a 4U server configuration, it can support
> up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
>

I had a feeling it was going to be media.... 
Matthew Brost Jan. 11, 2023, 10:31 p.m. UTC | #35
On Wed, Jan 11, 2023 at 04:18:01PM -0600, Jason Ekstrand wrote:
> On Wed, Jan 11, 2023 at 2:50 AM Tvrtko Ursulin <
> tvrtko.ursulin@linux.intel.com> wrote:
> 
> >
> > On 10/01/2023 14:08, Jason Ekstrand wrote:
> > > On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin
> > > <tvrtko.ursulin@linux.intel.com <mailto:tvrtko.ursulin@linux.intel.com>>
> >
> > > wrote:
> > >
> > >
> > >
> > >     On 09/01/2023 17:27, Jason Ekstrand wrote:
> > >
> > >     [snip]
> > >
> > >      >      >>> AFAICT it proposes to have 1:1 between *userspace*
> > created
> > >      >     contexts (per
> > >      >      >>> context _and_ engine) and drm_sched. I am not sure
> > avoiding
> > >      >     invasive changes
> > >      >      >>> to the shared code is in the spirit of the overall idea
> > >     and instead
> > >      >      >>> opportunity should be used to look at way to
> > >     refactor/improve
> > >      >     drm_sched.
> > >      >
> > >      >
> > >      > Maybe?  I'm not convinced that what Xe is doing is an abuse at
> > >     all or
> > >      > really needs to drive a re-factor.  (More on that later.)
> > >     There's only
> > >      > one real issue which is that it fires off potentially a lot of
> > >     kthreads.
> > >      > Even that's not that bad given that kthreads are pretty light and
> > >     you're
> > >      > not likely to have more kthreads than userspace threads which are
> > >     much
> > >      > heavier.  Not ideal, but not the end of the world either.
> > >     Definitely
> > >      > something we can/should optimize but if we went through with Xe
> > >     without
> > >      > this patch, it would probably be mostly ok.
> > >      >
> > >      >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > >      >      >>
> > >      >      >> I'm not really prepared to make large changes to DRM
> > >     scheduler
> > >      >     at the
> > >      >      >> moment for Xe as they are not really required nor does
> > Boris
> > >      >     seem they
> > >      >      >> will be required for his work either. I am interested to
> > see
> > >      >     what Boris
> > >      >      >> comes up with.
> > >      >      >>
> > >      >      >>> Even on the low level, the idea to replace drm_sched
> > threads
> > >      >     with workers
> > >      >      >>> has a few problems.
> > >      >      >>>
> > >      >      >>> To start with, the pattern of:
> > >      >      >>>
> > >      >      >>>    while (not_stopped) {
> > >      >      >>>     keep picking jobs
> > >      >      >>>    }
> > >      >      >>>
> > >      >      >>> Feels fundamentally in disagreement with workers (while
> > >      >     obviously fits
> > >      >      >>> perfectly with the current kthread design).
> > >      >      >>
> > >      >      >> The while loop breaks and worker exists if no jobs are
> > ready.
> > >      >
> > >      >
> > >      > I'm not very familiar with workqueues. What are you saying would
> > fit
> > >      > better? One scheduling job per work item rather than one big work
> > >     item
> > >      > which handles all available jobs?
> > >
> > >     Yes and no, it indeed IMO does not fit to have a work item which is
> > >     potentially unbound in runtime. But it is a bit moot conceptual
> > >     mismatch
> > >     because it is a worst case / theoretical, and I think due more
> > >     fundamental concerns.
> > >
> > >     If we have to go back to the low level side of things, I've picked
> > this
> > >     random spot to consolidate what I have already mentioned and perhaps
> > >     expand.
> > >
> > >     To start with, let me pull out some thoughts from workqueue.rst:
> > >
> > >     """
> > >     Generally, work items are not expected to hog a CPU and consume many
> > >     cycles. That means maintaining just enough concurrency to prevent
> > work
> > >     processing from stalling should be optimal.
> > >     """
> > >
> > >     For unbound queues:
> > >     """
> > >     The responsibility of regulating concurrency level is on the users.
> > >     """
> > >
> > >     Given the unbound queues will be spawned on demand to service all
> > >     queued
> > >     work items (more interesting when mixing up with the
> > >     system_unbound_wq),
> > >     in the proposed design the number of instantiated worker threads does
> > >     not correspond to the number of user threads (as you have elsewhere
> > >     stated), but pessimistically to the number of active user contexts.
> > >
> > >
> > > Those are pretty much the same in practice.  Rather, user threads is
> > > typically an upper bound on the number of contexts.  Yes, a single user
> > > thread could have a bunch of contexts but basically nothing does that
> > > except IGT.  In real-world usage, it's at most one context per user
> > thread.
> >
> > Typically is the key here. But I am not sure it is good enough. Consider
> > this example - Intel Flex 170:
> >
> >   * Delivers up to 36 streams 1080p60 transcode throughput per card.
> >   * When scaled to 10 cards in a 4U server configuration, it can support
> > up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
> >
> 
> I had a feeling it was going to be media.... 
Jason Ekstrand Jan. 11, 2023, 10:56 p.m. UTC | #36
On Wed, Jan 11, 2023 at 4:32 PM Matthew Brost <matthew.brost@intel.com>
wrote:

> On Wed, Jan 11, 2023 at 04:18:01PM -0600, Jason Ekstrand wrote:
> > On Wed, Jan 11, 2023 at 2:50 AM Tvrtko Ursulin <
> > tvrtko.ursulin@linux.intel.com> wrote:
> >
> > >
> > > On 10/01/2023 14:08, Jason Ekstrand wrote:
> > > > On Tue, Jan 10, 2023 at 5:28 AM Tvrtko Ursulin
> > > > <tvrtko.ursulin@linux.intel.com <mailto:
> tvrtko.ursulin@linux.intel.com>>
> > >
> > > > wrote:
> > > >
> > > >
> > > >
> > > >     On 09/01/2023 17:27, Jason Ekstrand wrote:
> > > >
> > > >     [snip]
> > > >
> > > >      >      >>> AFAICT it proposes to have 1:1 between *userspace*
> > > created
> > > >      >     contexts (per
> > > >      >      >>> context _and_ engine) and drm_sched. I am not sure
> > > avoiding
> > > >      >     invasive changes
> > > >      >      >>> to the shared code is in the spirit of the overall
> idea
> > > >     and instead
> > > >      >      >>> opportunity should be used to look at way to
> > > >     refactor/improve
> > > >      >     drm_sched.
> > > >      >
> > > >      >
> > > >      > Maybe?  I'm not convinced that what Xe is doing is an abuse at
> > > >     all or
> > > >      > really needs to drive a re-factor.  (More on that later.)
> > > >     There's only
> > > >      > one real issue which is that it fires off potentially a lot of
> > > >     kthreads.
> > > >      > Even that's not that bad given that kthreads are pretty light
> and
> > > >     you're
> > > >      > not likely to have more kthreads than userspace threads which
> are
> > > >     much
> > > >      > heavier.  Not ideal, but not the end of the world either.
> > > >     Definitely
> > > >      > something we can/should optimize but if we went through with
> Xe
> > > >     without
> > > >      > this patch, it would probably be mostly ok.
> > > >      >
> > > >      >      >> Yes, it is 1:1 *userspace* engines and drm_sched.
> > > >      >      >>
> > > >      >      >> I'm not really prepared to make large changes to DRM
> > > >     scheduler
> > > >      >     at the
> > > >      >      >> moment for Xe as they are not really required nor does
> > > Boris
> > > >      >     seem they
> > > >      >      >> will be required for his work either. I am interested
> to
> > > see
> > > >      >     what Boris
> > > >      >      >> comes up with.
> > > >      >      >>
> > > >      >      >>> Even on the low level, the idea to replace drm_sched
> > > threads
> > > >      >     with workers
> > > >      >      >>> has a few problems.
> > > >      >      >>>
> > > >      >      >>> To start with, the pattern of:
> > > >      >      >>>
> > > >      >      >>>    while (not_stopped) {
> > > >      >      >>>     keep picking jobs
> > > >      >      >>>    }
> > > >      >      >>>
> > > >      >      >>> Feels fundamentally in disagreement with workers
> (while
> > > >      >     obviously fits
> > > >      >      >>> perfectly with the current kthread design).
> > > >      >      >>
> > > >      >      >> The while loop breaks and worker exists if no jobs are
> > > ready.
> > > >      >
> > > >      >
> > > >      > I'm not very familiar with workqueues. What are you saying
> would
> > > fit
> > > >      > better? One scheduling job per work item rather than one big
> work
> > > >     item
> > > >      > which handles all available jobs?
> > > >
> > > >     Yes and no, it indeed IMO does not fit to have a work item which
> is
> > > >     potentially unbound in runtime. But it is a bit moot conceptual
> > > >     mismatch
> > > >     because it is a worst case / theoretical, and I think due more
> > > >     fundamental concerns.
> > > >
> > > >     If we have to go back to the low level side of things, I've
> picked
> > > this
> > > >     random spot to consolidate what I have already mentioned and
> perhaps
> > > >     expand.
> > > >
> > > >     To start with, let me pull out some thoughts from workqueue.rst:
> > > >
> > > >     """
> > > >     Generally, work items are not expected to hog a CPU and consume
> many
> > > >     cycles. That means maintaining just enough concurrency to prevent
> > > work
> > > >     processing from stalling should be optimal.
> > > >     """
> > > >
> > > >     For unbound queues:
> > > >     """
> > > >     The responsibility of regulating concurrency level is on the
> users.
> > > >     """
> > > >
> > > >     Given the unbound queues will be spawned on demand to service all
> > > >     queued
> > > >     work items (more interesting when mixing up with the
> > > >     system_unbound_wq),
> > > >     in the proposed design the number of instantiated worker threads
> does
> > > >     not correspond to the number of user threads (as you have
> elsewhere
> > > >     stated), but pessimistically to the number of active user
> contexts.
> > > >
> > > >
> > > > Those are pretty much the same in practice.  Rather, user threads is
> > > > typically an upper bound on the number of contexts.  Yes, a single
> user
> > > > thread could have a bunch of contexts but basically nothing does that
> > > > except IGT.  In real-world usage, it's at most one context per user
> > > thread.
> > >
> > > Typically is the key here. But I am not sure it is good enough.
> Consider
> > > this example - Intel Flex 170:
> > >
> > >   * Delivers up to 36 streams 1080p60 transcode throughput per card.
> > >   * When scaled to 10 cards in a 4U server configuration, it can
> support
> > > up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
> > >
> >
> > I had a feeling it was going to be media.... 
Boris Brezillon Jan. 12, 2023, 9:10 a.m. UTC | #37
Hi Daniel,

On Wed, 11 Jan 2023 22:47:02 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> <boris.brezillon@collabora.com> wrote:
> >
> > Hi Daniel,
> >
> > On Mon, 9 Jan 2023 21:40:21 +0100
> > Daniel Vetter <daniel@ffwll.ch> wrote:
> >  
> > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:  
> > > > Hi Jason,
> > > >
> > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > >  
> > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > wrote:
> > > > >  
> > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:  
> > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > >  
> > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > >  
> > > > > > > > > Hello Matthew,
> > > > > > > > >
> > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > >  
> > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first  
> > > > > > this  
> > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > >
> > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > guaranteed to be the same completion even if targeting the same  
> > > > > > hardware  
> > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the  
> > > > > > GuC,  
> > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a  
> > > > > > using  
> > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR  
> > > > > > falls  
> > > > > > > > > > apart as the TDR expects submission order == completion order.  
> > > > > > Using a  
> > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this  
> > > > > > problem.  
> > > > > > > > >
> > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > issues to support Arm's new Mali GPU which is relying on a  
> > > > > > FW-assisted  
> > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > because the integration with drm_sched was painful, but also because  
> > > > > > I  
> > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.  
> > > > > > Giving  
> > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably  
> > > > > > might  
> > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.  
> > > > > > > >
> > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue  
> > > > > >
> > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > scheduler fits quite well.
> > > > > >  
> > > > >
> > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > Mali. Roughly, there's about three categories here:
> > > > >
> > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > categorization.)
> > > > >
> > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > >
> > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > windows.
> > > > >
> > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > here and will require different designs. For Xe, we really are just using
> > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > >
> > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > eventually want to break that off into a separate discussion and not litter
> > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > >
> > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > able to juggle if you run out.  
> > > >
> > > > That's more or less what I do right now, except I don't use the
> > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > ticks being handled with a delayed-work (tick happening every X
> > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > generalize that part, but the dep-tracking logic implemented by
> > > > drm_sched looked like something I could easily re-use, hence my
> > > > interest in Xe's approach.  
> > >
> > > So another option for these few fw queue slots schedulers would be to
> > > treat them as vram and enlist ttm.
> > >
> > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > handle idr (or xarray or whatever you want) and then help you with all the
> > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > then also preferentially "evict" low-priority queus you pretty much have
> > > the perfect thing.
> > >
> > > Note that GuC with sriov splits up the id space and together with some
> > > restrictions due to multi-engine contexts media needs might also need this
> > > all.
> > >
> > > If you're balking at the idea of enlisting ttm just for fw queue
> > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > index allocation. Might be worth it to instead lift that into some sched
> > > helper code.  
> >
> > Would you mind pointing me to the amdgpu code you're mentioning here?
> > Still have a hard time seeing what TTM has to do with scheduling, but I
> > also don't know much about TTM, so I'll keep digging.  
> 
> ttm is about moving stuff in&out of a limited space and gives you some
> nice tooling for pipelining it all. It doesn't care whether that space
> is vram or some limited id space. vmwgfx used ttm as an id manager
> iirc.

Ok.

> 
> > > Either way there's two imo rather solid approaches available to sort this
> > > out. And once you have that, then there shouldn't be any big difference in
> > > driver design between fw with defacto unlimited queue ids, and those with
> > > severe restrictions in number of queues.  
> >
> > Honestly, I don't think there's much difference between those two cases
> > already. There's just a bunch of additional code to schedule queues on
> > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > driver specific. The job queuing front-end pretty much achieves what
> > drm_sched does already: queuing job to entities, checking deps,
> > submitting job to HW (in our case, writing to the command stream ring
> > buffer). Things start to differ after that point: once a scheduling
> > entity has pending jobs, we add it to one of the runnable queues (one
> > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > re-evaluate, if needed.
> >
> > I'm all for using generic code when it makes sense, even if that means
> > adding this common code when it doesn't exists, but I don't want to be
> > dragged into some major refactoring that might take years to land.
> > Especially if pancsf is the first
> > FW-assisted-scheduler-with-few-FW-slot driver.  
> 
> I don't see where there's a major refactoring that you're getting dragged into?

Oh, no, I'm not saying this is the case just yet, just wanted to make
sure we're on the same page :-).

> 
> Yes there's a huge sprawling discussion right now, but I think that's
> just largely people getting confused.

I definitely am :-).

> 
> Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> of code. See the amdgpu_vmid_grab stuff for the simplest starting
> point.

Ok, thanks for the pointers. I'll have a look and see how I could use
that. I guess that's about getting access to the FW slots with some
sort of priority+FIFO ordering guarantees given by TTM. If that's the
case, I'll have to think about it, because that's a major shift from
what we're doing now, and I'm afraid this could lead to starving
non-resident entities if all resident entities keep receiving new jobs
to execute. Unless we put some sort of barrier when giving access to a
slot, so we evict the entity when it's done executing the stuff it had
when it was given access to this slot. But then, again, there are other
constraints to take into account for the Arm Mali CSF case:

- it's more efficient to update all FW slots at once, because each
  update of a slot might require updating priorities of the other slots
  (FW mandates unique slot priorities, and those priorities depend on
  the entity priority/queue-ordering)
- context/FW slot switches have a non-negligible cost (FW needs to
  suspend the context and save the state every time there such a
  switch), so, limiting the number of FW slot updates might prove
  important

> 
> And also yes a scheduler frontend for dependency sorting shouldn't
> really be a that big thing, so there's not going to be huge amounts of
> code sharing in the end.

Agreed.

> It's the conceptual sharing, and sharing
> stuff like drm_sched_entity to eventual build some cross driver gpu
> context stuff on top that really is going to matter.

And I agree with that too.

> 
> Also like I mentioned, at least in some cases i915-guc might also have
> a need for fw scheduler slot allocation for a bunch of running things.

Ok.

> 
> Finally I'm a bit confused why you're building a time sharing
> scheduler in the kernel if you have one in fw already. Or do I get
> that part wrong?

It's here to overcome the low number of FW-slot (which is as low as 8
on the HW I'm testing on). If you don't do time sharing scheduling
kernel-side, you have no guarantee of fairness, since one could keep
queuing jobs to an entity/queue, making it permanently resident,
without giving a chance to non-resident entities/queues to ever run. To
sum-up, the scheduler is not entirely handled by the FW, it's a mixed
design, where part of it is in the FW (scheduling between currently
active entities passed to the FW), and the other part in the kernel
driver (rotating runnable entities on the limited amount of FW slots we
have). But overall, it shouldn't make a difference compared to Xe. The
fact some of the scheduling happens kernel-side is completely opaque to
the drm_sched_entity frontend if we go the Xe way (one
drm_gpu_scheduler per drm_sched_entity, real scheduling is handled by
some black box, either entirely in the FW, or with shared
responsibility between FW and kernel).

Regards,

Boris
Daniel Vetter Jan. 12, 2023, 9:32 a.m. UTC | #38
On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:
> Hi Daniel,
> 
> On Wed, 11 Jan 2023 22:47:02 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > <boris.brezillon@collabora.com> wrote:
> > >
> > > Hi Daniel,
> > >
> > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > >  
> > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:  
> > > > > Hi Jason,
> > > > >
> > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > >  
> > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > wrote:
> > > > > >  
> > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:  
> > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > >  
> > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > >  
> > > > > > > > > > Hello Matthew,
> > > > > > > > > >
> > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > >  
> > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first  
> > > > > > > this  
> > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > >
> > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > guaranteed to be the same completion even if targeting the same  
> > > > > > > hardware  
> > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the  
> > > > > > > GuC,  
> > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a  
> > > > > > > using  
> > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR  
> > > > > > > falls  
> > > > > > > > > > > apart as the TDR expects submission order == completion order.  
> > > > > > > Using a  
> > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this  
> > > > > > > problem.  
> > > > > > > > > >
> > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a  
> > > > > > > FW-assisted  
> > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > because the integration with drm_sched was painful, but also because  
> > > > > > > I  
> > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.  
> > > > > > > Giving  
> > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably  
> > > > > > > might  
> > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.  
> > > > > > > > >
> > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue  
> > > > > > >
> > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > scheduler fits quite well.
> > > > > > >  
> > > > > >
> > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > Mali. Roughly, there's about three categories here:
> > > > > >
> > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > categorization.)
> > > > > >
> > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > >
> > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > windows.
> > > > > >
> > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > >
> > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > >
> > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > able to juggle if you run out.  
> > > > >
> > > > > That's more or less what I do right now, except I don't use the
> > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > interest in Xe's approach.  
> > > >
> > > > So another option for these few fw queue slots schedulers would be to
> > > > treat them as vram and enlist ttm.
> > > >
> > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > the perfect thing.
> > > >
> > > > Note that GuC with sriov splits up the id space and together with some
> > > > restrictions due to multi-engine contexts media needs might also need this
> > > > all.
> > > >
> > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > index allocation. Might be worth it to instead lift that into some sched
> > > > helper code.  
> > >
> > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > also don't know much about TTM, so I'll keep digging.  
> > 
> > ttm is about moving stuff in&out of a limited space and gives you some
> > nice tooling for pipelining it all. It doesn't care whether that space
> > is vram or some limited id space. vmwgfx used ttm as an id manager
> > iirc.
> 
> Ok.
> 
> > 
> > > > Either way there's two imo rather solid approaches available to sort this
> > > > out. And once you have that, then there shouldn't be any big difference in
> > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > severe restrictions in number of queues.  
> > >
> > > Honestly, I don't think there's much difference between those two cases
> > > already. There's just a bunch of additional code to schedule queues on
> > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > driver specific. The job queuing front-end pretty much achieves what
> > > drm_sched does already: queuing job to entities, checking deps,
> > > submitting job to HW (in our case, writing to the command stream ring
> > > buffer). Things start to differ after that point: once a scheduling
> > > entity has pending jobs, we add it to one of the runnable queues (one
> > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > re-evaluate, if needed.
> > >
> > > I'm all for using generic code when it makes sense, even if that means
> > > adding this common code when it doesn't exists, but I don't want to be
> > > dragged into some major refactoring that might take years to land.
> > > Especially if pancsf is the first
> > > FW-assisted-scheduler-with-few-FW-slot driver.  
> > 
> > I don't see where there's a major refactoring that you're getting dragged into?
> 
> Oh, no, I'm not saying this is the case just yet, just wanted to make
> sure we're on the same page :-).
> 
> > 
> > Yes there's a huge sprawling discussion right now, but I think that's
> > just largely people getting confused.
> 
> I definitely am :-).
> 
> > 
> > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > point.
> 
> Ok, thanks for the pointers. I'll have a look and see how I could use
> that. I guess that's about getting access to the FW slots with some
> sort of priority+FIFO ordering guarantees given by TTM. If that's the
> case, I'll have to think about it, because that's a major shift from
> what we're doing now, and I'm afraid this could lead to starving
> non-resident entities if all resident entities keep receiving new jobs
> to execute. Unless we put some sort of barrier when giving access to a
> slot, so we evict the entity when it's done executing the stuff it had
> when it was given access to this slot. But then, again, there are other
> constraints to take into account for the Arm Mali CSF case:
> 
> - it's more efficient to update all FW slots at once, because each
>   update of a slot might require updating priorities of the other slots
>   (FW mandates unique slot priorities, and those priorities depend on
>   the entity priority/queue-ordering)
> - context/FW slot switches have a non-negligible cost (FW needs to
>   suspend the context and save the state every time there such a
>   switch), so, limiting the number of FW slot updates might prove
>   important

I frankly think you're overworrying. When you have 31+ contexts running at
the same time, you have bigger problems. At that point there's two
use-cases:
1. system is overloaded, the user will reach for reset button anyway
2. temporary situation, all you have to do is be roughly fair enough to get
   through it before case 1 happens.
 
Trying to write a perfect scheduler for this before we have actual
benchmarks that justify the effort seems like pretty serious overkill.
That's why I think the simplest solution is the one we should have:

- drm/sched frontend. If you get into slot exhaustion that alone will
  ensure enough fairness

- LRU list of slots, with dma_fence so you can pipeline/batch up changes
  as needed (but I honestly wouldn't worry about the batching before
  you've shown an actual need for this in some benchmark/workload, even
  piglit shouldn't have this many things running concurrently I think, you
  don't have that many cpu cores). Between drm/sched and the lru you will
  have an emergent scheduler that cycles through all runnable gpu jobs.

- If you want to go fancy, have eviction tricks like skipping currently
  still active gpu context with higher priority than the one that you need
  to find a slot for.

- You don't need time slicing in this, not even for compute. compute is
  done with preempt context fences, if you give them a minimum scheduling
  quanta you'll have a very basic round robin scheduler as an emergent
  thing.

Any workload were it matters will be scheduled by the fw directly, with
drm/sched only being the dma_fence dependcy sorter. My take is that if you
spend more than a hundred or so lines with slot allocation logic
(excluding the hw code to load/unload a slot) you're probably doing some
serious overengineering.

> > And also yes a scheduler frontend for dependency sorting shouldn't
> > really be a that big thing, so there's not going to be huge amounts of
> > code sharing in the end.
> 
> Agreed.
> 
> > It's the conceptual sharing, and sharing
> > stuff like drm_sched_entity to eventual build some cross driver gpu
> > context stuff on top that really is going to matter.
> 
> And I agree with that too.
> 
> > 
> > Also like I mentioned, at least in some cases i915-guc might also have
> > a need for fw scheduler slot allocation for a bunch of running things.
> 
> Ok.
> 
> > 
> > Finally I'm a bit confused why you're building a time sharing
> > scheduler in the kernel if you have one in fw already. Or do I get
> > that part wrong?
> 
> It's here to overcome the low number of FW-slot (which is as low as 8
> on the HW I'm testing on). If you don't do time sharing scheduling
> kernel-side, you have no guarantee of fairness, since one could keep
> queuing jobs to an entity/queue, making it permanently resident,
> without giving a chance to non-resident entities/queues to ever run. To
> sum-up, the scheduler is not entirely handled by the FW, it's a mixed
> design, where part of it is in the FW (scheduling between currently
> active entities passed to the FW), and the other part in the kernel
> driver (rotating runnable entities on the limited amount of FW slots we
> have). But overall, it shouldn't make a difference compared to Xe. The
> fact some of the scheduling happens kernel-side is completely opaque to
> the drm_sched_entity frontend if we go the Xe way (one
> drm_gpu_scheduler per drm_sched_entity, real scheduling is handled by
> some black box, either entirely in the FW, or with shared
> responsibility between FW and kernel).

See above. I don't think you need three scheduler (dma_fence sorting
frontend, kernel round robin, fw round robin here). I'm pretty sure you do
not _want_ 3 schedulers. And if you just take the 3 pieces above, you will
have a scheduler that's Fair Enough (tm) even when you have more than 31
context.

I would frankly not even be surprised if you can get away with full
stalls, so not even the dma_fence pipelining needed. even if you stall out
a handful of context, there should still be 20+ available for the fw to
schedule and keep the gpu busy. After all, this is still a gpu, there's
only 2 things you need:
- fair enough to avoid completely stalling out some app and the user
  reaching the reset button
- throughput. as long as you can keep enough runnable slots for the fw to
  schedule, it really shouldn't matter how shoddily you push in new stuff.

Cheers, Daniel
Boris Brezillon Jan. 12, 2023, 10:11 a.m. UTC | #39
On Thu, 12 Jan 2023 10:32:18 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:
> > Hi Daniel,
> > 
> > On Wed, 11 Jan 2023 22:47:02 +0100
> > Daniel Vetter <daniel@ffwll.ch> wrote:
> >   
> > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > <boris.brezillon@collabora.com> wrote:  
> > > >
> > > > Hi Daniel,
> > > >
> > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >    
> > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:    
> > > > > > Hi Jason,
> > > > > >
> > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > >    
> > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > wrote:
> > > > > > >    
> > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:    
> > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > >    
> > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > >    
> > > > > > > > > > > Hello Matthew,
> > > > > > > > > > >
> > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > >    
> > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first    
> > > > > > > > this    
> > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > >
> > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > guaranteed to be the same completion even if targeting the same    
> > > > > > > > hardware    
> > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the    
> > > > > > > > GuC,    
> > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a    
> > > > > > > > using    
> > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR    
> > > > > > > > falls    
> > > > > > > > > > > > apart as the TDR expects submission order == completion order.    
> > > > > > > > Using a    
> > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this    
> > > > > > > > problem.    
> > > > > > > > > > >
> > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a    
> > > > > > > > FW-assisted    
> > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > because the integration with drm_sched was painful, but also because    
> > > > > > > > I    
> > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.    
> > > > > > > > Giving    
> > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably    
> > > > > > > > might    
> > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.    
> > > > > > > > > >
> > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue    
> > > > > > > >
> > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > scheduler fits quite well.
> > > > > > > >    
> > > > > > >
> > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > >
> > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > categorization.)
> > > > > > >
> > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > >
> > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > windows.
> > > > > > >
> > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > >
> > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > >
> > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > able to juggle if you run out.    
> > > > > >
> > > > > > That's more or less what I do right now, except I don't use the
> > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > interest in Xe's approach.    
> > > > >
> > > > > So another option for these few fw queue slots schedulers would be to
> > > > > treat them as vram and enlist ttm.
> > > > >
> > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > the perfect thing.
> > > > >
> > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > all.
> > > > >
> > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > helper code.    
> > > >
> > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > also don't know much about TTM, so I'll keep digging.    
> > > 
> > > ttm is about moving stuff in&out of a limited space and gives you some
> > > nice tooling for pipelining it all. It doesn't care whether that space
> > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > iirc.  
> > 
> > Ok.
> >   
> > >   
> > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > severe restrictions in number of queues.    
> > > >
> > > > Honestly, I don't think there's much difference between those two cases
> > > > already. There's just a bunch of additional code to schedule queues on
> > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > driver specific. The job queuing front-end pretty much achieves what
> > > > drm_sched does already: queuing job to entities, checking deps,
> > > > submitting job to HW (in our case, writing to the command stream ring
> > > > buffer). Things start to differ after that point: once a scheduling
> > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > re-evaluate, if needed.
> > > >
> > > > I'm all for using generic code when it makes sense, even if that means
> > > > adding this common code when it doesn't exists, but I don't want to be
> > > > dragged into some major refactoring that might take years to land.
> > > > Especially if pancsf is the first
> > > > FW-assisted-scheduler-with-few-FW-slot driver.    
> > > 
> > > I don't see where there's a major refactoring that you're getting dragged into?  
> > 
> > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > sure we're on the same page :-).
> >   
> > > 
> > > Yes there's a huge sprawling discussion right now, but I think that's
> > > just largely people getting confused.  
> > 
> > I definitely am :-).
> >   
> > > 
> > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > point.  
> > 
> > Ok, thanks for the pointers. I'll have a look and see how I could use
> > that. I guess that's about getting access to the FW slots with some
> > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > case, I'll have to think about it, because that's a major shift from
> > what we're doing now, and I'm afraid this could lead to starving
> > non-resident entities if all resident entities keep receiving new jobs
> > to execute. Unless we put some sort of barrier when giving access to a
> > slot, so we evict the entity when it's done executing the stuff it had
> > when it was given access to this slot. But then, again, there are other
> > constraints to take into account for the Arm Mali CSF case:
> > 
> > - it's more efficient to update all FW slots at once, because each
> >   update of a slot might require updating priorities of the other slots
> >   (FW mandates unique slot priorities, and those priorities depend on
> >   the entity priority/queue-ordering)
> > - context/FW slot switches have a non-negligible cost (FW needs to
> >   suspend the context and save the state every time there such a
> >   switch), so, limiting the number of FW slot updates might prove
> >   important  
> 
> I frankly think you're overworrying. When you have 31+ contexts running at
> the same time, you have bigger problems. At that point there's two
> use-cases:
> 1. system is overloaded, the user will reach for reset button anyway
> 2. temporary situation, all you have to do is be roughly fair enough to get
>    through it before case 1 happens.
>  
> Trying to write a perfect scheduler for this before we have actual
> benchmarks that justify the effort seems like pretty serious overkill.
> That's why I think the simplest solution is the one we should have:
> 
> - drm/sched frontend. If you get into slot exhaustion that alone will
>   ensure enough fairness

We're talking about the CS ring buffer slots here, right?

> 
> - LRU list of slots, with dma_fence so you can pipeline/batch up changes
>   as needed (but I honestly wouldn't worry about the batching before
>   you've shown an actual need for this in some benchmark/workload, even
>   piglit shouldn't have this many things running concurrently I think, you
>   don't have that many cpu cores). Between drm/sched and the lru you will
>   have an emergent scheduler that cycles through all runnable gpu jobs.
> 
> - If you want to go fancy, have eviction tricks like skipping currently
>   still active gpu context with higher priority than the one that you need
>   to find a slot for.
> 
> - You don't need time slicing in this, not even for compute. compute is
>   done with preempt context fences, if you give them a minimum scheduling
>   quanta you'll have a very basic round robin scheduler as an emergent
>   thing.
> 
> Any workload were it matters will be scheduled by the fw directly, with
> drm/sched only being the dma_fence dependcy sorter. My take is that if you
> spend more than a hundred or so lines with slot allocation logic
> (excluding the hw code to load/unload a slot) you're probably doing some
> serious overengineering.

Let me see if I got this right:

- we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
  where hw_submission_limit == available_slots_in_ring_buf
- when ->run_job() is called, we write the RUN_JOB() instruction
  sequence to the next available ringbuf slot and queue the entity to
  the FW-slot queue
  * if a slot is directly available, we program the slot directly
  * if no slots are available, but some slots are done with the jobs
    they were given (last job fence signaled), we evict the LRU entity
    (possibly taking priority into account) and use this slot for the
    new entity
  * if no slots are available and all currently assigned slots
    contain busy entities, we queue the entity to a pending list
    (possibly one list per prio)

I'll need to make sure this still works with the concept of group (it's
not a single queue we schedule, it's a group of queues, meaning that we
have N fences to watch to determine if the slot is busy or not, but
that should be okay).
Boris Brezillon Jan. 12, 2023, 10:25 a.m. UTC | #40
On Thu, 12 Jan 2023 11:11:03 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> On Thu, 12 Jan 2023 10:32:18 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:  
> > > Hi Daniel,
> > > 
> > > On Wed, 11 Jan 2023 22:47:02 +0100
> > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > >     
> > > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > > <boris.brezillon@collabora.com> wrote:    
> > > > >
> > > > > Hi Daniel,
> > > > >
> > > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >      
> > > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:      
> > > > > > > Hi Jason,
> > > > > > >
> > > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > > >      
> > > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > > wrote:
> > > > > > > >      
> > > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:      
> > > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > >      
> > > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > >      
> > > > > > > > > > > > Hello Matthew,
> > > > > > > > > > > >
> > > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > > >      
> > > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first      
> > > > > > > > > this      
> > > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > > guaranteed to be the same completion even if targeting the same      
> > > > > > > > > hardware      
> > > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the      
> > > > > > > > > GuC,      
> > > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a      
> > > > > > > > > using      
> > > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR      
> > > > > > > > > falls      
> > > > > > > > > > > > > apart as the TDR expects submission order == completion order.      
> > > > > > > > > Using a      
> > > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this      
> > > > > > > > > problem.      
> > > > > > > > > > > >
> > > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a      
> > > > > > > > > FW-assisted      
> > > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > > because the integration with drm_sched was painful, but also because      
> > > > > > > > > I      
> > > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.      
> > > > > > > > > Giving      
> > > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably      
> > > > > > > > > might      
> > > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.      
> > > > > > > > > > >
> > > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue      
> > > > > > > > >
> > > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > > scheduler fits quite well.
> > > > > > > > >      
> > > > > > > >
> > > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > > >
> > > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > > categorization.)
> > > > > > > >
> > > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > > >
> > > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > > windows.
> > > > > > > >
> > > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > > >
> > > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > > >
> > > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > > able to juggle if you run out.      
> > > > > > >
> > > > > > > That's more or less what I do right now, except I don't use the
> > > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > > interest in Xe's approach.      
> > > > > >
> > > > > > So another option for these few fw queue slots schedulers would be to
> > > > > > treat them as vram and enlist ttm.
> > > > > >
> > > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > > the perfect thing.
> > > > > >
> > > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > > all.
> > > > > >
> > > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > > helper code.      
> > > > >
> > > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > > also don't know much about TTM, so I'll keep digging.      
> > > > 
> > > > ttm is about moving stuff in&out of a limited space and gives you some
> > > > nice tooling for pipelining it all. It doesn't care whether that space
> > > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > > iirc.    
> > > 
> > > Ok.
> > >     
> > > >     
> > > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > > severe restrictions in number of queues.      
> > > > >
> > > > > Honestly, I don't think there's much difference between those two cases
> > > > > already. There's just a bunch of additional code to schedule queues on
> > > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > > driver specific. The job queuing front-end pretty much achieves what
> > > > > drm_sched does already: queuing job to entities, checking deps,
> > > > > submitting job to HW (in our case, writing to the command stream ring
> > > > > buffer). Things start to differ after that point: once a scheduling
> > > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > > re-evaluate, if needed.
> > > > >
> > > > > I'm all for using generic code when it makes sense, even if that means
> > > > > adding this common code when it doesn't exists, but I don't want to be
> > > > > dragged into some major refactoring that might take years to land.
> > > > > Especially if pancsf is the first
> > > > > FW-assisted-scheduler-with-few-FW-slot driver.      
> > > > 
> > > > I don't see where there's a major refactoring that you're getting dragged into?    
> > > 
> > > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > > sure we're on the same page :-).
> > >     
> > > > 
> > > > Yes there's a huge sprawling discussion right now, but I think that's
> > > > just largely people getting confused.    
> > > 
> > > I definitely am :-).
> > >     
> > > > 
> > > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > > point.    
> > > 
> > > Ok, thanks for the pointers. I'll have a look and see how I could use
> > > that. I guess that's about getting access to the FW slots with some
> > > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > > case, I'll have to think about it, because that's a major shift from
> > > what we're doing now, and I'm afraid this could lead to starving
> > > non-resident entities if all resident entities keep receiving new jobs
> > > to execute. Unless we put some sort of barrier when giving access to a
> > > slot, so we evict the entity when it's done executing the stuff it had
> > > when it was given access to this slot. But then, again, there are other
> > > constraints to take into account for the Arm Mali CSF case:
> > > 
> > > - it's more efficient to update all FW slots at once, because each
> > >   update of a slot might require updating priorities of the other slots
> > >   (FW mandates unique slot priorities, and those priorities depend on
> > >   the entity priority/queue-ordering)
> > > - context/FW slot switches have a non-negligible cost (FW needs to
> > >   suspend the context and save the state every time there such a
> > >   switch), so, limiting the number of FW slot updates might prove
> > >   important    
> > 
> > I frankly think you're overworrying. When you have 31+ contexts running at
> > the same time, you have bigger problems. At that point there's two
> > use-cases:
> > 1. system is overloaded, the user will reach for reset button anyway
> > 2. temporary situation, all you have to do is be roughly fair enough to get
> >    through it before case 1 happens.
> >  
> > Trying to write a perfect scheduler for this before we have actual
> > benchmarks that justify the effort seems like pretty serious overkill.
> > That's why I think the simplest solution is the one we should have:
> > 
> > - drm/sched frontend. If you get into slot exhaustion that alone will
> >   ensure enough fairness  
> 
> We're talking about the CS ring buffer slots here, right?
> 
> > 
> > - LRU list of slots, with dma_fence so you can pipeline/batch up changes
> >   as needed (but I honestly wouldn't worry about the batching before
> >   you've shown an actual need for this in some benchmark/workload, even
> >   piglit shouldn't have this many things running concurrently I think, you
> >   don't have that many cpu cores). Between drm/sched and the lru you will
> >   have an emergent scheduler that cycles through all runnable gpu jobs.
> > 
> > - If you want to go fancy, have eviction tricks like skipping currently
> >   still active gpu context with higher priority than the one that you need
> >   to find a slot for.
> > 
> > - You don't need time slicing in this, not even for compute. compute is
> >   done with preempt context fences, if you give them a minimum scheduling
> >   quanta you'll have a very basic round robin scheduler as an emergent
> >   thing.
> > 
> > Any workload were it matters will be scheduled by the fw directly, with
> > drm/sched only being the dma_fence dependcy sorter. My take is that if you
> > spend more than a hundred or so lines with slot allocation logic
> > (excluding the hw code to load/unload a slot) you're probably doing some
> > serious overengineering.  
> 
> Let me see if I got this right:
> 
> - we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
>   where hw_submission_limit == available_slots_in_ring_buf
> - when ->run_job() is called, we write the RUN_JOB() instruction
>   sequence to the next available ringbuf slot and queue the entity to
>   the FW-slot queue
>   * if a slot is directly available, we program the slot directly
>   * if no slots are available, but some slots are done with the jobs
>     they were given (last job fence signaled), we evict the LRU entity
>     (possibly taking priority into account) and use this slot for the
>     new entity
>   * if no slots are available and all currently assigned slots
>     contain busy entities, we queue the entity to a pending list
>     (possibly one list per prio)
> 
> I'll need to make sure this still works with the concept of group (it's
> not a single queue we schedule, it's a group of queues, meaning that we
> have N fences to watch to determine if the slot is busy or not, but
> that should be okay).

Oh, there's one other thing I forgot to mention: the FW scheduler is
not entirely fair, it does take the slot priority (which has to be
unique across all currently assigned slots) into account when
scheduling groups. So, ideally, we'd want to rotate group priorities
when they share the same drm_sched_priority (probably based on the
position in the LRU).
Boris Brezillon Jan. 12, 2023, 10:30 a.m. UTC | #41
On Thu, 12 Jan 2023 11:11:03 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> On Thu, 12 Jan 2023 10:32:18 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:  
> > > Hi Daniel,
> > > 
> > > On Wed, 11 Jan 2023 22:47:02 +0100
> > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > >     
> > > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > > <boris.brezillon@collabora.com> wrote:    
> > > > >
> > > > > Hi Daniel,
> > > > >
> > > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >      
> > > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:      
> > > > > > > Hi Jason,
> > > > > > >
> > > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > > >      
> > > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > > wrote:
> > > > > > > >      
> > > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:      
> > > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > >      
> > > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > >      
> > > > > > > > > > > > Hello Matthew,
> > > > > > > > > > > >
> > > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > > >      
> > > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first      
> > > > > > > > > this      
> > > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > > guaranteed to be the same completion even if targeting the same      
> > > > > > > > > hardware      
> > > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the      
> > > > > > > > > GuC,      
> > > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a      
> > > > > > > > > using      
> > > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR      
> > > > > > > > > falls      
> > > > > > > > > > > > > apart as the TDR expects submission order == completion order.      
> > > > > > > > > Using a      
> > > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this      
> > > > > > > > > problem.      
> > > > > > > > > > > >
> > > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a      
> > > > > > > > > FW-assisted      
> > > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > > because the integration with drm_sched was painful, but also because      
> > > > > > > > > I      
> > > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.      
> > > > > > > > > Giving      
> > > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably      
> > > > > > > > > might      
> > > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.      
> > > > > > > > > > >
> > > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue      
> > > > > > > > >
> > > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > > scheduler fits quite well.
> > > > > > > > >      
> > > > > > > >
> > > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > > >
> > > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > > categorization.)
> > > > > > > >
> > > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > > >
> > > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > > windows.
> > > > > > > >
> > > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > > >
> > > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > > >
> > > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > > able to juggle if you run out.      
> > > > > > >
> > > > > > > That's more or less what I do right now, except I don't use the
> > > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > > interest in Xe's approach.      
> > > > > >
> > > > > > So another option for these few fw queue slots schedulers would be to
> > > > > > treat them as vram and enlist ttm.
> > > > > >
> > > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > > the perfect thing.
> > > > > >
> > > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > > all.
> > > > > >
> > > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > > helper code.      
> > > > >
> > > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > > also don't know much about TTM, so I'll keep digging.      
> > > > 
> > > > ttm is about moving stuff in&out of a limited space and gives you some
> > > > nice tooling for pipelining it all. It doesn't care whether that space
> > > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > > iirc.    
> > > 
> > > Ok.
> > >     
> > > >     
> > > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > > severe restrictions in number of queues.      
> > > > >
> > > > > Honestly, I don't think there's much difference between those two cases
> > > > > already. There's just a bunch of additional code to schedule queues on
> > > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > > driver specific. The job queuing front-end pretty much achieves what
> > > > > drm_sched does already: queuing job to entities, checking deps,
> > > > > submitting job to HW (in our case, writing to the command stream ring
> > > > > buffer). Things start to differ after that point: once a scheduling
> > > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > > re-evaluate, if needed.
> > > > >
> > > > > I'm all for using generic code when it makes sense, even if that means
> > > > > adding this common code when it doesn't exists, but I don't want to be
> > > > > dragged into some major refactoring that might take years to land.
> > > > > Especially if pancsf is the first
> > > > > FW-assisted-scheduler-with-few-FW-slot driver.      
> > > > 
> > > > I don't see where there's a major refactoring that you're getting dragged into?    
> > > 
> > > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > > sure we're on the same page :-).
> > >     
> > > > 
> > > > Yes there's a huge sprawling discussion right now, but I think that's
> > > > just largely people getting confused.    
> > > 
> > > I definitely am :-).
> > >     
> > > > 
> > > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > > point.    
> > > 
> > > Ok, thanks for the pointers. I'll have a look and see how I could use
> > > that. I guess that's about getting access to the FW slots with some
> > > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > > case, I'll have to think about it, because that's a major shift from
> > > what we're doing now, and I'm afraid this could lead to starving
> > > non-resident entities if all resident entities keep receiving new jobs
> > > to execute. Unless we put some sort of barrier when giving access to a
> > > slot, so we evict the entity when it's done executing the stuff it had
> > > when it was given access to this slot. But then, again, there are other
> > > constraints to take into account for the Arm Mali CSF case:
> > > 
> > > - it's more efficient to update all FW slots at once, because each
> > >   update of a slot might require updating priorities of the other slots
> > >   (FW mandates unique slot priorities, and those priorities depend on
> > >   the entity priority/queue-ordering)
> > > - context/FW slot switches have a non-negligible cost (FW needs to
> > >   suspend the context and save the state every time there such a
> > >   switch), so, limiting the number of FW slot updates might prove
> > >   important    
> > 
> > I frankly think you're overworrying. When you have 31+ contexts running at
> > the same time, you have bigger problems. At that point there's two
> > use-cases:
> > 1. system is overloaded, the user will reach for reset button anyway
> > 2. temporary situation, all you have to do is be roughly fair enough to get
> >    through it before case 1 happens.
> >  
> > Trying to write a perfect scheduler for this before we have actual
> > benchmarks that justify the effort seems like pretty serious overkill.
> > That's why I think the simplest solution is the one we should have:
> > 
> > - drm/sched frontend. If you get into slot exhaustion that alone will
> >   ensure enough fairness  
> 
> We're talking about the CS ring buffer slots here, right?
> 
> > 
> > - LRU list of slots, with dma_fence so you can pipeline/batch up changes
> >   as needed (but I honestly wouldn't worry about the batching before
> >   you've shown an actual need for this in some benchmark/workload, even
> >   piglit shouldn't have this many things running concurrently I think, you
> >   don't have that many cpu cores). Between drm/sched and the lru you will
> >   have an emergent scheduler that cycles through all runnable gpu jobs.
> > 
> > - If you want to go fancy, have eviction tricks like skipping currently
> >   still active gpu context with higher priority than the one that you need
> >   to find a slot for.
> > 
> > - You don't need time slicing in this, not even for compute. compute is
> >   done with preempt context fences, if you give them a minimum scheduling
> >   quanta you'll have a very basic round robin scheduler as an emergent
> >   thing.
> > 
> > Any workload were it matters will be scheduled by the fw directly, with
> > drm/sched only being the dma_fence dependcy sorter. My take is that if you
> > spend more than a hundred or so lines with slot allocation logic
> > (excluding the hw code to load/unload a slot) you're probably doing some
> > serious overengineering.  
> 
> Let me see if I got this right:
> 
> - we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
>   where hw_submission_limit == available_slots_in_ring_buf
> - when ->run_job() is called, we write the RUN_JOB() instruction
>   sequence to the next available ringbuf slot and queue the entity to
>   the FW-slot queue
>   * if a slot is directly available, we program the slot directly
>   * if no slots are available, but some slots are done with the jobs
>     they were given (last job fence signaled), we evict the LRU entity
>     (possibly taking priority into account) and use this slot for the
>     new entity
>   * if no slots are available and all currently assigned slots
>     contain busy entities, we queue the entity to a pending list
>     (possibly one list per prio)

Forgot:

   * if the group is already resident, we just move the slot to the
     LRU list head.

> 
> I'll need to make sure this still works with the concept of group (it's
> not a single queue we schedule, it's a group of queues, meaning that we
> have N fences to watch to determine if the slot is busy or not, but
> that should be okay).
Daniel Vetter Jan. 12, 2023, 10:42 a.m. UTC | #42
On Thu, Jan 12, 2023 at 11:25:53AM +0100, Boris Brezillon wrote:
> On Thu, 12 Jan 2023 11:11:03 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
> > On Thu, 12 Jan 2023 10:32:18 +0100
> > Daniel Vetter <daniel@ffwll.ch> wrote:
> > 
> > > On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:  
> > > > Hi Daniel,
> > > > 
> > > > On Wed, 11 Jan 2023 22:47:02 +0100
> > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >     
> > > > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > > > <boris.brezillon@collabora.com> wrote:    
> > > > > >
> > > > > > Hi Daniel,
> > > > > >
> > > > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > >      
> > > > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:      
> > > > > > > > Hi Jason,
> > > > > > > >
> > > > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > > > >      
> > > > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > wrote:
> > > > > > > > >      
> > > > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:      
> > > > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > >      
> > > > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > > >      
> > > > > > > > > > > > > Hello Matthew,
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > > > >      
> > > > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first      
> > > > > > > > > > this      
> > > > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > > > guaranteed to be the same completion even if targeting the same      
> > > > > > > > > > hardware      
> > > > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the      
> > > > > > > > > > GuC,      
> > > > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a      
> > > > > > > > > > using      
> > > > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR      
> > > > > > > > > > falls      
> > > > > > > > > > > > > > apart as the TDR expects submission order == completion order.      
> > > > > > > > > > Using a      
> > > > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this      
> > > > > > > > > > problem.      
> > > > > > > > > > > > >
> > > > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a      
> > > > > > > > > > FW-assisted      
> > > > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > > > because the integration with drm_sched was painful, but also because      
> > > > > > > > > > I      
> > > > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.      
> > > > > > > > > > Giving      
> > > > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably      
> > > > > > > > > > might      
> > > > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.      
> > > > > > > > > > > >
> > > > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue      
> > > > > > > > > >
> > > > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > > > scheduler fits quite well.
> > > > > > > > > >      
> > > > > > > > >
> > > > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > > > >
> > > > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > > > categorization.)
> > > > > > > > >
> > > > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > > > >
> > > > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > > > windows.
> > > > > > > > >
> > > > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > > > >
> > > > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > > > >
> > > > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > > > able to juggle if you run out.      
> > > > > > > >
> > > > > > > > That's more or less what I do right now, except I don't use the
> > > > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > > > interest in Xe's approach.      
> > > > > > >
> > > > > > > So another option for these few fw queue slots schedulers would be to
> > > > > > > treat them as vram and enlist ttm.
> > > > > > >
> > > > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > > > the perfect thing.
> > > > > > >
> > > > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > > > all.
> > > > > > >
> > > > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > > > helper code.      
> > > > > >
> > > > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > > > also don't know much about TTM, so I'll keep digging.      
> > > > > 
> > > > > ttm is about moving stuff in&out of a limited space and gives you some
> > > > > nice tooling for pipelining it all. It doesn't care whether that space
> > > > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > > > iirc.    
> > > > 
> > > > Ok.
> > > >     
> > > > >     
> > > > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > > > severe restrictions in number of queues.      
> > > > > >
> > > > > > Honestly, I don't think there's much difference between those two cases
> > > > > > already. There's just a bunch of additional code to schedule queues on
> > > > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > > > driver specific. The job queuing front-end pretty much achieves what
> > > > > > drm_sched does already: queuing job to entities, checking deps,
> > > > > > submitting job to HW (in our case, writing to the command stream ring
> > > > > > buffer). Things start to differ after that point: once a scheduling
> > > > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > > > re-evaluate, if needed.
> > > > > >
> > > > > > I'm all for using generic code when it makes sense, even if that means
> > > > > > adding this common code when it doesn't exists, but I don't want to be
> > > > > > dragged into some major refactoring that might take years to land.
> > > > > > Especially if pancsf is the first
> > > > > > FW-assisted-scheduler-with-few-FW-slot driver.      
> > > > > 
> > > > > I don't see where there's a major refactoring that you're getting dragged into?    
> > > > 
> > > > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > > > sure we're on the same page :-).
> > > >     
> > > > > 
> > > > > Yes there's a huge sprawling discussion right now, but I think that's
> > > > > just largely people getting confused.    
> > > > 
> > > > I definitely am :-).
> > > >     
> > > > > 
> > > > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > > > point.    
> > > > 
> > > > Ok, thanks for the pointers. I'll have a look and see how I could use
> > > > that. I guess that's about getting access to the FW slots with some
> > > > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > > > case, I'll have to think about it, because that's a major shift from
> > > > what we're doing now, and I'm afraid this could lead to starving
> > > > non-resident entities if all resident entities keep receiving new jobs
> > > > to execute. Unless we put some sort of barrier when giving access to a
> > > > slot, so we evict the entity when it's done executing the stuff it had
> > > > when it was given access to this slot. But then, again, there are other
> > > > constraints to take into account for the Arm Mali CSF case:
> > > > 
> > > > - it's more efficient to update all FW slots at once, because each
> > > >   update of a slot might require updating priorities of the other slots
> > > >   (FW mandates unique slot priorities, and those priorities depend on
> > > >   the entity priority/queue-ordering)
> > > > - context/FW slot switches have a non-negligible cost (FW needs to
> > > >   suspend the context and save the state every time there such a
> > > >   switch), so, limiting the number of FW slot updates might prove
> > > >   important    
> > > 
> > > I frankly think you're overworrying. When you have 31+ contexts running at
> > > the same time, you have bigger problems. At that point there's two
> > > use-cases:
> > > 1. system is overloaded, the user will reach for reset button anyway
> > > 2. temporary situation, all you have to do is be roughly fair enough to get
> > >    through it before case 1 happens.
> > >  
> > > Trying to write a perfect scheduler for this before we have actual
> > > benchmarks that justify the effort seems like pretty serious overkill.
> > > That's why I think the simplest solution is the one we should have:
> > > 
> > > - drm/sched frontend. If you get into slot exhaustion that alone will
> > >   ensure enough fairness  
> > 
> > We're talking about the CS ring buffer slots here, right?
> > 
> > > 
> > > - LRU list of slots, with dma_fence so you can pipeline/batch up changes
> > >   as needed (but I honestly wouldn't worry about the batching before
> > >   you've shown an actual need for this in some benchmark/workload, even
> > >   piglit shouldn't have this many things running concurrently I think, you
> > >   don't have that many cpu cores). Between drm/sched and the lru you will
> > >   have an emergent scheduler that cycles through all runnable gpu jobs.
> > > 
> > > - If you want to go fancy, have eviction tricks like skipping currently
> > >   still active gpu context with higher priority than the one that you need
> > >   to find a slot for.
> > > 
> > > - You don't need time slicing in this, not even for compute. compute is
> > >   done with preempt context fences, if you give them a minimum scheduling
> > >   quanta you'll have a very basic round robin scheduler as an emergent
> > >   thing.
> > > 
> > > Any workload were it matters will be scheduled by the fw directly, with
> > > drm/sched only being the dma_fence dependcy sorter. My take is that if you
> > > spend more than a hundred or so lines with slot allocation logic
> > > (excluding the hw code to load/unload a slot) you're probably doing some
> > > serious overengineering.  
> > 
> > Let me see if I got this right:
> > 
> > - we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
> >   where hw_submission_limit == available_slots_in_ring_buf
> > - when ->run_job() is called, we write the RUN_JOB() instruction
> >   sequence to the next available ringbuf slot and queue the entity to
> >   the FW-slot queue
> >   * if a slot is directly available, we program the slot directly
> >   * if no slots are available, but some slots are done with the jobs
> >     they were given (last job fence signaled), we evict the LRU entity
> >     (possibly taking priority into account) and use this slot for the
> >     new entity
> >   * if no slots are available and all currently assigned slots
> >     contain busy entities, we queue the entity to a pending list
> >     (possibly one list per prio)

You could also handle this in ->prepare_job, which is called after all the
default fences have signalled. That allows you to put the "wait for a
previous job to finnish/unload" behind a dma_fence, which is how (I think
at least) you can get the round-robin emergent behaviour: If there's no
idle slot, you just pick all the fences from the currently busy job you
want to steal the slot from (with priority and lru taken into account),
let the scheduler wait for that to finnish, and then it'll call your
run_job when the slot is already available.

Also if you do the allocation in ->prepare_job with dma_fence and not
run_job, then I think can sort out fairness issues (if they do pop up) in
the drm/sched code instead of having to think about this in each driver.
Few fw sched slots essentially just make fw scheduling unfairness more
prominent than with others, but I don't think it's fundamentally something
else really.

If every ctx does that and the lru isn't too busted, they should then form
a nice orderly queue and cycle through the fw scheduler, while still being
able to get some work done. It's essentially the exact same thing that
happens with ttm vram eviction, when you have a total working set where
each process fits in vram individually, but in total they're too big and
you need to cycle things through.

> > I'll need to make sure this still works with the concept of group (it's
> > not a single queue we schedule, it's a group of queues, meaning that we
> > have N fences to watch to determine if the slot is busy or not, but
> > that should be okay).
> 
> Oh, there's one other thing I forgot to mention: the FW scheduler is
> not entirely fair, it does take the slot priority (which has to be
> unique across all currently assigned slots) into account when
> scheduling groups. So, ideally, we'd want to rotate group priorities
> when they share the same drm_sched_priority (probably based on the
> position in the LRU).

Hm that will make things a bit more fun I guess, especially with your
constraint to not update this too often. How strict is that priority
difference? If it's a lot, we might need to treat this more like execlist
and less like a real fw scheduler ...
-Daniel
Boris Brezillon Jan. 12, 2023, 12:08 p.m. UTC | #43
On Thu, 12 Jan 2023 11:42:57 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Thu, Jan 12, 2023 at 11:25:53AM +0100, Boris Brezillon wrote:
> > On Thu, 12 Jan 2023 11:11:03 +0100
> > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> >   
> > > On Thu, 12 Jan 2023 10:32:18 +0100
> > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > >   
> > > > On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:    
> > > > > Hi Daniel,
> > > > > 
> > > > > On Wed, 11 Jan 2023 22:47:02 +0100
> > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >       
> > > > > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > > > > <boris.brezillon@collabora.com> wrote:      
> > > > > > >
> > > > > > > Hi Daniel,
> > > > > > >
> > > > > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > > >        
> > > > > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:        
> > > > > > > > > Hi Jason,
> > > > > > > > >
> > > > > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > > > > >        
> > > > > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > > wrote:
> > > > > > > > > >        
> > > > > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:        
> > > > > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > > >        
> > > > > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > > > >        
> > > > > > > > > > > > > > Hello Matthew,
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > > > > >        
> > > > > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first        
> > > > > > > > > > > this        
> > > > > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > > > > guaranteed to be the same completion even if targeting the same        
> > > > > > > > > > > hardware        
> > > > > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the        
> > > > > > > > > > > GuC,        
> > > > > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a        
> > > > > > > > > > > using        
> > > > > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR        
> > > > > > > > > > > falls        
> > > > > > > > > > > > > > > apart as the TDR expects submission order == completion order.        
> > > > > > > > > > > Using a        
> > > > > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this        
> > > > > > > > > > > problem.        
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a        
> > > > > > > > > > > FW-assisted        
> > > > > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > > > > because the integration with drm_sched was painful, but also because        
> > > > > > > > > > > I        
> > > > > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.        
> > > > > > > > > > > Giving        
> > > > > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably        
> > > > > > > > > > > might        
> > > > > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.        
> > > > > > > > > > > > >
> > > > > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue        
> > > > > > > > > > >
> > > > > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > > > > scheduler fits quite well.
> > > > > > > > > > >        
> > > > > > > > > >
> > > > > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > > > > >
> > > > > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > > > > categorization.)
> > > > > > > > > >
> > > > > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > > > > >
> > > > > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > > > > windows.
> > > > > > > > > >
> > > > > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > > > > >
> > > > > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > > > > >
> > > > > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > > > > able to juggle if you run out.        
> > > > > > > > >
> > > > > > > > > That's more or less what I do right now, except I don't use the
> > > > > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > > > > interest in Xe's approach.        
> > > > > > > >
> > > > > > > > So another option for these few fw queue slots schedulers would be to
> > > > > > > > treat them as vram and enlist ttm.
> > > > > > > >
> > > > > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > > > > the perfect thing.
> > > > > > > >
> > > > > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > > > > all.
> > > > > > > >
> > > > > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > > > > helper code.        
> > > > > > >
> > > > > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > > > > also don't know much about TTM, so I'll keep digging.        
> > > > > > 
> > > > > > ttm is about moving stuff in&out of a limited space and gives you some
> > > > > > nice tooling for pipelining it all. It doesn't care whether that space
> > > > > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > > > > iirc.      
> > > > > 
> > > > > Ok.
> > > > >       
> > > > > >       
> > > > > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > > > > severe restrictions in number of queues.        
> > > > > > >
> > > > > > > Honestly, I don't think there's much difference between those two cases
> > > > > > > already. There's just a bunch of additional code to schedule queues on
> > > > > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > > > > driver specific. The job queuing front-end pretty much achieves what
> > > > > > > drm_sched does already: queuing job to entities, checking deps,
> > > > > > > submitting job to HW (in our case, writing to the command stream ring
> > > > > > > buffer). Things start to differ after that point: once a scheduling
> > > > > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > > > > re-evaluate, if needed.
> > > > > > >
> > > > > > > I'm all for using generic code when it makes sense, even if that means
> > > > > > > adding this common code when it doesn't exists, but I don't want to be
> > > > > > > dragged into some major refactoring that might take years to land.
> > > > > > > Especially if pancsf is the first
> > > > > > > FW-assisted-scheduler-with-few-FW-slot driver.        
> > > > > > 
> > > > > > I don't see where there's a major refactoring that you're getting dragged into?      
> > > > > 
> > > > > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > > > > sure we're on the same page :-).
> > > > >       
> > > > > > 
> > > > > > Yes there's a huge sprawling discussion right now, but I think that's
> > > > > > just largely people getting confused.      
> > > > > 
> > > > > I definitely am :-).
> > > > >       
> > > > > > 
> > > > > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > > > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > > > > point.      
> > > > > 
> > > > > Ok, thanks for the pointers. I'll have a look and see how I could use
> > > > > that. I guess that's about getting access to the FW slots with some
> > > > > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > > > > case, I'll have to think about it, because that's a major shift from
> > > > > what we're doing now, and I'm afraid this could lead to starving
> > > > > non-resident entities if all resident entities keep receiving new jobs
> > > > > to execute. Unless we put some sort of barrier when giving access to a
> > > > > slot, so we evict the entity when it's done executing the stuff it had
> > > > > when it was given access to this slot. But then, again, there are other
> > > > > constraints to take into account for the Arm Mali CSF case:
> > > > > 
> > > > > - it's more efficient to update all FW slots at once, because each
> > > > >   update of a slot might require updating priorities of the other slots
> > > > >   (FW mandates unique slot priorities, and those priorities depend on
> > > > >   the entity priority/queue-ordering)
> > > > > - context/FW slot switches have a non-negligible cost (FW needs to
> > > > >   suspend the context and save the state every time there such a
> > > > >   switch), so, limiting the number of FW slot updates might prove
> > > > >   important      
> > > > 
> > > > I frankly think you're overworrying. When you have 31+ contexts running at
> > > > the same time, you have bigger problems. At that point there's two
> > > > use-cases:
> > > > 1. system is overloaded, the user will reach for reset button anyway
> > > > 2. temporary situation, all you have to do is be roughly fair enough to get
> > > >    through it before case 1 happens.
> > > >  
> > > > Trying to write a perfect scheduler for this before we have actual
> > > > benchmarks that justify the effort seems like pretty serious overkill.
> > > > That's why I think the simplest solution is the one we should have:
> > > > 
> > > > - drm/sched frontend. If you get into slot exhaustion that alone will
> > > >   ensure enough fairness    
> > > 
> > > We're talking about the CS ring buffer slots here, right?
> > >   
> > > > 
> > > > - LRU list of slots, with dma_fence so you can pipeline/batch up changes
> > > >   as needed (but I honestly wouldn't worry about the batching before
> > > >   you've shown an actual need for this in some benchmark/workload, even
> > > >   piglit shouldn't have this many things running concurrently I think, you
> > > >   don't have that many cpu cores). Between drm/sched and the lru you will
> > > >   have an emergent scheduler that cycles through all runnable gpu jobs.
> > > > 
> > > > - If you want to go fancy, have eviction tricks like skipping currently
> > > >   still active gpu context with higher priority than the one that you need
> > > >   to find a slot for.
> > > > 
> > > > - You don't need time slicing in this, not even for compute. compute is
> > > >   done with preempt context fences, if you give them a minimum scheduling
> > > >   quanta you'll have a very basic round robin scheduler as an emergent
> > > >   thing.
> > > > 
> > > > Any workload were it matters will be scheduled by the fw directly, with
> > > > drm/sched only being the dma_fence dependcy sorter. My take is that if you
> > > > spend more than a hundred or so lines with slot allocation logic
> > > > (excluding the hw code to load/unload a slot) you're probably doing some
> > > > serious overengineering.    
> > > 
> > > Let me see if I got this right:
> > > 
> > > - we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
> > >   where hw_submission_limit == available_slots_in_ring_buf
> > > - when ->run_job() is called, we write the RUN_JOB() instruction
> > >   sequence to the next available ringbuf slot and queue the entity to
> > >   the FW-slot queue
> > >   * if a slot is directly available, we program the slot directly
> > >   * if no slots are available, but some slots are done with the jobs
> > >     they were given (last job fence signaled), we evict the LRU entity
> > >     (possibly taking priority into account) and use this slot for the
> > >     new entity
> > >   * if no slots are available and all currently assigned slots
> > >     contain busy entities, we queue the entity to a pending list
> > >     (possibly one list per prio)  
> 
> You could also handle this in ->prepare_job, which is called after all the
> default fences have signalled. That allows you to put the "wait for a
> previous job to finnish/unload" behind a dma_fence, which is how (I think
> at least) you can get the round-robin emergent behaviour: If there's no
> idle slot, you just pick all the fences from the currently busy job you
> want to steal the slot from (with priority and lru taken into account),
> let the scheduler wait for that to finnish, and then it'll call your
> run_job when the slot is already available.

Ah, nice! It would also avoid queuing new jobs to a resident entity
when others are waiting for a FW slot, even if, in practice, I'm not
sure we should do that: context will be suspended when the group is
evicted anyway, and things could keep running in the meantime.
I'll give it a try, thanks for the suggestion!

> 
> Also if you do the allocation in ->prepare_job with dma_fence and not
> run_job, then I think can sort out fairness issues (if they do pop up) in
> the drm/sched code instead of having to think about this in each driver.

By allocation, you mean assigning a FW slot ID? If we do this allocation
in ->prepare_job(), couldn't we mess up ordering? Like,
lower-prio/later-queuing entity being scheduled before its pairs,
because there's no guarantee on the job completion order (and thus the
queue idleness order). I mean, completion order depends on the kind of
job being executed by the queues, the time the FW actually lets the
queue execute things and probably other factors. You can use metrics
like the position in the LRU list + the amount of jobs currently
queued to a group to guess which one will be idle first, but that's
just a guess. And I'm not sure I see what doing this slot selection in
->prepare_job() would bring us compared to doing it in ->run_job(),
where we can just pick the least recently used slot.

> Few fw sched slots essentially just make fw scheduling unfairness more
> prominent than with others, but I don't think it's fundamentally something
> else really.
> 
> If every ctx does that and the lru isn't too busted, they should then form
> a nice orderly queue and cycle through the fw scheduler, while still being
> able to get some work done. It's essentially the exact same thing that
> happens with ttm vram eviction, when you have a total working set where
> each process fits in vram individually, but in total they're too big and
> you need to cycle things through.

I see.

> 
> > > I'll need to make sure this still works with the concept of group (it's
> > > not a single queue we schedule, it's a group of queues, meaning that we
> > > have N fences to watch to determine if the slot is busy or not, but
> > > that should be okay).  
> > 
> > Oh, there's one other thing I forgot to mention: the FW scheduler is
> > not entirely fair, it does take the slot priority (which has to be
> > unique across all currently assigned slots) into account when
> > scheduling groups. So, ideally, we'd want to rotate group priorities
> > when they share the same drm_sched_priority (probably based on the
> > position in the LRU).  
> 
> Hm that will make things a bit more fun I guess, especially with your
> constraint to not update this too often. How strict is that priority
> difference? If it's a lot, we might need to treat this more like execlist
> and less like a real fw scheduler ...

Strict as in, if two groups with same priority try to request an
overlapping set of resources (cores or tilers), it can deadlock, so
pretty strict I would say :-).
Daniel Vetter Jan. 12, 2023, 3:38 p.m. UTC | #44
On Thu, 12 Jan 2023 at 13:08, Boris Brezillon
<boris.brezillon@collabora.com> wrote:
> On Thu, 12 Jan 2023 11:42:57 +0100
> Daniel Vetter <daniel@ffwll.ch> wrote:
>
> > On Thu, Jan 12, 2023 at 11:25:53AM +0100, Boris Brezillon wrote:
> > > On Thu, 12 Jan 2023 11:11:03 +0100
> > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > >
> > > > On Thu, 12 Jan 2023 10:32:18 +0100
> > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > > On Thu, Jan 12, 2023 at 10:10:53AM +0100, Boris Brezillon wrote:
> > > > > > Hi Daniel,
> > > > > >
> > > > > > On Wed, 11 Jan 2023 22:47:02 +0100
> > > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > >
> > > > > > > On Tue, 10 Jan 2023 at 09:46, Boris Brezillon
> > > > > > > <boris.brezillon@collabora.com> wrote:
> > > > > > > >
> > > > > > > > Hi Daniel,
> > > > > > > >
> > > > > > > > On Mon, 9 Jan 2023 21:40:21 +0100
> > > > > > > > Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > > > >
> > > > > > > > > On Mon, Jan 09, 2023 at 06:17:48PM +0100, Boris Brezillon wrote:
> > > > > > > > > > Hi Jason,
> > > > > > > > > >
> > > > > > > > > > On Mon, 9 Jan 2023 09:45:09 -0600
> > > > > > > > > > Jason Ekstrand <jason@jlekstrand.net> wrote:
> > > > > > > > > >
> > > > > > > > > > > On Thu, Jan 5, 2023 at 1:40 PM Matthew Brost <matthew.brost@intel.com>
> > > > > > > > > > > wrote:
> > > > > > > > > > >
> > > > > > > > > > > > On Mon, Jan 02, 2023 at 08:30:19AM +0100, Boris Brezillon wrote:
> > > > > > > > > > > > > On Fri, 30 Dec 2022 12:55:08 +0100
> > > > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > > On Fri, 30 Dec 2022 11:20:42 +0100
> > > > > > > > > > > > > > Boris Brezillon <boris.brezillon@collabora.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Hello Matthew,
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > On Thu, 22 Dec 2022 14:21:11 -0800
> > > > > > > > > > > > > > > Matthew Brost <matthew.brost@intel.com> wrote:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
> > > > > > > > > > > > > > > > mapping between a drm_gpu_scheduler and drm_sched_entity. At first
> > > > > > > > > > > > this
> > > > > > > > > > > > > > > > seems a bit odd but let us explain the reasoning below.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > 1. In XE the submission order from multiple drm_sched_entity is not
> > > > > > > > > > > > > > > > guaranteed to be the same completion even if targeting the same
> > > > > > > > > > > > hardware
> > > > > > > > > > > > > > > > engine. This is because in XE we have a firmware scheduler, the
> > > > > > > > > > > > GuC,
> > > > > > > > > > > > > > > > which allowed to reorder, timeslice, and preempt submissions. If a
> > > > > > > > > > > > using
> > > > > > > > > > > > > > > > shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR
> > > > > > > > > > > > falls
> > > > > > > > > > > > > > > > apart as the TDR expects submission order == completion order.
> > > > > > > > > > > > Using a
> > > > > > > > > > > > > > > > dedicated drm_gpu_scheduler per drm_sched_entity solve this
> > > > > > > > > > > > problem.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Oh, that's interesting. I've been trying to solve the same sort of
> > > > > > > > > > > > > > > issues to support Arm's new Mali GPU which is relying on a
> > > > > > > > > > > > FW-assisted
> > > > > > > > > > > > > > > scheduling scheme (you give the FW N streams to execute, and it does
> > > > > > > > > > > > > > > the scheduling between those N command streams, the kernel driver
> > > > > > > > > > > > > > > does timeslice scheduling to update the command streams passed to the
> > > > > > > > > > > > > > > FW). I must admit I gave up on using drm_sched at some point, mostly
> > > > > > > > > > > > > > > because the integration with drm_sched was painful, but also because
> > > > > > > > > > > > I
> > > > > > > > > > > > > > > felt trying to bend drm_sched to make it interact with a
> > > > > > > > > > > > > > > timeslice-oriented scheduling model wasn't really future proof.
> > > > > > > > > > > > Giving
> > > > > > > > > > > > > > > drm_sched_entity exlusive access to a drm_gpu_scheduler probably
> > > > > > > > > > > > might
> > > > > > > > > > > > > > > help for a few things (didn't think it through yet), but I feel it's
> > > > > > > > > > > > > > > coming short on other aspects we have to deal with on Arm GPUs.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Ok, so I just had a quick look at the Xe driver and how it
> > > > > > > > > > > > > > instantiates the drm_sched_entity and drm_gpu_scheduler, and I think I
> > > > > > > > > > > > > > have a better understanding of how you get away with using drm_sched
> > > > > > > > > > > > > > while still controlling how scheduling is really done. Here
> > > > > > > > > > > > > > drm_gpu_scheduler is just a dummy abstract that let's you use the
> > > > > > > > > > > > > > drm_sched job queuing/dep/tracking mechanism. The whole run-queue
> > > > > > > > > > > >
> > > > > > > > > > > > You nailed it here, we use the DRM scheduler for queuing jobs,
> > > > > > > > > > > > dependency tracking and releasing jobs to be scheduled when dependencies
> > > > > > > > > > > > are met, and lastly a tracking mechanism of inflights jobs that need to
> > > > > > > > > > > > be cleaned up if an error occurs. It doesn't actually do any scheduling
> > > > > > > > > > > > aside from the most basic level of not overflowing the submission ring
> > > > > > > > > > > > buffer. In this sense, a 1 to 1 relationship between entity and
> > > > > > > > > > > > scheduler fits quite well.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Yeah, I think there's an annoying difference between what AMD/NVIDIA/Intel
> > > > > > > > > > > want here and what you need for Arm thanks to the number of FW queues
> > > > > > > > > > > available. I don't remember the exact number of GuC queues but it's at
> > > > > > > > > > > least 1k. This puts it in an entirely different class from what you have on
> > > > > > > > > > > Mali. Roughly, there's about three categories here:
> > > > > > > > > > >
> > > > > > > > > > >  1. Hardware where the kernel is placing jobs on actual HW rings. This is
> > > > > > > > > > > old Mali, Intel Haswell and earlier, and probably a bunch of others.
> > > > > > > > > > > (Intel BDW+ with execlists is a weird case that doesn't fit in this
> > > > > > > > > > > categorization.)
> > > > > > > > > > >
> > > > > > > > > > >  2. Hardware (or firmware) with a very limited number of queues where
> > > > > > > > > > > you're going to have to juggle in the kernel in order to run desktop Linux.
> > > > > > > > > > >
> > > > > > > > > > >  3. Firmware scheduling with a high queue count. In this case, you don't
> > > > > > > > > > > want the kernel scheduling anything. Just throw it at the firmware and let
> > > > > > > > > > > it go brrrrr.  If we ever run out of queues (unlikely), the kernel can
> > > > > > > > > > > temporarily pause some low-priority contexts and do some juggling or,
> > > > > > > > > > > frankly, just fail userspace queue creation and tell the user to close some
> > > > > > > > > > > windows.
> > > > > > > > > > >
> > > > > > > > > > > The existence of this 2nd class is a bit annoying but it's where we are. I
> > > > > > > > > > > think it's worth recognizing that Xe and panfrost are in different places
> > > > > > > > > > > here and will require different designs. For Xe, we really are just using
> > > > > > > > > > > drm/scheduler as a front-end and the firmware does all the real scheduling.
> > > > > > > > > > >
> > > > > > > > > > > How do we deal with class 2? That's an interesting question.  We may
> > > > > > > > > > > eventually want to break that off into a separate discussion and not litter
> > > > > > > > > > > the Xe thread but let's keep going here for a bit.  I think there are some
> > > > > > > > > > > pretty reasonable solutions but they're going to look a bit different.
> > > > > > > > > > >
> > > > > > > > > > > The way I did this for Xe with execlists was to keep the 1:1:1 mapping
> > > > > > > > > > > between drm_gpu_scheduler, drm_sched_entity, and userspace xe_engine.
> > > > > > > > > > > Instead of feeding a GuC ring, though, it would feed a fixed-size execlist
> > > > > > > > > > > ring and then there was a tiny kernel which operated entirely in IRQ
> > > > > > > > > > > handlers which juggled those execlists by smashing HW registers.  For
> > > > > > > > > > > Panfrost, I think we want something slightly different but can borrow some
> > > > > > > > > > > ideas here.  In particular, have the schedulers feed kernel-side SW queues
> > > > > > > > > > > (they can even be fixed-size if that helps) and then have a kthread which
> > > > > > > > > > > juggles those feeds the limited FW queues.  In the case where you have few
> > > > > > > > > > > enough active contexts to fit them all in FW, I do think it's best to have
> > > > > > > > > > > them all active in FW and let it schedule. But with only 31, you need to be
> > > > > > > > > > > able to juggle if you run out.
> > > > > > > > > >
> > > > > > > > > > That's more or less what I do right now, except I don't use the
> > > > > > > > > > drm_sched front-end to handle deps or queue jobs (at least not yet). The
> > > > > > > > > > kernel-side timeslice-based scheduler juggling with runnable queues
> > > > > > > > > > (queues with pending jobs that are not yet resident on a FW slot)
> > > > > > > > > > uses a dedicated ordered-workqueue instead of a thread, with scheduler
> > > > > > > > > > ticks being handled with a delayed-work (tick happening every X
> > > > > > > > > > milliseconds when queues are waiting for a slot). It all seems very
> > > > > > > > > > HW/FW-specific though, and I think it's a bit premature to try to
> > > > > > > > > > generalize that part, but the dep-tracking logic implemented by
> > > > > > > > > > drm_sched looked like something I could easily re-use, hence my
> > > > > > > > > > interest in Xe's approach.
> > > > > > > > >
> > > > > > > > > So another option for these few fw queue slots schedulers would be to
> > > > > > > > > treat them as vram and enlist ttm.
> > > > > > > > >
> > > > > > > > > Well maybe more enlist ttm and less treat them like vram, but ttm can
> > > > > > > > > handle idr (or xarray or whatever you want) and then help you with all the
> > > > > > > > > pipelining (and the drm_sched then with sorting out dependencies). If you
> > > > > > > > > then also preferentially "evict" low-priority queus you pretty much have
> > > > > > > > > the perfect thing.
> > > > > > > > >
> > > > > > > > > Note that GuC with sriov splits up the id space and together with some
> > > > > > > > > restrictions due to multi-engine contexts media needs might also need this
> > > > > > > > > all.
> > > > > > > > >
> > > > > > > > > If you're balking at the idea of enlisting ttm just for fw queue
> > > > > > > > > management, amdgpu has a shoddy version of id allocation for their vm/tlb
> > > > > > > > > index allocation. Might be worth it to instead lift that into some sched
> > > > > > > > > helper code.
> > > > > > > >
> > > > > > > > Would you mind pointing me to the amdgpu code you're mentioning here?
> > > > > > > > Still have a hard time seeing what TTM has to do with scheduling, but I
> > > > > > > > also don't know much about TTM, so I'll keep digging.
> > > > > > >
> > > > > > > ttm is about moving stuff in&out of a limited space and gives you some
> > > > > > > nice tooling for pipelining it all. It doesn't care whether that space
> > > > > > > is vram or some limited id space. vmwgfx used ttm as an id manager
> > > > > > > iirc.
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > > >
> > > > > > > > > Either way there's two imo rather solid approaches available to sort this
> > > > > > > > > out. And once you have that, then there shouldn't be any big difference in
> > > > > > > > > driver design between fw with defacto unlimited queue ids, and those with
> > > > > > > > > severe restrictions in number of queues.
> > > > > > > >
> > > > > > > > Honestly, I don't think there's much difference between those two cases
> > > > > > > > already. There's just a bunch of additional code to schedule queues on
> > > > > > > > FW slots for the limited-number-of-FW-slots case, which, right now, is
> > > > > > > > driver specific. The job queuing front-end pretty much achieves what
> > > > > > > > drm_sched does already: queuing job to entities, checking deps,
> > > > > > > > submitting job to HW (in our case, writing to the command stream ring
> > > > > > > > buffer). Things start to differ after that point: once a scheduling
> > > > > > > > entity has pending jobs, we add it to one of the runnable queues (one
> > > > > > > > queue per prio) and kick the kernel-side timeslice-based scheduler to
> > > > > > > > re-evaluate, if needed.
> > > > > > > >
> > > > > > > > I'm all for using generic code when it makes sense, even if that means
> > > > > > > > adding this common code when it doesn't exists, but I don't want to be
> > > > > > > > dragged into some major refactoring that might take years to land.
> > > > > > > > Especially if pancsf is the first
> > > > > > > > FW-assisted-scheduler-with-few-FW-slot driver.
> > > > > > >
> > > > > > > I don't see where there's a major refactoring that you're getting dragged into?
> > > > > >
> > > > > > Oh, no, I'm not saying this is the case just yet, just wanted to make
> > > > > > sure we're on the same page :-).
> > > > > >
> > > > > > >
> > > > > > > Yes there's a huge sprawling discussion right now, but I think that's
> > > > > > > just largely people getting confused.
> > > > > >
> > > > > > I definitely am :-).
> > > > > >
> > > > > > >
> > > > > > > Wrt the actual id assignment stuff, in amdgpu at least it's few lines
> > > > > > > of code. See the amdgpu_vmid_grab stuff for the simplest starting
> > > > > > > point.
> > > > > >
> > > > > > Ok, thanks for the pointers. I'll have a look and see how I could use
> > > > > > that. I guess that's about getting access to the FW slots with some
> > > > > > sort of priority+FIFO ordering guarantees given by TTM. If that's the
> > > > > > case, I'll have to think about it, because that's a major shift from
> > > > > > what we're doing now, and I'm afraid this could lead to starving
> > > > > > non-resident entities if all resident entities keep receiving new jobs
> > > > > > to execute. Unless we put some sort of barrier when giving access to a
> > > > > > slot, so we evict the entity when it's done executing the stuff it had
> > > > > > when it was given access to this slot. But then, again, there are other
> > > > > > constraints to take into account for the Arm Mali CSF case:
> > > > > >
> > > > > > - it's more efficient to update all FW slots at once, because each
> > > > > >   update of a slot might require updating priorities of the other slots
> > > > > >   (FW mandates unique slot priorities, and those priorities depend on
> > > > > >   the entity priority/queue-ordering)
> > > > > > - context/FW slot switches have a non-negligible cost (FW needs to
> > > > > >   suspend the context and save the state every time there such a
> > > > > >   switch), so, limiting the number of FW slot updates might prove
> > > > > >   important
> > > > >
> > > > > I frankly think you're overworrying. When you have 31+ contexts running at
> > > > > the same time, you have bigger problems. At that point there's two
> > > > > use-cases:
> > > > > 1. system is overloaded, the user will reach for reset button anyway
> > > > > 2. temporary situation, all you have to do is be roughly fair enough to get
> > > > >    through it before case 1 happens.
> > > > >
> > > > > Trying to write a perfect scheduler for this before we have actual
> > > > > benchmarks that justify the effort seems like pretty serious overkill.
> > > > > That's why I think the simplest solution is the one we should have:
> > > > >
> > > > > - drm/sched frontend. If you get into slot exhaustion that alone will
> > > > >   ensure enough fairness
> > > >
> > > > We're talking about the CS ring buffer slots here, right?
> > > >
> > > > >
> > > > > - LRU list of slots, with dma_fence so you can pipeline/batch up changes
> > > > >   as needed (but I honestly wouldn't worry about the batching before
> > > > >   you've shown an actual need for this in some benchmark/workload, even
> > > > >   piglit shouldn't have this many things running concurrently I think, you
> > > > >   don't have that many cpu cores). Between drm/sched and the lru you will
> > > > >   have an emergent scheduler that cycles through all runnable gpu jobs.
> > > > >
> > > > > - If you want to go fancy, have eviction tricks like skipping currently
> > > > >   still active gpu context with higher priority than the one that you need
> > > > >   to find a slot for.
> > > > >
> > > > > - You don't need time slicing in this, not even for compute. compute is
> > > > >   done with preempt context fences, if you give them a minimum scheduling
> > > > >   quanta you'll have a very basic round robin scheduler as an emergent
> > > > >   thing.
> > > > >
> > > > > Any workload were it matters will be scheduled by the fw directly, with
> > > > > drm/sched only being the dma_fence dependcy sorter. My take is that if you
> > > > > spend more than a hundred or so lines with slot allocation logic
> > > > > (excluding the hw code to load/unload a slot) you're probably doing some
> > > > > serious overengineering.
> > > >
> > > > Let me see if I got this right:
> > > >
> > > > - we still keep a 1:1 drm_gpu_scheduler:drm_sched_entity approach,
> > > >   where hw_submission_limit == available_slots_in_ring_buf
> > > > - when ->run_job() is called, we write the RUN_JOB() instruction
> > > >   sequence to the next available ringbuf slot and queue the entity to
> > > >   the FW-slot queue
> > > >   * if a slot is directly available, we program the slot directly
> > > >   * if no slots are available, but some slots are done with the jobs
> > > >     they were given (last job fence signaled), we evict the LRU entity
> > > >     (possibly taking priority into account) and use this slot for the
> > > >     new entity
> > > >   * if no slots are available and all currently assigned slots
> > > >     contain busy entities, we queue the entity to a pending list
> > > >     (possibly one list per prio)
> >
> > You could also handle this in ->prepare_job, which is called after all the
> > default fences have signalled. That allows you to put the "wait for a
> > previous job to finnish/unload" behind a dma_fence, which is how (I think
> > at least) you can get the round-robin emergent behaviour: If there's no
> > idle slot, you just pick all the fences from the currently busy job you
> > want to steal the slot from (with priority and lru taken into account),
> > let the scheduler wait for that to finnish, and then it'll call your
> > run_job when the slot is already available.
>
> Ah, nice! It would also avoid queuing new jobs to a resident entity
> when others are waiting for a FW slot, even if, in practice, I'm not
> sure we should do that: context will be suspended when the group is
> evicted anyway, and things could keep running in the meantime.
> I'll give it a try, thanks for the suggestion!
>
> >
> > Also if you do the allocation in ->prepare_job with dma_fence and not
> > run_job, then I think can sort out fairness issues (if they do pop up) in
> > the drm/sched code instead of having to think about this in each driver.
>
> By allocation, you mean assigning a FW slot ID? If we do this allocation
> in ->prepare_job(), couldn't we mess up ordering? Like,
> lower-prio/later-queuing entity being scheduled before its pairs,
> because there's no guarantee on the job completion order (and thus the
> queue idleness order). I mean, completion order depends on the kind of
> job being executed by the queues, the time the FW actually lets the
> queue execute things and probably other factors. You can use metrics
> like the position in the LRU list + the amount of jobs currently
> queued to a group to guess which one will be idle first, but that's
> just a guess. And I'm not sure I see what doing this slot selection in
> ->prepare_job() would bring us compared to doing it in ->run_job(),
> where we can just pick the least recently used slot.

In ->prepare_job you can let the scheduler code do the stalling (and
ensure fairness), in ->run_job it's your job. The current RFC doesn't
really bother much with getting this very right, but if the scheduler
code tries to make sure it pushes higher-prio stuff in first before
others, you should get the right outcome.

The more important functional issue is that you must only allocate the
fw slot after all dependencies have signalled. Otherwise you might get
a nice deadlock, where job A is waiting for the fw slot of B to become
free, and B is waiting for A to finish.

> > Few fw sched slots essentially just make fw scheduling unfairness more
> > prominent than with others, but I don't think it's fundamentally something
> > else really.
> >
> > If every ctx does that and the lru isn't too busted, they should then form
> > a nice orderly queue and cycle through the fw scheduler, while still being
> > able to get some work done. It's essentially the exact same thing that
> > happens with ttm vram eviction, when you have a total working set where
> > each process fits in vram individually, but in total they're too big and
> > you need to cycle things through.
>
> I see.
>
> >
> > > > I'll need to make sure this still works with the concept of group (it's
> > > > not a single queue we schedule, it's a group of queues, meaning that we
> > > > have N fences to watch to determine if the slot is busy or not, but
> > > > that should be okay).
> > >
> > > Oh, there's one other thing I forgot to mention: the FW scheduler is
> > > not entirely fair, it does take the slot priority (which has to be
> > > unique across all currently assigned slots) into account when
> > > scheduling groups. So, ideally, we'd want to rotate group priorities
> > > when they share the same drm_sched_priority (probably based on the
> > > position in the LRU).
> >
> > Hm that will make things a bit more fun I guess, especially with your
> > constraint to not update this too often. How strict is that priority
> > difference? If it's a lot, we might need to treat this more like execlist
> > and less like a real fw scheduler ...
>
> Strict as in, if two groups with same priority try to request an
> overlapping set of resources (cores or tilers), it can deadlock, so
> pretty strict I would say :-).

So it first finishes all the higher priority tasks and only then it
runs the next one, so no round-robin? Or am I just confused what this
all is about. Or is it more that the order in the group determines how
it tries to schedule on the hw, and if the earlier job needs hw that
also the later one needs, then the earlier one has to finish first?
Which would still mean that for these overlapping cases there's just
no round-robin in the fw scheduler at all.
-Daniel
Boris Brezillon Jan. 12, 2023, 4:48 p.m. UTC | #45
On Thu, 12 Jan 2023 16:38:18 +0100
Daniel Vetter <daniel@ffwll.ch> wrote:

> > >
> > > Also if you do the allocation in ->prepare_job with dma_fence and not
> > > run_job, then I think can sort out fairness issues (if they do pop up) in
> > > the drm/sched code instead of having to think about this in each driver.  
> >
> > By allocation, you mean assigning a FW slot ID? If we do this allocation
> > in ->prepare_job(), couldn't we mess up ordering? Like,
> > lower-prio/later-queuing entity being scheduled before its pairs,
> > because there's no guarantee on the job completion order (and thus the
> > queue idleness order). I mean, completion order depends on the kind of
> > job being executed by the queues, the time the FW actually lets the
> > queue execute things and probably other factors. You can use metrics
> > like the position in the LRU list + the amount of jobs currently
> > queued to a group to guess which one will be idle first, but that's
> > just a guess. And I'm not sure I see what doing this slot selection in  
> > ->prepare_job() would bring us compared to doing it in ->run_job(),  
> > where we can just pick the least recently used slot.  
> 
> In ->prepare_job you can let the scheduler code do the stalling (and
> ensure fairness), in ->run_job it's your job.

Yeah returning a fence in ->prepare_job() to wait for a FW slot to
become idle sounds good. This fence would be signaled when one of the
slots becomes idle. But I'm wondering why we'd want to select the slot
so early. Can't we just do the selection in ->run_job()? After all, if
the fence has been signaled, that means we'll find at least one slot
that's ready when we hit ->run_job(), and we can select it at that
point.

> The current RFC doesn't
> really bother much with getting this very right, but if the scheduler
> code tries to make sure it pushes higher-prio stuff in first before
> others, you should get the right outcome.

Okay, so I'm confused again. We said we had a 1:1
drm_gpu_scheduler:drm_sched_entity mapping, meaning that entities are
isolated from each other. I can see how I could place the dma_fence
returned by ->prepare_job() in a driver-specific per-priority list, so
the driver can pick the highest-prio/first-inserted entry and signal the
associated fence when a slot becomes idle. But I have a hard time
seeing how common code could do that if it doesn't see the other
entities. Right now, drm_gpu_scheduler only selects the best entity
among the registered ones, and there's only one entity per
drm_gpu_scheduler in this case.

> 
> The more important functional issue is that you must only allocate the
> fw slot after all dependencies have signalled.

Sure, but it doesn't have to be a specific FW slot, it can be any FW
slot, as long as we don't signal more fences than we have slots
available, right?

> Otherwise you might get
> a nice deadlock, where job A is waiting for the fw slot of B to become
> free, and B is waiting for A to finish.

Got that part, and that's ensured by the fact we wait for all
regular deps before returning the FW-slot-available dma_fence in
->prepare_job(). This exact same fence will be signaled when a slot
becomes idle.

> 
> > > Few fw sched slots essentially just make fw scheduling unfairness more
> > > prominent than with others, but I don't think it's fundamentally something
> > > else really.
> > >
> > > If every ctx does that and the lru isn't too busted, they should then form
> > > a nice orderly queue and cycle through the fw scheduler, while still being
> > > able to get some work done. It's essentially the exact same thing that
> > > happens with ttm vram eviction, when you have a total working set where
> > > each process fits in vram individually, but in total they're too big and
> > > you need to cycle things through.  
> >
> > I see.
> >  
> > >  
> > > > > I'll need to make sure this still works with the concept of group (it's
> > > > > not a single queue we schedule, it's a group of queues, meaning that we
> > > > > have N fences to watch to determine if the slot is busy or not, but
> > > > > that should be okay).  
> > > >
> > > > Oh, there's one other thing I forgot to mention: the FW scheduler is
> > > > not entirely fair, it does take the slot priority (which has to be
> > > > unique across all currently assigned slots) into account when
> > > > scheduling groups. So, ideally, we'd want to rotate group priorities
> > > > when they share the same drm_sched_priority (probably based on the
> > > > position in the LRU).  
> > >
> > > Hm that will make things a bit more fun I guess, especially with your
> > > constraint to not update this too often. How strict is that priority
> > > difference? If it's a lot, we might need to treat this more like execlist
> > > and less like a real fw scheduler ...  
> >
> > Strict as in, if two groups with same priority try to request an
> > overlapping set of resources (cores or tilers), it can deadlock, so
> > pretty strict I would say :-).  
> 
> So it first finishes all the higher priority tasks and only then it
> runs the next one, so no round-robin? Or am I just confused what this
> all is about. Or is it more that the order in the group determines how
> it tries to schedule on the hw, and if the earlier job needs hw that
> also the later one needs, then the earlier one has to finish first?
> Which would still mean that for these overlapping cases there's just
> no round-robin in the fw scheduler at all.

Okay, so my understanding is: FW scheduler always takes the highest
priority when selecting between X groups requesting access to a
resource, but if 2 groups want the same resource and have the same
priority, there's no ordering guarantee. The deadlock happens when both
group A and B claim resources X and Y. Group A might get resource X
and group B might get resource Y, both waiting for the other resource
they claimed. If they have different priorities one of them would back
off and let the other run, if they have the same priority, none of them
would, and that's where the deadlock comes from. Note that we don't
control the order resources get acquired from the CS, so there's no way
to avoid this deadlock without assigning different priorities.

And you're right, if you pick different priorities, the only time lower
priority groups get to run is when the highest priority group is
waiting on an asynchronous operation to complete (can be a
compute/frag/tiler job completion, some inter queue synchronization,
waiting for an already acquired resource, ...), or when it's idle. I
suspect queues from different groups can run concurrently if there's
enough command-stream processing slots available, and those groups
request resources that don't overlap, but I'm speculating here. So, no
round-robin if slots are assigned unique priorities. Not even sure
scheduling is time-slice based to be honest, it could be some
cooperative scheduling where groups with the same priorities get to
wait for the currently running group to be blocked to get access to
the HW. In any case, there's no easy way to prevent deadlocks if we
don't assign unique priorities.
Tvrtko Ursulin Jan. 12, 2023, 6:21 p.m. UTC | #46
On 11/01/2023 17:52, Matthew Brost wrote:
> On Wed, Jan 11, 2023 at 09:09:45AM +0000, Tvrtko Ursulin wrote:

[snip]

>> Anyway, since you are not buying any arguments on paper perhaps you are more
>> open towards testing. If you would adapt gem_wsim for Xe you would be able
>> to spawn N simulated transcode sessions on any Gen11+ machine and try it
>> out.
>>
>> For example:
>>
>> gem_wsim -w benchmarks/wsim/media_load_balance_fhd26u7.wsim -c 36 -r 600
>>
>> That will run you 36 parallel transcoding sessions streams for 600 frames
>> each. No client setup needed whatsoever apart from compiling IGT.
>>
>> In the past that was quite a handy tool to identify scheduling issues, or
>> validate changes against. All workloads with the media prefix have actually
>> been hand crafted by looking at what real media pipelines do with real data.
>> Few years back at least.
>>
> 
> Porting this is non-trivial as this is 2.5k. Also in Xe we are trending
> to use UMD benchmarks to determine if there are performance problems as
> in the i915 we had tons microbenchmarks / IGT benchmarks that we found
> meant absolutely nothing. Can't say if this benchmark falls into that
> category.

I explained what it does so it was supposed to be obvious it is not a 
micro benchmark.

2.5k what, lines of code? Difficulty of adding Xe support does not scale 
with LOC but with how much it uses the kernel API. You'd essentially 
need to handle context/engine creation and different execbuf.

It's not trivial no, but it would save you downloading gigabytes of test 
streams, building a bunch of tools and libraries etc, and so overall in 
my experience it *significantly* improves the driver development 
turn-around time.

> We VK and compute benchmarks running and haven't found any major issues
> yet. The media UMD hasn't been ported because of the VM bind dependency
> so I can't say if there are any issues with the media UMD + Xe.
> 
> What I can do hack up xe_exec_threads to really hammer Xe - change it to
> 128x xe_engines + 8k execs per thread. Each exec is super simple, it
> just stores a dword. It creates a thread per hardware engine, so on TGL
> this is 5x threads.
> 
> Results below:
> root@DUT025-TGLU:mbrost# xe_exec_threads --r threads-basic
> IGT-Version: 1.26-ge26de4b2 (x86_64) (Linux: 6.1.0-rc1-xe+ x86_64)
> Starting subtest: threads-basic
> Subtest threads-basic: SUCCESS (1.215s)
> root@DUT025-TGLU:mbrost# dumptrace | grep job | wc
>    40960  491520 7401728
> root@DUT025-TGLU:mbrost# dumptrace | grep engine | wc
>      645    7095   82457
> 
> So with 640 xe_engines (5x are VM engines) it takes 1.215 seconds test
> time to run 40960 execs. That seems to indicate we do not have a
> scheduling problem.
> 
> This is 8 core (or at least 8 threads) TGL:
> 
> root@DUT025-TGLU:mbrost# cat /proc/cpuinfo
> ...
> processor       : 7
> vendor_id       : GenuineIntel
> cpu family      : 6
> model           : 140
> model name      : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> stepping        : 1
> microcode       : 0x3a
> cpu MHz         : 2344.098
> cache size      : 12288 KB
> physical id     : 0
> siblings        : 8
> core id         : 3
> cpu cores       : 4
> ...
> 
> Enough data to be convinced there is not issue with this design? I can
> also hack up Xe to use less GPU schedulers /w a kthreads but again that
> isn't trivial and doesn't seem necessary based on these results.

Not yet. It's not only about how many somethings per second you can do. 
It is also about what effect to the rest of the system it creates.

Anyway I think you said in different sub-thread you will move away from 
system_wq, so we can close this one. With that plan at least I don't 
have to worry my mouse will stutter and audio glitch while Xe is 
churning away.

Regards,

Tvrtko
Tvrtko Ursulin Jan. 12, 2023, 6:43 p.m. UTC | #47
On 11/01/2023 19:40, Matthew Brost wrote:
> On Wed, Jan 11, 2023 at 08:50:37AM +0000, Tvrtko Ursulin wrote:

[snip]

>> This example is where it would hurt on large systems. Imagine only an even
>> wider media transcode card...
>>
>> Second example is only a single engine class used (3d desktop?) but with a
>> bunch of not-runnable jobs queued and waiting on a fence to signal. Implicit
>> or explicit dependencies doesn't matter. Then the fence signals and call
>> backs run. N work items get scheduled, but they all submit to the same HW
>> engine. So we end up with:
>>
>>          /-- wi1 --\
>>         / ..     .. \
>>   cb --+---  wi.. ---+-- rq1 -- .. -- rqN
>>         \ ..    ..  /
>>          \-- wiN --/
>>
>>
>> All that we have achieved is waking up N CPUs to contend on the same lock
>> and effectively insert the job into the same single HW queue. I don't see
>> any positives there.
>>
> 
> I've said this before, the CT channel in practice isn't going to be full
> so the section of code protected by the mutex is really, really small.
> The mutex really shouldn't ever have contention. Also does a mutex spin
> for small period of time before going to sleep? I seem to recall some
> type of core lock did this, if we can use a lock that spins for short
> period of time this argument falls apart.

This argument already fell apart when we established it's the system_wq 
and not the unbound one. So a digression only - it did not fall apart 
because of the CT channel not being ever congested, there would still be 
the question of what's the point to wake up N cpus when there is a 
single work channel in the backend.

You would have been able to bypass all that by inserting work items 
directly, not via the scheduler workers. I thought that was what Jason 
was implying when he mentioned that a better frontend/backend drm 
scheduler split was considered at some point.

Because for 1:1:1, where GuC is truly 1, it does seem it would work 
better if that sort of a split would enable you to queue directly into 
the backend bypassing the kthread/worker / wait_on wake_up dance.

Would that work? From drm_sched_entity_push_job directly to the backend 
- not waking up but *calling* the equivalent of drm_sched_main.

>> Right, that's all solid I think. My takeaway is that frontend priority
>> sorting and that stuff isn't needed and that is okay. And that there are
>> multiple options to maybe improve drm scheduler, like the fore mentioned
>> making it deal with out of order, or split into functional components, or
>> split frontend/backend what you suggested. For most of them cost vs benefit
>> is more or less not completely clear, neither how much effort was invested
>> to look into them.
>>
>> One thing I missed from this explanation is how drm_scheduler per engine
>> class interferes with the high level concepts. And I did not manage to pick
>> up on what exactly is the TDR problem in that case. Maybe the two are one
>> and the same.
>>
>> Bottom line is I still have the concern that conversion to kworkers has an
>> opportunity to regress. Possibly more opportunity for some Xe use cases than
>> to affect other vendors, since they would still be using per physical engine
>> / queue scheduler instances.
>>
> 
> We certainly don't want to affect other vendors but I haven't yet heard
> any push back from other vendors. I don't think speculating about
> potential problems is helpful.

I haven't had any push back on the drm cgroup controller either. :D

>> And to put my money where my mouth is I will try to put testing Xe inside
>> the full blown ChromeOS environment in my team plans. It would probably also
>> be beneficial if Xe team could take a look at real world behaviour of the
>> extreme transcode use cases too. If the stack is ready for that and all. It
>> would be better to know earlier rather than later if there is a fundamental
>> issue.
>>
> 
> We don't have a media UMD yet it will be tough to test at this point in
> time. Also not sure when Xe is going to be POR for a Chrome product
> either so porting Xe into ChromeOS likely isn't a top priority for your
> team. I know from experience that porting things into ChromeOS isn't
> trivial as I've support several of these efforts. Not saying don't do
> this just mentioning the realities of what you are suggesting.

I know, I only said I'd put it in the plans, not that it will happen 
tomorrow.

Regards,

Tvrtko
John Harrison Jan. 13, 2023, 12:39 a.m. UTC | #48
On 1/11/2023 14:56, Jason Ekstrand wrote:
> On Wed, Jan 11, 2023 at 4:32 PM Matthew Brost 
> <matthew.brost@intel.com> wrote:
>
>     On Wed, Jan 11, 2023 at 04:18:01PM -0600, Jason Ekstrand wrote:
>     > On Wed, Jan 11, 2023 at 2:50 AM Tvrtko Ursulin <
>     > tvrtko.ursulin@linux.intel.com> wrote:
>     >
>     > >
>     [snip]
>     > >
>     > > Typically is the key here. But I am not sure it is good
>     enough. Consider
>     > > this example - Intel Flex 170:
>     > >
>     > >   * Delivers up to 36 streams 1080p60 transcode throughput per
>     card.
>     > >   * When scaled to 10 cards in a 4U server configuration, it
>     can support
>     > > up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
>     > >
>     >
>     > I had a feeling it was going to be media.... 
Matthew Brost Jan. 18, 2023, 3:06 a.m. UTC | #49
On Thu, Jan 12, 2023 at 04:39:32PM -0800, John Harrison wrote:
> On 1/11/2023 14:56, Jason Ekstrand wrote:
> > On Wed, Jan 11, 2023 at 4:32 PM Matthew Brost <matthew.brost@intel.com>
> > wrote:
> > 
> >     On Wed, Jan 11, 2023 at 04:18:01PM -0600, Jason Ekstrand wrote:
> >     > On Wed, Jan 11, 2023 at 2:50 AM Tvrtko Ursulin <
> >     > tvrtko.ursulin@linux.intel.com> wrote:
> >     >
> >     > >
> >     [snip]
> >     > >
> >     > > Typically is the key here. But I am not sure it is good
> >     enough. Consider
> >     > > this example - Intel Flex 170:
> >     > >
> >     > >   * Delivers up to 36 streams 1080p60 transcode throughput per
> >     card.
> >     > >   * When scaled to 10 cards in a 4U server configuration, it
> >     can support
> >     > > up to 360 streams of HEVC/HEVC 1080p60 transcode throughput.
> >     > >
> >     >
> >     > I had a feeling it was going to be media.... 
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index f60753f97ac5..9c2a10aeb0b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1489,9 +1489,9 @@  static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->sched.ready)
 			continue;
-		kthread_park(ring->sched.thread);
+		drm_sched_run_wq_stop(&ring->sched);
 	}
 
 	seq_printf(m, "run ib test:\n");
@@ -1505,9 +1505,9 @@  static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->sched.ready)
 			continue;
-		kthread_unpark(ring->sched.thread);
+		drm_sched_run_wq_start(&ring->sched);
 	}
 
 	up_write(&adev->reset_domain->sem);
@@ -1727,7 +1727,7 @@  static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 
 	ring = adev->rings[val];
 
-	if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
+	if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
 		return -EINVAL;
 
 	/* the last preemption failed */
@@ -1745,7 +1745,7 @@  static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 		goto pro_end;
 
 	/* stop the scheduler */
-	kthread_park(ring->sched.thread);
+	drm_sched_run_wq_stop(&ring->sched);
 
 	/* preempt the IB */
 	r = amdgpu_ring_preempt_ib(ring);
@@ -1779,7 +1779,7 @@  static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 
 failure:
 	/* restart the scheduler */
-	kthread_unpark(ring->sched.thread);
+	drm_sched_run_wq_start(&ring->sched);
 
 	up_read(&adev->reset_domain->sem);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 076ae400d099..9552929ccf87 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4577,7 +4577,7 @@  bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->sched.ready)
 			continue;
 
 		spin_lock(&ring->sched.job_list_lock);
@@ -4708,7 +4708,7 @@  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->sched.ready)
 			continue;
 
 		/*clear job fence from fence drv to avoid force_completion
@@ -5247,7 +5247,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !ring->sched.ready)
 				continue;
 
 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5321,7 +5321,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !ring->sched.ready)
 				continue;
 
 			drm_sched_start(&ring->sched, true);
@@ -5648,7 +5648,7 @@  pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = adev->rings[i];
 
-			if (!ring || !ring->sched.thread)
+			if (!ring || !ring->sched.ready)
 				continue;
 
 			drm_sched_stop(&ring->sched, NULL);
@@ -5776,7 +5776,7 @@  void amdgpu_pci_resume(struct pci_dev *pdev)
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
-		if (!ring || !ring->sched.thread)
+		if (!ring || !ring->sched.ready)
 			continue;
 
 		drm_sched_start(&ring->sched, true);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 27d52ffbb808..8c64045d0692 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -44,7 +44,6 @@ 
  * The jobs in a entity are always scheduled in the order that they were pushed.
  */
 
-#include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
@@ -251,6 +250,53 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
 }
 
+/**
+ * drm_sched_run_wq_stop - stop scheduler run worker
+ *
+ * @sched: scheduler instance to stop run worker
+ */
+void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched)
+{
+	sched->pause_run_wq = true;
+	smp_wmb();
+
+	cancel_work_sync(&sched->work_run);
+}
+EXPORT_SYMBOL(drm_sched_run_wq_stop);
+
+/**
+ * drm_sched_run_wq_start - start scheduler run worker
+ *
+ * @sched: scheduler instance to start run worker
+ */
+void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched)
+{
+	sched->pause_run_wq = false;
+	smp_wmb();
+
+	queue_work(sched->run_wq, &sched->work_run);
+}
+EXPORT_SYMBOL(drm_sched_run_wq_start);
+
+/**
+ * drm_sched_run_wq_queue - queue scheduler run worker
+ *
+ * @sched: scheduler instance to queue run worker
+ */
+static void drm_sched_run_wq_queue(struct drm_gpu_scheduler *sched)
+{
+	smp_rmb();
+
+	/*
+	 * Try not to schedule work if pause_run_wq set but not the end of world
+	 * if we do as either it will be cancelled by the above
+	 * cancel_work_sync, or drm_sched_main turns into a NOP while
+	 * pause_run_wq is set.
+	 */
+	if (!sched->pause_run_wq)
+		queue_work(sched->run_wq, &sched->work_run);
+}
+
 /**
  * drm_sched_job_done - complete a job
  * @s_job: pointer to the job which is done
@@ -270,7 +316,7 @@  static void drm_sched_job_done(struct drm_sched_job *s_job)
 	dma_fence_get(&s_fence->finished);
 	drm_sched_fence_finished(s_fence);
 	dma_fence_put(&s_fence->finished);
-	wake_up_interruptible(&sched->wake_up_worker);
+	drm_sched_run_wq_queue(sched);
 }
 
 /**
@@ -433,7 +479,7 @@  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 {
 	struct drm_sched_job *s_job, *tmp;
 
-	kthread_park(sched->thread);
+	drm_sched_run_wq_stop(sched);
 
 	/*
 	 * Reinsert back the bad job here - now it's safe as
@@ -546,7 +592,7 @@  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 		spin_unlock(&sched->job_list_lock);
 	}
 
-	kthread_unpark(sched->thread);
+	drm_sched_run_wq_start(sched);
 }
 EXPORT_SYMBOL(drm_sched_start);
 
@@ -831,7 +877,7 @@  static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
 {
 	if (drm_sched_ready(sched))
-		wake_up_interruptible(&sched->wake_up_worker);
+		drm_sched_run_wq_queue(sched);
 }
 
 /**
@@ -941,60 +987,42 @@  drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 }
 EXPORT_SYMBOL(drm_sched_pick_best);
 
-/**
- * drm_sched_blocked - check if the scheduler is blocked
- *
- * @sched: scheduler instance
- *
- * Returns true if blocked, otherwise false.
- */
-static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
-{
-	if (kthread_should_park()) {
-		kthread_parkme();
-		return true;
-	}
-
-	return false;
-}
-
 /**
  * drm_sched_main - main scheduler thread
  *
  * @param: scheduler instance
- *
- * Returns 0.
  */
-static int drm_sched_main(void *param)
+static void drm_sched_main(struct work_struct *w)
 {
-	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
+	struct drm_gpu_scheduler *sched =
+		container_of(w, struct drm_gpu_scheduler, work_run);
 	int r;
 
-	sched_set_fifo_low(current);
-
-	while (!kthread_should_stop()) {
-		struct drm_sched_entity *entity = NULL;
+	while (!READ_ONCE(sched->pause_run_wq)) {
+		struct drm_sched_entity *entity;
 		struct drm_sched_fence *s_fence;
 		struct drm_sched_job *sched_job;
 		struct dma_fence *fence;
-		struct drm_sched_job *cleanup_job = NULL;
+		struct drm_sched_job *cleanup_job;
 
-		wait_event_interruptible(sched->wake_up_worker,
-					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
-					 (!drm_sched_blocked(sched) &&
-					  (entity = drm_sched_select_entity(sched))) ||
-					 kthread_should_stop());
+		cleanup_job = drm_sched_get_cleanup_job(sched);
+		entity = drm_sched_select_entity(sched);
 
 		if (cleanup_job)
 			sched->ops->free_job(cleanup_job);
 
-		if (!entity)
+		if (!entity) {
+			if (!cleanup_job)
+				break;
 			continue;
+		}
 
 		sched_job = drm_sched_entity_pop_job(entity);
 
 		if (!sched_job) {
 			complete_all(&entity->entity_idle);
+			if (!cleanup_job)
+				break;
 			continue;
 		}
 
@@ -1022,14 +1050,14 @@  static int drm_sched_main(void *param)
 					  r);
 		} else {
 			if (IS_ERR(fence))
-				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
+				dma_fence_set_error(&s_fence->finished,
+						    PTR_ERR(fence));
 
 			drm_sched_job_done(sched_job);
 		}
 
 		wake_up(&sched->job_scheduled);
 	}
-	return 0;
 }
 
 /**
@@ -1054,35 +1082,28 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   long timeout, struct workqueue_struct *timeout_wq,
 		   atomic_t *score, const char *name, struct device *dev)
 {
-	int i, ret;
+	int i;
 	sched->ops = ops;
 	sched->hw_submission_limit = hw_submission;
 	sched->name = name;
 	sched->timeout = timeout;
 	sched->timeout_wq = timeout_wq ? : system_wq;
+	sched->run_wq = system_wq;	/* FIXME: Let user pass this in */
 	sched->hang_limit = hang_limit;
 	sched->score = score ? score : &sched->_score;
 	sched->dev = dev;
 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
 		drm_sched_rq_init(sched, &sched->sched_rq[i]);
 
-	init_waitqueue_head(&sched->wake_up_worker);
 	init_waitqueue_head(&sched->job_scheduled);
 	INIT_LIST_HEAD(&sched->pending_list);
 	spin_lock_init(&sched->job_list_lock);
 	atomic_set(&sched->hw_rq_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
+	INIT_WORK(&sched->work_run, drm_sched_main);
 	atomic_set(&sched->_score, 0);
 	atomic64_set(&sched->job_id_count, 0);
-
-	/* Each scheduler will run on a seperate kernel thread */
-	sched->thread = kthread_run(drm_sched_main, sched, sched->name);
-	if (IS_ERR(sched->thread)) {
-		ret = PTR_ERR(sched->thread);
-		sched->thread = NULL;
-		DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
-		return ret;
-	}
+	sched->pause_run_wq = false;
 
 	sched->ready = true;
 	return 0;
@@ -1101,8 +1122,7 @@  void drm_sched_fini(struct drm_gpu_scheduler *sched)
 	struct drm_sched_entity *s_entity;
 	int i;
 
-	if (sched->thread)
-		kthread_stop(sched->thread);
+	drm_sched_run_wq_stop(sched);
 
 	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
 		struct drm_sched_rq *rq = &sched->sched_rq[i];
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index ca857ec9e7eb..ff50f3c289cd 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -456,17 +456,16 @@  struct drm_sched_backend_ops {
  * @timeout: the time after which a job is removed from the scheduler.
  * @name: name of the ring for which this scheduler is being used.
  * @sched_rq: priority wise array of run queues.
- * @wake_up_worker: the wait queue on which the scheduler sleeps until a job
- *                  is ready to be scheduled.
  * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
  *                 waits on this wait queue until all the scheduled jobs are
  *                 finished.
  * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
+ * @run_wq: workqueue used to queue @work_run
  * @timeout_wq: workqueue used to queue @work_tdr
+ * @work_run: schedules jobs and cleans up entities
  * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
  *            timeout interval is over.
- * @thread: the kthread on which the scheduler which run.
  * @pending_list: the list of jobs which are currently in the job queue.
  * @job_list_lock: lock to protect the pending_list.
  * @hang_limit: once the hangs by a job crosses this limit then it is marked
@@ -475,6 +474,7 @@  struct drm_sched_backend_ops {
  * @_score: score used when the driver doesn't provide one
  * @ready: marks if the underlying HW is ready to work
  * @free_guilty: A hit to time out handler to free the guilty job.
+ * @pause_run_wq: pause queuing of @work_run on @run_wq
  * @dev: system &struct device
  *
  * One scheduler is implemented for each hardware ring.
@@ -485,13 +485,13 @@  struct drm_gpu_scheduler {
 	long				timeout;
 	const char			*name;
 	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
-	wait_queue_head_t		wake_up_worker;
 	wait_queue_head_t		job_scheduled;
 	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
+	struct workqueue_struct		*run_wq;
 	struct workqueue_struct		*timeout_wq;
+	struct work_struct		work_run;
 	struct delayed_work		work_tdr;
-	struct task_struct		*thread;
 	struct list_head		pending_list;
 	spinlock_t			job_list_lock;
 	int				hang_limit;
@@ -499,6 +499,7 @@  struct drm_gpu_scheduler {
 	atomic_t                        _score;
 	bool				ready;
 	bool				free_guilty;
+	bool				pause_run_wq;
 	struct device			*dev;
 };
 
@@ -529,6 +530,8 @@  void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
 
 void drm_sched_job_cleanup(struct drm_sched_job *job);
 void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
+void drm_sched_run_wq_stop(struct drm_gpu_scheduler *sched);
+void drm_sched_run_wq_start(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);