@@ -1298,6 +1298,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job* job);
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
+ struct amdgpu_job *job);
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
bool amdgpu_device_need_post(struct amdgpu_device *adev);
@@ -5033,7 +5033,7 @@ static void amdgpu_device_recheck_guilty_jobs(
* Returns 0 for success or an error on failure.
*/
-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
@@ -5292,6 +5292,37 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
return r;
}
+struct amdgpu_recover_work_struct {
+ struct work_struct base;
+ struct amdgpu_device *adev;
+ struct amdgpu_job *job;
+ int ret;
+};
+
+static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
+{
+ struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
+
+ recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
+}
+/*
+ * Serialize gpu recover into reset domain single threaded wq
+ */
+int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ struct amdgpu_job *job)
+{
+ struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
+
+ INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
+
+ if (!queue_work(adev->reset_domain.wq, &work.base))
+ return -EAGAIN;
+
+ flush_work(&work.base);
+
+ return work.ret;
+}
+
/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
@@ -63,7 +63,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
ti.process_name, ti.tgid, ti.task_name, ti.pid);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
- amdgpu_device_gpu_recover(ring->adev, job);
+ amdgpu_device_gpu_recover_imp(ring->adev, job);
} else {
drm_sched_suspend_timeout(&ring->sched);
if (amdgpu_sriov_vf(adev))