@@ -1156,9 +1156,12 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct kfd_process_device *pdd;
int retval = 0;
+ // gfx1103 APU fails to remove the queue usually after 10-50 attempts
+ if (dqm->dev->adev->flags & AMD_IS_APU)
+ goto out;
dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
- goto out;
+ goto out_unlock;
pdd = qpd_to_pdd(qpd);
@@ -1167,7 +1170,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
* Skip queue eviction on process eviction.
*/
if (!pdd->drm_priv)
- goto out;
+ goto out_unlock;
pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
pdd->process->pasid);
@@ -1188,7 +1191,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
if (retval) {
dev_err(dev, "Failed to evict queue %d\n",
q->properties.queue_id);
- goto out;
+ goto out_unlock;
}
}
}
@@ -1200,8 +1203,9 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
-out:
+out_unlock:
dqm_unlock(dqm);
+out:
return retval;
}
@@ -1295,6 +1299,9 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
uint64_t eviction_duration;
int retval = 0;
+ // gfx1103 APU fails to remove the queue usually after 10-50 attempts
+ if (dqm->dev->adev->flags & AMD_IS_APU)
+ goto out;
pdd = qpd_to_pdd(qpd);
dqm_lock(dqm);
@@ -1344,8 +1351,9 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
vm_not_acquired:
qpd->evicted = 0;
-out:
+out_unlock:
dqm_unlock(dqm);
+out:
return retval;
}
AMD gfx1103/M780 iGPU will crash eventually while performing pytorch ML/AI operations on rocm sdk stack. Crash causes linux desktop randomly either to recover after killing the app, freeze the desktop or reset back to login screen. Easy way to trigger the problem is to build the the ML/AI support for gfx1103 M780 iGPU with the rocm sdk builder and then running the test application in loop. Additional trace messages helped to found out that error happens always on same location when kernel ends up peridiocally calling evict_process_queues_cpsch and restore_process_queues_cpsch methods and calls MES to restore the queues in loop. Crash requires small but random amount calls to these evict and restore calls. (usually around 10-50) before the error happens on kernel. On gfx1103 case, there seems to be 3 queues that are evicted and restored and errors happens always when restoring the second one from the list with the doorbell 0x1002. Adding delays to either to test application between calls (1 second) or to loop inside kernel which removes the queues one by one (mdelay(10)) does not help to avoid the crash. I tested multiple other GPUs and similar error could not be triggered gfx1010 (rx 5900), gfx1030 (rx 6800) and gfx1035 (M680 iGPU) or gfx1102 (RX 7700S). From these devices only the gfx1102 uses same codepath for calling mes firmware. I tested that the problem could not be avoidded either by adding delay to user space pytorch app via delays between calls (1 sec) or by adding delay to loop inside the kernel which removes/restores the queues. (tested with mdelay(10)) Testing has mostly been done on 6.12rcs and 6.12 final kernels but same problem can been triggered also at least on 6.08 and 6.11 kernels. After the fix I have run the application on loop multiple times over 1000 loop without seeing the error to happen again. Original bug and test case was made by jrl290 on rocm sdk builder bug issue 141. [ 948.324174] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch [ 948.334344] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch [ 948.344499] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch [ 952.380614] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch [ 952.391330] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch [ 952.401634] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1000, queue: 0, caller: evict_process_queues_cpsch [ 952.414507] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch [ 952.424618] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch [ 952.434922] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch [ 952.446272] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch [ 954.460341] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE [ 954.460356] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes failed to remove hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch [ 954.460360] amdgpu 0000:c4:00.0: amdgpu: MES might be in unrecoverable state, issue a GPU reset [ 954.460366] amdgpu 0000:c4:00.0: amdgpu: Failed to evict queue 1 [ 954.460368] amdgpu 0000:c4:00.0: amdgpu: Failed to evict process queues [ 954.460439] amdgpu 0000:c4:00.0: amdgpu: GPU reset begin! [ 954.460464] amdgpu 0000:c4:00.0: amdgpu: remove_all_queues_mes: Failed to remove queue 0 for dev 5257 [ 954.460515] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State [ 954.462637] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State Completed [ 955.865591] amdgpu: process_termination_cpsch started [ 955.866432] amdgpu: process_termination_cpsch started [ 955.866445] amdgpu 0000:c4:00.0: amdgpu: Failed to remove queue 0 [ 956.503043] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE [ 956.503059] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue [ 958.507491] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE [ 958.507507] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue [ 960.512077] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE [ 960.512093] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue [ 960.785816] [drm:gfx_v11_0_hw_fini [amdgpu]] *ERROR* failed to halt cp gfx Signed-off-by: Mika Laitio <lamikr@gmail.com> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)