@@ -3320,7 +3320,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
if (!amdgpu_noretry && is_compute_context &&
- !svm_range_restore_pages(adev, pasid, addr)) {
+ !svm_range_restore_pages(adev, vm, pasid, addr)) {
amdgpu_bo_unref(&root);
return true;
}
@@ -441,6 +441,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
* svm_migrate_ram_to_vram - migrate svm range from system to device
* @prange: range structure
* @best_loc: the device to migrate to
+ * @mm: the process mm structure
*
* Context: Process context, caller hold mm->mmap_sem and prange->lock and take
* svms srcu read lock.
@@ -448,12 +449,12 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
* Return:
* 0 - OK, otherwise error code
*/
-int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
+ struct mm_struct *mm)
{
unsigned long addr, start, end;
struct vm_area_struct *vma;
struct amdgpu_device *adev;
- struct mm_struct *mm;
int r = 0;
if (prange->actual_loc == best_loc) {
@@ -475,8 +476,6 @@ int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
start = prange->it_node.start << PAGE_SHIFT;
end = (prange->it_node.last + 1) << PAGE_SHIFT;
- mm = current->mm;
-
for (addr = start; addr < end;) {
unsigned long next;
@@ -740,12 +739,26 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
list_for_each_entry(prange, &list, update_list) {
mutex_lock(&prange->mutex);
r = svm_migrate_vram_to_ram(prange, vma->vm_mm);
- mutex_unlock(&prange->mutex);
if (r) {
pr_debug("failed %d migrate [0x%lx 0x%lx] to ram\n", r,
prange->it_node.start, prange->it_node.last);
- goto out_srcu;
+ goto next;
}
+
+ /* xnack off, svm_range_restore_work will update GPU mapping */
+ if (!p->xnack_enabled)
+ goto next;
+
+ /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
+ r = svm_range_map_to_gpus(prange, true);
+ if (r)
+ pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx]\n",
+ r, prange->svms, prange->it_node.start,
+ prange->it_node.last);
+next:
+ mutex_unlock(&prange->mutex);
+ if (r)
+ break;
}
out_srcu:
@@ -37,7 +37,8 @@ enum MIGRATION_COPY_DIR {
FROM_VRAM_TO_RAM
};
-int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc);
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
+ struct mm_struct *mm);
int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm);
unsigned long
svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
@@ -864,6 +864,9 @@ int kfd_process_gpuid_from_gpuidx(struct kfd_process *p,
int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id);
int kfd_process_device_from_gpuidx(struct kfd_process *p,
uint32_t gpu_idx, struct kfd_dev **gpu);
+int kfd_process_gpuid_from_kgd(struct kfd_process *p,
+ struct amdgpu_device *adev, uint32_t *gpuid,
+ uint32_t *gpuidx);
void kfd_unref_process(struct kfd_process *p);
int kfd_process_evict_queues(struct kfd_process *p);
int kfd_process_restore_queues(struct kfd_process *p);
@@ -1637,6 +1637,22 @@ int kfd_process_device_from_gpuidx(struct kfd_process *p,
return -EINVAL;
}
+int
+kfd_process_gpuid_from_kgd(struct kfd_process *p, struct amdgpu_device *adev,
+ uint32_t *gpuid, uint32_t *gpuidx)
+{
+ struct kgd_dev *kgd = (struct kgd_dev *)adev;
+ int i;
+
+ for (i = 0; i < p->n_pdds; i++)
+ if (p->pdds[i] && p->pdds[i]->dev->kgd == kgd) {
+ *gpuid = p->pdds[i]->dev->id;
+ *gpuidx = i;
+ return 0;
+ }
+ return -EINVAL;
+}
+
static void evict_process_worker(struct work_struct *work)
{
int ret;
@@ -1153,7 +1153,7 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
return r;
}
-static int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm)
+int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm)
{
DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
struct kfd_process_device *pdd;
@@ -1170,9 +1170,29 @@ static int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm)
else
bo_adev = NULL;
- bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
- MAX_GPU_INSTANCE);
p = container_of(prange->svms, struct kfd_process, svms);
+ if (p->xnack_enabled) {
+ bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
+
+ /* If prefetch range to GPU, or GPU retry fault migrate range to
+ * GPU, which has ACCESS attribute to the range, create mapping
+ * on that GPU.
+ */
+ if (prange->actual_loc) {
+ gpuidx = kfd_process_gpuidx_from_gpuid(p,
+ prange->actual_loc);
+ if (gpuidx < 0) {
+ WARN_ONCE(1, "failed get device by id 0x%x\n",
+ prange->actual_loc);
+ return -EINVAL;
+ }
+ if (test_bit(gpuidx, prange->bitmap_access))
+ bitmap_set(bitmap, gpuidx, 1);
+ }
+ } else {
+ bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+ MAX_GPU_INSTANCE);
+ }
for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
r = kfd_process_device_from_gpuidx(p, gpuidx, &dev);
@@ -1678,16 +1698,77 @@ svm_range_from_addr(struct svm_range_list *svms, unsigned long addr)
return container_of(node, struct svm_range, it_node);
}
+/* svm_range_best_restore_location - decide the best fault restore location
+ * @prange: svm range structure
+ * @adev: the GPU on which vm fault happened
+ *
+ * This is only called when xnack is on, to decide the best location to restore
+ * the range mapping after GPU vm fault. Caller uses the best location to do
+ * migration if actual loc is not best location, then update GPU page table
+ * mapping to the best location.
+ *
+ * If vm fault gpu is range preferred loc, the best_loc is preferred loc.
+ * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
+ * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
+ * if range actual loc is cpu, best_loc is cpu
+ * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
+ * range actual loc.
+ * Otherwise, GPU no access, best_loc is -1.
+ *
+ * Return:
+ * -1 means vm fault GPU no access
+ * 0 for CPU or GPU id
+ */
+static int32_t
+svm_range_best_restore_location(struct svm_range *prange,
+ struct amdgpu_device *adev)
+{
+ struct amdgpu_device *bo_adev;
+ struct kfd_process *p;
+ int32_t gpuidx;
+ uint32_t gpuid;
+ int r;
+
+ p = container_of(prange->svms, struct kfd_process, svms);
+
+ r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx);
+ if (r < 0) {
+ pr_debug("failed to get gpuid from kgd\n");
+ return -1;
+ }
+
+ if (prange->preferred_loc == gpuid)
+ return prange->preferred_loc;
+
+ if (test_bit(gpuidx, prange->bitmap_access))
+ return gpuid;
+
+ if (test_bit(gpuidx, prange->bitmap_aip)) {
+ if (!prange->actual_loc)
+ return 0;
+
+ bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc);
+ if (amdgpu_xgmi_same_hive(adev, bo_adev))
+ return prange->actual_loc;
+ else
+ return 0;
+ }
+
+ return -1;
+}
+
int
-svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
- uint64_t addr)
+svm_range_restore_pages(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ unsigned int pasid, uint64_t addr)
{
- int r = 0;
- int srcu_idx;
+ struct amdgpu_device *bo_adev;
struct mm_struct *mm = NULL;
- struct svm_range *prange;
struct svm_range_list *svms;
+ struct svm_range *prange;
struct kfd_process *p;
+ int32_t best_loc;
+ int srcu_idx;
+ int r = 0;
p = kfd_lookup_process_by_pasid(pasid);
if (!p) {
@@ -1706,20 +1787,20 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
svms, addr);
r = -EFAULT;
- goto unlock_out;
+ goto out_srcu_unlock;
}
if (!atomic_read(&prange->invalid)) {
pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
svms, prange->it_node.start, prange->it_node.last);
- goto unlock_out;
+ goto out_srcu_unlock;
}
mm = get_task_mm(p->lead_thread);
if (!mm) {
pr_debug("svms 0x%p failed to get mm\n", svms);
r = -ESRCH;
- goto unlock_out;
+ goto out_srcu_unlock;
}
mmap_read_lock(mm);
@@ -1729,27 +1810,57 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
*/
mutex_lock(&prange->mutex);
+ best_loc = svm_range_best_restore_location(prange, adev);
+ if (best_loc == -1) {
+ pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
+ svms, prange->it_node.start, prange->it_node.last);
+ r = -EACCES;
+ goto out_mmput;
+ }
+
+ pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
+ svms, prange->it_node.start, prange->it_node.last, best_loc,
+ prange->actual_loc);
+
+ if (prange->actual_loc != best_loc) {
+ if (best_loc)
+ r = svm_migrate_ram_to_vram(prange, best_loc, mm);
+ else
+ r = svm_migrate_vram_to_ram(prange, mm);
+ if (r) {
+ pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
+ r, svms, prange->it_node.start,
+ prange->it_node.last);
+ goto out_mmput;
+ }
+ }
+
r = svm_range_validate(mm, prange);
if (r) {
- pr_debug("failed %d to validate svms 0x%p [0x%lx 0x%lx]\n", r,
+ pr_debug("failed %d to validate svms %p [0x%lx 0x%lx]\n", r,
svms, prange->it_node.start, prange->it_node.last);
-
- goto mmput_out;
+ goto out_mmput;
}
- pr_debug("restoring svms 0x%p [0x%lx %lx] mapping\n",
- svms, prange->it_node.start, prange->it_node.last);
+ if (prange->svm_bo && prange->mm_nodes)
+ bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
+ else
+ bo_adev = NULL;
+
+ pr_debug("restoring svms 0x%p [0x%lx %lx] mapping, bo_adev is %s\n",
+ svms, prange->it_node.start, prange->it_node.last,
+ bo_adev ? "not NULL" : "NULL");
r = svm_range_map_to_gpus(prange, true);
if (r)
- pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpu\n", r,
- svms, prange->it_node.start, prange->it_node.last);
+ pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
+ r, svms, prange->it_node.start, prange->it_node.last);
-mmput_out:
+out_mmput:
mutex_unlock(&prange->mutex);
mmap_read_unlock(mm);
mmput(mm);
-unlock_out:
+out_srcu_unlock:
srcu_read_unlock(&svms->srcu, srcu_idx);
kfd_unref_process(p);
@@ -1882,7 +1993,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
return 0;
}
-/* svm_range_best_location - decide the best actual location
+/* svm_range_best_prefetch_location - decide the best prefetch location
* @prange: svm range structure
*
* For xnack off:
@@ -1904,7 +2015,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
* Return:
* 0 for CPU or GPU id
*/
-static uint32_t svm_range_best_location(struct svm_range *prange)
+static uint32_t
+svm_range_best_prefetch_location(struct svm_range *prange)
{
DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
uint32_t best_loc = prange->prefetch_loc;
@@ -1980,7 +2092,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
int r = 0;
*migrated = false;
- best_loc = svm_range_best_location(prange);
+ best_loc = svm_range_best_prefetch_location(prange);
if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
best_loc == prange->actual_loc)
@@ -2001,10 +2113,10 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
}
pr_debug("migrate from ram to vram\n");
- r = svm_migrate_ram_to_vram(prange, best_loc);
+ r = svm_migrate_ram_to_vram(prange, best_loc, mm);
} else {
pr_debug("migrate from vram to ram\n");
- r = svm_migrate_vram_to_ram(prange, current->mm);
+ r = svm_migrate_vram_to_ram(prange, mm);
}
if (!r)
@@ -125,8 +125,9 @@ int svm_range_vram_node_new(struct amdgpu_device *adev,
void svm_range_vram_node_free(struct svm_range *prange);
int svm_range_split_by_granularity(struct kfd_process *p, unsigned long addr,
struct list_head *list);
-int svm_range_restore_pages(struct amdgpu_device *adev,
+int svm_range_restore_pages(struct amdgpu_device *adev, struct amdgpu_vm *vm,
unsigned int pasid, uint64_t addr);
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
+int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm);
#endif /* KFD_SVM_H_ */