@@ -736,6 +736,8 @@ struct svm_range_list {
struct list_head free_list;
struct mutex free_list_lock;
struct mmu_interval_notifier notifier;
+ atomic_t evicted_ranges;
+ struct delayed_work restore_work;
};
/* Process data */
@@ -1048,6 +1048,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
+ cancel_delayed_work_sync(&p->svms.restore_work);
mutex_lock(&p->mutex);
@@ -21,6 +21,7 @@
*/
#include <linux/types.h>
+#include <linux/sched/task.h>
#include "amdgpu_sync.h"
#include "amdgpu_object.h"
#include "amdgpu_vm.h"
@@ -28,6 +29,8 @@
#include "kfd_priv.h"
#include "kfd_svm.h"
+#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
+
/**
* svm_range_unlink - unlink svm_range from lists and interval tree
* @prange: svm range structure to be removed
@@ -99,6 +102,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
INIT_LIST_HEAD(&prange->list);
INIT_LIST_HEAD(&prange->update_list);
INIT_LIST_HEAD(&prange->remove_list);
+ atomic_set(&prange->invalid, 0);
svm_range_set_default_attributes(&prange->preferred_loc,
&prange->prefetch_loc,
&prange->granularity, &prange->flags);
@@ -191,6 +195,10 @@ svm_range_validate(struct mm_struct *mm, struct svm_range *prange)
r = svm_range_validate_ram(mm, prange);
+ pr_debug("svms 0x%p [0x%lx 0x%lx] ret %d invalid %d\n", prange->svms,
+ prange->it_node.start, prange->it_node.last,
+ r, atomic_read(&prange->invalid));
+
return r;
}
@@ -757,6 +765,151 @@ static int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm)
return r;
}
+static void svm_range_restore_work(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct amdkfd_process_info *process_info;
+ struct svm_range_list *svms;
+ struct svm_range *prange;
+ struct kfd_process *p;
+ struct mm_struct *mm;
+ int evicted_ranges;
+ int srcu_idx;
+ int invalid;
+ int r;
+
+ svms = container_of(dwork, struct svm_range_list, restore_work);
+ evicted_ranges = atomic_read(&svms->evicted_ranges);
+ if (!evicted_ranges)
+ return;
+
+ pr_debug("restore svm ranges\n");
+
+ /* kfd_process_notifier_release destroys this worker thread. So during
+ * the lifetime of this thread, kfd_process and mm will be valid.
+ */
+ p = container_of(svms, struct kfd_process, svms);
+ process_info = p->kgd_process_info;
+ mm = p->mm;
+ if (!mm)
+ return;
+
+ mutex_lock(&process_info->lock);
+ mmap_read_lock(mm);
+ srcu_idx = srcu_read_lock(&svms->srcu);
+
+ list_for_each_entry_rcu(prange, &svms->list, list) {
+ invalid = atomic_read(&prange->invalid);
+ if (!invalid)
+ continue;
+
+ pr_debug("restoring svms 0x%p [0x%lx %lx] invalid %d\n",
+ prange->svms, prange->it_node.start,
+ prange->it_node.last, invalid);
+
+ r = svm_range_validate(mm, prange);
+ if (r) {
+ pr_debug("failed %d to validate [0x%lx 0x%lx]\n", r,
+ prange->it_node.start, prange->it_node.last);
+
+ goto unlock_out;
+ }
+
+ r = svm_range_map_to_gpus(prange, true);
+ if (r) {
+ pr_debug("failed %d to map 0x%lx to gpu\n", r,
+ prange->it_node.start);
+ goto unlock_out;
+ }
+
+ if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
+ goto unlock_out;
+ }
+
+ if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
+ evicted_ranges)
+ goto unlock_out;
+
+ evicted_ranges = 0;
+
+ r = kgd2kfd_resume_mm(mm);
+ if (r) {
+ /* No recovery from this failure. Probably the CP is
+ * hanging. No point trying again.
+ */
+ pr_debug("failed %d to resume KFD\n", r);
+ }
+
+ pr_debug("restore svm ranges successfully\n");
+
+unlock_out:
+ srcu_read_unlock(&svms->srcu, srcu_idx);
+ mmap_read_unlock(mm);
+ mutex_unlock(&process_info->lock);
+
+ /* If validation failed, reschedule another attempt */
+ if (evicted_ranges) {
+ pr_debug("reschedule to restore svm range\n");
+ schedule_delayed_work(&svms->restore_work,
+ msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+ }
+}
+
+/**
+ * svm_range_evict - evict svm range
+ *
+ * Stop all queues of the process to ensure GPU doesn't access the memory, then
+ * return to let CPU evict the buffer and proceed CPU pagetable update.
+ *
+ * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
+ * If invalidation happens while restore work is running, restore work will
+ * restart to ensure to get the latest CPU pages mapping to GPU, then start
+ * the queues.
+ */
+static int
+svm_range_evict(struct svm_range_list *svms, struct mm_struct *mm,
+ unsigned long start, unsigned long last)
+{
+ int invalid, evicted_ranges;
+ int r = 0;
+ struct interval_tree_node *node;
+ struct svm_range *prange;
+
+ svms_lock(svms);
+
+ pr_debug("invalidate svms 0x%p [0x%lx 0x%lx]\n", svms, start, last);
+
+ node = interval_tree_iter_first(&svms->objects, start, last);
+ while (node) {
+ struct interval_tree_node *next;
+
+ prange = container_of(node, struct svm_range, it_node);
+ next = interval_tree_iter_next(node, start, last);
+
+ invalid = atomic_inc_return(&prange->invalid);
+ evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
+ if (evicted_ranges == 1) {
+ pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
+ prange->svms, prange->it_node.start,
+ prange->it_node.last);
+
+ /* First eviction, stop the queues */
+ r = kgd2kfd_quiesce_mm(mm);
+ if (r)
+ pr_debug("failed to quiesce KFD\n");
+
+ pr_debug("schedule to restore svm %p ranges\n", svms);
+ schedule_delayed_work(&svms->restore_work,
+ msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+ }
+ node = next;
+ }
+
+ svms_unlock(svms);
+
+ return r;
+}
+
struct svm_range *svm_range_clone(struct svm_range *old)
{
struct svm_range *new;
@@ -994,6 +1147,11 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, unsigned long start,
* svm_range_cpu_invalidate_pagetables - interval notifier callback
*
* MMU range unmap notifier to remove svm ranges
+ *
+ * If GPU vm fault retry is not enabled, evict the svm range, then restore
+ * work will update GPU mapping.
+ * If GPU vm fault retry is enabled, unmap the svm range from GPU, vm fault
+ * will update GPU mapping.
*/
static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
@@ -1009,15 +1167,14 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
if (range->event == MMU_NOTIFY_RELEASE) {
pr_debug("cpu release range [0x%lx 0x%lx]\n", range->start,
range->end - 1);
- return true;
- }
- if (range->event == MMU_NOTIFY_UNMAP) {
+ } else if (range->event == MMU_NOTIFY_UNMAP) {
pr_debug("mm 0x%p unmap range [0x%lx 0x%lx]\n", range->mm,
start, last);
svm_range_unmap_from_cpu(mni->mm, start, last);
- return true;
+ } else {
+ mmu_interval_set_seq(mni, cur_seq);
+ svm_range_evict(svms, mni->mm, start, last);
}
-
return true;
}
@@ -1045,6 +1202,8 @@ int svm_range_list_init(struct kfd_process *p)
svms->objects = RB_ROOT_CACHED;
mutex_init(&svms->lock);
INIT_LIST_HEAD(&svms->list);
+ atomic_set(&svms->evicted_ranges, 0);
+ INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
r = init_srcu_struct(&svms->srcu);
if (r) {
pr_debug("failed %d to init srcu\n", r);
@@ -50,6 +50,7 @@
* @perfetch_loc: last prefetch location, 0 for CPU, or GPU id
* @actual_loc: the actual location, 0 for CPU, or GPU id
* @granularity:migration granularity, log2 num pages
+ * @invalid: not 0 means cpu page table is invalidated
* @bitmap_access: index bitmap of GPUs which can access the range
* @bitmap_aip: index bitmap of GPUs which can access the range in place
*
@@ -72,6 +73,7 @@ struct svm_range {
uint32_t prefetch_loc;
uint32_t actual_loc;
uint8_t granularity;
+ atomic_t invalid;
DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
};