@@ -205,6 +205,311 @@ svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence)
return r;
}
+static uint64_t
+svm_migrate_node_physical_addr(struct amdgpu_device *adev,
+ struct drm_mm_node **mm_node, uint64_t *offset)
+{
+ struct drm_mm_node *node = *mm_node;
+ uint64_t pos = *offset;
+
+ if (node->start == AMDGPU_BO_INVALID_OFFSET) {
+ pr_debug("drm node is not validated\n");
+ return 0;
+ }
+
+ pr_debug("vram node start 0x%llx npages 0x%llx\n", node->start,
+ node->size);
+
+ if (pos >= node->size) {
+ do {
+ pos -= node->size;
+ node++;
+ } while (pos >= node->size);
+
+ *mm_node = node;
+ *offset = pos;
+ }
+
+ return (node->start + pos) << PAGE_SHIFT;
+}
+
+unsigned long
+svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr)
+{
+ return (addr + adev->kfd.dev->pgmap.range.start) >> PAGE_SHIFT;
+}
+
+static void
+svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
+{
+ struct page *page;
+
+ page = pfn_to_page(pfn);
+ page->zone_device_data = prange;
+ get_page(page);
+ lock_page(page);
+}
+
+static void
+svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr)
+{
+ struct page *page;
+
+ page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr));
+ unlock_page(page);
+ put_page(page);
+}
+
+
+static int
+svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
+ struct migrate_vma *migrate, struct dma_fence **mfence,
+ dma_addr_t *scratch)
+{
+ uint64_t npages = migrate->cpages;
+ struct device *dev = adev->dev;
+ struct drm_mm_node *node;
+ dma_addr_t *src;
+ uint64_t *dst;
+ uint64_t vram_addr;
+ uint64_t offset;
+ uint64_t i, j;
+ int r = -ENOMEM;
+
+ pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
+ prange->last);
+
+ src = scratch;
+ dst = (uint64_t *)(scratch + npages);
+
+ r = svm_range_vram_node_new(adev, prange, true);
+ if (r) {
+ pr_debug("failed %d get 0x%llx pages from vram\n", r, npages);
+ goto out;
+ }
+
+ node = prange->ttm_res->mm_node;
+ offset = prange->offset;
+ vram_addr = svm_migrate_node_physical_addr(adev, &node, &offset);
+ if (!vram_addr) {
+ WARN_ONCE(1, "vram node address is 0\n");
+ r = -ENOMEM;
+ goto out;
+ }
+
+ for (i = j = 0; i < npages; i++) {
+ struct page *spage;
+
+ dst[i] = vram_addr + (j << PAGE_SHIFT);
+ migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
+ svm_migrate_get_vram_page(prange, migrate->dst[i]);
+
+ migrate->dst[i] = migrate_pfn(migrate->dst[i]);
+ migrate->dst[i] |= MIGRATE_PFN_LOCKED;
+
+ if (migrate->src[i] & MIGRATE_PFN_VALID) {
+ spage = migrate_pfn_to_page(migrate->src[i]);
+ src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
+ DMA_TO_DEVICE);
+ r = dma_mapping_error(dev, src[i]);
+ if (r) {
+ pr_debug("failed %d dma_map_page\n", r);
+ goto out_free_vram_pages;
+ }
+ } else {
+ if (j) {
+ r = svm_migrate_copy_memory_gart(
+ adev, src + i - j,
+ dst + i - j, j,
+ FROM_RAM_TO_VRAM,
+ mfence);
+ if (r)
+ goto out_free_vram_pages;
+ offset += j;
+ vram_addr = (node->start + offset) << PAGE_SHIFT;
+ j = 0;
+ } else {
+ offset++;
+ vram_addr += PAGE_SIZE;
+ }
+ if (offset >= node->size) {
+ node++;
+ pr_debug("next node size 0x%llx\n", node->size);
+ vram_addr = node->start << PAGE_SHIFT;
+ offset = 0;
+ }
+ continue;
+ }
+
+ pr_debug("dma mapping src to 0x%llx, page_to_pfn 0x%lx\n",
+ src[i] >> PAGE_SHIFT, page_to_pfn(spage));
+
+ if (j + offset >= node->size - 1 && i < npages - 1) {
+ r = svm_migrate_copy_memory_gart(adev, src + i - j,
+ dst + i - j, j + 1,
+ FROM_RAM_TO_VRAM,
+ mfence);
+ if (r)
+ goto out_free_vram_pages;
+
+ node++;
+ pr_debug("next node size 0x%llx\n", node->size);
+ vram_addr = node->start << PAGE_SHIFT;
+ offset = 0;
+ j = 0;
+ } else {
+ j++;
+ }
+ }
+
+ r = svm_migrate_copy_memory_gart(adev, src + i - j, dst + i - j, j,
+ FROM_RAM_TO_VRAM, mfence);
+
+out_free_vram_pages:
+ if (r) {
+ pr_debug("failed %d to copy memory to vram\n", r);
+ while (i--) {
+ svm_migrate_put_vram_page(adev, dst[i]);
+ migrate->dst[i] = 0;
+ }
+ }
+
+out:
+ return r;
+}
+
+static int
+svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
+ struct vm_area_struct *vma, uint64_t start,
+ uint64_t end)
+{
+ uint64_t npages = (end - start) >> PAGE_SHIFT;
+ struct dma_fence *mfence = NULL;
+ struct migrate_vma migrate;
+ dma_addr_t *scratch;
+ size_t size;
+ void *buf;
+ int r = -ENOMEM;
+ int retry = 0;
+
+ memset(&migrate, 0, sizeof(migrate));
+ migrate.vma = vma;
+ migrate.start = start;
+ migrate.end = end;
+ migrate.flags = MIGRATE_VMA_SELECT_SYSTEM;
+ migrate.pgmap_owner = adev;
+
+ size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t);
+ size *= npages;
+ buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!buf)
+ goto out;
+
+ migrate.src = buf;
+ migrate.dst = migrate.src + npages;
+ scratch = (dma_addr_t *)(migrate.dst + npages);
+
+retry:
+ r = migrate_vma_setup(&migrate);
+ if (r) {
+ pr_debug("failed %d prepare migrate svms 0x%p [0x%lx 0x%lx]\n",
+ r, prange->svms, prange->start, prange->last);
+ goto out_free;
+ }
+ if (migrate.cpages != npages) {
+ pr_debug("collect 0x%lx/0x%llx pages, retry\n", migrate.cpages,
+ npages);
+ migrate_vma_finalize(&migrate);
+ if (retry++ >= 3) {
+ r = -ENOMEM;
+ pr_debug("failed %d migrate svms 0x%p [0x%lx 0x%lx]\n",
+ r, prange->svms, prange->start, prange->last);
+ goto out_free;
+ }
+
+ goto retry;
+ }
+
+ if (migrate.cpages) {
+ svm_migrate_copy_to_vram(adev, prange, &migrate, &mfence,
+ scratch);
+ migrate_vma_pages(&migrate);
+ svm_migrate_copy_done(adev, mfence);
+ migrate_vma_finalize(&migrate);
+ }
+
+ svm_range_dma_unmap(adev->dev, scratch, 0, npages);
+ svm_range_free_dma_mappings(prange);
+
+out_free:
+ kvfree(buf);
+out:
+ return r;
+}
+
+/**
+ * svm_migrate_ram_to_vram - migrate svm range from system to device
+ * @prange: range structure
+ * @best_loc: the device to migrate to
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock
+ *
+ * Return:
+ * 0 - OK, otherwise error code
+ */
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+{
+ unsigned long addr, start, end;
+ struct vm_area_struct *vma;
+ struct amdgpu_device *adev;
+ struct mm_struct *mm;
+ int r = 0;
+
+ if (prange->actual_loc == best_loc) {
+ pr_debug("svms 0x%p [0x%lx 0x%lx] already on best_loc 0x%x\n",
+ prange->svms, prange->start, prange->last, best_loc);
+ return 0;
+ }
+
+ adev = svm_range_get_adev_by_id(prange, best_loc);
+ if (!adev) {
+ pr_debug("failed to get device by id 0x%x\n", best_loc);
+ return -ENODEV;
+ }
+
+ pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms,
+ prange->start, prange->last, best_loc);
+
+ /* FIXME: workaround for page locking bug with invalid pages */
+ svm_range_prefault(prange, mm);
+
+ start = prange->start << PAGE_SHIFT;
+ end = (prange->last + 1) << PAGE_SHIFT;
+
+ mm = current->mm;
+
+ for (addr = start; addr < end;) {
+ unsigned long next;
+
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start)
+ break;
+
+ next = min(vma->vm_end, end);
+ r = svm_migrate_vma_to_vram(adev, prange, vma, addr, next);
+ if (r) {
+ pr_debug("failed to migrate\n");
+ break;
+ }
+ addr = next;
+ }
+
+ if (!r)
+ prange->actual_loc = best_loc;
+
+ return r;
+}
+
static void svm_migrate_page_free(struct page *page)
{
}
@@ -38,6 +38,8 @@ enum MIGRATION_COPY_DIR {
FROM_VRAM_TO_RAM
};
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc);
+
#if defined(CONFIG_DEVICE_PRIVATE)
int svm_migrate_init(struct amdgpu_device *adev);
void svm_migrate_fini(struct amdgpu_device *adev);
@@ -31,6 +31,7 @@
#include "amdgpu_xgmi.h"
#include "kfd_priv.h"
#include "kfd_svm.h"
+#include "kfd_migrate.h"
#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
@@ -177,8 +178,8 @@ svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
return r;
}
-static void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
- unsigned long offset, unsigned long npages)
+void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+ unsigned long offset, unsigned long npages)
{
enum dma_data_direction dir = DMA_BIDIRECTIONAL;
int i;
@@ -195,7 +196,7 @@ static void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
}
}
-static void svm_range_free_dma_mappings(struct svm_range *prange)
+void svm_range_free_dma_mappings(struct svm_range *prange)
{
struct kfd_process_device *pdd;
dma_addr_t *dma_addr;
@@ -230,6 +231,7 @@ static void svm_range_free(struct svm_range *prange)
svm_range_vram_node_free(prange);
svm_range_free_dma_mappings(prange);
mutex_destroy(&prange->lock);
+ mutex_destroy(&prange->migrate_mutex);
kfree(prange);
}
@@ -266,6 +268,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
INIT_LIST_HEAD(&prange->deferred_list);
INIT_LIST_HEAD(&prange->child_list);
atomic_set(&prange->invalid, 0);
+ mutex_init(&prange->migrate_mutex);
mutex_init(&prange->lock);
svm_range_set_default_attributes(&prange->preferred_loc,
&prange->prefetch_loc,
@@ -1238,6 +1241,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
pr_debug("failed %d to dma map range\n", r);
goto unreserve_out;
}
+
+ prange->validated_once = true;
}
svm_range_lock(prange);
@@ -1329,21 +1334,28 @@ static void svm_range_restore_work(struct work_struct *work)
prange->svms, prange, prange->start, prange->last,
invalid);
+ /*
+ * If range is migrating, wait for migration is done.
+ */
+ mutex_lock(&prange->migrate_mutex);
+
r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
false, true);
- if (r) {
+ if (r)
pr_debug("failed %d to map 0x%lx to gpus\n", r,
prange->start);
- goto unlock_out;
- }
+
+ mutex_unlock(&prange->migrate_mutex);
+ if (r)
+ goto out_reschedule;
if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
- goto unlock_out;
+ goto out_reschedule;
}
if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
evicted_ranges)
- goto unlock_out;
+ goto out_reschedule;
evicted_ranges = 0;
@@ -1357,7 +1369,7 @@ static void svm_range_restore_work(struct work_struct *work)
pr_debug("restore svm ranges successfully\n");
-unlock_out:
+out_reschedule:
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
mutex_unlock(&process_info->lock);
@@ -1649,6 +1661,7 @@ static void svm_range_deferred_list_work(struct work_struct *work)
list_del_init(&prange->deferred_list);
spin_unlock(&svms->deferred_list_lock);
+ mutex_lock(&prange->migrate_mutex);
while (!list_empty(&prange->child_list)) {
struct svm_range *pchild;
@@ -1659,6 +1672,7 @@ static void svm_range_deferred_list_work(struct work_struct *work)
list_del_init(&pchild->child_list);
svm_range_handle_list_op(svms, pchild);
}
+ mutex_unlock(&prange->migrate_mutex);
svm_range_handle_list_op(svms, prange);
mutex_unlock(&svms->lock);
@@ -1957,6 +1971,151 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
return 0;
}
+/* svm_range_best_location - decide the best actual location
+ * @prange: svm range structure
+ *
+ * For xnack off:
+ * If range map to single GPU, the best acutal location is prefetch loc, which
+ * can be CPU or GPU.
+ *
+ * If range map to multiple GPUs, only if mGPU connection on xgmi same hive,
+ * the best actual location could be prefetch_loc GPU. If mGPU connection on
+ * PCIe, the best actual location is always CPU, because GPU cannot access vram
+ * of other GPUs, assuming PCIe small bar (large bar support is not upstream).
+ *
+ * For xnack on:
+ * The best actual location is prefetch location. If mGPU connection on xgmi
+ * same hive, range map to multiple GPUs. Otherwise, the range only map to
+ * actual location GPU. Other GPU access vm fault will trigger migration.
+ *
+ * Context: Process context
+ *
+ * Return:
+ * 0 for CPU or GPU id
+ */
+static uint32_t svm_range_best_location(struct svm_range *prange)
+{
+ DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
+ uint32_t best_loc = prange->prefetch_loc;
+ struct kfd_process_device *pdd;
+ struct amdgpu_device *bo_adev;
+ struct amdgpu_device *adev;
+ struct kfd_process *p;
+ uint32_t gpuidx;
+
+ p = container_of(prange->svms, struct kfd_process, svms);
+
+ /* xnack on */
+ if (p->xnack_enabled)
+ goto out;
+
+ /* xnack off */
+ if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
+ goto out;
+
+ bo_adev = svm_range_get_adev_by_id(prange, best_loc);
+ bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+ MAX_GPU_INSTANCE);
+
+ for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+ pdd = kfd_process_device_from_gpuidx(p, gpuidx);
+ if (!pdd) {
+ pr_debug("failed to get device by idx 0x%x\n", gpuidx);
+ continue;
+ }
+ adev = (struct amdgpu_device *)pdd->dev->kgd;
+
+ if (adev == bo_adev)
+ continue;
+
+ if (!amdgpu_xgmi_same_hive(adev, bo_adev)) {
+ best_loc = 0;
+ break;
+ }
+ }
+
+out:
+ pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
+ p->xnack_enabled, &p->svms, prange->start, prange->last,
+ best_loc);
+
+ return best_loc;
+}
+
+/* FIXME: This is a workaround for page locking bug when some pages are
+ * invalid during migration to VRAM
+ */
+void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm)
+{
+ struct hmm_range *hmm_range;
+ int r;
+
+ if (prange->validated_once)
+ return;
+
+ r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
+ prange->start << PAGE_SHIFT,
+ prange->npages, &hmm_range,
+ false, true);
+ if (!r) {
+ amdgpu_hmm_range_get_pages_done(hmm_range);
+ prange->validated_once = true;
+ }
+}
+
+/* svm_range_trigger_migration - start page migration if prefetch loc changed
+ * @mm: current process mm_struct
+ * @prange: svm range structure
+ * @migrated: output, true if migration is triggered
+ *
+ * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
+ * from ram to vram.
+ * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
+ * from vram to ram.
+ *
+ * If GPU vm fault retry is not enabled, migration interact with MMU notifier
+ * and restore work:
+ * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
+ * stops all queues, schedule restore work
+ * 2. svm_range_restore_work wait for migration is done by
+ * a. svm_range_validate_vram takes prange->migrate_mutex
+ * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
+ * 3. restore work update mappings of GPU, resume all queues.
+ *
+ * Context: Process context
+ *
+ * Return:
+ * 0 - OK, otherwise - error code of migration
+ */
+static int
+svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
+ bool *migrated)
+{
+ uint32_t best_loc;
+ int r = 0;
+
+ *migrated = false;
+ best_loc = svm_range_best_location(prange);
+
+ if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
+ best_loc == prange->actual_loc)
+ return 0;
+
+ if (best_loc && !prange->actual_loc &&
+ !(prange->flags & KFD_IOCTL_SVM_FLAG_HOST_ACCESS))
+ return 0;
+
+ if (best_loc) {
+ pr_debug("migrate from ram to vram\n");
+ r = svm_migrate_ram_to_vram(prange, best_loc);
+
+ if (!r)
+ *migrated = true;
+ }
+
+ return r;
+}
+
static int
svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
@@ -2027,13 +2186,29 @@ svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
* case because the rollback wouldn't be guaranteed to work either.
*/
list_for_each_entry(prange, &update_list, update_list) {
+ bool migrated;
+
+ mutex_lock(&prange->migrate_mutex);
+
+ r = svm_range_trigger_migration(mm, prange, &migrated);
+ if (r)
+ goto out_unlock_range;
+
+ if (migrated) {
+ pr_debug("restore_work will update mappings of GPUs\n");
+ mutex_unlock(&prange->migrate_mutex);
+ continue;
+ }
+
r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
true, true);
- if (r) {
- pr_debug("failed %d to map 0x%lx to gpus\n", r,
- prange->start);
+ if (r)
+ pr_debug("failed %d to map svm range\n", r);
+
+out_unlock_range:
+ mutex_unlock(&prange->migrate_mutex);
+ if (r)
break;
- }
}
svm_range_debug_dump(svms);
@@ -56,6 +56,7 @@ struct svm_work_list_item {
* struct svm_range - shared virtual memory range
*
* @svms: list of svm ranges, structure defined in kfd_process
+ * @migrate_mutex: to serialize range migration, validation and mapping update
* @start: range start address in pages
* @last: range last address in pages
* @it_node: node [start, last] stored in interval tree, start, last are page
@@ -92,6 +93,7 @@ struct svm_work_list_item {
*/
struct svm_range {
struct svm_range_list *svms;
+ struct mutex migrate_mutex;
unsigned long start;
unsigned long last;
struct interval_tree_node it_node;
@@ -120,6 +122,7 @@ struct svm_range {
struct list_head child_list;
DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
+ bool validated_once;
};
static inline void svm_range_lock(struct svm_range *prange)
@@ -144,5 +147,9 @@ struct amdgpu_device *svm_range_get_adev_by_id(struct svm_range *prange,
int svm_range_vram_node_new(struct amdgpu_device *adev,
struct svm_range *prange, bool clear);
void svm_range_vram_node_free(struct svm_range *prange);
+void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+ unsigned long offset, unsigned long npages);
+void svm_range_free_dma_mappings(struct svm_range *prange);
+void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
#endif /* KFD_SVM_H_ */