@@ -45,7 +45,8 @@ static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
* svm_range_unlink - unlink svm_range from lists and interval tree
* @prange: svm range structure to be removed
*
- * Remove the svm range from svms interval tree and link list
+ * Remove the svm_range from the svms and svm_bo lists and the svms
+ * interval tree.
*
* Context: The caller must hold svms->lock
*/
@@ -54,6 +55,12 @@ static void svm_range_unlink(struct svm_range *prange)
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
prange, prange->start, prange->last);
+ if (prange->svm_bo) {
+ spin_lock(&prange->svm_bo->list_lock);
+ list_del(&prange->svm_bo_list);
+ spin_unlock(&prange->svm_bo->list_lock);
+ }
+
list_del(&prange->list);
if (prange->it_node.start != 0 && prange->it_node.last != 0)
interval_tree_remove(&prange->it_node, &prange->svms->objects);
@@ -218,6 +225,7 @@ static void svm_range_free(struct svm_range *prange)
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
prange->start, prange->last);
+ svm_range_vram_node_free(prange);
svm_range_free_dma_mappings(prange);
mutex_destroy(&prange->lock);
kfree(prange);
@@ -252,6 +260,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
INIT_LIST_HEAD(&prange->update_list);
INIT_LIST_HEAD(&prange->remove_list);
INIT_LIST_HEAD(&prange->insert_list);
+ INIT_LIST_HEAD(&prange->svm_bo_list);
INIT_LIST_HEAD(&prange->deferred_list);
INIT_LIST_HEAD(&prange->child_list);
atomic_set(&prange->invalid, 0);
@@ -265,6 +274,210 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
return prange;
}
+static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
+{
+ if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
+ return false;
+
+ return true;
+}
+
+static struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo)
+{
+ if (svm_bo)
+ kref_get(&svm_bo->kref);
+
+ return svm_bo;
+}
+
+static void svm_range_bo_release(struct kref *kref)
+{
+ struct svm_range_bo *svm_bo;
+
+ svm_bo = container_of(kref, struct svm_range_bo, kref);
+ spin_lock(&svm_bo->list_lock);
+ while (!list_empty(&svm_bo->range_list)) {
+ struct svm_range *prange =
+ list_first_entry(&svm_bo->range_list,
+ struct svm_range, svm_bo_list);
+ /* list_del_init tells a concurrent svm_range_vram_node_new when
+ * it's safe to reuse the svm_bo pointer and svm_bo_list head.
+ */
+ list_del_init(&prange->svm_bo_list);
+ spin_unlock(&svm_bo->list_lock);
+
+ pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
+ prange->start, prange->last);
+ mutex_lock(&prange->lock);
+ prange->svm_bo = NULL;
+ mutex_unlock(&prange->lock);
+
+ spin_lock(&svm_bo->list_lock);
+ }
+ spin_unlock(&svm_bo->list_lock);
+
+ amdgpu_bo_unref(&svm_bo->bo);
+ kfree(svm_bo);
+}
+
+static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
+{
+ if (!svm_bo)
+ return;
+
+ kref_put(&svm_bo->kref, svm_range_bo_release);
+}
+
+static struct svm_range_bo *svm_range_bo_new(void)
+{
+ struct svm_range_bo *svm_bo;
+
+ svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
+ if (!svm_bo)
+ return NULL;
+
+ kref_init(&svm_bo->kref);
+ INIT_LIST_HEAD(&svm_bo->range_list);
+ spin_lock_init(&svm_bo->list_lock);
+
+ return svm_bo;
+}
+
+int
+svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
+ bool clear)
+{
+ struct amdkfd_process_info *process_info;
+ struct amdgpu_bo_param bp;
+ struct svm_range_bo *svm_bo;
+ struct amdgpu_bo_user *ubo;
+ struct amdgpu_bo *bo;
+ struct kfd_process *p;
+ int r;
+
+ pr_debug("[0x%lx 0x%lx]\n", prange->start, prange->last);
+ mutex_lock(&prange->lock);
+ if (prange->svm_bo) {
+ if (prange->ttm_res) {
+ /* We still have a reference, all is well */
+ mutex_unlock(&prange->lock);
+ return 0;
+ }
+ if (svm_bo_ref_unless_zero(prange->svm_bo)) {
+ /* The BO was still around and we got
+ * a new reference to it
+ */
+ mutex_unlock(&prange->lock);
+ pr_debug("reuse old bo [0x%lx 0x%lx]\n",
+ prange->start, prange->last);
+
+ prange->ttm_res = &prange->svm_bo->bo->tbo.mem;
+ return 0;
+ }
+
+ mutex_unlock(&prange->lock);
+
+ /* We need a new svm_bo. Spin-loop to wait for concurrent
+ * svm_range_bo_release to finish removing this range from
+ * its range list. After this, it is safe to reuse the
+ * svm_bo pointer and svm_bo_list head.
+ */
+ while (!list_empty_careful(&prange->svm_bo_list))
+ ;
+
+ } else {
+ mutex_unlock(&prange->lock);
+ }
+
+ svm_bo = svm_range_bo_new();
+ if (!svm_bo) {
+ pr_debug("failed to alloc svm bo\n");
+ return -ENOMEM;
+ }
+
+ memset(&bp, 0, sizeof(bp));
+ bp.size = prange->npages * PAGE_SIZE;
+ bp.byte_align = PAGE_SIZE;
+ bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+ bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
+ bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
+ bp.type = ttm_bo_type_device;
+ bp.resv = NULL;
+
+ r = amdgpu_bo_create_user(adev, &bp, &ubo);
+ if (r) {
+ pr_debug("failed %d to create bo\n", r);
+ kfree(svm_bo);
+ return r;
+ }
+ bo = &ubo->bo;
+
+ p = container_of(prange->svms, struct kfd_process, svms);
+ r = amdgpu_bo_reserve(bo, true);
+ if (r) {
+ pr_debug("failed %d to reserve bo\n", r);
+ goto reserve_bo_failed;
+ }
+
+ r = dma_resv_reserve_shared(bo->tbo.base.resv, 1);
+ if (r) {
+ pr_debug("failed %d to reserve bo\n", r);
+ amdgpu_bo_unreserve(bo);
+ goto reserve_bo_failed;
+ }
+ process_info = p->kgd_process_info;
+ amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
+
+ amdgpu_bo_unreserve(bo);
+
+ svm_bo->bo = bo;
+ prange->svm_bo = svm_bo;
+ prange->ttm_res = &bo->tbo.mem;
+ prange->offset = 0;
+
+ spin_lock(&svm_bo->list_lock);
+ list_add(&prange->svm_bo_list, &svm_bo->range_list);
+ spin_unlock(&svm_bo->list_lock);
+
+ return 0;
+
+reserve_bo_failed:
+ kfree(svm_bo);
+ amdgpu_bo_unref(&bo);
+ prange->ttm_res = NULL;
+
+ return r;
+}
+
+void svm_range_vram_node_free(struct svm_range *prange)
+{
+ svm_range_bo_unref(prange->svm_bo);
+ prange->ttm_res = NULL;
+}
+
+struct amdgpu_device *
+svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p;
+ int32_t gpu_idx;
+
+ p = container_of(prange->svms, struct kfd_process, svms);
+
+ gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id);
+ if (gpu_idx < 0) {
+ pr_debug("failed to get device by id 0x%x\n", gpu_id);
+ return NULL;
+ }
+ pdd = kfd_process_device_from_gpuidx(p, gpu_idx);
+ if (!pdd) {
+ pr_debug("failed to get device by idx 0x%x\n", gpu_idx);
+ return NULL;
+ }
+
+ return (struct amdgpu_device *)pdd->dev->kgd;
+}
+
static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
{
struct ttm_operation_ctx ctx = { false, false };
@@ -471,6 +684,32 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old,
return 0;
}
+static int
+svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
+ uint64_t start, uint64_t last)
+{
+ uint64_t npages = last - start + 1;
+
+ pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
+ new->svms, new, new->start, start, last);
+
+ if (new->start == old->start) {
+ new->offset = old->offset;
+ old->offset += new->npages;
+ } else {
+ new->offset = old->offset + npages;
+ }
+
+ new->svm_bo = svm_range_bo_ref(old->svm_bo);
+ new->ttm_res = old->ttm_res;
+
+ spin_lock(&new->svm_bo->list_lock);
+ list_add(&new->svm_bo_list, &new->svm_bo->range_list);
+ spin_unlock(&new->svm_bo->list_lock);
+
+ return 0;
+}
+
/**
* svm_range_split_adjust - split range and adjust
*
@@ -479,7 +718,7 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old,
* @start: the old range adjust to start address in pages
* @last: the old range adjust to last address in pages
*
- * Copy system memory dma_addr in old range to new
+ * Copy system memory dma_addr or vram ttm_res in old range to new
* range from new_start up to size new->npages, the remaining old range is from
* start to last
*
@@ -505,6 +744,12 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
if (r)
return r;
+ if (old->actual_loc && old->ttm_res) {
+ r = svm_range_split_nodes(new, old, start, last);
+ if (r)
+ return r;
+ }
+
old->npages = last - start + 1;
old->start = start;
old->last = last;
@@ -619,7 +864,8 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange)
uint64_t pte_flags;
pte_flags = AMDGPU_PTE_VALID;
- pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED;
+ if (!prange->ttm_res)
+ pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED;
mapping_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
@@ -639,7 +885,9 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange)
/* Apply ASIC specific mapping flags */
amdgpu_gmc_get_vm_pte(adev, &prange->mapping, &pte_flags);
- pr_debug("PTE flags 0x%llx\n", pte_flags);
+ pr_debug("svms 0x%p [0x%lx 0x%lx] vram %d PTE flags 0x%llx\n",
+ prange->svms, prange->start, prange->last,
+ prange->ttm_res ? 1:0, pte_flags);
return pte_flags;
}
@@ -715,13 +963,15 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
prange->mapping.start = prange->start;
prange->mapping.last = prange->last;
- prange->mapping.offset = 0;
+ prange->mapping.offset = prange->offset;
pte_flags = svm_range_get_pte_flags(adev, prange);
r = amdgpu_vm_bo_update_mapping(adev, adev, vm, false, false, NULL,
prange->mapping.start,
prange->mapping.last, pte_flags,
- prange->mapping.offset, NULL,
+ prange->mapping.offset,
+ prange->ttm_res ?
+ prange->ttm_res->mm_node : NULL,
dma_addr, &vm->last_update);
if (r) {
pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
@@ -817,6 +1067,11 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx)
ctx->tv[gpuidx].num_shared = 4;
list_add(&ctx->tv[gpuidx].head, &ctx->validate_list);
}
+ if (ctx->prange->svm_bo && ctx->prange->ttm_res) {
+ ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo;
+ ctx->tv[MAX_GPU_INSTANCE].num_shared = 1;
+ list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list);
+ }
r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list,
ctx->intr, NULL);
@@ -901,6 +1156,14 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE))
return 0;
+ if (prange->actual_loc && !prange->ttm_res) {
+ /* This should never happen. actual_loc gets set by
+ * svm_migrate_ram_to_vram after allocating a BO.
+ */
+ WARN(1, "VRAM BO missing during validation\n");
+ return -EINVAL;
+ }
+
svm_range_reserve_bos(&ctx);
if (!prange->actual_loc) {
@@ -1098,6 +1361,14 @@ static struct svm_range *svm_range_clone(struct svm_range *old)
if (!new)
return NULL;
+ if (old->svm_bo) {
+ new->ttm_res = old->ttm_res;
+ new->offset = old->offset;
+ new->svm_bo = svm_range_bo_ref(old->svm_bo);
+ spin_lock(&new->svm_bo->list_lock);
+ list_add(&new->svm_bo_list, &new->svm_bo->range_list);
+ spin_unlock(&new->svm_bo->list_lock);
+ }
new->flags = old->flags;
new->preferred_loc = old->preferred_loc;
new->prefetch_loc = old->prefetch_loc;
@@ -1507,12 +1778,23 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
void svm_range_list_fini(struct kfd_process *p)
{
- mutex_destroy(&p->svms.lock);
+ struct svm_range *prange;
+ struct svm_range *next;
pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
/* Ensure list work is finished before process is destroyed */
flush_work(&p->svms.deferred_list_work);
+
+ list_for_each_entry_safe(prange, next, &p->svms.list, list) {
+ svm_range_unlink(prange);
+ svm_range_remove_notifier(prange);
+ svm_range_free(prange);
+ }
+
+ mutex_destroy(&p->svms.lock);
+
+ pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
}
int svm_range_list_init(struct kfd_process *p)
@@ -33,6 +33,13 @@
#include "amdgpu.h"
#include "kfd_priv.h"
+struct svm_range_bo {
+ struct amdgpu_bo *bo;
+ struct kref kref;
+ struct list_head range_list; /* all svm ranges shared this bo */
+ spinlock_t list_lock;
+};
+
enum svm_work_list_ops {
SVM_OP_NULL,
SVM_OP_UNMAP_RANGE,
@@ -60,6 +67,10 @@ struct svm_work_list_item {
* @mapping: bo_va mapping structure to create and update GPU page table
* @npages: number of pages
* @dma_addr: dma mapping address on each GPU for system memory physical page
+ * @ttm_res: vram ttm resource map
+ * @offset: range start offset within mm_nodes
+ * @svm_bo: struct to manage splited amdgpu_bo
+ * @svm_bo_list:link list node, to scan all ranges which share same svm_bo
* @lock: protect prange start, last, child_list, svm_bo_list
* @saved_flags:save/restore current PF_MEMALLOC flags
* @flags: flags defined as KFD_IOCTL_SVM_FLAG_*
@@ -91,6 +102,10 @@ struct svm_range {
struct amdgpu_bo_va_mapping mapping;
uint64_t npages;
dma_addr_t *dma_addr[MAX_GPU_INSTANCE];
+ struct ttm_resource *ttm_res;
+ uint64_t offset;
+ struct svm_range_bo *svm_bo;
+ struct list_head svm_bo_list;
struct mutex lock;
unsigned int saved_flags;
uint32_t flags;
@@ -124,5 +139,10 @@ void svm_range_list_fini(struct kfd_process *p);
int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
uint64_t size, uint32_t nattrs,
struct kfd_ioctl_svm_attribute *attrs);
+struct amdgpu_device *svm_range_get_adev_by_id(struct svm_range *prange,
+ uint32_t id);
+int svm_range_vram_node_new(struct amdgpu_device *adev,
+ struct svm_range *prange, bool clear);
+void svm_range_vram_node_free(struct svm_range *prange);
#endif /* KFD_SVM_H_ */