diff mbox series

[21/23] drm/xe/svm: GPU page fault support

Message ID	20240117221223.18540-22-oak.zeng@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: Oak Zeng <oak.zeng@intel.com> To: dri-devel@lists.freedesktop.org, intel-xe@lists.freedesktop.org Subject: [PATCH 21/23] drm/xe/svm: GPU page fault support Date: Wed, 17 Jan 2024 17:12:21 -0500 Message-Id: <20240117221223.18540-22-oak.zeng@intel.com> In-Reply-To: <20240117221223.18540-1-oak.zeng@intel.com> References: <20240117221223.18540-1-oak.zeng@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precedence: list Cc: matthew.brost@intel.com, Thomas.Hellstrom@linux.intel.com, brian.welty@intel.com, himal.prasad.ghimiray@intel.com, krishnaiah.bommu@intel.com, niranjana.vishwanathapura@intel.com Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	XeKmd basic SVM support \| expand [00/23] XeKmd basic SVM support [01/23] drm/xe/svm: Add SVM document [02/23] drm/xe/svm: Add svm key data structures [03/23] drm/xe/svm: create xe svm during vm creation [04/23] drm/xe/svm: Trace svm creation [05/23] drm/xe/svm: add helper to retrieve svm range from address [06/23] drm/xe/svm: Introduce a helper to build sg table from hmm range [07/23] drm/xe/svm: Add helper for binding hmm range to gpu [08/23] drm/xe/svm: Add helper to invalidate svm range from GPU [09/23] drm/xe/svm: Remap and provide memmap backing for GPU vram [10/23] drm/xe/svm: Introduce svm migration function [11/23] drm/xe/svm: implement functions to allocate and free device memory [12/23] drm/xe/svm: Trace buddy block allocation and free [13/23] drm/xe/svm: Handle CPU page fault [14/23] drm/xe/svm: trace svm range migration [15/23] drm/xe/svm: Implement functions to register and unregister mmu notifier [16/23] drm/xe/svm: Implement the mmu notifier range invalidate callback [17/23] drm/xe/svm: clean up svm range during process exit [18/23] drm/xe/svm: Move a few structures to xe_gt.h [19/23] drm/xe/svm: migrate svm range to vram [20/23] drm/xe/svm: Populate svm range [21/23] drm/xe/svm: GPU page fault support [22/23] drm/xe/svm: Add DRM_XE_SVM kernel config entry [23/23] drm/xe/svm: Add svm memory hints interface

Commit Message

Zeng, Oak Jan. 17, 2024, 10:12 p.m. UTC

On gpu page fault of a virtual address, try to fault in the virtual
address range to gpu page table and let HW to retry on the faulty
address.

Right now, we always migrate the whole vma which contains the fault
address to GPU. This is subject to change of a more sophisticated
migration policy: decide whether to migrate memory to GPU or map
in place with CPU memory; migration granularity.

There is rather complicated locking strategy in this patch. See more
details in xe_svm_doc.h, lock design section.

Signed-off-by: Oak Zeng <oak.zeng@intel.com>
Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@intel.com>
Cc: Brian Welty <brian.welty@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c |   7 ++
 drivers/gpu/drm/xe/xe_svm.c          | 116 +++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_svm.h          |   6 ++
 drivers/gpu/drm/xe/xe_svm_range.c    |  43 ++++++++++
 4 files changed, 172 insertions(+)

Comments

Welty, Brian Jan. 23, 2024, 2:06 a.m. UTC | #1

On 1/17/2024 2:12 PM, Oak Zeng wrote:
> On gpu page fault of a virtual address, try to fault in the virtual
> address range to gpu page table and let HW to retry on the faulty
> address.
> 
> Right now, we always migrate the whole vma which contains the fault
> address to GPU. This is subject to change of a more sophisticated
> migration policy: decide whether to migrate memory to GPU or map
> in place with CPU memory; migration granularity.
> 
> There is rather complicated locking strategy in this patch. See more
> details in xe_svm_doc.h, lock design section.
> 
> Signed-off-by: Oak Zeng <oak.zeng@intel.com>
> Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@intel.com>
> Cc: Brian Welty <brian.welty@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_gt_pagefault.c |   7 ++
>   drivers/gpu/drm/xe/xe_svm.c          | 116 +++++++++++++++++++++++++++
>   drivers/gpu/drm/xe/xe_svm.h          |   6 ++
>   drivers/gpu/drm/xe/xe_svm_range.c    |  43 ++++++++++
>   4 files changed, 172 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 467d68f8332e..462603abab8a 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -22,6 +22,7 @@
>   #include "xe_pt.h"
>   #include "xe_trace.h"
>   #include "xe_vm.h"
> +#include "xe_svm.h"
>   
>   enum fault_type {
>   	NOT_PRESENT = 0,
> @@ -131,6 +132,11 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
>   	if (!vm || !xe_vm_in_fault_mode(vm))
>   		return -EINVAL;
>   
> +	if (vm->svm) {
> +		ret = xe_svm_handle_gpu_fault(vm, gt, pf);
> +		goto put_vm;
> +	}
> +
>   retry_userptr:
>   	/*
>   	 * TODO: Avoid exclusive lock if VM doesn't have userptrs, or
> @@ -219,6 +225,7 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
>   		if (ret >= 0)
>   			ret = 0;
>   	}
> +put_vm:
>   	xe_vm_put(vm);
>   
>   	return ret;
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 0c13690a19f5..1ade8d7f0ab2 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -12,6 +12,7 @@
>   #include "xe_svm.h"
>   #include <linux/hmm.h>
>   #include <linux/scatterlist.h>
> +#include <drm/xe_drm.h>
>   #include "xe_pt.h"
>   #include "xe_assert.h"
>   #include "xe_vm_types.h"
> @@ -206,3 +207,118 @@ static int svm_populate_range(struct xe_svm_range *svm_range,
>   		kvfree(pfns);
>   	return ret;
>   }
> +
> +/**
> + * svm_access_allowed() -  Determine whether read or/and write to vma is allowed
> + *
> + * @write: true means a read and write access; false: read only access
> + */
> +static bool svm_access_allowed(struct vm_area_struct *vma, bool write)
> +{
> +	unsigned long access = VM_READ;
> +
> +	if (write)
> +		access |= VM_WRITE;
> +
> +	return (vma->vm_flags & access) == access;
> +}
> +
> +/**
> + * svm_should_migrate() - Determine whether we should migrate a range to
> + * a destination memory region
> + *
> + * @range: The svm memory range to consider
> + * @dst_region: target destination memory region
> + * @is_atomic_fault: Is the intended migration triggered by a atomic access?
> + * On some platform, we have to migrate memory to guarantee atomic correctness.
> + */
> +static bool svm_should_migrate(struct xe_svm_range *range,
> +				struct xe_mem_region *dst_region, bool is_atomic_fault)
> +{
> +	return true;
> +}
> +
> +/**
> + * xe_svm_handle_gpu_fault() - gpu page fault handler for svm subsystem
> + *
> + * @vm: The vm of the fault.
> + * @gt: The gt hardware on which the fault happens.
> + * @pf: page fault descriptor
> + *
> + * Workout a backing memory for the fault address, migrate memory from
> + * system memory to gpu vram if nessary, and map the fault address to
> + * GPU so GPU HW can retry the last operation which has caused the GPU
> + * page fault.
> + */
> +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> +				struct xe_gt *gt,
> +				struct pagefault *pf)
> +{
> +	u8 access_type = pf->access_type;
> +	u64 page_addr = pf->page_addr;
> +	struct hmm_range hmm_range;
> +	struct vm_area_struct *vma;
> +	struct xe_svm_range *range;
> +	struct mm_struct *mm;
> +	struct xe_svm *svm;
> +	int ret = 0;
> +
> +	svm = vm->svm;
> +	if (!svm)
> +		return -EINVAL;
> +
> +	mm = svm->mm;
> +	mmap_read_lock(mm);
> +	vma = find_vma_intersection(mm, page_addr, page_addr + 4);
> +	if (!vma) {
> +		mmap_read_unlock(mm);
> +		return -ENOENT;
> +	}
> +
> +	if (!svm_access_allowed (vma, access_type != ACCESS_TYPE_READ)) {
> +		mmap_read_unlock(mm);
> +		return -EPERM;
> +	}
> +
> +	range = xe_svm_range_from_addr(svm, page_addr);
> +	if (!range) {
> +		range = xe_svm_range_create(svm, vma);
> +		if (!range) {
> +			mmap_read_unlock(mm);
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	if (svm_should_migrate(range, &gt->tile->mem.vram,
> +						access_type == ACCESS_TYPE_ATOMIC))
> +		/** Migrate whole svm range for now.
> +		 *  This is subject to change once we introduce a migration granularity
> +		 *  parameter for user to select.
> +		 *
> +		 *	Migration is best effort. If we failed to migrate to vram,
> +		 *	we just map that range to gpu in system memory. For cases
> +		 *	such as gpu atomic operation which requires memory to be
> +		 *	resident in vram, we will fault again and retry migration.
> +		 */
> +		svm_migrate_range_to_vram(range, vma, gt->tile);
> +
> +	ret = svm_populate_range(range, &hmm_range, vma->vm_flags & VM_WRITE);
> +	mmap_read_unlock(mm);
> +	/** There is no need to destroy this range. Range can be reused later */
> +	if (ret)
> +		goto free_pfns;
> +
> +	/**FIXME: set the DM, AE flags in PTE*/
> +	ret = xe_bind_svm_range(vm, gt->tile, &hmm_range,
> +		!(vma->vm_flags & VM_WRITE) ? DRM_XE_VM_BIND_FLAG_READONLY : 0);
> +	/** Concurrent cpu page table update happened,
> +	 *  Return successfully so we will retry everything
> +	 *  on next gpu page fault.
> +	 */
> +	if (ret == -EAGAIN)
> +		ret = 0;
> +
> +free_pfns:
> +	kvfree(hmm_range.hmm_pfns);
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
> index 659bcb7927d6..a8ff4957a9b8 100644
> --- a/drivers/gpu/drm/xe/xe_svm.h
> +++ b/drivers/gpu/drm/xe/xe_svm.h
> @@ -20,6 +20,7 @@
>   
>   struct xe_vm;
>   struct mm_struct;
> +struct pagefault;
>   
>   #define XE_MAX_SVM_PROCESS 5 /* Maximumly support 32 SVM process*/
>   extern DECLARE_HASHTABLE(xe_svm_table, XE_MAX_SVM_PROCESS);
> @@ -94,6 +95,8 @@ bool xe_svm_range_belongs_to_vma(struct mm_struct *mm,
>   void xe_svm_range_unregister_mmu_notifier(struct xe_svm_range *range);
>   int xe_svm_range_register_mmu_notifier(struct xe_svm_range *range);
>   void xe_svm_range_prepare_destroy(struct xe_svm_range *range);
> +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> +									struct vm_area_struct *vma);
>   
>   int xe_svm_build_sg(struct hmm_range *range, struct sg_table *st);
>   int xe_svm_devm_add(struct xe_tile *tile, struct xe_mem_region *mem);
> @@ -106,4 +109,7 @@ int xe_devm_alloc_pages(struct xe_tile *tile,
>   
>   void xe_devm_free_blocks(struct list_head *blocks);
>   void xe_devm_page_free(struct page *page);
> +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> +				struct xe_gt *gt,
> +				struct pagefault *pf);
>   #endif
> diff --git a/drivers/gpu/drm/xe/xe_svm_range.c b/drivers/gpu/drm/xe/xe_svm_range.c
> index dfb4660dc26f..05c088dddc2d 100644
> --- a/drivers/gpu/drm/xe/xe_svm_range.c
> +++ b/drivers/gpu/drm/xe/xe_svm_range.c
> @@ -182,3 +182,46 @@ void xe_svm_range_prepare_destroy(struct xe_svm_range *range)
>   	xe_invalidate_svm_range(vm, range->start, length);
>   	xe_svm_range_unregister_mmu_notifier(range);
>   }
> +
> +static void add_range_to_svm(struct xe_svm_range *range)
> +{
> +	range->inode.start = range->start;
> +	range->inode.last = range->end;
> +	mutex_lock(&range->svm->mutex);
> +	interval_tree_insert(&range->inode, &range->svm->range_tree);
> +	mutex_unlock(&range->svm->mutex);
> +}

I have following question / concern.

I believe we are planning for what we call 'shared allocations' to use 
svm.  But what we call device-only allocations, will continue to use
GEM_CREATE and those are in the BO-centric world.

But you need to still have the application with one single managed 
address space, yes?  In other words, how will theses co-exist?
It seems you will have collisions.

For example as hmm_range_fault brings a range from host into GPU address 
space,  what if it was already allocated and in use by VM_BIND for
a GEM_CREATE allocated buffer?    That is of course application error, 
but KMD needs to detect it, and provide one single managed address
space across all allocations from the application....

Continuing on this theme.  Instead of this interval tree, did you 
consider to just use drm_gpuvm as address space manager?
It probably needs some overhaul, and not to assume it is managing only
BO backed allocations, but could work....
And it has all the split/merge support already there, which you will 
need for adding hints later?

Wanted to hear your thoughts.

-Brian



> +
> +/**
> + * xe_svm_range_create() - create and initialize a svm range
> + *
> + * @svm: the svm that the range belongs to
> + * @vma: the corresponding vma of the range
> + *
> + * Create range, add it to svm's interval tree. Regiter a mmu
> + * interval notifier for this range.
> + *
> + * Return the pointer of the created svm range
> + * or NULL if fail
> + */
> +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> +									struct vm_area_struct *vma)
> +{
> +	struct xe_svm_range *range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return NULL;
> +
> +	range->start = vma->vm_start;
> +	range->end = vma->vm_end;
> +	range->vma = vma;
> +	range->svm = svm;
> +
> +	if (xe_svm_range_register_mmu_notifier(range)){
> +		kfree(range);
> +		return NULL;
> +	}
> +
> +	add_range_to_svm(range);
> +	return range;
> +}

Zeng, Oak Jan. 23, 2024, 3:09 a.m. UTC | #2

> -----Original Message-----
> From: Welty, Brian <brian.welty@intel.com>
> Sent: Monday, January 22, 2024 9:06 PM
> To: Zeng, Oak <oak.zeng@intel.com>; dri-devel@lists.freedesktop.org; intel-
> xe@lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost,
> Matthew <matthew.brost@intel.com>
> Subject: Re: [PATCH 21/23] drm/xe/svm: GPU page fault support
> 
> 
> On 1/17/2024 2:12 PM, Oak Zeng wrote:
> > On gpu page fault of a virtual address, try to fault in the virtual
> > address range to gpu page table and let HW to retry on the faulty
> > address.
> >
> > Right now, we always migrate the whole vma which contains the fault
> > address to GPU. This is subject to change of a more sophisticated
> > migration policy: decide whether to migrate memory to GPU or map
> > in place with CPU memory; migration granularity.
> >
> > There is rather complicated locking strategy in this patch. See more
> > details in xe_svm_doc.h, lock design section.
> >
> > Signed-off-by: Oak Zeng <oak.zeng@intel.com>
> > Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom@intel.com>
> > Cc: Brian Welty <brian.welty@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_gt_pagefault.c |   7 ++
> >   drivers/gpu/drm/xe/xe_svm.c          | 116 +++++++++++++++++++++++++++
> >   drivers/gpu/drm/xe/xe_svm.h          |   6 ++
> >   drivers/gpu/drm/xe/xe_svm_range.c    |  43 ++++++++++
> >   4 files changed, 172 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index 467d68f8332e..462603abab8a 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -22,6 +22,7 @@
> >   #include "xe_pt.h"
> >   #include "xe_trace.h"
> >   #include "xe_vm.h"
> > +#include "xe_svm.h"
> >
> >   enum fault_type {
> >   	NOT_PRESENT = 0,
> > @@ -131,6 +132,11 @@ static int handle_pagefault(struct xe_gt *gt, struct
> pagefault *pf)
> >   	if (!vm || !xe_vm_in_fault_mode(vm))
> >   		return -EINVAL;
> >
> > +	if (vm->svm) {
> > +		ret = xe_svm_handle_gpu_fault(vm, gt, pf);
> > +		goto put_vm;
> > +	}
> > +
> >   retry_userptr:
> >   	/*
> >   	 * TODO: Avoid exclusive lock if VM doesn't have userptrs, or
> > @@ -219,6 +225,7 @@ static int handle_pagefault(struct xe_gt *gt, struct
> pagefault *pf)
> >   		if (ret >= 0)
> >   			ret = 0;
> >   	}
> > +put_vm:
> >   	xe_vm_put(vm);
> >
> >   	return ret;
> > diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> > index 0c13690a19f5..1ade8d7f0ab2 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.c
> > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > @@ -12,6 +12,7 @@
> >   #include "xe_svm.h"
> >   #include <linux/hmm.h>
> >   #include <linux/scatterlist.h>
> > +#include <drm/xe_drm.h>
> >   #include "xe_pt.h"
> >   #include "xe_assert.h"
> >   #include "xe_vm_types.h"
> > @@ -206,3 +207,118 @@ static int svm_populate_range(struct xe_svm_range
> *svm_range,
> >   		kvfree(pfns);
> >   	return ret;
> >   }
> > +
> > +/**
> > + * svm_access_allowed() -  Determine whether read or/and write to vma is
> allowed
> > + *
> > + * @write: true means a read and write access; false: read only access
> > + */
> > +static bool svm_access_allowed(struct vm_area_struct *vma, bool write)
> > +{
> > +	unsigned long access = VM_READ;
> > +
> > +	if (write)
> > +		access |= VM_WRITE;
> > +
> > +	return (vma->vm_flags & access) == access;
> > +}
> > +
> > +/**
> > + * svm_should_migrate() - Determine whether we should migrate a range to
> > + * a destination memory region
> > + *
> > + * @range: The svm memory range to consider
> > + * @dst_region: target destination memory region
> > + * @is_atomic_fault: Is the intended migration triggered by a atomic access?
> > + * On some platform, we have to migrate memory to guarantee atomic
> correctness.
> > + */
> > +static bool svm_should_migrate(struct xe_svm_range *range,
> > +				struct xe_mem_region *dst_region, bool
> is_atomic_fault)
> > +{
> > +	return true;
> > +}
> > +
> > +/**
> > + * xe_svm_handle_gpu_fault() - gpu page fault handler for svm subsystem
> > + *
> > + * @vm: The vm of the fault.
> > + * @gt: The gt hardware on which the fault happens.
> > + * @pf: page fault descriptor
> > + *
> > + * Workout a backing memory for the fault address, migrate memory from
> > + * system memory to gpu vram if nessary, and map the fault address to
> > + * GPU so GPU HW can retry the last operation which has caused the GPU
> > + * page fault.
> > + */
> > +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> > +				struct xe_gt *gt,
> > +				struct pagefault *pf)
> > +{
> > +	u8 access_type = pf->access_type;
> > +	u64 page_addr = pf->page_addr;
> > +	struct hmm_range hmm_range;
> > +	struct vm_area_struct *vma;
> > +	struct xe_svm_range *range;
> > +	struct mm_struct *mm;
> > +	struct xe_svm *svm;
> > +	int ret = 0;
> > +
> > +	svm = vm->svm;
> > +	if (!svm)
> > +		return -EINVAL;
> > +
> > +	mm = svm->mm;
> > +	mmap_read_lock(mm);
> > +	vma = find_vma_intersection(mm, page_addr, page_addr + 4);
> > +	if (!vma) {
> > +		mmap_read_unlock(mm);
> > +		return -ENOENT;
> > +	}
> > +
> > +	if (!svm_access_allowed (vma, access_type != ACCESS_TYPE_READ)) {
> > +		mmap_read_unlock(mm);
> > +		return -EPERM;
> > +	}
> > +
> > +	range = xe_svm_range_from_addr(svm, page_addr);
> > +	if (!range) {
> > +		range = xe_svm_range_create(svm, vma);
> > +		if (!range) {
> > +			mmap_read_unlock(mm);
> > +			return -ENOMEM;
> > +		}
> > +	}
> > +
> > +	if (svm_should_migrate(range, &gt->tile->mem.vram,
> > +						access_type ==
> ACCESS_TYPE_ATOMIC))
> > +		/** Migrate whole svm range for now.
> > +		 *  This is subject to change once we introduce a migration
> granularity
> > +		 *  parameter for user to select.
> > +		 *
> > +		 *	Migration is best effort. If we failed to migrate to vram,
> > +		 *	we just map that range to gpu in system memory. For
> cases
> > +		 *	such as gpu atomic operation which requires memory to
> be
> > +		 *	resident in vram, we will fault again and retry migration.
> > +		 */
> > +		svm_migrate_range_to_vram(range, vma, gt->tile);
> > +
> > +	ret = svm_populate_range(range, &hmm_range, vma->vm_flags &
> VM_WRITE);
> > +	mmap_read_unlock(mm);
> > +	/** There is no need to destroy this range. Range can be reused later */
> > +	if (ret)
> > +		goto free_pfns;
> > +
> > +	/**FIXME: set the DM, AE flags in PTE*/
> > +	ret = xe_bind_svm_range(vm, gt->tile, &hmm_range,
> > +		!(vma->vm_flags & VM_WRITE) ?
> DRM_XE_VM_BIND_FLAG_READONLY : 0);
> > +	/** Concurrent cpu page table update happened,
> > +	 *  Return successfully so we will retry everything
> > +	 *  on next gpu page fault.
> > +	 */
> > +	if (ret == -EAGAIN)
> > +		ret = 0;
> > +
> > +free_pfns:
> > +	kvfree(hmm_range.hmm_pfns);
> > +	return ret;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
> > index 659bcb7927d6..a8ff4957a9b8 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.h
> > +++ b/drivers/gpu/drm/xe/xe_svm.h
> > @@ -20,6 +20,7 @@
> >
> >   struct xe_vm;
> >   struct mm_struct;
> > +struct pagefault;
> >
> >   #define XE_MAX_SVM_PROCESS 5 /* Maximumly support 32 SVM process*/
> >   extern DECLARE_HASHTABLE(xe_svm_table, XE_MAX_SVM_PROCESS);
> > @@ -94,6 +95,8 @@ bool xe_svm_range_belongs_to_vma(struct mm_struct
> *mm,
> >   void xe_svm_range_unregister_mmu_notifier(struct xe_svm_range *range);
> >   int xe_svm_range_register_mmu_notifier(struct xe_svm_range *range);
> >   void xe_svm_range_prepare_destroy(struct xe_svm_range *range);
> > +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> > +									struct
> vm_area_struct *vma);
> >
> >   int xe_svm_build_sg(struct hmm_range *range, struct sg_table *st);
> >   int xe_svm_devm_add(struct xe_tile *tile, struct xe_mem_region *mem);
> > @@ -106,4 +109,7 @@ int xe_devm_alloc_pages(struct xe_tile *tile,
> >
> >   void xe_devm_free_blocks(struct list_head *blocks);
> >   void xe_devm_page_free(struct page *page);
> > +int xe_svm_handle_gpu_fault(struct xe_vm *vm,
> > +				struct xe_gt *gt,
> > +				struct pagefault *pf);
> >   #endif
> > diff --git a/drivers/gpu/drm/xe/xe_svm_range.c
> b/drivers/gpu/drm/xe/xe_svm_range.c
> > index dfb4660dc26f..05c088dddc2d 100644
> > --- a/drivers/gpu/drm/xe/xe_svm_range.c
> > +++ b/drivers/gpu/drm/xe/xe_svm_range.c
> > @@ -182,3 +182,46 @@ void xe_svm_range_prepare_destroy(struct
> xe_svm_range *range)
> >   	xe_invalidate_svm_range(vm, range->start, length);
> >   	xe_svm_range_unregister_mmu_notifier(range);
> >   }
> > +
> > +static void add_range_to_svm(struct xe_svm_range *range)
> > +{
> > +	range->inode.start = range->start;
> > +	range->inode.last = range->end;
> > +	mutex_lock(&range->svm->mutex);
> > +	interval_tree_insert(&range->inode, &range->svm->range_tree);
> > +	mutex_unlock(&range->svm->mutex);
> > +}
> 
> I have following question / concern.
> 
> I believe we are planning for what we call 'shared allocations' to use
> svm.  But what we call device-only allocations, will continue to use
> GEM_CREATE and those are in the BO-centric world.
> 
> But you need to still have the application with one single managed
> address space, yes?  In other words, how will theses co-exist?
> It seems you will have collisions.

Yes, those two types of allocators have to co-exist.


> 
> For example as hmm_range_fault brings a range from host into GPU address
> space,  what if it was already allocated and in use by VM_BIND for
> a GEM_CREATE allocated buffer?    That is of course application error,
> but KMD needs to detect it, and provide one single managed address
> space across all allocations from the application....


This is very good question. Yes agree we should check this application error. Fortunately this is doable. All vm_bind virtual address range are tracked in xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb tree of *all* gpu devices (as xe_vm is for one device only) to see whether there is a conflict. Will make the change soon.


> 
> Continuing on this theme.  Instead of this interval tree, did you
> consider to just use drm_gpuvm as address space manager?
> It probably needs some overhaul, and not to assume it is managing only
> BO backed allocations, but could work....
> And it has all the split/merge support already there, which you will
> need for adding hints later?


Yes another good point. I discuss the approach of leveraging drm_gpuvm with Matt Brost. Yes the good thing is we can leverage all the range split/merge utilities there.

The difficulty to use drm_gpuvm is, today xe_vm/drm_gpuvm are all per-device based (see the *dev pointer in each structure). But xe_svm should work across all gpu devices.... So it is hard for xe_svm to inherit from drm_gpuvm...

One approach Matt mentioned is, change the drm_gpuvm a little to make it work across gpu device. I think this should be doable. I looked at the dev pointer in drm_gpuvm, it didn't really use this parameter a lot. The dev pointer is used just to print some warning message, no real logic work...

So what we can do is, we remove the dev pointer from drm_gpuvm, and instead of have xe_vm to inherit from drm_gpuvm, we can have a drm_gpuvm pointer in xe_vm, and let xe_svm to inherit from drm_gpuvm. Matt pointed all those ideas to me. We thought we want to make svm work w/o changing xekmd base driver and drm as a first step. And try this idea as a second step...

But since you also have this idea, I will start to an email the the drm_gpuvm designer to query the feasibility. If it turns out the be feasible, I will it work in one step. Considering this will save some codes in the memory hint part, I think it worth the time considering it right now.

Thanks,
Oak

> 
> Wanted to hear your thoughts.
> 
> -Brian
> 
> 
> 
> > +
> > +/**
> > + * xe_svm_range_create() - create and initialize a svm range
> > + *
> > + * @svm: the svm that the range belongs to
> > + * @vma: the corresponding vma of the range
> > + *
> > + * Create range, add it to svm's interval tree. Regiter a mmu
> > + * interval notifier for this range.
> > + *
> > + * Return the pointer of the created svm range
> > + * or NULL if fail
> > + */
> > +struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
> > +									struct
> vm_area_struct *vma)
> > +{
> > +	struct xe_svm_range *range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return NULL;
> > +
> > +	range->start = vma->vm_start;
> > +	range->end = vma->vm_end;
> > +	range->vma = vma;
> > +	range->svm = svm;
> > +
> > +	if (xe_svm_range_register_mmu_notifier(range)){
> > +		kfree(range);
> > +		return NULL;
> > +	}
> > +
> > +	add_range_to_svm(range);
> > +	return range;
> > +}

Zeng, Oak Jan. 23, 2024, 3:21 a.m. UTC | #3

Hi Danilo and all,

During the work of Intel's SVM code, we came up the idea of making drm_gpuvm to work across multiple gpu devices. See some discussion here: https://lore.kernel.org/dri-devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd11.prod.outlook.com/

The reason we try to do this is, for a SVM (shared virtual memory across cpu program and all gpu program on all gpu devices) process, the address space has to be across all gpu devices. So if we make drm_gpuvm to work across devices, then our SVM code can leverage drm_gpuvm as well.

At a first look, it seems feasible because drm_gpuvm doesn't really use the drm_device *drm pointer a lot. This param is used only for printing/warning. So I think maybe we can delete this drm field from drm_gpuvm.

This way, on a multiple gpu device system, for one process, we can have only one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for each gpu device).

What do you think?

Thanks,
Oak

Christian König Jan. 23, 2024, 11:13 a.m. UTC | #4

Hi Oak,

Am 23.01.24 um 04:21 schrieb Zeng, Oak:
> Hi Danilo and all,
>
> During the work of Intel's SVM code, we came up the idea of making drm_gpuvm to work across multiple gpu devices. See some discussion here: https://lore.kernel.org/dri-devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd11.prod.outlook.com/
>
> The reason we try to do this is, for a SVM (shared virtual memory across cpu program and all gpu program on all gpu devices) process, the address space has to be across all gpu devices. So if we make drm_gpuvm to work across devices, then our SVM code can leverage drm_gpuvm as well.
>
> At a first look, it seems feasible because drm_gpuvm doesn't really use the drm_device *drm pointer a lot. This param is used only for printing/warning. So I think maybe we can delete this drm field from drm_gpuvm.
>
> This way, on a multiple gpu device system, for one process, we can have only one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for each gpu device).
>
> What do you think?

Well from the GPUVM side I don't think it would make much difference if 
we have the drm device or not.

But the experience we had with the KFD I think I should mention that we 
should absolutely *not* deal with multiple devices at the same time in 
the UAPI or VM objects inside the driver.

The background is that all the APIs inside the Linux kernel are build 
around the idea that they work with only one device at a time. This 
accounts for both low level APIs like the DMA API as well as pretty high 
level things like for example file system address space etc...

So when you have multiple GPUs you either have an inseparable cluster of 
them which case you would also only have one drm_device. Or you have 
separated drm_device which also results in separate drm render nodes and 
separate virtual address spaces and also eventually separate IOMMU 
domains which gives you separate dma_addresses for the same page and so 
separate GPUVM page tables....

It's up to you how to implement it, but I think it's pretty clear that 
you need separate drm_gpuvm objects to manage those.

That you map the same thing in all those virtual address spaces at the 
same address is a completely different optimization problem I think. 
What we could certainly do is to optimize hmm_range_fault by making 
hmm_range a reference counted object and using it for multiple devices 
at the same time if those devices request the same range of an mm_struct.

I think if you start using the same drm_gpuvm for multiple devices you 
will sooner or later start to run into the same mess we have seen with 
KFD, where we moved more and more functionality from the KFD to the DRM 
render node because we found that a lot of the stuff simply doesn't work 
correctly with a single object to maintain the state.

Just one more point to your original discussion on the xe list: I think 
it's perfectly valid for an application to map something at the same 
address you already have something else.

Cheers,
Christian.

>
> Thanks,
> Oak

Zeng, Oak Jan. 23, 2024, 7:37 p.m. UTC | #5

Thanks Christian. I have some comment inline below.

Danilo, can you also take a look and give your feedback? Thanks.

> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Tuesday, January 23, 2024 6:13 AM
> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> <matthew.brost@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> Hi Oak,
> 
> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
> > Hi Danilo and all,
> >
> > During the work of Intel's SVM code, we came up the idea of making
> drm_gpuvm to work across multiple gpu devices. See some discussion here:
> https://lore.kernel.org/dri-
> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
> 11.prod.outlook.com/
> >
> > The reason we try to do this is, for a SVM (shared virtual memory across cpu
> program and all gpu program on all gpu devices) process, the address space has
> to be across all gpu devices. So if we make drm_gpuvm to work across devices,
> then our SVM code can leverage drm_gpuvm as well.
> >
> > At a first look, it seems feasible because drm_gpuvm doesn't really use the
> drm_device *drm pointer a lot. This param is used only for printing/warning. So I
> think maybe we can delete this drm field from drm_gpuvm.
> >
> > This way, on a multiple gpu device system, for one process, we can have only
> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for
> each gpu device).
> >
> > What do you think?
> 
> Well from the GPUVM side I don't think it would make much difference if
> we have the drm device or not.
> 
> But the experience we had with the KFD I think I should mention that we
> should absolutely *not* deal with multiple devices at the same time in
> the UAPI or VM objects inside the driver.
> 
> The background is that all the APIs inside the Linux kernel are build
> around the idea that they work with only one device at a time. This
> accounts for both low level APIs like the DMA API as well as pretty high
> level things like for example file system address space etc...

Yes most API are per device based.

One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices. Cc Felix.

Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

We have the same design requirement of SVM. For anyone who want to implement the SVM concept, this is a hard requirement. Since now drm has the drm_gpuvm concept which strictly speaking is designed for one device, I want to see whether we can extend drm_gpuvm to make it work for both single device (as used in xe) and multipe devices (will be used in the SVM code). That is why I brought up this topic.

> 
> So when you have multiple GPUs you either have an inseparable cluster of
> them which case you would also only have one drm_device. Or you have
> separated drm_device which also results in separate drm render nodes and
> separate virtual address spaces and also eventually separate IOMMU
> domains which gives you separate dma_addresses for the same page and so
> separate GPUVM page tables....

I am thinking we can still make each device has its separate drm_device/render node/iommu domains/gpu page table. Just as what we have today. I am not plan to change this picture.

But the virtual address space will support two modes of operation: 
1. one drm_gpuvm per device. This is when svm is not in the picture
2. all devices in the process share one single drm_gpuvm, when svm is in the picture. In xe driver design, we have to support a mixture use of legacy mode (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu submission). So whenever SVM is in the picture, we want one single process address space across all devices. Drm_gpuvm doesn't need to be aware of those two operation modes. It is driver's responsibility to use different mode. 

For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series: https://lore.kernel.org/dri-devel/20240117221223.18540-1-oak.zeng@intel.com/) can inherit from drm_gpuvm while each xe_vm (still a per-device based struct) will just have a pointer to the drm_gpuvm structure. This way when svm is in play, we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means shared address space across gpu devices.

This requires some changes of drm_gpuvm design:
1. The drm_device *drm pointer, in mode #2 operation, this can be NULL, means this drm_gpuvm is not for specific gpu device
2. The common dma_resv object: drm_gem_object *r_obj. *Does one dma_resv object allocated/initialized for one device work for all devices*? From first look, dma_resv is just some CPU structure maintaining dma-fences. So I guess it should work for all devices? I definitely need you to comment.  


> 
> It's up to you how to implement it, but I think it's pretty clear that
> you need separate drm_gpuvm objects to manage those.

As explained above, I am thinking of one drm_gpuvm object across all devices when SVM is in the picture...

> 
> That you map the same thing in all those virtual address spaces at the
> same address is a completely different optimization problem I think.

Not sure I follow here... the requirement from SVM is, one virtual address points to same physical backing store. For example, whenever CPU or any GPU device access this virtual address, it refers to the same physical content. Of course the physical backing store can be migrated b/t host memory and any of the GPU's device memory, but the content should be consistent.

So we are mapping same physical content to the same virtual address in either cpu page table or any gpu device's page table...

> What we could certainly do is to optimize hmm_range_fault by making
> hmm_range a reference counted object and using it for multiple devices
> at the same time if those devices request the same range of an mm_struct.
> 

Not very follow. If you are trying to resolve a multiple devices concurrent access problem, I think we should serialize concurrent device fault to one address range. The reason is, during device fault handling, we might migrate the backing store so hmm_range->hmm_pfns[] might have changed after one device access it.

> I think if you start using the same drm_gpuvm for multiple devices you
> will sooner or later start to run into the same mess we have seen with
> KFD, where we moved more and more functionality from the KFD to the DRM
> render node because we found that a lot of the stuff simply doesn't work
> correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process. Yes the codes are a little complicated.

Kfd manages the shared virtual address space in the kfd driver codes, like the split, merging etc. Here I am looking whether we can leverage the drm_gpuvm code for those functions. 

As of the shared virtual address space across gpu devices, it is a hard requirement for svm/system allocator (aka malloc for gpu program). We need to make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better because the work can be shared b/t drivers.

Thanks a lot,
Oak

> 
> Just one more point to your original discussion on the xe list: I think
> it's perfectly valid for an application to map something at the same
> address you already have something else.
> 
> Cheers,
> Christian.
> 
> >
> > Thanks,
> > Oak

Felix Kuehling Jan. 23, 2024, 8:17 p.m. UTC | #6

On 2024-01-23 14:37, Zeng, Oak wrote:
> Thanks Christian. I have some comment inline below.
>
> Danilo, can you also take a look and give your feedback? Thanks.

Sorry, just catching up with this thread now. I'm also not familiar with 
drm_gpuvm.

Some general observations based on my experience with KFD, amdgpu and 
SVM. With SVM we have a single virtual address space managed in user 
mode (basically using mmap) with attributes per virtual address range 
maintained in the kernel mode driver. Different devices get their 
mappings of pieces of that address space using the same virtual 
addresses. We also support migration to different DEVICE_PRIVATE memory 
spaces.

However, we still have page tables managed per device. Each device can 
have different page table formats and layout (e.g. different GPU 
generations in the same system) and the same memory may be mapped with 
different flags on different devices in order to get the right coherence 
behaviour. We also need to maintain per-device DMA mappings somewhere. 
That means, as far as the device page tables are concerned, we still 
have separate address spaces. SVM only adds a layer on top, which 
coordinates these separate device virtual address spaces so that some 
parts of them provide the appearance of a shared virtual address space.

At some point you need to decide, where you draw the boundary between 
managing a per-process shared virtual address space and managing 
per-device virtual address spaces. In amdgpu that boundary is currently 
where kfd_svm code calls amdgpu_vm code to manage the per-device page 
tables.

In the amdgpu driver, we still have the traditional memory management 
APIs in the render nodes that don't do SVM. They share the device 
virtual address spaces with SVM. We have to be careful that we don't try 
to manage the same device virtual address ranges with these two 
different memory managers. In practice, we let the non-SVM APIs use the 
upper half of the canonical address space, while the lower half can be 
used almost entirely for SVM.

Regards,
   Felix


>
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Tuesday, January 23, 2024 6:13 AM
>> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
>> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
>> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>> <matthew.brost@intel.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> Hi Oak,
>>
>> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
>>> Hi Danilo and all,
>>>
>>> During the work of Intel's SVM code, we came up the idea of making
>> drm_gpuvm to work across multiple gpu devices. See some discussion here:
>> https://lore.kernel.org/dri-
>> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
>> 11.prod.outlook.com/
>>> The reason we try to do this is, for a SVM (shared virtual memory across cpu
>> program and all gpu program on all gpu devices) process, the address space has
>> to be across all gpu devices. So if we make drm_gpuvm to work across devices,
>> then our SVM code can leverage drm_gpuvm as well.
>>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
>> drm_device *drm pointer a lot. This param is used only for printing/warning. So I
>> think maybe we can delete this drm field from drm_gpuvm.
>>> This way, on a multiple gpu device system, for one process, we can have only
>> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for
>> each gpu device).
>>> What do you think?
>> Well from the GPUVM side I don't think it would make much difference if
>> we have the drm device or not.
>>
>> But the experience we had with the KFD I think I should mention that we
>> should absolutely *not* deal with multiple devices at the same time in
>> the UAPI or VM objects inside the driver.
>>
>> The background is that all the APIs inside the Linux kernel are build
>> around the idea that they work with only one device at a time. This
>> accounts for both low level APIs like the DMA API as well as pretty high
>> level things like for example file system address space etc...
> Yes most API are per device based.
>
> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices. Cc Felix.
>
> Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
>
> We have the same design requirement of SVM. For anyone who want to implement the SVM concept, this is a hard requirement. Since now drm has the drm_gpuvm concept which strictly speaking is designed for one device, I want to see whether we can extend drm_gpuvm to make it work for both single device (as used in xe) and multipe devices (will be used in the SVM code). That is why I brought up this topic.
>
>> So when you have multiple GPUs you either have an inseparable cluster of
>> them which case you would also only have one drm_device. Or you have
>> separated drm_device which also results in separate drm render nodes and
>> separate virtual address spaces and also eventually separate IOMMU
>> domains which gives you separate dma_addresses for the same page and so
>> separate GPUVM page tables....
> I am thinking we can still make each device has its separate drm_device/render node/iommu domains/gpu page table. Just as what we have today. I am not plan to change this picture.
>
> But the virtual address space will support two modes of operation:
> 1. one drm_gpuvm per device. This is when svm is not in the picture
> 2. all devices in the process share one single drm_gpuvm, when svm is in the picture. In xe driver design, we have to support a mixture use of legacy mode (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu submission). So whenever SVM is in the picture, we want one single process address space across all devices. Drm_gpuvm doesn't need to be aware of those two operation modes. It is driver's responsibility to use different mode.
>
> For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series: https://lore.kernel.org/dri-devel/20240117221223.18540-1-oak.zeng@intel.com/) can inherit from drm_gpuvm while each xe_vm (still a per-device based struct) will just have a pointer to the drm_gpuvm structure. This way when svm is in play, we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means shared address space across gpu devices.
>
> This requires some changes of drm_gpuvm design:
> 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL, means this drm_gpuvm is not for specific gpu device
> 2. The common dma_resv object: drm_gem_object *r_obj. *Does one dma_resv object allocated/initialized for one device work for all devices*? From first look, dma_resv is just some CPU structure maintaining dma-fences. So I guess it should work for all devices? I definitely need you to comment.
>
>
>> It's up to you how to implement it, but I think it's pretty clear that
>> you need separate drm_gpuvm objects to manage those.
> As explained above, I am thinking of one drm_gpuvm object across all devices when SVM is in the picture...
>
>> That you map the same thing in all those virtual address spaces at the
>> same address is a completely different optimization problem I think.
> Not sure I follow here... the requirement from SVM is, one virtual address points to same physical backing store. For example, whenever CPU or any GPU device access this virtual address, it refers to the same physical content. Of course the physical backing store can be migrated b/t host memory and any of the GPU's device memory, but the content should be consistent.
>
> So we are mapping same physical content to the same virtual address in either cpu page table or any gpu device's page table...
>
>> What we could certainly do is to optimize hmm_range_fault by making
>> hmm_range a reference counted object and using it for multiple devices
>> at the same time if those devices request the same range of an mm_struct.
>>
> Not very follow. If you are trying to resolve a multiple devices concurrent access problem, I think we should serialize concurrent device fault to one address range. The reason is, during device fault handling, we might migrate the backing store so hmm_range->hmm_pfns[] might have changed after one device access it.
>
>> I think if you start using the same drm_gpuvm for multiple devices you
>> will sooner or later start to run into the same mess we have seen with
>> KFD, where we moved more and more functionality from the KFD to the DRM
>> render node because we found that a lot of the stuff simply doesn't work
>> correctly with a single object to maintain the state.
> As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process. Yes the codes are a little complicated.
>
> Kfd manages the shared virtual address space in the kfd driver codes, like the split, merging etc. Here I am looking whether we can leverage the drm_gpuvm code for those functions.
>
> As of the shared virtual address space across gpu devices, it is a hard requirement for svm/system allocator (aka malloc for gpu program). We need to make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better because the work can be shared b/t drivers.
>
> Thanks a lot,
> Oak
>
>> Just one more point to your original discussion on the xe list: I think
>> it's perfectly valid for an application to map something at the same
>> address you already have something else.
>>
>> Cheers,
>> Christian.
>>
>>> Thanks,
>>> Oak

Danilo Krummrich Jan. 23, 2024, 11:56 p.m. UTC | #7

Hi Oak,

On 1/23/24 20:37, Zeng, Oak wrote:
> Thanks Christian. I have some comment inline below.
> 
> Danilo, can you also take a look and give your feedback? Thanks.

I agree with everything Christian already wrote. Except for the KFD parts, which
I'm simply not familiar with, I had exactly the same thoughts after reading your
initial mail.

Please find some more comments below.

> 
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Tuesday, January 23, 2024 6:13 AM
>> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
>> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
>> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>> <matthew.brost@intel.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> Hi Oak,
>>
>> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
>>> Hi Danilo and all,
>>>
>>> During the work of Intel's SVM code, we came up the idea of making
>> drm_gpuvm to work across multiple gpu devices. See some discussion here:
>> https://lore.kernel.org/dri-
>> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
>> 11.prod.outlook.com/
>>>
>>> The reason we try to do this is, for a SVM (shared virtual memory across cpu
>> program and all gpu program on all gpu devices) process, the address space has
>> to be across all gpu devices. So if we make drm_gpuvm to work across devices,
>> then our SVM code can leverage drm_gpuvm as well.
>>>
>>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
>> drm_device *drm pointer a lot. This param is used only for printing/warning. So I
>> think maybe we can delete this drm field from drm_gpuvm.
>>>
>>> This way, on a multiple gpu device system, for one process, we can have only
>> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for
>> each gpu device).
>>>
>>> What do you think?
>>
>> Well from the GPUVM side I don't think it would make much difference if
>> we have the drm device or not.
>>
>> But the experience we had with the KFD I think I should mention that we
>> should absolutely *not* deal with multiple devices at the same time in
>> the UAPI or VM objects inside the driver.
>>
>> The background is that all the APIs inside the Linux kernel are build
>> around the idea that they work with only one device at a time. This
>> accounts for both low level APIs like the DMA API as well as pretty high
>> level things like for example file system address space etc...
> 
> Yes most API are per device based.
> 
> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices. Cc Felix.
> 
> Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
> 
> We have the same design requirement of SVM. For anyone who want to implement the SVM concept, this is a hard requirement. Since now drm has the drm_gpuvm concept which strictly speaking is designed for one device, I want to see whether we can extend drm_gpuvm to make it work for both single device (as used in xe) and multipe devices (will be used in the SVM code). That is why I brought up this topic.
> 
>>
>> So when you have multiple GPUs you either have an inseparable cluster of
>> them which case you would also only have one drm_device. Or you have
>> separated drm_device which also results in separate drm render nodes and
>> separate virtual address spaces and also eventually separate IOMMU
>> domains which gives you separate dma_addresses for the same page and so
>> separate GPUVM page tables....
> 
> I am thinking we can still make each device has its separate drm_device/render node/iommu domains/gpu page table. Just as what we have today. I am not plan to change this picture.
> 
> But the virtual address space will support two modes of operation:
> 1. one drm_gpuvm per device. This is when svm is not in the picture
> 2. all devices in the process share one single drm_gpuvm, when svm is in the picture. In xe driver design, we have to support a mixture use of legacy mode (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu submission). So whenever SVM is in the picture, we want one single process address space across all devices. Drm_gpuvm doesn't need to be aware of those two operation modes. It is driver's responsibility to use different mode.
> 
> For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series: https://lore.kernel.org/dri-devel/20240117221223.18540-1-oak.zeng@intel.com/) can inherit from drm_gpuvm while each xe_vm (still a per-device based struct) will just have a pointer to the drm_gpuvm structure. This way when svm is in play, we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means shared address space across gpu devices.

With a shared GPUVM structure, how do you track actual per device resources such as
page tables? You also need to consider that the page table layout, memory mapping
flags may vary from device to device due to different GPU chipsets or revisions.

Also, if you replace the shared GPUVM structure with a pointer to a shared one,
you may run into all kinds of difficulties due to increasing complexity in terms
of locking, synchronization, lifetime and potential unwind operations in error paths.
I haven't thought it through yet, but I wouldn't be surprised entirely if there are
cases where you simply run into circular dependencies.

Also, looking at the conversation in the linked patch series:

<snip>

>> For example as hmm_range_fault brings a range from host into GPU address
>> space,  what if it was already allocated and in use by VM_BIND for
>> a GEM_CREATE allocated buffer?    That is of course application error,
>> but KMD needs to detect it, and provide one single managed address
>> space across all allocations from the application....

> This is very good question. Yes agree we should check this application error. Fortunately this is doable. All vm_bind virtual address range are tracked in xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb tree of *all* gpu devices (as xe_vm is for one device only) to see whether there is a conflict. Will make the change soon.

<snip>

How do you do that if xe_vm->gpuvm is just a pointer to the GPUVM structure within xe_svm?

> 
> This requires some changes of drm_gpuvm design:
> 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL, means this drm_gpuvm is not for specific gpu device
> 2. The common dma_resv object: drm_gem_object *r_obj. *Does one dma_resv object allocated/initialized for one device work for all devices*? From first look, dma_resv is just some CPU structure maintaining dma-fences. So I guess it should work for all devices? I definitely need you to comment.

The general rule is that drivers can share the common dma_resv across GEM objects that
are only mapped within the VM owning the dma_resv, but never within another VM.

Now, your question is whether multiple VMs can share the same common dma_resv. I think
that calls for trouble, since it would create dependencies that simply aren't needed
and might even introduce locking issues.

However, that's optional, you can simply decide to not make use of the common dma_resv
and all the optimizations based on it.

> 
> 
>>
>> It's up to you how to implement it, but I think it's pretty clear that
>> you need separate drm_gpuvm objects to manage those.
> 
> As explained above, I am thinking of one drm_gpuvm object across all devices when SVM is in the picture...
> 
>>
>> That you map the same thing in all those virtual address spaces at the
>> same address is a completely different optimization problem I think.
> 
> Not sure I follow here... the requirement from SVM is, one virtual address points to same physical backing store. For example, whenever CPU or any GPU device access this virtual address, it refers to the same physical content. Of course the physical backing store can be migrated b/t host memory and any of the GPU's device memory, but the content should be consistent.

Technically, multiple different GPUs will have separate virtual address spaces, it's
just that you create mappings within all of them such that the same virtual address
resolves to the same physical content on all of them.

So, having a single GPUVM instance representing all of them might give the illusion of
a single unified address space, but you still need to maintain each device's address
space backing resources, such as page tables, separately.

- Danilo

> 
> So we are mapping same physical content to the same virtual address in either cpu page table or any gpu device's page table...
> 
>> What we could certainly do is to optimize hmm_range_fault by making
>> hmm_range a reference counted object and using it for multiple devices
>> at the same time if those devices request the same range of an mm_struct.
>>
> 
> Not very follow. If you are trying to resolve a multiple devices concurrent access problem, I think we should serialize concurrent device fault to one address range. The reason is, during device fault handling, we might migrate the backing store so hmm_range->hmm_pfns[] might have changed after one device access it.
> 
>> I think if you start using the same drm_gpuvm for multiple devices you
>> will sooner or later start to run into the same mess we have seen with
>> KFD, where we moved more and more functionality from the KFD to the DRM
>> render node because we found that a lot of the stuff simply doesn't work
>> correctly with a single object to maintain the state.
> 
> As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process. Yes the codes are a little complicated.
> 
> Kfd manages the shared virtual address space in the kfd driver codes, like the split, merging etc. Here I am looking whether we can leverage the drm_gpuvm code for those functions.
> 
> As of the shared virtual address space across gpu devices, it is a hard requirement for svm/system allocator (aka malloc for gpu program). We need to make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better because the work can be shared b/t drivers.
> 
> Thanks a lot,
> Oak
> 
>>
>> Just one more point to your original discussion on the xe list: I think
>> it's perfectly valid for an application to map something at the same
>> address you already have something else.
>>
>> Cheers,
>> Christian.
>>
>>>
>>> Thanks,
>>> Oak
>

Zeng, Oak Jan. 24, 2024, 3:57 a.m. UTC | #8

Thanks a lot Danilo.

Maybe I wasn't clear enough. In the solution I proposed, each device still have separate vm/page tables. Each device still need to manage the mapping, page table flags etc. It is just in svm use case, all devices share one drm_gpuvm instance. As I understand it, drm_gpuvm's main function is the va range split and merging. I don't see why it doesn't work across gpu devices. 

But I read more about drm_gpuvm. Its split merge function takes a drm_gem_object parameter, see drm_gpuvm_sm_map_ops_create and drm_gpuvm_sm_map. Actually the whole drm_gpuvm is designed for BO-centric driver, for example, it has a drm_gpuvm_bo concept to keep track of the 1BO:Ngpuva mapping. The whole purpose of leveraging drm_gpuvm is to re-use the va split/merge functions for SVM. But in our SVM implementation, there is no buffer object at all. So I don't think our SVM codes can leverage drm_gpuvm.

I will give up this approach, unless Matt or Brian can see a way.

A few replies inline.... @Welty, Brian I had more thoughts inline to one of your original question....

> -----Original Message-----
> From: Danilo Krummrich <dakr@redhat.com>
> Sent: Tuesday, January 23, 2024 6:57 PM
> To: Zeng, Oak <oak.zeng@intel.com>; Christian König
> <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> Hi Oak,
> 
> On 1/23/24 20:37, Zeng, Oak wrote:
> > Thanks Christian. I have some comment inline below.
> >
> > Danilo, can you also take a look and give your feedback? Thanks.
> 
> I agree with everything Christian already wrote. Except for the KFD parts, which
> I'm simply not familiar with, I had exactly the same thoughts after reading your
> initial mail.
> 
> Please find some more comments below.
> 
> >
> >> -----Original Message-----
> >> From: Christian König <christian.koenig@amd.com>
> >> Sent: Tuesday, January 23, 2024 6:13 AM
> >> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
> >> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
> >> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
> intel-
> >> xe@lists.freedesktop.org; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>;
> >> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> >> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> >> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> >> <matthew.brost@intel.com>
> >> Subject: Re: Making drm_gpuvm work across gpu devices
> >>
> >> Hi Oak,
> >>
> >> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
> >>> Hi Danilo and all,
> >>>
> >>> During the work of Intel's SVM code, we came up the idea of making
> >> drm_gpuvm to work across multiple gpu devices. See some discussion here:
> >> https://lore.kernel.org/dri-
> >>
> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
> >> 11.prod.outlook.com/
> >>>
> >>> The reason we try to do this is, for a SVM (shared virtual memory across cpu
> >> program and all gpu program on all gpu devices) process, the address space
> has
> >> to be across all gpu devices. So if we make drm_gpuvm to work across devices,
> >> then our SVM code can leverage drm_gpuvm as well.
> >>>
> >>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
> >> drm_device *drm pointer a lot. This param is used only for printing/warning.
> So I
> >> think maybe we can delete this drm field from drm_gpuvm.
> >>>
> >>> This way, on a multiple gpu device system, for one process, we can have only
> >> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for
> >> each gpu device).
> >>>
> >>> What do you think?
> >>
> >> Well from the GPUVM side I don't think it would make much difference if
> >> we have the drm device or not.
> >>
> >> But the experience we had with the KFD I think I should mention that we
> >> should absolutely *not* deal with multiple devices at the same time in
> >> the UAPI or VM objects inside the driver.
> >>
> >> The background is that all the APIs inside the Linux kernel are build
> >> around the idea that they work with only one device at a time. This
> >> accounts for both low level APIs like the DMA API as well as pretty high
> >> level things like for example file system address space etc...
> >
> > Yes most API are per device based.
> >
> > One exception I know is actually the kfd SVM API. If you look at the svm_ioctl
> function, it is per-process based. Each kfd_process represent a process across N
> gpu devices. Cc Felix.
> >
> > Need to say, kfd SVM represent a shared virtual address space across CPU and
> all GPU devices on the system. This is by the definition of SVM (shared virtual
> memory). This is very different from our legacy gpu *device* driver which works
> for only one device (i.e., if you want one device to access another device's
> memory, you will have to use dma-buf export/import etc).
> >
> > We have the same design requirement of SVM. For anyone who want to
> implement the SVM concept, this is a hard requirement. Since now drm has the
> drm_gpuvm concept which strictly speaking is designed for one device, I want to
> see whether we can extend drm_gpuvm to make it work for both single device
> (as used in xe) and multipe devices (will be used in the SVM code). That is why I
> brought up this topic.
> >
> >>
> >> So when you have multiple GPUs you either have an inseparable cluster of
> >> them which case you would also only have one drm_device. Or you have
> >> separated drm_device which also results in separate drm render nodes and
> >> separate virtual address spaces and also eventually separate IOMMU
> >> domains which gives you separate dma_addresses for the same page and so
> >> separate GPUVM page tables....
> >
> > I am thinking we can still make each device has its separate drm_device/render
> node/iommu domains/gpu page table. Just as what we have today. I am not plan
> to change this picture.
> >
> > But the virtual address space will support two modes of operation:
> > 1. one drm_gpuvm per device. This is when svm is not in the picture
> > 2. all devices in the process share one single drm_gpuvm, when svm is in the
> picture. In xe driver design, we have to support a mixture use of legacy mode
> (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu
> submission). So whenever SVM is in the picture, we want one single process
> address space across all devices. Drm_gpuvm doesn't need to be aware of those
> two operation modes. It is driver's responsibility to use different mode.
> >
> > For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit
> from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series:
> https://lore.kernel.org/dri-devel/20240117221223.18540-1-oak.zeng@intel.com/)
> can inherit from drm_gpuvm while each xe_vm (still a per-device based struct)
> will just have a pointer to the drm_gpuvm structure. This way when svm is in play,
> we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means
> shared address space across gpu devices.
> 
> With a shared GPUVM structure, how do you track actual per device resources
> such as
> page tables? You also need to consider that the page table layout, memory
> mapping
> flags may vary from device to device due to different GPU chipsets or revisions.

The per device page table, flags etc are still managed per-device based, which is the xe_vm in the xekmd driver.

> 
> Also, if you replace the shared GPUVM structure with a pointer to a shared one,
> you may run into all kinds of difficulties due to increasing complexity in terms
> of locking, synchronization, lifetime and potential unwind operations in error
> paths.
> I haven't thought it through yet, but I wouldn't be surprised entirely if there are
> cases where you simply run into circular dependencies.

Make sense, I can't see through this without a prove of concept code either.

> 
> Also, looking at the conversation in the linked patch series:
> 
> <snip>
> 
> >> For example as hmm_range_fault brings a range from host into GPU address
> >> space,  what if it was already allocated and in use by VM_BIND for
> >> a GEM_CREATE allocated buffer?    That is of course application error,
> >> but KMD needs to detect it, and provide one single managed address
> >> space across all allocations from the application....
> 
> > This is very good question. Yes agree we should check this application error.
> Fortunately this is doable. All vm_bind virtual address range are tracked in
> xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb
> tree of *all* gpu devices (as xe_vm is for one device only) to see whether there
> is a conflict. Will make the change soon.
> 
> <snip>
> 
> How do you do that if xe_vm->gpuvm is just a pointer to the GPUVM structure
> within xe_svm?

In the proposed approach, we have a single drm_gpuvm instance for one process. All device's xe_vm pointing to this drm_gpuvm instance. This drm_gpuvm's rb tree maintains all the va range we have in this process. We can just walk this rb tree to see if there is a conflict.

But I didn't answer Brian's question completely... In a mixed use of vm_bind and malloc/mmap, the virtual address used by vm_bind should first be reserved in user space using mmap. So all valid virtual address should be tracked by linux kernel vma_struct.

Both vm_bind and malloc'ed virtual address can cause a gpu page fault. Our fault handler should first see whether this is a vm_bind va and service the fault accordingly; if not, then serve the fault in the SVM path; if SVM path also failed, it is an invalid address. So from user perspective, user can use:
Ptr = mmap()
Vm_bind(ptr, bo)
Submit gpu kernel using ptr
Or:
Ptr = mmap()
Submit gpu kernel using ptr
Whether vm_bind is called or not decides the gpu fault handler code path. Hopefully this answers @Welty, Brian's original question


> 
> >
> > This requires some changes of drm_gpuvm design:
> > 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL,
> means this drm_gpuvm is not for specific gpu device
> > 2. The common dma_resv object: drm_gem_object *r_obj. *Does one
> dma_resv object allocated/initialized for one device work for all devices*? From
> first look, dma_resv is just some CPU structure maintaining dma-fences. So I
> guess it should work for all devices? I definitely need you to comment.
> 
> The general rule is that drivers can share the common dma_resv across GEM
> objects that
> are only mapped within the VM owning the dma_resv, but never within another
> VM.
> 
> Now, your question is whether multiple VMs can share the same common
> dma_resv. I think
> that calls for trouble, since it would create dependencies that simply aren't
> needed
> and might even introduce locking issues.
> 
> However, that's optional, you can simply decide to not make use of the common
> dma_resv
> and all the optimizations based on it.

Ok, got it.
> 
> >
> >
> >>
> >> It's up to you how to implement it, but I think it's pretty clear that
> >> you need separate drm_gpuvm objects to manage those.
> >
> > As explained above, I am thinking of one drm_gpuvm object across all devices
> when SVM is in the picture...
> >
> >>
> >> That you map the same thing in all those virtual address spaces at the
> >> same address is a completely different optimization problem I think.
> >
> > Not sure I follow here... the requirement from SVM is, one virtual address
> points to same physical backing store. For example, whenever CPU or any GPU
> device access this virtual address, it refers to the same physical content. Of
> course the physical backing store can be migrated b/t host memory and any of
> the GPU's device memory, but the content should be consistent.
> 
> Technically, multiple different GPUs will have separate virtual address spaces, it's
> just that you create mappings within all of them such that the same virtual
> address
> resolves to the same physical content on all of them.
> 
> So, having a single GPUVM instance representing all of them might give the
> illusion of
> a single unified address space, but you still need to maintain each device's
> address
> space backing resources, such as page tables, separately.

Yes agreed.

Regards,
Oak
> 
> - Danilo
> 
> >
> > So we are mapping same physical content to the same virtual address in either
> cpu page table or any gpu device's page table...
> >
> >> What we could certainly do is to optimize hmm_range_fault by making
> >> hmm_range a reference counted object and using it for multiple devices
> >> at the same time if those devices request the same range of an mm_struct.
> >>
> >
> > Not very follow. If you are trying to resolve a multiple devices concurrent access
> problem, I think we should serialize concurrent device fault to one address range.
> The reason is, during device fault handling, we might migrate the backing store so
> hmm_range->hmm_pfns[] might have changed after one device access it.
> >
> >> I think if you start using the same drm_gpuvm for multiple devices you
> >> will sooner or later start to run into the same mess we have seen with
> >> KFD, where we moved more and more functionality from the KFD to the DRM
> >> render node because we found that a lot of the stuff simply doesn't work
> >> correctly with a single object to maintain the state.
> >
> > As I understand it, KFD is designed to work across devices. A single pseudo
> /dev/kfd device represent all hardware gpu devices. That is why during kfd open,
> many pdd (process device data) is created, each for one hardware device for this
> process. Yes the codes are a little complicated.
> >
> > Kfd manages the shared virtual address space in the kfd driver codes, like the
> split, merging etc. Here I am looking whether we can leverage the drm_gpuvm
> code for those functions.
> >
> > As of the shared virtual address space across gpu devices, it is a hard
> requirement for svm/system allocator (aka malloc for gpu program). We need to
> make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better
> because the work can be shared b/t drivers.
> >
> > Thanks a lot,
> > Oak
> >
> >>
> >> Just one more point to your original discussion on the xe list: I think
> >> it's perfectly valid for an application to map something at the same
> >> address you already have something else.
> >>
> >> Cheers,
> >> Christian.
> >>
> >>>
> >>> Thanks,
> >>> Oak
> >

Zeng, Oak Jan. 24, 2024, 4:14 a.m. UTC | #9

Danilo,

Maybe before I give up, I should also ask, currently drm_gpuvm is designed for BO-centric world. Is it easy to make the va range split/merge work simply for va range, but without BO? Conceptually this should work as we are merge/splitting virtual address range which can be decoupled completely from BO. 

> -----Original Message-----
> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of Zeng,
> Oak
> Sent: Tuesday, January 23, 2024 10:57 PM
> To: Danilo Krummrich <dakr@redhat.com>; Christian König
> <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian
> <brian.welty@intel.com>
> Cc: Brost, Matthew <matthew.brost@intel.com>;
> Thomas.Hellstrom@linux.intel.com; dri-devel@lists.freedesktop.org; Ghimiray,
> Himal Prasad <himal.prasad.ghimiray@intel.com>; Gupta, saurabhg
> <saurabhg.gupta@intel.com>; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org
> Subject: RE: Making drm_gpuvm work across gpu devices
> 
> Thanks a lot Danilo.
> 
> Maybe I wasn't clear enough. In the solution I proposed, each device still have
> separate vm/page tables. Each device still need to manage the mapping, page
> table flags etc. It is just in svm use case, all devices share one drm_gpuvm
> instance. As I understand it, drm_gpuvm's main function is the va range split and
> merging. I don't see why it doesn't work across gpu devices.
> 
> But I read more about drm_gpuvm. Its split merge function takes a
> drm_gem_object parameter, see drm_gpuvm_sm_map_ops_create and
> drm_gpuvm_sm_map. Actually the whole drm_gpuvm is designed for BO-centric
> driver, for example, it has a drm_gpuvm_bo concept to keep track of the
> 1BO:Ngpuva mapping. The whole purpose of leveraging drm_gpuvm is to re-use
> the va split/merge functions for SVM. But in our SVM implementation, there is no
> buffer object at all. So I don't think our SVM codes can leverage drm_gpuvm.
> 
> I will give up this approach, unless Matt or Brian can see a way.
> 
> A few replies inline.... @Welty, Brian I had more thoughts inline to one of your
> original question....
> 
> > -----Original Message-----
> > From: Danilo Krummrich <dakr@redhat.com>
> > Sent: Tuesday, January 23, 2024 6:57 PM
> > To: Zeng, Oak <oak.zeng@intel.com>; Christian König
> > <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
> > <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
> > Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
> intel-
> > xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
> > Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> > Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> > <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> > <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
> > Subject: Re: Making drm_gpuvm work across gpu devices
> >
> > Hi Oak,
> >
> > On 1/23/24 20:37, Zeng, Oak wrote:
> > > Thanks Christian. I have some comment inline below.
> > >
> > > Danilo, can you also take a look and give your feedback? Thanks.
> >
> > I agree with everything Christian already wrote. Except for the KFD parts, which
> > I'm simply not familiar with, I had exactly the same thoughts after reading your
> > initial mail.
> >
> > Please find some more comments below.
> >
> > >
> > >> -----Original Message-----
> > >> From: Christian König <christian.koenig@amd.com>
> > >> Sent: Tuesday, January 23, 2024 6:13 AM
> > >> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich
> <dakr@redhat.com>;
> > >> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
> > >> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
> > intel-
> > >> xe@lists.freedesktop.org; Bommu, Krishnaiah
> > <krishnaiah.bommu@intel.com>;
> > >> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> > >> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> > >> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> > >> <matthew.brost@intel.com>
> > >> Subject: Re: Making drm_gpuvm work across gpu devices
> > >>
> > >> Hi Oak,
> > >>
> > >> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
> > >>> Hi Danilo and all,
> > >>>
> > >>> During the work of Intel's SVM code, we came up the idea of making
> > >> drm_gpuvm to work across multiple gpu devices. See some discussion here:
> > >> https://lore.kernel.org/dri-
> > >>
> >
> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
> > >> 11.prod.outlook.com/
> > >>>
> > >>> The reason we try to do this is, for a SVM (shared virtual memory across
> cpu
> > >> program and all gpu program on all gpu devices) process, the address space
> > has
> > >> to be across all gpu devices. So if we make drm_gpuvm to work across
> devices,
> > >> then our SVM code can leverage drm_gpuvm as well.
> > >>>
> > >>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
> > >> drm_device *drm pointer a lot. This param is used only for printing/warning.
> > So I
> > >> think maybe we can delete this drm field from drm_gpuvm.
> > >>>
> > >>> This way, on a multiple gpu device system, for one process, we can have
> only
> > >> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one
> for
> > >> each gpu device).
> > >>>
> > >>> What do you think?
> > >>
> > >> Well from the GPUVM side I don't think it would make much difference if
> > >> we have the drm device or not.
> > >>
> > >> But the experience we had with the KFD I think I should mention that we
> > >> should absolutely *not* deal with multiple devices at the same time in
> > >> the UAPI or VM objects inside the driver.
> > >>
> > >> The background is that all the APIs inside the Linux kernel are build
> > >> around the idea that they work with only one device at a time. This
> > >> accounts for both low level APIs like the DMA API as well as pretty high
> > >> level things like for example file system address space etc...
> > >
> > > Yes most API are per device based.
> > >
> > > One exception I know is actually the kfd SVM API. If you look at the svm_ioctl
> > function, it is per-process based. Each kfd_process represent a process across N
> > gpu devices. Cc Felix.
> > >
> > > Need to say, kfd SVM represent a shared virtual address space across CPU
> and
> > all GPU devices on the system. This is by the definition of SVM (shared virtual
> > memory). This is very different from our legacy gpu *device* driver which
> works
> > for only one device (i.e., if you want one device to access another device's
> > memory, you will have to use dma-buf export/import etc).
> > >
> > > We have the same design requirement of SVM. For anyone who want to
> > implement the SVM concept, this is a hard requirement. Since now drm has the
> > drm_gpuvm concept which strictly speaking is designed for one device, I want
> to
> > see whether we can extend drm_gpuvm to make it work for both single device
> > (as used in xe) and multipe devices (will be used in the SVM code). That is why I
> > brought up this topic.
> > >
> > >>
> > >> So when you have multiple GPUs you either have an inseparable cluster of
> > >> them which case you would also only have one drm_device. Or you have
> > >> separated drm_device which also results in separate drm render nodes and
> > >> separate virtual address spaces and also eventually separate IOMMU
> > >> domains which gives you separate dma_addresses for the same page and so
> > >> separate GPUVM page tables....
> > >
> > > I am thinking we can still make each device has its separate
> drm_device/render
> > node/iommu domains/gpu page table. Just as what we have today. I am not
> plan
> > to change this picture.
> > >
> > > But the virtual address space will support two modes of operation:
> > > 1. one drm_gpuvm per device. This is when svm is not in the picture
> > > 2. all devices in the process share one single drm_gpuvm, when svm is in the
> > picture. In xe driver design, we have to support a mixture use of legacy mode
> > (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu
> > submission). So whenever SVM is in the picture, we want one single process
> > address space across all devices. Drm_gpuvm doesn't need to be aware of
> those
> > two operation modes. It is driver's responsibility to use different mode.
> > >
> > > For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit
> > from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series:
> > https://lore.kernel.org/dri-devel/20240117221223.18540-1-
> oak.zeng@intel.com/)
> > can inherit from drm_gpuvm while each xe_vm (still a per-device based struct)
> > will just have a pointer to the drm_gpuvm structure. This way when svm is in
> play,
> > we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means
> > shared address space across gpu devices.
> >
> > With a shared GPUVM structure, how do you track actual per device resources
> > such as
> > page tables? You also need to consider that the page table layout, memory
> > mapping
> > flags may vary from device to device due to different GPU chipsets or revisions.
> 
> The per device page table, flags etc are still managed per-device based, which is
> the xe_vm in the xekmd driver.
> 
> >
> > Also, if you replace the shared GPUVM structure with a pointer to a shared one,
> > you may run into all kinds of difficulties due to increasing complexity in terms
> > of locking, synchronization, lifetime and potential unwind operations in error
> > paths.
> > I haven't thought it through yet, but I wouldn't be surprised entirely if there are
> > cases where you simply run into circular dependencies.
> 
> Make sense, I can't see through this without a prove of concept code either.
> 
> >
> > Also, looking at the conversation in the linked patch series:
> >
> > <snip>
> >
> > >> For example as hmm_range_fault brings a range from host into GPU address
> > >> space,  what if it was already allocated and in use by VM_BIND for
> > >> a GEM_CREATE allocated buffer?    That is of course application error,
> > >> but KMD needs to detect it, and provide one single managed address
> > >> space across all allocations from the application....
> >
> > > This is very good question. Yes agree we should check this application error.
> > Fortunately this is doable. All vm_bind virtual address range are tracked in
> > xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb
> > tree of *all* gpu devices (as xe_vm is for one device only) to see whether
> there
> > is a conflict. Will make the change soon.
> >
> > <snip>
> >
> > How do you do that if xe_vm->gpuvm is just a pointer to the GPUVM structure
> > within xe_svm?
> 
> In the proposed approach, we have a single drm_gpuvm instance for one process.
> All device's xe_vm pointing to this drm_gpuvm instance. This drm_gpuvm's rb
> tree maintains all the va range we have in this process. We can just walk this rb
> tree to see if there is a conflict.
> 
> But I didn't answer Brian's question completely... In a mixed use of vm_bind and
> malloc/mmap, the virtual address used by vm_bind should first be reserved in
> user space using mmap. So all valid virtual address should be tracked by linux
> kernel vma_struct.
> 
> Both vm_bind and malloc'ed virtual address can cause a gpu page fault. Our fault
> handler should first see whether this is a vm_bind va and service the fault
> accordingly; if not, then serve the fault in the SVM path; if SVM path also failed, it
> is an invalid address. So from user perspective, user can use:
> Ptr = mmap()
> Vm_bind(ptr, bo)
> Submit gpu kernel using ptr
> Or:
> Ptr = mmap()
> Submit gpu kernel using ptr
> Whether vm_bind is called or not decides the gpu fault handler code path.
> Hopefully this answers @Welty, Brian's original question
> 
> 
> >
> > >
> > > This requires some changes of drm_gpuvm design:
> > > 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL,
> > means this drm_gpuvm is not for specific gpu device
> > > 2. The common dma_resv object: drm_gem_object *r_obj. *Does one
> > dma_resv object allocated/initialized for one device work for all devices*? From
> > first look, dma_resv is just some CPU structure maintaining dma-fences. So I
> > guess it should work for all devices? I definitely need you to comment.
> >
> > The general rule is that drivers can share the common dma_resv across GEM
> > objects that
> > are only mapped within the VM owning the dma_resv, but never within
> another
> > VM.
> >
> > Now, your question is whether multiple VMs can share the same common
> > dma_resv. I think
> > that calls for trouble, since it would create dependencies that simply aren't
> > needed
> > and might even introduce locking issues.
> >
> > However, that's optional, you can simply decide to not make use of the
> common
> > dma_resv
> > and all the optimizations based on it.
> 
> Ok, got it.
> >
> > >
> > >
> > >>
> > >> It's up to you how to implement it, but I think it's pretty clear that
> > >> you need separate drm_gpuvm objects to manage those.
> > >
> > > As explained above, I am thinking of one drm_gpuvm object across all devices
> > when SVM is in the picture...
> > >
> > >>
> > >> That you map the same thing in all those virtual address spaces at the
> > >> same address is a completely different optimization problem I think.
> > >
> > > Not sure I follow here... the requirement from SVM is, one virtual address
> > points to same physical backing store. For example, whenever CPU or any GPU
> > device access this virtual address, it refers to the same physical content. Of
> > course the physical backing store can be migrated b/t host memory and any of
> > the GPU's device memory, but the content should be consistent.
> >
> > Technically, multiple different GPUs will have separate virtual address spaces,
> it's
> > just that you create mappings within all of them such that the same virtual
> > address
> > resolves to the same physical content on all of them.
> >
> > So, having a single GPUVM instance representing all of them might give the
> > illusion of
> > a single unified address space, but you still need to maintain each device's
> > address
> > space backing resources, such as page tables, separately.
> 
> Yes agreed.
> 
> Regards,
> Oak
> >
> > - Danilo
> >
> > >
> > > So we are mapping same physical content to the same virtual address in
> either
> > cpu page table or any gpu device's page table...
> > >
> > >> What we could certainly do is to optimize hmm_range_fault by making
> > >> hmm_range a reference counted object and using it for multiple devices
> > >> at the same time if those devices request the same range of an mm_struct.
> > >>
> > >
> > > Not very follow. If you are trying to resolve a multiple devices concurrent
> access
> > problem, I think we should serialize concurrent device fault to one address
> range.
> > The reason is, during device fault handling, we might migrate the backing store
> so
> > hmm_range->hmm_pfns[] might have changed after one device access it.
> > >
> > >> I think if you start using the same drm_gpuvm for multiple devices you
> > >> will sooner or later start to run into the same mess we have seen with
> > >> KFD, where we moved more and more functionality from the KFD to the
> DRM
> > >> render node because we found that a lot of the stuff simply doesn't work
> > >> correctly with a single object to maintain the state.
> > >
> > > As I understand it, KFD is designed to work across devices. A single pseudo
> > /dev/kfd device represent all hardware gpu devices. That is why during kfd
> open,
> > many pdd (process device data) is created, each for one hardware device for
> this
> > process. Yes the codes are a little complicated.
> > >
> > > Kfd manages the shared virtual address space in the kfd driver codes, like the
> > split, merging etc. Here I am looking whether we can leverage the drm_gpuvm
> > code for those functions.
> > >
> > > As of the shared virtual address space across gpu devices, it is a hard
> > requirement for svm/system allocator (aka malloc for gpu program). We need
> to
> > make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better
> > because the work can be shared b/t drivers.
> > >
> > > Thanks a lot,
> > > Oak
> > >
> > >>
> > >> Just one more point to your original discussion on the xe list: I think
> > >> it's perfectly valid for an application to map something at the same
> > >> address you already have something else.
> > >>
> > >> Cheers,
> > >> Christian.
> > >>
> > >>>
> > >>> Thanks,
> > >>> Oak
> > >

Christian König Jan. 24, 2024, 6:48 a.m. UTC | #10

Am 24.01.24 um 05:14 schrieb Zeng, Oak:
> Danilo,
>
> Maybe before I give up, I should also ask, currently drm_gpuvm is designed for BO-centric world. Is it easy to make the va range split/merge work simply for va range, but without BO? Conceptually this should work as we are merge/splitting virtual address range which can be decoupled completely from BO.

At least AMD GPUs have a similar requirement to manage virtual ranges 
which are not backed by a BO. For example PRT ranges.

I expect that we can still use drm_gpuvm for this and the BO is simply 
NULL in that case.

Regards,
Christian.

>
>> -----Original Message-----
>> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of Zeng,
>> Oak
>> Sent: Tuesday, January 23, 2024 10:57 PM
>> To: Danilo Krummrich <dakr@redhat.com>; Christian König
>> <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
>> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian
>> <brian.welty@intel.com>
>> Cc: Brost, Matthew <matthew.brost@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; dri-devel@lists.freedesktop.org; Ghimiray,
>> Himal Prasad <himal.prasad.ghimiray@intel.com>; Gupta, saurabhg
>> <saurabhg.gupta@intel.com>; Bommu, Krishnaiah
>> <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org
>> Subject: RE: Making drm_gpuvm work across gpu devices
>>
>> Thanks a lot Danilo.
>>
>> Maybe I wasn't clear enough. In the solution I proposed, each device still have
>> separate vm/page tables. Each device still need to manage the mapping, page
>> table flags etc. It is just in svm use case, all devices share one drm_gpuvm
>> instance. As I understand it, drm_gpuvm's main function is the va range split and
>> merging. I don't see why it doesn't work across gpu devices.
>>
>> But I read more about drm_gpuvm. Its split merge function takes a
>> drm_gem_object parameter, see drm_gpuvm_sm_map_ops_create and
>> drm_gpuvm_sm_map. Actually the whole drm_gpuvm is designed for BO-centric
>> driver, for example, it has a drm_gpuvm_bo concept to keep track of the
>> 1BO:Ngpuva mapping. The whole purpose of leveraging drm_gpuvm is to re-use
>> the va split/merge functions for SVM. But in our SVM implementation, there is no
>> buffer object at all. So I don't think our SVM codes can leverage drm_gpuvm.
>>
>> I will give up this approach, unless Matt or Brian can see a way.
>>
>> A few replies inline.... @Welty, Brian I had more thoughts inline to one of your
>> original question....
>>
>>> -----Original Message-----
>>> From: Danilo Krummrich <dakr@redhat.com>
>>> Sent: Tuesday, January 23, 2024 6:57 PM
>>> To: Zeng, Oak <oak.zeng@intel.com>; Christian König
>>> <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
>>> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
>>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
>> intel-
>>> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
>>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>>> <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
>>> Subject: Re: Making drm_gpuvm work across gpu devices
>>>
>>> Hi Oak,
>>>
>>> On 1/23/24 20:37, Zeng, Oak wrote:
>>>> Thanks Christian. I have some comment inline below.
>>>>
>>>> Danilo, can you also take a look and give your feedback? Thanks.
>>> I agree with everything Christian already wrote. Except for the KFD parts, which
>>> I'm simply not familiar with, I had exactly the same thoughts after reading your
>>> initial mail.
>>>
>>> Please find some more comments below.
>>>
>>>>> -----Original Message-----
>>>>> From: Christian König <christian.koenig@amd.com>
>>>>> Sent: Tuesday, January 23, 2024 6:13 AM
>>>>> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich
>> <dakr@redhat.com>;
>>>>> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
>>>>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
>>> intel-
>>>>> xe@lists.freedesktop.org; Bommu, Krishnaiah
>>> <krishnaiah.bommu@intel.com>;
>>>>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>>>>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>>>>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>>>>> <matthew.brost@intel.com>
>>>>> Subject: Re: Making drm_gpuvm work across gpu devices
>>>>>
>>>>> Hi Oak,
>>>>>
>>>>> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
>>>>>> Hi Danilo and all,
>>>>>>
>>>>>> During the work of Intel's SVM code, we came up the idea of making
>>>>> drm_gpuvm to work across multiple gpu devices. See some discussion here:
>>>>> https://lore.kernel.org/dri-
>>>>>
>> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
>>>>> 11.prod.outlook.com/
>>>>>> The reason we try to do this is, for a SVM (shared virtual memory across
>> cpu
>>>>> program and all gpu program on all gpu devices) process, the address space
>>> has
>>>>> to be across all gpu devices. So if we make drm_gpuvm to work across
>> devices,
>>>>> then our SVM code can leverage drm_gpuvm as well.
>>>>>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
>>>>> drm_device *drm pointer a lot. This param is used only for printing/warning.
>>> So I
>>>>> think maybe we can delete this drm field from drm_gpuvm.
>>>>>> This way, on a multiple gpu device system, for one process, we can have
>> only
>>>>> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one
>> for
>>>>> each gpu device).
>>>>>> What do you think?
>>>>> Well from the GPUVM side I don't think it would make much difference if
>>>>> we have the drm device or not.
>>>>>
>>>>> But the experience we had with the KFD I think I should mention that we
>>>>> should absolutely *not* deal with multiple devices at the same time in
>>>>> the UAPI or VM objects inside the driver.
>>>>>
>>>>> The background is that all the APIs inside the Linux kernel are build
>>>>> around the idea that they work with only one device at a time. This
>>>>> accounts for both low level APIs like the DMA API as well as pretty high
>>>>> level things like for example file system address space etc...
>>>> Yes most API are per device based.
>>>>
>>>> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl
>>> function, it is per-process based. Each kfd_process represent a process across N
>>> gpu devices. Cc Felix.
>>>> Need to say, kfd SVM represent a shared virtual address space across CPU
>> and
>>> all GPU devices on the system. This is by the definition of SVM (shared virtual
>>> memory). This is very different from our legacy gpu *device* driver which
>> works
>>> for only one device (i.e., if you want one device to access another device's
>>> memory, you will have to use dma-buf export/import etc).
>>>> We have the same design requirement of SVM. For anyone who want to
>>> implement the SVM concept, this is a hard requirement. Since now drm has the
>>> drm_gpuvm concept which strictly speaking is designed for one device, I want
>> to
>>> see whether we can extend drm_gpuvm to make it work for both single device
>>> (as used in xe) and multipe devices (will be used in the SVM code). That is why I
>>> brought up this topic.
>>>>> So when you have multiple GPUs you either have an inseparable cluster of
>>>>> them which case you would also only have one drm_device. Or you have
>>>>> separated drm_device which also results in separate drm render nodes and
>>>>> separate virtual address spaces and also eventually separate IOMMU
>>>>> domains which gives you separate dma_addresses for the same page and so
>>>>> separate GPUVM page tables....
>>>> I am thinking we can still make each device has its separate
>> drm_device/render
>>> node/iommu domains/gpu page table. Just as what we have today. I am not
>> plan
>>> to change this picture.
>>>> But the virtual address space will support two modes of operation:
>>>> 1. one drm_gpuvm per device. This is when svm is not in the picture
>>>> 2. all devices in the process share one single drm_gpuvm, when svm is in the
>>> picture. In xe driver design, we have to support a mixture use of legacy mode
>>> (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu
>>> submission). So whenever SVM is in the picture, we want one single process
>>> address space across all devices. Drm_gpuvm doesn't need to be aware of
>> those
>>> two operation modes. It is driver's responsibility to use different mode.
>>>> For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit
>>> from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series:
>>> https://lore.kernel.org/dri-devel/20240117221223.18540-1-
>> oak.zeng@intel.com/)
>>> can inherit from drm_gpuvm while each xe_vm (still a per-device based struct)
>>> will just have a pointer to the drm_gpuvm structure. This way when svm is in
>> play,
>>> we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means
>>> shared address space across gpu devices.
>>>
>>> With a shared GPUVM structure, how do you track actual per device resources
>>> such as
>>> page tables? You also need to consider that the page table layout, memory
>>> mapping
>>> flags may vary from device to device due to different GPU chipsets or revisions.
>> The per device page table, flags etc are still managed per-device based, which is
>> the xe_vm in the xekmd driver.
>>
>>> Also, if you replace the shared GPUVM structure with a pointer to a shared one,
>>> you may run into all kinds of difficulties due to increasing complexity in terms
>>> of locking, synchronization, lifetime and potential unwind operations in error
>>> paths.
>>> I haven't thought it through yet, but I wouldn't be surprised entirely if there are
>>> cases where you simply run into circular dependencies.
>> Make sense, I can't see through this without a prove of concept code either.
>>
>>> Also, looking at the conversation in the linked patch series:
>>>
>>> <snip>
>>>
>>>>> For example as hmm_range_fault brings a range from host into GPU address
>>>>> space,  what if it was already allocated and in use by VM_BIND for
>>>>> a GEM_CREATE allocated buffer?    That is of course application error,
>>>>> but KMD needs to detect it, and provide one single managed address
>>>>> space across all allocations from the application....
>>>> This is very good question. Yes agree we should check this application error.
>>> Fortunately this is doable. All vm_bind virtual address range are tracked in
>>> xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb
>>> tree of *all* gpu devices (as xe_vm is for one device only) to see whether
>> there
>>> is a conflict. Will make the change soon.
>>>
>>> <snip>
>>>
>>> How do you do that if xe_vm->gpuvm is just a pointer to the GPUVM structure
>>> within xe_svm?
>> In the proposed approach, we have a single drm_gpuvm instance for one process.
>> All device's xe_vm pointing to this drm_gpuvm instance. This drm_gpuvm's rb
>> tree maintains all the va range we have in this process. We can just walk this rb
>> tree to see if there is a conflict.
>>
>> But I didn't answer Brian's question completely... In a mixed use of vm_bind and
>> malloc/mmap, the virtual address used by vm_bind should first be reserved in
>> user space using mmap. So all valid virtual address should be tracked by linux
>> kernel vma_struct.
>>
>> Both vm_bind and malloc'ed virtual address can cause a gpu page fault. Our fault
>> handler should first see whether this is a vm_bind va and service the fault
>> accordingly; if not, then serve the fault in the SVM path; if SVM path also failed, it
>> is an invalid address. So from user perspective, user can use:
>> Ptr = mmap()
>> Vm_bind(ptr, bo)
>> Submit gpu kernel using ptr
>> Or:
>> Ptr = mmap()
>> Submit gpu kernel using ptr
>> Whether vm_bind is called or not decides the gpu fault handler code path.
>> Hopefully this answers @Welty, Brian's original question
>>
>>
>>>> This requires some changes of drm_gpuvm design:
>>>> 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL,
>>> means this drm_gpuvm is not for specific gpu device
>>>> 2. The common dma_resv object: drm_gem_object *r_obj. *Does one
>>> dma_resv object allocated/initialized for one device work for all devices*? From
>>> first look, dma_resv is just some CPU structure maintaining dma-fences. So I
>>> guess it should work for all devices? I definitely need you to comment.
>>>
>>> The general rule is that drivers can share the common dma_resv across GEM
>>> objects that
>>> are only mapped within the VM owning the dma_resv, but never within
>> another
>>> VM.
>>>
>>> Now, your question is whether multiple VMs can share the same common
>>> dma_resv. I think
>>> that calls for trouble, since it would create dependencies that simply aren't
>>> needed
>>> and might even introduce locking issues.
>>>
>>> However, that's optional, you can simply decide to not make use of the
>> common
>>> dma_resv
>>> and all the optimizations based on it.
>> Ok, got it.
>>>>
>>>>> It's up to you how to implement it, but I think it's pretty clear that
>>>>> you need separate drm_gpuvm objects to manage those.
>>>> As explained above, I am thinking of one drm_gpuvm object across all devices
>>> when SVM is in the picture...
>>>>> That you map the same thing in all those virtual address spaces at the
>>>>> same address is a completely different optimization problem I think.
>>>> Not sure I follow here... the requirement from SVM is, one virtual address
>>> points to same physical backing store. For example, whenever CPU or any GPU
>>> device access this virtual address, it refers to the same physical content. Of
>>> course the physical backing store can be migrated b/t host memory and any of
>>> the GPU's device memory, but the content should be consistent.
>>>
>>> Technically, multiple different GPUs will have separate virtual address spaces,
>> it's
>>> just that you create mappings within all of them such that the same virtual
>>> address
>>> resolves to the same physical content on all of them.
>>>
>>> So, having a single GPUVM instance representing all of them might give the
>>> illusion of
>>> a single unified address space, but you still need to maintain each device's
>>> address
>>> space backing resources, such as page tables, separately.
>> Yes agreed.
>>
>> Regards,
>> Oak
>>> - Danilo
>>>
>>>> So we are mapping same physical content to the same virtual address in
>> either
>>> cpu page table or any gpu device's page table...
>>>>> What we could certainly do is to optimize hmm_range_fault by making
>>>>> hmm_range a reference counted object and using it for multiple devices
>>>>> at the same time if those devices request the same range of an mm_struct.
>>>>>
>>>> Not very follow. If you are trying to resolve a multiple devices concurrent
>> access
>>> problem, I think we should serialize concurrent device fault to one address
>> range.
>>> The reason is, during device fault handling, we might migrate the backing store
>> so
>>> hmm_range->hmm_pfns[] might have changed after one device access it.
>>>>> I think if you start using the same drm_gpuvm for multiple devices you
>>>>> will sooner or later start to run into the same mess we have seen with
>>>>> KFD, where we moved more and more functionality from the KFD to the
>> DRM
>>>>> render node because we found that a lot of the stuff simply doesn't work
>>>>> correctly with a single object to maintain the state.
>>>> As I understand it, KFD is designed to work across devices. A single pseudo
>>> /dev/kfd device represent all hardware gpu devices. That is why during kfd
>> open,
>>> many pdd (process device data) is created, each for one hardware device for
>> this
>>> process. Yes the codes are a little complicated.
>>>> Kfd manages the shared virtual address space in the kfd driver codes, like the
>>> split, merging etc. Here I am looking whether we can leverage the drm_gpuvm
>>> code for those functions.
>>>> As of the shared virtual address space across gpu devices, it is a hard
>>> requirement for svm/system allocator (aka malloc for gpu program). We need
>> to
>>> make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better
>>> because the work can be shared b/t drivers.
>>>> Thanks a lot,
>>>> Oak
>>>>
>>>>> Just one more point to your original discussion on the xe list: I think
>>>>> it's perfectly valid for an application to map something at the same
>>>>> address you already have something else.
>>>>>
>>>>> Cheers,
>>>>> Christian.
>>>>>
>>>>>> Thanks,
>>>>>> Oak

Christian König Jan. 24, 2024, 8:33 a.m. UTC | #11

Am 23.01.24 um 20:37 schrieb Zeng, Oak:
> [SNIP]
> Yes most API are per device based.
>
> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not do 
that ever again.

> Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker for a 
virtualization projects. Having SVM as device independent feature which 
somehow ties to the process address space turned out to be an extremely 
bad idea.

The background is that this only works for some use cases but not all of 
them.

What's working much better is to just have a mirror functionality which 
says that a range A..B of the process address space is mapped into a 
range C..D of the GPU address space.

Those ranges can then be used to implement the SVM feature required for 
higher level APIs and not something you need at the UAPI or even inside 
the low level kernel memory management.

When you talk about migrating memory to a device you also do this on a 
per device basis and *not* tied to the process address space. If you 
then get crappy performance because userspace gave contradicting 
information where to migrate memory then that's a bug in userspace and 
not something the kernel should try to prevent somehow.

[SNIP]
>> I think if you start using the same drm_gpuvm for multiple devices you
>> will sooner or later start to run into the same mess we have seen with
>> KFD, where we moved more and more functionality from the KFD to the DRM
>> render node because we found that a lot of the stuff simply doesn't work
>> correctly with a single object to maintain the state.
> As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that I 
see this design as a rather extreme failure. And I think it's one of the 
reasons why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of extending 
the CPU process into the GPUs, but this idea only works for a few use 
cases and is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up with 
CPU address != GPU address because the VAs are actually coming from the 
guest VM and not the host process.

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not 
have any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need to 
get explicit permission to do this from Dave and Daniel and maybe even 
Linus.

Regards,
Christian.

Zeng, Oak Jan. 25, 2024, 1:17 a.m. UTC | #12

Hi Christian,

Even though I mentioned KFD design, I didn’t mean to copy the KFD design. I also had hard time to understand the difficulty of KFD under virtualization environment.

For us, Xekmd doesn't need to know it is running under bare metal or virtualized environment. Xekmd is always a guest driver. All the virtual address used in xekmd is guest virtual address. For SVM, we require all the VF devices share one single shared address space with guest CPU program. So all the design works in bare metal environment can automatically work under virtualized environment. +@Shah, Ankur N<mailto:ankur.n.shah@intel.com> +@Winiarski, Michal<mailto:michal.winiarski@intel.com> to backup me if I am wrong.

Again, shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design (which means malloc’ed memory, cpu stack variables, globals can be directly used in gpu program. Same requirement as kfd SVM design). This was aligned with our user space software stack.

For anyone who want to implement system allocator, or SVM, this is a hard requirement. I started this thread hoping I can leverage the drm_gpuvm design to manage the shared virtual address space (as the address range split/merge function was scary to me and I didn’t want re-invent). I guess my takeaway from this you and Danilo is this approach is a NAK. Thomas also mentioned to me drm_gpuvm is a overkill for our svm address range split/merge. So I will make things work first by manage address range xekmd internally. I can re-look drm-gpuvm approach in the future.

Maybe a pseudo user program can illustrate our programming model:


Fd0 = open(card0)

Fd1 = open(card1)

Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's first vm_create

Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called from same process

Queue0 = xe_exec_queue_create(fd0, vm0)

Queue1 = xe_exec_queue_create(fd1, vm1)

//check p2p capability calling L0 API….

ptr = malloc()//this replace bo_create, vm_bind, dma-import/export

Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0

Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1

//Gpu page fault handles memory allocation/migration/mapping to gpu

As you can see, from above model, our design is a little bit different than the KFD design. user need to explicitly create gpuvm (vm0 and vm1 above) for each gpu device. Driver internally have a xe_svm represent the shared address space b/t cpu and multiple gpu devices. But end user doesn’t see and no need to create xe_svm. The shared virtual address space is really managed by linux core mm (through the vma struct, mm_struct etc). From each gpu device’s perspective, it just operate under its own gpuvm, not aware of the existence of other gpuvm, even though in reality all those gpuvm shares a same virtual address space.

See one more comment inline

From: Christian König <christian.koenig@amd.com>
Sent: Wednesday, January 24, 2024 3:33 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]



Yes most API are per device based.



One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not do that ever again.



Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker for a virtualization projects. Having SVM as device independent feature which somehow ties to the process address space turned out to be an extremely bad idea.

The background is that this only works for some use cases but not all of them.

What's working much better is to just have a mirror functionality which says that a range A..B of the process address space is mapped into a range C..D of the GPU address space.

Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.

When you talk about migrating memory to a device you also do this on a per device basis and *not* tied to the process address space. If you then get crappy performance because userspace gave contradicting information where to migrate memory then that's a bug in userspace and not something the kernel should try to prevent somehow.

[SNIP]


I think if you start using the same drm_gpuvm for multiple devices you

will sooner or later start to run into the same mess we have seen with

KFD, where we moved more and more functionality from the KFD to the DRM

render node because we found that a lot of the stuff simply doesn't work

correctly with a single object to maintain the state.



As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that I see this design as a rather extreme failure. And I think it's one of the reasons why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of extending the CPU process into the GPUs, but this idea only works for a few use cases and is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up with CPU address != GPU address because the VAs are actually coming from the guest VM and not the host process.


I don’t get the problem here. For us, under virtualization, both the cpu address and gpu virtual address operated in xekmd is guest virtual address. They can still share the same virtual address space (as SVM required)

Oak


SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need to get explicit permission to do this from Dave and Daniel and maybe even Linus.

Regards,
Christian.

David Airlie Jan. 25, 2024, 1:25 a.m. UTC | #13

>
>
> For us, Xekmd doesn't need to know it is running under bare metal or virtualized environment. Xekmd is always a guest driver. All the virtual address used in xekmd is guest virtual address. For SVM, we require all the VF devices share one single shared address space with guest CPU program. So all the design works in bare metal environment can automatically work under virtualized environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am wrong.
>
>
>
> Again, shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design (which means malloc’ed memory, cpu stack variables, globals can be directly used in gpu program. Same requirement as kfd SVM design). This was aligned with our user space software stack.

Just to make a very general point here (I'm hoping you listen to
Christian a bit more and hoping he replies in more detail), but just
because you have a system allocator design done, it doesn't in any way
enforce the requirements on the kernel driver to accept that design.
Bad system design should be pushed back on, not enforced in
implementation stages. It's a trap Intel falls into regularly since
they say well we already agreed this design with the userspace team
and we can't change it now. This isn't acceptable. Design includes
upstream discussion and feedback, if you say misdesigned the system
allocator (and I'm not saying you definitely have), and this is
pushing back on that, then you have to go fix your system
architecture.

KFD was an experiment like this, I pushed back on AMD at the start
saying it was likely a bad plan, we let it go and got a lot of
experience in why it was a bad design.

Dave.

Zeng, Oak Jan. 25, 2024, 1:39 a.m. UTC | #14

Thank you Felix for sharing. See a few comments inline

> -----Original Message-----
> From: Felix Kuehling <felix.kuehling@amd.com>
> Sent: Tuesday, January 23, 2024 3:17 PM
> To: Zeng, Oak <oak.zeng@intel.com>; Christian König <christian.koenig@amd.com>;
> Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel
> Vetter <daniel@ffwll.ch>
> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> On 2024-01-23 14:37, Zeng, Oak wrote:
> > Thanks Christian. I have some comment inline below.
> >
> > Danilo, can you also take a look and give your feedback? Thanks.
> 
> Sorry, just catching up with this thread now. I'm also not familiar with
> drm_gpuvm.
> 
> Some general observations based on my experience with KFD, amdgpu and
> SVM. With SVM we have a single virtual address space managed in user
> mode (basically using mmap) with attributes per virtual address range
> maintained in the kernel mode driver. Different devices get their
> mappings of pieces of that address space using the same virtual
> addresses. We also support migration to different DEVICE_PRIVATE memory
> spaces.

I think one same virtual address can be mapped into different devices. For different device, reading from same virtual address result in same content.  Driver either map the page table to pointing to the same physical location, or migrate before mapping. I guess you imply this.

> 
> However, we still have page tables managed per device. Each device can
> have different page table formats and layout (e.g. different GPU
> generations in the same system) and the same memory may be mapped with
> different flags on different devices in order to get the right coherence
> behaviour. We also need to maintain per-device DMA mappings somewhere.
> That means, as far as the device page tables are concerned, we still
> have separate address spaces. SVM only adds a layer on top, which
> coordinates these separate device virtual address spaces so that some
> parts of them provide the appearance of a shared virtual address space.
> 

Yes exactly the same understanding.

> At some point you need to decide, where you draw the boundary between
> managing a per-process shared virtual address space and managing
> per-device virtual address spaces. In amdgpu that boundary is currently
> where kfd_svm code calls amdgpu_vm code to manage the per-device page
> tables.

Exactly, in xe driver it is xe_svm and xe_vm. Just different name

Zeng, Oak Jan. 25, 2024, 5:25 a.m. UTC | #15

Hi Dave,

Let me step back. When I wrote " shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design", I meant this is not only Intel's design requirement. Rather this is a common requirement for both Intel, AMD and Nvidia. Take a look at cuda driver API definition of cuMemAllocManaged (search this API on https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM), it said: 

"The pointer is valid on the CPU and on all GPUs in the system that support managed memory."

This means the program virtual address space is shared b/t CPU and all GPU devices on the system. The system allocator we are discussing is just one step advanced than cuMemAllocManaged: it allows malloc'ed memory to be shared b/t CPU and all GPU devices.

I hope we all agree with this point.

With that, I agree with Christian that in kmd we should make driver code per-device based instead of managing all devices in one driver instance. Our system allocator (and generally xekmd)design follows this rule: we make xe_vm per device based - one device is *not* aware of other device's address space, as I explained in previous email. I started this email seeking a one drm_gpuvm instance to cover all GPU devices. I gave up this approach (at least for now) per Danilo and Christian's feedback: We will continue to have per device based drm_gpuvm. I hope this is aligned with Christian but I will have to wait for Christian's reply to my previous email.

I hope this clarify thing a little.

Regards,
Oak 

> -----Original Message-----
> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of David
> Airlie
> Sent: Wednesday, January 24, 2024 8:25 PM
> To: Zeng, Oak <oak.zeng@intel.com>
> Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
> <michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
> Brian <brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com>; dri-
> devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Gupta, saurabhg
> <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; Daniel
> Vetter <daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com>; Bommu,
> Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; Christian König
> <christian.koenig@amd.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> >
> >
> > For us, Xekmd doesn't need to know it is running under bare metal or
> virtualized environment. Xekmd is always a guest driver. All the virtual address
> used in xekmd is guest virtual address. For SVM, we require all the VF devices
> share one single shared address space with guest CPU program. So all the design
> works in bare metal environment can automatically work under virtualized
> environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am wrong.
> >
> >
> >
> > Again, shared virtual address space b/t cpu and all gpu devices is a hard
> requirement for our system allocator design (which means malloc’ed memory,
> cpu stack variables, globals can be directly used in gpu program. Same
> requirement as kfd SVM design). This was aligned with our user space software
> stack.
> 
> Just to make a very general point here (I'm hoping you listen to
> Christian a bit more and hoping he replies in more detail), but just
> because you have a system allocator design done, it doesn't in any way
> enforce the requirements on the kernel driver to accept that design.
> Bad system design should be pushed back on, not enforced in
> implementation stages. It's a trap Intel falls into regularly since
> they say well we already agreed this design with the userspace team
> and we can't change it now. This isn't acceptable. Design includes
> upstream discussion and feedback, if you say misdesigned the system
> allocator (and I'm not saying you definitely have), and this is
> pushing back on that, then you have to go fix your system
> architecture.
> 
> KFD was an experiment like this, I pushed back on AMD at the start
> saying it was likely a bad plan, we let it go and got a lot of
> experience in why it was a bad design.
> 
> Dave.

周春明(日月) Jan. 25, 2024, 11 a.m. UTC | #16

[snip]
Fd0 = open(card0)
Fd1 = open(card1)
Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's first vm_create
Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called from same process
Queue0 = xe_exec_queue_create(fd0, vm0)
Queue1 = xe_exec_queue_create(fd1, vm1)
//check p2p capability calling L0 API….
ptr = malloc()//this replace bo_create, vm_bind, dma-import/export
Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0
Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1
//Gpu page fault handles memory allocation/migration/mapping to gpu
[snip]
Hi Oak,
From your sample code, you not only need va-manager cross gpu devices, but also cpu, right?
I think you need a UVA (unified va) manager in user space and make range of drm_gpuvm reserved from cpu va space. In that way, malloc's va and gpu va are in same space and will not conflict. And then via HMM mechanism, gpu devices can safely use VA passed from HMM.
By the way, I'm not familiar with drm_gpuvm, traditionally, gpu driver often put va-manager in user space, not sure what's benefit we can get from drm_gpuvm invented in kernel space. Can anyone help explain more?
- Chunming
------------------------------------------------------------------
发件人：Zeng, Oak <oak.zeng@intel.com>
发送时间：2024年1月25日(星期四) 09:17
收件人："Christian König" <christian.koenig@amd.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; "Shah, Ankur N" <ankur.n.shah@intel.com>; "Winiarski, Michal" <michal.winiarski@intel.com>
抄　送："Brost, Matthew" <matthew.brost@intel.com>; Thomas.Hellstrom@linux.intel.com <Thomas.Hellstrom@linux.intel.com>; "Welty, Brian" <brian.welty@intel.com>; dri-devel@lists.freedesktop.org <dri-devel@lists.freedesktop.org>; "Ghimiray, Himal Prasad" <himal.prasad.ghimiray@intel.com>; "Gupta, saurabhg" <saurabhg.gupta@intel.com>; "Bommu, Krishnaiah" <krishnaiah.bommu@intel.com>; "Vishwanathapura, Niranjana" <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org <intel-xe@lists.freedesktop.org>
主　题：RE: Making drm_gpuvm work across gpu devices
Hi Christian,
Even though I mentioned KFD design, I didn’t mean to copy the KFD design. I also had hard time to understand the difficulty of KFD under virtualization environment.
For us, Xekmd doesn't need to know it is running under bare metal or virtualized environment. Xekmd is always a guest driver. All the virtual address used in xekmd is guest virtual address. For SVM, we require all the VF devices share one single shared address space with guest CPU program. So all the design works in bare metal environment can automatically work under virtualized environment. +@Shah, Ankur N <mailto:ankur.n.shah@intel.com > +@Winiarski, Michal <mailto:michal.winiarski@intel.com > to backup me if I am wrong.
Again, shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design (which means malloc’ed memory, cpu stack variables, globals can be directly used in gpu program. Same requirement as kfd SVM design). This was aligned with our user space software stack. 
For anyone who want to implement system allocator, or SVM, this is a hard requirement. I started this thread hoping I can leverage the drm_gpuvm design to manage the shared virtual address space (as the address range split/merge function was scary to me and I didn’t want re-invent). I guess my takeaway from this you and Danilo is this approach is a NAK. Thomas also mentioned to me drm_gpuvm is a overkill for our svm address range split/merge. So I will make things work first by manage address range xekmd internally. I can re-look drm-gpuvm approach in the future.
Maybe a pseudo user program can illustrate our programming model:
Fd0 = open(card0)
Fd1 = open(card1)
Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's first vm_create
Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called from same process
Queue0 = xe_exec_queue_create(fd0, vm0)
Queue1 = xe_exec_queue_create(fd1, vm1)
//check p2p capability calling L0 API….
ptr = malloc()//this replace bo_create, vm_bind, dma-import/export
Xe_exec(queue0,  ptr)//submit gpu job which use ptr, on card0
Xe_exec(queue1,  ptr)//submit gpu job which use ptr, on card1
//Gpu page fault handles memory allocation/migration/mapping to gpu
As you can see, from above model, our design is a little bit different than the KFD design. user need to explicitly create gpuvm (vm0 and vm1 above) for each gpu device. Driver internally have a xe_svm represent the shared address space b/t cpu and multiple gpu devices. But end user doesn’t see and no need to create xe_svm. The shared virtual address space is really managed by linux core mm (through the vma struct, mm_struct etc). From each gpu device’s perspective, it just operate under its own gpuvm, not aware of the existence of other gpuvm, even though in reality all those gpuvm shares a same virtual address space.
See one more comment inline
From: Christian König <christian.koenig@amd.com> 
Sent: Wednesday, January 24, 2024 3:33 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices
Am 23.01.24 um 20:37 schrieb Zeng, Oak:
[SNIP] 
Yes most API are per device based. One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.
 Yeah and that was a big mistake in my opinion. We should really not do that ever again.
Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
 Exactly that thinking is what we have currently found as blocker for a virtualization projects. Having SVM as device independent feature which somehow ties to the process address space turned out to be an extremely bad idea.
 The background is that this only works for some use cases but not all of them.
 What's working much better is to just have a mirror functionality which says that a range A..B of the process address space is mapped into a range C..D of the GPU address space.
 Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.
 When you talk about migrating memory to a device you also do this on a per device basis and *not* tied to the process address space. If you then get crappy performance because userspace gave contradicting information where to migrate memory then that's a bug in userspace and not something the kernel should try to prevent somehow.
 [SNIP]
I think if you start using the same drm_gpuvm for multiple devices youwill sooner or later start to run into the same mess we have seen withKFD, where we moved more and more functionality from the KFD to the DRMrender node because we found that a lot of the stuff simply doesn't workcorrectly with a single object to maintain the state. As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.
 Yes, I'm perfectly aware of that. And I can only repeat myself that I see this design as a rather extreme failure. And I think it's one of the reasons why NVidia is so dominant with Cuda.
 This whole approach KFD takes was designed with the idea of extending the CPU process into the GPUs, but this idea only works for a few use cases and is not something we should apply to drivers in general.
 A very good example are virtualization use cases where you end up with CPU address != GPU address because the VAs are actually coming from the guest VM and not the host process.
I don’t get the problem here. For us, under virtualization, both the cpu address and gpu virtual address operated in xekmd is guest virtual address. They can still share the same virtual address space (as SVM required)
Oak
 SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.
 If you want to do something similar as KFD for Xe I think you need to get explicit permission to do this from Dave and Daniel and maybe even Linus.
 Regards,
 Christian.

Zeng, Oak Jan. 25, 2024, 4:42 p.m. UTC | #17

Hi Christian,

I got a few more questions inline

From: Christian König <christian.koenig@amd.com>
Sent: Wednesday, January 24, 2024 3:33 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]

Yes most API are per device based.

One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not do that ever again.

Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker for a virtualization projects. Having SVM as device independent feature which somehow ties to the process address space turned out to be an extremely bad idea.

The background is that this only works for some use cases but not all of them.

What's working much better is to just have a mirror functionality which says that a range A..B of the process address space is mapped into a range C..D of the GPU address space.

Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.

The whole purpose of the HMM design is to create a shared address space b/t cpu and gpu program. See here: https://www.kernel.org/doc/Documentation/vm/hmm.rst. Mapping process address A..B to C..D of GPU address space is exactly referred as “split address space” in the HMM design.

When you talk about migrating memory to a device you also do this on a per device basis and *not* tied to the process address space. If you then get crappy performance because userspace gave contradicting information where to migrate memory then that's a bug in userspace and not something the kernel should try to prevent somehow.

[SNIP]

I think if you start using the same drm_gpuvm for multiple devices you

will sooner or later start to run into the same mess we have seen with

KFD, where we moved more and more functionality from the KFD to the DRM

render node because we found that a lot of the stuff simply doesn't work

correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that I see this design as a rather extreme failure. And I think it's one of the reasons why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of extending the CPU process into the GPUs, but this idea only works for a few use cases and is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up with CPU address != GPU address because the VAs are actually coming from the guest VM and not the host process.

Are you talking about general virtualization set up such as SRIOV, GPU device pass through, or something else?

In a typical virtualization set up, gpu driver such as xekmd or amdgpu is always a guest driver. In xekmd case, xekmd doesn’t need to know it is operating under virtualized environment. So the virtual address in driver is guest virtual address. From kmd driver perspective, there is no difference b/t bare metal and virtualized.

Are you talking about special virtualized setup such as para-virtualized/VirGL? I need more background info to understand why you end up with CPU address !=GPU address in SVM….

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.

Maybe a terminology problem here. I agree what you said above. We also have achieved the SVM design with our BO-centric driver such as i915, xekmd.

But we are mainly talking about system allocator here, like use malloc’ed memory directly for GPU program. And we want to leverage HMM. System allocator can be used to implement the same SVM concept at OpenCL/Cuda/ROCm, but SVM can be implemented with BO-centric driver also.

If you want to do something similar as KFD for Xe I think you need to get explicit permission to do this from Dave and Daniel and maybe even Linus.

If you look at my series https://lore.kernel.org/dri-devel/20231221043812.3783313-1-oak.zeng@intel.com/, I am not doing things similar to KFD.

Regards,
Oak

Regards,
Christian.

Zeng, Oak Jan. 25, 2024, 5 p.m. UTC | #18

Hi Chunming,

From: 周春明(日月) <riyue.zcm@alibaba-inc.com>
Sent: Thursday, January 25, 2024 6:01 AM
To: Zeng, Oak <oak.zeng@intel.com>; Christian König <christian.koenig@amd.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; Shah, Ankur N <ankur.n.shah@intel.com>; Winiarski, Michal <michal.winiarski@intel.com>
Cc: Brost, Matthew <matthew.brost@intel.com>; Thomas.Hellstrom@linux.intel.com; Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org
Subject: 回复：Making drm_gpuvm work across gpu devices

[snip]

Fd0 = open(card0)

Fd1 = open(card1)

Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's first vm_create

Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called from same process

Queue0 = xe_exec_queue_create(fd0, vm0)

Queue1 = xe_exec_queue_create(fd1, vm1)

//check p2p capability calling L0 API….

ptr = malloc()//this replace bo_create, vm_bind, dma-import/export

Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0

Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1

//Gpu page fault handles memory allocation/migration/mapping to gpu
[snip]
Hi Oak,
From your sample code, you not only need va-manager cross gpu devices, but also cpu, right?

No. Per the feedback from Christian and Danilo, I would give up the idea of making drm_gpuvm to work across gpu devices. I might want to come back later but for now it is not the plan anymore.

I think you need a UVA (unified va) manager in user space and make range of drm_gpuvm reserved from cpu va space. In that way, malloc's va and gpu va are in same space and will not conflict. And then via HMM mechanism, gpu devices can safely use VA passed from HMM.

Under HMM, both GPU and CPU are simply under the same address space. A same virtual address represent the same allocation for both CPU and GPUs. See the hmm doc here: https://www.kernel.org/doc/Documentation/vm/hmm.rst.  User space program doesn’t need to reserve any address range. All the address ranges are managed by linux kernel core mm. Today GPU kmd driver has some structure to save the address range based memory attributes.

Regards,
Oak

By the way, I'm not familiar with drm_gpuvm, traditionally, gpu driver often put va-manager in user space, not sure what's benefit we can get from drm_gpuvm invented in kernel space. Can anyone help explain more?

- Chunming
------------------------------------------------------------------
发件人：Zeng, Oak <oak.zeng@intel.com<mailto:oak.zeng@intel.com>>
发送时间：2024年1月25日(星期四) 09:17
收件人："Christian König" <christian.koenig@amd.com<mailto:christian.koenig@amd.com>>; Danilo Krummrich <dakr@redhat.com<mailto:dakr@redhat.com>>; Dave Airlie <airlied@redhat.com<mailto:airlied@redhat.com>>; Daniel Vetter <daniel@ffwll.ch<mailto:daniel@ffwll.ch>>; Felix Kuehling <felix.kuehling@amd.com<mailto:felix.kuehling@amd.com>>; "Shah, Ankur N" <ankur.n.shah@intel.com<mailto:ankur.n.shah@intel.com>>; "Winiarski, Michal" <michal.winiarski@intel.com<mailto:michal.winiarski@intel.com>>
抄　送："Brost, Matthew" <matthew.brost@intel.com<mailto:matthew.brost@intel.com>>; Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com> <Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>>; "Welty, Brian" <brian.welty@intel.com<mailto:brian.welty@intel.com>>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org> <dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>>; "Ghimiray, Himal Prasad" <himal.prasad.ghimiray@intel.com<mailto:himal.prasad.ghimiray@intel.com>>; "Gupta, saurabhg" <saurabhg.gupta@intel.com<mailto:saurabhg.gupta@intel.com>>; "Bommu, Krishnaiah" <krishnaiah.bommu@intel.com<mailto:krishnaiah.bommu@intel.com>>; "Vishwanathapura, Niranjana" <niranjana.vishwanathapura@intel.com<mailto:niranjana.vishwanathapura@intel.com>>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org> <intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>>
主　题：RE: Making drm_gpuvm work across gpu devices

Hi Christian,

Even though I mentioned KFD design, I didn’t mean to copy the KFD design. I also had hard time to understand the difficulty of KFD under virtualization environment.

For us, Xekmd doesn't need to know it is running under bare metal or virtualized environment. Xekmd is always a guest driver. All the virtual address used in xekmd is guest virtual address. For SVM, we require all the VF devices share one single shared address space with guest CPU program. So all the design works in bare metal environment can automatically work under virtualized environment. +@Shah, Ankur N<mailto:ankur.n.shah@intel.com> +@Winiarski, Michal<mailto:michal.winiarski@intel.com> to backup me if I am wrong.

Again, shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design (which means malloc’ed memory, cpu stack variables, globals can be directly used in gpu program. Same requirement as kfd SVM design). This was aligned with our user space software stack.

For anyone who want to implement system allocator, or SVM, this is a hard requirement. I started this thread hoping I can leverage the drm_gpuvm design to manage the shared virtual address space (as the address range split/merge function was scary to me and I didn’t want re-invent). I guess my takeaway from this you and Danilo is this approach is a NAK. Thomas also mentioned to me drm_gpuvm is a overkill for our svm address range split/merge. So I will make things work first by manage address range xekmd internally. I can re-look drm-gpuvm approach in the future.

Maybe a pseudo user program can illustrate our programming model:

Fd0 = open(card0)

Fd1 = open(card1)

Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's first vm_create

Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called from same process

Queue0 = xe_exec_queue_create(fd0, vm0)

Queue1 = xe_exec_queue_create(fd1, vm1)

//check p2p capability calling L0 API….

ptr = malloc()//this replace bo_create, vm_bind, dma-import/export

Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0

Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1

//Gpu page fault handles memory allocation/migration/mapping to gpu

As you can see, from above model, our design is a little bit different than the KFD design. user need to explicitly create gpuvm (vm0 and vm1 above) for each gpu device. Driver internally have a xe_svm represent the shared address space b/t cpu and multiple gpu devices. But end user doesn’t see and no need to create xe_svm. The shared virtual address space is really managed by linux core mm (through the vma struct, mm_struct etc). From each gpu device’s perspective, it just operate under its own gpuvm, not aware of the existence of other gpuvm, even though in reality all those gpuvm shares a same virtual address space.

See one more comment inline

From: Christian König <christian.koenig@amd.com<mailto:christian.koenig@amd.com>>
Sent: Wednesday, January 24, 2024 3:33 AM
To: Zeng, Oak <oak.zeng@intel.com<mailto:oak.zeng@intel.com>>; Danilo Krummrich <dakr@redhat.com<mailto:dakr@redhat.com>>; Dave Airlie <airlied@redhat.com<mailto:airlied@redhat.com>>; Daniel Vetter <daniel@ffwll.ch<mailto:daniel@ffwll.ch>>; Felix Kuehling <felix.kuehling@amd.com<mailto:felix.kuehling@amd.com>>
Cc: Welty, Brian <brian.welty@intel.com<mailto:brian.welty@intel.com>>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com<mailto:krishnaiah.bommu@intel.com>>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com<mailto:himal.prasad.ghimiray@intel.com>>; Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com<mailto:niranjana.vishwanathapura@intel.com>>; Brost, Matthew <matthew.brost@intel.com<mailto:matthew.brost@intel.com>>; Gupta, saurabhg <saurabhg.gupta@intel.com<mailto:saurabhg.gupta@intel.com>>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 23.01.24 um 20:37 schrieb Zeng, Oak:
[SNIP]

Yes most API are per device based.

One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not do that ever again.

Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker for a virtualization projects. Having SVM as device independent feature which somehow ties to the process address space turned out to be an extremely bad idea.

The background is that this only works for some use cases but not all of them.

What's working much better is to just have a mirror functionality which says that a range A..B of the process address space is mapped into a range C..D of the GPU address space.

Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.

When you talk about migrating memory to a device you also do this on a per device basis and *not* tied to the process address space. If you then get crappy performance because userspace gave contradicting information where to migrate memory then that's a bug in userspace and not something the kernel should try to prevent somehow.

[SNIP]

I think if you start using the same drm_gpuvm for multiple devices you

will sooner or later start to run into the same mess we have seen with

KFD, where we moved more and more functionality from the KFD to the DRM

render node because we found that a lot of the stuff simply doesn't work

correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that I see this design as a rather extreme failure. And I think it's one of the reasons why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of extending the CPU process into the GPUs, but this idea only works for a few use cases and is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up with CPU address != GPU address because the VAs are actually coming from the guest VM and not the host process.

I don’t get the problem here. For us, under virtualization, both the cpu address and gpu virtual address operated in xekmd is guest virtual address. They can still share the same virtual address space (as SVM required)

Oak

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need to get explicit permission to do this from Dave and Daniel and maybe even Linus.

Regards,
Christian.

Felix Kuehling Jan. 25, 2024, 5:15 p.m. UTC | #19

On 2024-01-24 20:17, Zeng, Oak wrote:
>
> Hi Christian,
>
> Even though I mentioned KFD design, I didn’t mean to copy the KFD 
> design. I also had hard time to understand the difficulty of KFD under 
> virtualization environment.
>
The problem with virtualization is related to virtualization design 
choices. There is a single process that proxies requests from multiple 
processes in one (or more?) VMs to the GPU driver. That means, we need a 
single process with multiple contexts (and address spaces). One proxy 
process on the host must support multiple guest address spaces.

I don't know much more than these very high level requirements, and I 
only found out about those a few weeks ago. Due to my own bias I can't 
comment whether there are bad design choices in the proxy architecture 
or in KFD or both. The way we are considering fixing this, is to enable 
creating multiple KFD contexts in the same process. Each of those 
contexts will still represent a shared virtual address space across 
devices (but not the CPU). Because the device address space is not 
shared with the CPU, we cannot support our SVM API in this situation.

I still believe that it makes sense to have the kernel mode driver aware 
of a shared virtual address space at some level. A per-GPU API and an 
API that doesn't require matching CPU and GPU virtual addresses would 
enable more flexibility at the cost duplicate information tracking for 
multiple devices and duplicate overhead for things like MMU notifiers 
and interval tree data structures. Having to coordinate multiple devices 
with potentially different address spaces would probably make it more 
awkward to implement memory migration. The added flexibility would go 
mostly unused, except in some very niche applications.

Regards,
   Felix


> For us, Xekmd doesn't need to know it is running under bare metal or 
> virtualized environment. Xekmd is always a guest driver. All the 
> virtual address used in xekmd is guest virtual address. For SVM, we 
> require all the VF devices share one single shared address space with 
> guest CPU program. So all the design works in bare metal environment 
> can automatically work under virtualized environment. +@Shah, Ankur N 
> <mailto:ankur.n.shah@intel.com> +@Winiarski, Michal 
> <mailto:michal.winiarski@intel.com> to backup me if I am wrong.
>
> Again, shared virtual address space b/t cpu and all gpu devices is a 
> hard requirement for our system allocator design (which means 
> malloc’ed memory, cpu stack variables, globals can be directly used in 
> gpu program. Same requirement as kfd SVM design). This was aligned 
> with our user space software stack.
>
> For anyone who want to implement system allocator, or SVM, this is a 
> hard requirement. I started this thread hoping I can leverage the 
> drm_gpuvm design to manage the shared virtual address space (as the 
> address range split/merge function was scary to me and I didn’t want 
> re-invent). I guess my takeaway from this you and Danilo is this 
> approach is a NAK. Thomas also mentioned to me drm_gpuvm is a overkill 
> for our svm address range split/merge. So I will make things work 
> first by manage address range xekmd internally. I can re-look 
> drm-gpuvm approach in the future.
>
> Maybe a pseudo user program can illustrate our programming model:
>
> Fd0 = open(card0)
>
> Fd1 = open(card1)
>
> Vm0 =xe_vm_create(fd0) //driver create process xe_svm on the process's 
> first vm_create
>
> Vm1 = xe_vm_create(fd1) //driver re-use xe_svm created above if called 
> from same process
>
> Queue0 = xe_exec_queue_create(fd0, vm0)
>
> Queue1 = xe_exec_queue_create(fd1, vm1)
>
> //check p2p capability calling L0 API….
>
> ptr = malloc()//this replace bo_create, vm_bind, dma-import/export
>
> Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0
>
> Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1
>
> //Gpu page fault handles memory allocation/migration/mapping to gpu
>
> As you can see, from above model, our design is a little bit different 
> than the KFD design. user need to explicitly create gpuvm (vm0 and vm1 
> above) for each gpu device. Driver internally have a xe_svm represent 
> the shared address space b/t cpu and multiple gpu devices. But end 
> user doesn’t see and no need to create xe_svm. The shared virtual 
> address space is really managed by linux core mm (through the vma 
> struct, mm_struct etc). From each gpu device’s perspective, it just 
> operate under its own gpuvm, not aware of the existence of other 
> gpuvm, even though in reality all those gpuvm shares a same virtual 
> address space.
>
> See one more comment inline
>
> *From:*Christian König <christian.koenig@amd.com>
> *Sent:* Wednesday, January 24, 2024 3:33 AM
> *To:* Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich 
> <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter 
> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
> *Cc:* Welty, Brian <brian.welty@intel.com>; 
> dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; 
> Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad 
> <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; 
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; 
> Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg 
> <saurabhg.gupta@intel.com>
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>
>     [SNIP]
>
>     Yes most API are per device based.
>
>     One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.
>
>
> Yeah and that was a big mistake in my opinion. We should really not do 
> that ever again.
>
>
>     Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
>
>
> Exactly that thinking is what we have currently found as blocker for a 
> virtualization projects. Having SVM as device independent feature 
> which somehow ties to the process address space turned out to be an 
> extremely bad idea.
>
> The background is that this only works for some use cases but not all 
> of them.
>
> What's working much better is to just have a mirror functionality 
> which says that a range A..B of the process address space is mapped 
> into a range C..D of the GPU address space.
>
> Those ranges can then be used to implement the SVM feature required 
> for higher level APIs and not something you need at the UAPI or even 
> inside the low level kernel memory management.
>
> When you talk about migrating memory to a device you also do this on a 
> per device basis and *not* tied to the process address space. If you 
> then get crappy performance because userspace gave contradicting 
> information where to migrate memory then that's a bug in userspace and 
> not something the kernel should try to prevent somehow.
>
> [SNIP]
>
>         I think if you start using the same drm_gpuvm for multiple devices you
>
>         will sooner or later start to run into the same mess we have seen with
>
>         KFD, where we moved more and more functionality from the KFD to the DRM
>
>         render node because we found that a lot of the stuff simply doesn't work
>
>         correctly with a single object to maintain the state.
>
>     As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.
>
>
> Yes, I'm perfectly aware of that. And I can only repeat myself that I 
> see this design as a rather extreme failure. And I think it's one of 
> the reasons why NVidia is so dominant with Cuda.
>
> This whole approach KFD takes was designed with the idea of extending 
> the CPU process into the GPUs, but this idea only works for a few use 
> cases and is not something we should apply to drivers in general.
>
> A very good example are virtualization use cases where you end up with 
> CPU address != GPU address because the VAs are actually coming from 
> the guest VM and not the host process.
>
> I don’t get the problem here. For us, under virtualization, both the 
> cpu address and gpu virtual address operated in xekmd is guest virtual 
> address. They can still share the same virtual address space (as SVM 
> required)
>
> Oak
>
>
>
> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should 
> not have any influence on the design of the kernel UAPI.
>
> If you want to do something similar as KFD for Xe I think you need to 
> get explicit permission to do this from Dave and Daniel and maybe even 
> Linus.
>
> Regards,
> Christian.
>

Daniel Vetter Jan. 25, 2024, 6:32 p.m. UTC | #20

On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
> > [SNIP]
> > Yes most API are per device based.
> > 
> > One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.
> 
> Yeah and that was a big mistake in my opinion. We should really not do that
> ever again.
> 
> > Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
> 
> Exactly that thinking is what we have currently found as blocker for a
> virtualization projects. Having SVM as device independent feature which
> somehow ties to the process address space turned out to be an extremely bad
> idea.
> 
> The background is that this only works for some use cases but not all of
> them.
> 
> What's working much better is to just have a mirror functionality which says
> that a range A..B of the process address space is mapped into a range C..D
> of the GPU address space.
> 
> Those ranges can then be used to implement the SVM feature required for
> higher level APIs and not something you need at the UAPI or even inside the
> low level kernel memory management.
> 
> When you talk about migrating memory to a device you also do this on a per
> device basis and *not* tied to the process address space. If you then get
> crappy performance because userspace gave contradicting information where to
> migrate memory then that's a bug in userspace and not something the kernel
> should try to prevent somehow.
> 
> [SNIP]
> > > I think if you start using the same drm_gpuvm for multiple devices you
> > > will sooner or later start to run into the same mess we have seen with
> > > KFD, where we moved more and more functionality from the KFD to the DRM
> > > render node because we found that a lot of the stuff simply doesn't work
> > > correctly with a single object to maintain the state.
> > As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.
> 
> Yes, I'm perfectly aware of that. And I can only repeat myself that I see
> this design as a rather extreme failure. And I think it's one of the reasons
> why NVidia is so dominant with Cuda.
> 
> This whole approach KFD takes was designed with the idea of extending the
> CPU process into the GPUs, but this idea only works for a few use cases and
> is not something we should apply to drivers in general.
> 
> A very good example are virtualization use cases where you end up with CPU
> address != GPU address because the VAs are actually coming from the guest VM
> and not the host process.
> 
> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have
> any influence on the design of the kernel UAPI.
> 
> If you want to do something similar as KFD for Xe I think you need to get
> explicit permission to do this from Dave and Daniel and maybe even Linus.

I think the one and only one exception where an SVM uapi like in kfd makes
sense, is if the _hardware_ itself, not the software stack defined
semantics that you've happened to build on top of that hw, enforces a 1:1
mapping with the cpu process address space.

Which means your hardware is using PASID, IOMMU based translation, PCI-ATS
(address translation services) or whatever your hw calls it and has _no_
device-side pagetables on top. Which from what I've seen all devices with
device-memory have, simply because they need some place to store whether
that memory is currently in device memory or should be translated using
PASID. Currently there's no gpu that works with PASID only, but there are
some on-cpu-die accelerator things that do work like that.

Maybe in the future there will be some accelerators that are fully cpu
cache coherent (including atomics) with something like CXL, and the
on-device memory is managed as normal system memory with struct page as
ZONE_DEVICE and accelerator va -> physical address translation is only
done with PASID ... but for now I haven't seen that, definitely not in
upstream drivers.

And the moment you have some per-device pagetables or per-device memory
management of some sort (like using gpuva mgr) then I'm 100% agreeing with
Christian that the kfd SVM model is too strict and not a great idea.

Cheers, Sima

Zeng, Oak Jan. 25, 2024, 6:37 p.m. UTC | #21

> -----Original Message-----
> From: Felix Kuehling <felix.kuehling@amd.com>
> Sent: Thursday, January 25, 2024 12:16 PM
> To: Zeng, Oak <oak.zeng@intel.com>; Christian König
> <christian.koenig@amd.com>; Danilo Krummrich <dakr@redhat.com>; Dave
> Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Shah, Ankur N
> <ankur.n.shah@intel.com>; Winiarski, Michal <michal.winiarski@intel.com>
> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> 
> On 2024-01-24 20:17, Zeng, Oak wrote:
> >
> > Hi Christian,
> >
> > Even though I mentioned KFD design, I didn’t mean to copy the KFD
> > design. I also had hard time to understand the difficulty of KFD under
> > virtualization environment.
> >
> The problem with virtualization is related to virtualization design
> choices. There is a single process that proxies requests from multiple
> processes in one (or more?) VMs to the GPU driver. That means, we need a
> single process with multiple contexts (and address spaces). One proxy
> process on the host must support multiple guest address spaces.

My first response is, why processes on the virtual machine can't open /dev/kfd device itself?

Also try to picture why base amdgpu driver (which is per hardware device based) doesn't have this problem... creating multiple contexts under single amdgpu device, each context servicing one guest process?
> 
> I don't know much more than these very high level requirements, and I
> only found out about those a few weeks ago. Due to my own bias I can't
> comment whether there are bad design choices in the proxy architecture
> or in KFD or both. The way we are considering fixing this, is to enable
> creating multiple KFD contexts in the same process. Each of those
> contexts will still represent a shared virtual address space across
> devices (but not the CPU). Because the device address space is not
> shared with the CPU, we cannot support our SVM API in this situation.
> 

One kfd process, multiple contexts, each context has a shared address space across devices.... I do see some complications

Zeng, Oak Jan. 25, 2024, 9:02 p.m. UTC | #22

> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Thursday, January 25, 2024 1:33 PM
> To: Christian König <christian.koenig@amd.com>
> Cc: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix
> Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
> devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost,
> Matthew <matthew.brost@intel.com>; Gupta, saurabhg
> <saurabhg.gupta@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
> > Am 23.01.24 um 20:37 schrieb Zeng, Oak:
> > > [SNIP]
> > > Yes most API are per device based.
> > >
> > > One exception I know is actually the kfd SVM API. If you look at the svm_ioctl
> function, it is per-process based. Each kfd_process represent a process across N
> gpu devices.
> >
> > Yeah and that was a big mistake in my opinion. We should really not do that
> > ever again.
> >
> > > Need to say, kfd SVM represent a shared virtual address space across CPU
> and all GPU devices on the system. This is by the definition of SVM (shared virtual
> memory). This is very different from our legacy gpu *device* driver which works
> for only one device (i.e., if you want one device to access another device's
> memory, you will have to use dma-buf export/import etc).
> >
> > Exactly that thinking is what we have currently found as blocker for a
> > virtualization projects. Having SVM as device independent feature which
> > somehow ties to the process address space turned out to be an extremely bad
> > idea.
> >
> > The background is that this only works for some use cases but not all of
> > them.
> >
> > What's working much better is to just have a mirror functionality which says
> > that a range A..B of the process address space is mapped into a range C..D
> > of the GPU address space.
> >
> > Those ranges can then be used to implement the SVM feature required for
> > higher level APIs and not something you need at the UAPI or even inside the
> > low level kernel memory management.
> >
> > When you talk about migrating memory to a device you also do this on a per
> > device basis and *not* tied to the process address space. If you then get
> > crappy performance because userspace gave contradicting information where
> to
> > migrate memory then that's a bug in userspace and not something the kernel
> > should try to prevent somehow.
> >
> > [SNIP]
> > > > I think if you start using the same drm_gpuvm for multiple devices you
> > > > will sooner or later start to run into the same mess we have seen with
> > > > KFD, where we moved more and more functionality from the KFD to the
> DRM
> > > > render node because we found that a lot of the stuff simply doesn't work
> > > > correctly with a single object to maintain the state.
> > > As I understand it, KFD is designed to work across devices. A single pseudo
> /dev/kfd device represent all hardware gpu devices. That is why during kfd open,
> many pdd (process device data) is created, each for one hardware device for this
> process.
> >
> > Yes, I'm perfectly aware of that. And I can only repeat myself that I see
> > this design as a rather extreme failure. And I think it's one of the reasons
> > why NVidia is so dominant with Cuda.
> >
> > This whole approach KFD takes was designed with the idea of extending the
> > CPU process into the GPUs, but this idea only works for a few use cases and
> > is not something we should apply to drivers in general.
> >
> > A very good example are virtualization use cases where you end up with CPU
> > address != GPU address because the VAs are actually coming from the guest
> VM
> > and not the host process.
> >
> > SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have
> > any influence on the design of the kernel UAPI.
> >
> > If you want to do something similar as KFD for Xe I think you need to get
> > explicit permission to do this from Dave and Daniel and maybe even Linus.
> 
> I think the one and only one exception where an SVM uapi like in kfd makes
> sense, is if the _hardware_ itself, not the software stack defined
> semantics that you've happened to build on top of that hw, enforces a 1:1
> mapping with the cpu process address space.
> 
> Which means your hardware is using PASID, IOMMU based translation, PCI-ATS
> (address translation services) or whatever your hw calls it and has _no_
> device-side pagetables on top. Which from what I've seen all devices with
> device-memory have, simply because they need some place to store whether
> that memory is currently in device memory or should be translated using
> PASID. Currently there's no gpu that works with PASID only, but there are
> some on-cpu-die accelerator things that do work like that.
> 
> Maybe in the future there will be some accelerators that are fully cpu
> cache coherent (including atomics) with something like CXL, and the
> on-device memory is managed as normal system memory with struct page as
> ZONE_DEVICE and accelerator va -> physical address translation is only
> done with PASID ... but for now I haven't seen that, definitely not in
> upstream drivers.
> 
> And the moment you have some per-device pagetables or per-device memory
> management of some sort (like using gpuva mgr) then I'm 100% agreeing with
> Christian that the kfd SVM model is too strict and not a great idea.
> 


GPU is nothing more than a piece of HW to accelerate part of a program, just like an extra CPU core. From this perspective, a unified virtual address space across CPU and all GPU devices (and any other accelerators) is always more convenient to program than split address space b/t devices.

In reality, GPU program started from split address space.  HMM is designed to provide unified virtual address space w/o a lot of advanced hardware feature you listed above. 

I am aware Nvidia's new hardware platforms such as Grace Hopper natively support the Unified Memory programming model through hardware-based memory coherence among all CPUs and GPUs. For such systems, HMM is not required.

You can think HMM as a software based solution to provide unified address space b/t cpu and devices. Both AMD and Nvidia have been providing unified address space through hmm. I think it is still valuable.

Regards,
Oak  



> Cheers, Sima
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

Danilo Krummrich Jan. 25, 2024, 10:13 p.m. UTC | #23

On 1/24/24 04:57, Zeng, Oak wrote:
> Thanks a lot Danilo.
> 
> Maybe I wasn't clear enough. In the solution I proposed, each device still have separate vm/page tables. Each device still need to manage the mapping, page table flags etc. It is just in svm use case, all devices share one drm_gpuvm instance. As I understand it, drm_gpuvm's main function is the va range split and merging. I don't see why it doesn't work across gpu devices.

I'm pretty sure it does work. You can indeed use GPUVM for tracking mappings using
the split and merge feature only, ignoring all other features it provides. However,
I don't think it's a good idea to have a single GPUVM instance to track the memory
mappings of different devices with different page tables, different object life times,
etc.

> 
> But I read more about drm_gpuvm. Its split merge function takes a drm_gem_object parameter, see drm_gpuvm_sm_map_ops_create and drm_gpuvm_sm_map. Actually the whole drm_gpuvm is designed for BO-centric driver, for example, it has a drm_gpuvm_bo concept to keep track of the 1BO:Ngpuva mapping. The whole purpose of leveraging drm_gpuvm is to re-use the va split/merge functions for SVM. But in our SVM implementation, there is no buffer object at all. So I don't think our SVM codes can leverage drm_gpuvm.

That's all optional features. As mentioned above, you can use GPUVM for tracking mappings
using the split and merge feature only. The drm_gem_object parameter in
drm_gpuvm_sm_map_ops_create() can simply be NULL. Afaik, Xe already does that for userptr
stuff already. But again, I don't think it's a good idea to track memory mappings of
multiple independent physical devices and driver instances in a single different place
whether you use GPUVM or a custom implementation.

- Danilo

> 
> I will give up this approach, unless Matt or Brian can see a way.
> 
> A few replies inline.... @Welty, Brian I had more thoughts inline to one of your original question....
> 
>> -----Original Message-----
>> From: Danilo Krummrich <dakr@redhat.com>
>> Sent: Tuesday, January 23, 2024 6:57 PM
>> To: Zeng, Oak <oak.zeng@intel.com>; Christian König
>> <christian.koenig@amd.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
>> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-
>> xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>;
>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>> <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> Hi Oak,
>>
>> On 1/23/24 20:37, Zeng, Oak wrote:
>>> Thanks Christian. I have some comment inline below.
>>>
>>> Danilo, can you also take a look and give your feedback? Thanks.
>>
>> I agree with everything Christian already wrote. Except for the KFD parts, which
>> I'm simply not familiar with, I had exactly the same thoughts after reading your
>> initial mail.
>>
>> Please find some more comments below.
>>
>>>
>>>> -----Original Message-----
>>>> From: Christian König <christian.koenig@amd.com>
>>>> Sent: Tuesday, January 23, 2024 6:13 AM
>>>> To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>;
>>>> Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>
>>>> Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org;
>> intel-
>>>> xe@lists.freedesktop.org; Bommu, Krishnaiah
>> <krishnaiah.bommu@intel.com>;
>>>> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>>>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>>>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>>>> <matthew.brost@intel.com>
>>>> Subject: Re: Making drm_gpuvm work across gpu devices
>>>>
>>>> Hi Oak,
>>>>
>>>> Am 23.01.24 um 04:21 schrieb Zeng, Oak:
>>>>> Hi Danilo and all,
>>>>>
>>>>> During the work of Intel's SVM code, we came up the idea of making
>>>> drm_gpuvm to work across multiple gpu devices. See some discussion here:
>>>> https://lore.kernel.org/dri-
>>>>
>> devel/PH7PR11MB70049E7E6A2F40BF6282ECC292742@PH7PR11MB7004.namprd
>>>> 11.prod.outlook.com/
>>>>>
>>>>> The reason we try to do this is, for a SVM (shared virtual memory across cpu
>>>> program and all gpu program on all gpu devices) process, the address space
>> has
>>>> to be across all gpu devices. So if we make drm_gpuvm to work across devices,
>>>> then our SVM code can leverage drm_gpuvm as well.
>>>>>
>>>>> At a first look, it seems feasible because drm_gpuvm doesn't really use the
>>>> drm_device *drm pointer a lot. This param is used only for printing/warning.
>> So I
>>>> think maybe we can delete this drm field from drm_gpuvm.
>>>>>
>>>>> This way, on a multiple gpu device system, for one process, we can have only
>>>> one drm_gpuvm instance, instead of multiple drm_gpuvm instances (one for
>>>> each gpu device).
>>>>>
>>>>> What do you think?
>>>>
>>>> Well from the GPUVM side I don't think it would make much difference if
>>>> we have the drm device or not.
>>>>
>>>> But the experience we had with the KFD I think I should mention that we
>>>> should absolutely *not* deal with multiple devices at the same time in
>>>> the UAPI or VM objects inside the driver.
>>>>
>>>> The background is that all the APIs inside the Linux kernel are build
>>>> around the idea that they work with only one device at a time. This
>>>> accounts for both low level APIs like the DMA API as well as pretty high
>>>> level things like for example file system address space etc...
>>>
>>> Yes most API are per device based.
>>>
>>> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl
>> function, it is per-process based. Each kfd_process represent a process across N
>> gpu devices. Cc Felix.
>>>
>>> Need to say, kfd SVM represent a shared virtual address space across CPU and
>> all GPU devices on the system. This is by the definition of SVM (shared virtual
>> memory). This is very different from our legacy gpu *device* driver which works
>> for only one device (i.e., if you want one device to access another device's
>> memory, you will have to use dma-buf export/import etc).
>>>
>>> We have the same design requirement of SVM. For anyone who want to
>> implement the SVM concept, this is a hard requirement. Since now drm has the
>> drm_gpuvm concept which strictly speaking is designed for one device, I want to
>> see whether we can extend drm_gpuvm to make it work for both single device
>> (as used in xe) and multipe devices (will be used in the SVM code). That is why I
>> brought up this topic.
>>>
>>>>
>>>> So when you have multiple GPUs you either have an inseparable cluster of
>>>> them which case you would also only have one drm_device. Or you have
>>>> separated drm_device which also results in separate drm render nodes and
>>>> separate virtual address spaces and also eventually separate IOMMU
>>>> domains which gives you separate dma_addresses for the same page and so
>>>> separate GPUVM page tables....
>>>
>>> I am thinking we can still make each device has its separate drm_device/render
>> node/iommu domains/gpu page table. Just as what we have today. I am not plan
>> to change this picture.
>>>
>>> But the virtual address space will support two modes of operation:
>>> 1. one drm_gpuvm per device. This is when svm is not in the picture
>>> 2. all devices in the process share one single drm_gpuvm, when svm is in the
>> picture. In xe driver design, we have to support a mixture use of legacy mode
>> (such as gem_create and vm_bind) and svm (such as malloc'ed memory for gpu
>> submission). So whenever SVM is in the picture, we want one single process
>> address space across all devices. Drm_gpuvm doesn't need to be aware of those
>> two operation modes. It is driver's responsibility to use different mode.
>>>
>>> For example, in mode #1, a driver's vm structure (such as xe_vm) can inherit
>> from drm_gpuvm. In mode #2, a driver's svm structure (xe_svm in this series:
>> https://lore.kernel.org/dri-devel/20240117221223.18540-1-oak.zeng@intel.com/)
>> can inherit from drm_gpuvm while each xe_vm (still a per-device based struct)
>> will just have a pointer to the drm_gpuvm structure. This way when svm is in play,
>> we build a 1 process:1 mm_struct:1 xe_svm:N xe_vm correlations which means
>> shared address space across gpu devices.
>>
>> With a shared GPUVM structure, how do you track actual per device resources
>> such as
>> page tables? You also need to consider that the page table layout, memory
>> mapping
>> flags may vary from device to device due to different GPU chipsets or revisions.
> 
> The per device page table, flags etc are still managed per-device based, which is the xe_vm in the xekmd driver.
> 
>>
>> Also, if you replace the shared GPUVM structure with a pointer to a shared one,
>> you may run into all kinds of difficulties due to increasing complexity in terms
>> of locking, synchronization, lifetime and potential unwind operations in error
>> paths.
>> I haven't thought it through yet, but I wouldn't be surprised entirely if there are
>> cases where you simply run into circular dependencies.
> 
> Make sense, I can't see through this without a prove of concept code either.
> 
>>
>> Also, looking at the conversation in the linked patch series:
>>
>> <snip>
>>
>>>> For example as hmm_range_fault brings a range from host into GPU address
>>>> space,  what if it was already allocated and in use by VM_BIND for
>>>> a GEM_CREATE allocated buffer?    That is of course application error,
>>>> but KMD needs to detect it, and provide one single managed address
>>>> space across all allocations from the application....
>>
>>> This is very good question. Yes agree we should check this application error.
>> Fortunately this is doable. All vm_bind virtual address range are tracked in
>> xe_vm/drm_gpuvm struct. In this case, we should iterate the drm_gpuvm's rb
>> tree of *all* gpu devices (as xe_vm is for one device only) to see whether there
>> is a conflict. Will make the change soon.
>>
>> <snip>
>>
>> How do you do that if xe_vm->gpuvm is just a pointer to the GPUVM structure
>> within xe_svm?
> 
> In the proposed approach, we have a single drm_gpuvm instance for one process. All device's xe_vm pointing to this drm_gpuvm instance. This drm_gpuvm's rb tree maintains all the va range we have in this process. We can just walk this rb tree to see if there is a conflict.
> 
> But I didn't answer Brian's question completely... In a mixed use of vm_bind and malloc/mmap, the virtual address used by vm_bind should first be reserved in user space using mmap. So all valid virtual address should be tracked by linux kernel vma_struct.
> 
> Both vm_bind and malloc'ed virtual address can cause a gpu page fault. Our fault handler should first see whether this is a vm_bind va and service the fault accordingly; if not, then serve the fault in the SVM path; if SVM path also failed, it is an invalid address. So from user perspective, user can use:
> Ptr = mmap()
> Vm_bind(ptr, bo)
> Submit gpu kernel using ptr
> Or:
> Ptr = mmap()
> Submit gpu kernel using ptr
> Whether vm_bind is called or not decides the gpu fault handler code path. Hopefully this answers @Welty, Brian's original question
> 
> 
>>
>>>
>>> This requires some changes of drm_gpuvm design:
>>> 1. The drm_device *drm pointer, in mode #2 operation, this can be NULL,
>> means this drm_gpuvm is not for specific gpu device
>>> 2. The common dma_resv object: drm_gem_object *r_obj. *Does one
>> dma_resv object allocated/initialized for one device work for all devices*? From
>> first look, dma_resv is just some CPU structure maintaining dma-fences. So I
>> guess it should work for all devices? I definitely need you to comment.
>>
>> The general rule is that drivers can share the common dma_resv across GEM
>> objects that
>> are only mapped within the VM owning the dma_resv, but never within another
>> VM.
>>
>> Now, your question is whether multiple VMs can share the same common
>> dma_resv. I think
>> that calls for trouble, since it would create dependencies that simply aren't
>> needed
>> and might even introduce locking issues.
>>
>> However, that's optional, you can simply decide to not make use of the common
>> dma_resv
>> and all the optimizations based on it.
> 
> Ok, got it.
>>
>>>
>>>
>>>>
>>>> It's up to you how to implement it, but I think it's pretty clear that
>>>> you need separate drm_gpuvm objects to manage those.
>>>
>>> As explained above, I am thinking of one drm_gpuvm object across all devices
>> when SVM is in the picture...
>>>
>>>>
>>>> That you map the same thing in all those virtual address spaces at the
>>>> same address is a completely different optimization problem I think.
>>>
>>> Not sure I follow here... the requirement from SVM is, one virtual address
>> points to same physical backing store. For example, whenever CPU or any GPU
>> device access this virtual address, it refers to the same physical content. Of
>> course the physical backing store can be migrated b/t host memory and any of
>> the GPU's device memory, but the content should be consistent.
>>
>> Technically, multiple different GPUs will have separate virtual address spaces, it's
>> just that you create mappings within all of them such that the same virtual
>> address
>> resolves to the same physical content on all of them.
>>
>> So, having a single GPUVM instance representing all of them might give the
>> illusion of
>> a single unified address space, but you still need to maintain each device's
>> address
>> space backing resources, such as page tables, separately.
> 
> Yes agreed.
> 
> Regards,
> Oak
>>
>> - Danilo
>>
>>>
>>> So we are mapping same physical content to the same virtual address in either
>> cpu page table or any gpu device's page table...
>>>
>>>> What we could certainly do is to optimize hmm_range_fault by making
>>>> hmm_range a reference counted object and using it for multiple devices
>>>> at the same time if those devices request the same range of an mm_struct.
>>>>
>>>
>>> Not very follow. If you are trying to resolve a multiple devices concurrent access
>> problem, I think we should serialize concurrent device fault to one address range.
>> The reason is, during device fault handling, we might migrate the backing store so
>> hmm_range->hmm_pfns[] might have changed after one device access it.
>>>
>>>> I think if you start using the same drm_gpuvm for multiple devices you
>>>> will sooner or later start to run into the same mess we have seen with
>>>> KFD, where we moved more and more functionality from the KFD to the DRM
>>>> render node because we found that a lot of the stuff simply doesn't work
>>>> correctly with a single object to maintain the state.
>>>
>>> As I understand it, KFD is designed to work across devices. A single pseudo
>> /dev/kfd device represent all hardware gpu devices. That is why during kfd open,
>> many pdd (process device data) is created, each for one hardware device for this
>> process. Yes the codes are a little complicated.
>>>
>>> Kfd manages the shared virtual address space in the kfd driver codes, like the
>> split, merging etc. Here I am looking whether we can leverage the drm_gpuvm
>> code for those functions.
>>>
>>> As of the shared virtual address space across gpu devices, it is a hard
>> requirement for svm/system allocator (aka malloc for gpu program). We need to
>> make it work either at driver level or drm_gpuvm level. Drm_gpuvm is better
>> because the work can be shared b/t drivers.
>>>
>>> Thanks a lot,
>>> Oak
>>>
>>>>
>>>> Just one more point to your original discussion on the xe list: I think
>>>> it's perfectly valid for an application to map something at the same
>>>> address you already have something else.
>>>>
>>>> Cheers,
>>>> Christian.
>>>>
>>>>>
>>>>> Thanks,
>>>>> Oak
>>>
>

Thomas Hellstrom Jan. 26, 2024, 8:21 a.m. UTC | #24

Hi, all

On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:
> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
> > Am 23.01.24 um 20:37 schrieb Zeng, Oak:
> > > [SNIP]
> > > Yes most API are per device based.
> > > 
> > > One exception I know is actually the kfd SVM API. If you look at
> > > the svm_ioctl function, it is per-process based. Each kfd_process
> > > represent a process across N gpu devices.
> > 
> > Yeah and that was a big mistake in my opinion. We should really not
> > do that
> > ever again.
> > 
> > > Need to say, kfd SVM represent a shared virtual address space
> > > across CPU and all GPU devices on the system. This is by the
> > > definition of SVM (shared virtual memory). This is very different
> > > from our legacy gpu *device* driver which works for only one
> > > device (i.e., if you want one device to access another device's
> > > memory, you will have to use dma-buf export/import etc).
> > 
> > Exactly that thinking is what we have currently found as blocker
> > for a
> > virtualization projects. Having SVM as device independent feature
> > which
> > somehow ties to the process address space turned out to be an
> > extremely bad
> > idea.
> > 
> > The background is that this only works for some use cases but not
> > all of
> > them.
> > 
> > What's working much better is to just have a mirror functionality
> > which says
> > that a range A..B of the process address space is mapped into a
> > range C..D
> > of the GPU address space.
> > 
> > Those ranges can then be used to implement the SVM feature required
> > for
> > higher level APIs and not something you need at the UAPI or even
> > inside the
> > low level kernel memory management.
> > 
> > When you talk about migrating memory to a device you also do this
> > on a per
> > device basis and *not* tied to the process address space. If you
> > then get
> > crappy performance because userspace gave contradicting information
> > where to
> > migrate memory then that's a bug in userspace and not something the
> > kernel
> > should try to prevent somehow.
> > 
> > [SNIP]
> > > > I think if you start using the same drm_gpuvm for multiple
> > > > devices you
> > > > will sooner or later start to run into the same mess we have
> > > > seen with
> > > > KFD, where we moved more and more functionality from the KFD to
> > > > the DRM
> > > > render node because we found that a lot of the stuff simply
> > > > doesn't work
> > > > correctly with a single object to maintain the state.
> > > As I understand it, KFD is designed to work across devices. A
> > > single pseudo /dev/kfd device represent all hardware gpu devices.
> > > That is why during kfd open, many pdd (process device data) is
> > > created, each for one hardware device for this process.
> > 
> > Yes, I'm perfectly aware of that. And I can only repeat myself that
> > I see
> > this design as a rather extreme failure. And I think it's one of
> > the reasons
> > why NVidia is so dominant with Cuda.
> > 
> > This whole approach KFD takes was designed with the idea of
> > extending the
> > CPU process into the GPUs, but this idea only works for a few use
> > cases and
> > is not something we should apply to drivers in general.
> > 
> > A very good example are virtualization use cases where you end up
> > with CPU
> > address != GPU address because the VAs are actually coming from the
> > guest VM
> > and not the host process.
> > 
> > SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should
> > not have
> > any influence on the design of the kernel UAPI.
> > 
> > If you want to do something similar as KFD for Xe I think you need
> > to get
> > explicit permission to do this from Dave and Daniel and maybe even
> > Linus.
> 
> I think the one and only one exception where an SVM uapi like in kfd
> makes
> sense, is if the _hardware_ itself, not the software stack defined
> semantics that you've happened to build on top of that hw, enforces a
> 1:1
> mapping with the cpu process address space.
> 
> Which means your hardware is using PASID, IOMMU based translation,
> PCI-ATS
> (address translation services) or whatever your hw calls it and has
> _no_
> device-side pagetables on top. Which from what I've seen all devices
> with
> device-memory have, simply because they need some place to store
> whether
> that memory is currently in device memory or should be translated
> using
> PASID. Currently there's no gpu that works with PASID only, but there
> are
> some on-cpu-die accelerator things that do work like that.
> 
> Maybe in the future there will be some accelerators that are fully
> cpu
> cache coherent (including atomics) with something like CXL, and the
> on-device memory is managed as normal system memory with struct page
> as
> ZONE_DEVICE and accelerator va -> physical address translation is
> only
> done with PASID ... but for now I haven't seen that, definitely not
> in
> upstream drivers.
> 
> And the moment you have some per-device pagetables or per-device
> memory
> management of some sort (like using gpuva mgr) then I'm 100% agreeing
> with
> Christian that the kfd SVM model is too strict and not a great idea.
> 
> Cheers, Sima


I'm trying to digest all the comments here, The end goal is to be able
to support something similar to this here:

https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/

Christian, If I understand you correctly, you're strongly suggesting
not to try to manage a common virtual address space across different
devices in the kernel, but merely providing building blocks to do so,
like for example a generalized userptr with migration support using
HMM; That way each "mirror" of the CPU mm would be per device and
inserted into the gpu_vm just like any other gpu_vma, and user-space
would dictate the A..B -> C..D mapping by choosing the GPU_VA for the
vma.

Sima, it sounds like you're suggesting to shy away from hmm and not
even attempt to support this except if it can be done using IOMMU sva
on selected hardware?

Could you clarify a bit?

Thanks,
Thomas

Christian König Jan. 26, 2024, 10:09 a.m. UTC | #25

Hi Oak,

you can still use SVM, but it should not be a design criteria for the 
kernel UAPI. In other words the UAPI should be designed in such a way 
that the GPU virtual address can be equal to the CPU virtual address of 
a buffer, but can also be different to support use cases where this 
isn't the case.

Additionally to what Dave wrote I can summarize a few things I have 
learned while working on the AMD GPU drivers in the last decade or so:

1. Userspace requirements are *not* relevant for UAPI or even more 
general kernel driver design.

2. What should be done is to look at the hardware capabilities and try 
to expose those in a save manner to userspace.

3. The userspace requirements are then used to validate the kernel 
driver and especially the UAPI design to ensure that nothing was missed.

The consequence of this is that nobody should ever use things like Cuda, 
Vulkan, OpenCL, OpenGL etc.. as argument to propose a certain UAPI design.

What should be done instead is to say: My hardware works in this and 
that way -> we want to expose it like this -> because that enables us to 
implement the high level API in this and that way.

Only this gives then a complete picture of how things interact together 
and allows the kernel community to influence and validate the design.

This doesn't mean that you need to throw away everything, but it gives a 
clear restriction that designs are not nailed in stone and for example 
you can't use something like a waterfall model.

Going to answer on your other questions separately.

Regards,
Christian.

Am 25.01.24 um 06:25 schrieb Zeng, Oak:
> Hi Dave,
>
> Let me step back. When I wrote " shared virtual address space b/t cpu and all gpu devices is a hard requirement for our system allocator design", I meant this is not only Intel's design requirement. Rather this is a common requirement for both Intel, AMD and Nvidia. Take a look at cuda driver API definition of cuMemAllocManaged (search this API on https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM), it said:
>
> "The pointer is valid on the CPU and on all GPUs in the system that support managed memory."
>
> This means the program virtual address space is shared b/t CPU and all GPU devices on the system. The system allocator we are discussing is just one step advanced than cuMemAllocManaged: it allows malloc'ed memory to be shared b/t CPU and all GPU devices.
>
> I hope we all agree with this point.
>
> With that, I agree with Christian that in kmd we should make driver code per-device based instead of managing all devices in one driver instance. Our system allocator (and generally xekmd)design follows this rule: we make xe_vm per device based - one device is *not* aware of other device's address space, as I explained in previous email. I started this email seeking a one drm_gpuvm instance to cover all GPU devices. I gave up this approach (at least for now) per Danilo and Christian's feedback: We will continue to have per device based drm_gpuvm. I hope this is aligned with Christian but I will have to wait for Christian's reply to my previous email.
>
> I hope this clarify thing a little.
>
> Regards,
> Oak
>
>> -----Original Message-----
>> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of David
>> Airlie
>> Sent: Wednesday, January 24, 2024 8:25 PM
>> To: Zeng, Oak <oak.zeng@intel.com>
>> Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
>> <michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
>> Brian <brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com>; dri-
>> devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Gupta, saurabhg
>> <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; Daniel
>> Vetter <daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com>; Bommu,
>> Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; Christian König
>> <christian.koenig@amd.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>>>
>>> For us, Xekmd doesn't need to know it is running under bare metal or
>> virtualized environment. Xekmd is always a guest driver. All the virtual address
>> used in xekmd is guest virtual address. For SVM, we require all the VF devices
>> share one single shared address space with guest CPU program. So all the design
>> works in bare metal environment can automatically work under virtualized
>> environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am wrong.
>>>
>>>
>>> Again, shared virtual address space b/t cpu and all gpu devices is a hard
>> requirement for our system allocator design (which means malloc’ed memory,
>> cpu stack variables, globals can be directly used in gpu program. Same
>> requirement as kfd SVM design). This was aligned with our user space software
>> stack.
>>
>> Just to make a very general point here (I'm hoping you listen to
>> Christian a bit more and hoping he replies in more detail), but just
>> because you have a system allocator design done, it doesn't in any way
>> enforce the requirements on the kernel driver to accept that design.
>> Bad system design should be pushed back on, not enforced in
>> implementation stages. It's a trap Intel falls into regularly since
>> they say well we already agreed this design with the userspace team
>> and we can't change it now. This isn't acceptable. Design includes
>> upstream discussion and feedback, if you say misdesigned the system
>> allocator (and I'm not saying you definitely have), and this is
>> pushing back on that, then you have to go fix your system
>> architecture.
>>
>> KFD was an experiment like this, I pushed back on AMD at the start
>> saying it was likely a bad plan, we let it go and got a lot of
>> experience in why it was a bad design.
>>
>> Dave.

Christian König Jan. 26, 2024, 12:52 p.m. UTC | #26

Am 26.01.24 um 09:21 schrieb Thomas Hellström:
> Hi, all
>
> On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:
>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>> [SNIP]
>>>> Yes most API are per device based.
>>>>
>>>> One exception I know is actually the kfd SVM API. If you look at
>>>> the svm_ioctl function, it is per-process based. Each kfd_process
>>>> represent a process across N gpu devices.
>>> Yeah and that was a big mistake in my opinion. We should really not
>>> do that
>>> ever again.
>>>
>>>> Need to say, kfd SVM represent a shared virtual address space
>>>> across CPU and all GPU devices on the system. This is by the
>>>> definition of SVM (shared virtual memory). This is very different
>>>> from our legacy gpu *device* driver which works for only one
>>>> device (i.e., if you want one device to access another device's
>>>> memory, you will have to use dma-buf export/import etc).
>>> Exactly that thinking is what we have currently found as blocker
>>> for a
>>> virtualization projects. Having SVM as device independent feature
>>> which
>>> somehow ties to the process address space turned out to be an
>>> extremely bad
>>> idea.
>>>
>>> The background is that this only works for some use cases but not
>>> all of
>>> them.
>>>
>>> What's working much better is to just have a mirror functionality
>>> which says
>>> that a range A..B of the process address space is mapped into a
>>> range C..D
>>> of the GPU address space.
>>>
>>> Those ranges can then be used to implement the SVM feature required
>>> for
>>> higher level APIs and not something you need at the UAPI or even
>>> inside the
>>> low level kernel memory management.
>>>
>>> When you talk about migrating memory to a device you also do this
>>> on a per
>>> device basis and *not* tied to the process address space. If you
>>> then get
>>> crappy performance because userspace gave contradicting information
>>> where to
>>> migrate memory then that's a bug in userspace and not something the
>>> kernel
>>> should try to prevent somehow.
>>>
>>> [SNIP]
>>>>> I think if you start using the same drm_gpuvm for multiple
>>>>> devices you
>>>>> will sooner or later start to run into the same mess we have
>>>>> seen with
>>>>> KFD, where we moved more and more functionality from the KFD to
>>>>> the DRM
>>>>> render node because we found that a lot of the stuff simply
>>>>> doesn't work
>>>>> correctly with a single object to maintain the state.
>>>> As I understand it, KFD is designed to work across devices. A
>>>> single pseudo /dev/kfd device represent all hardware gpu devices.
>>>> That is why during kfd open, many pdd (process device data) is
>>>> created, each for one hardware device for this process.
>>> Yes, I'm perfectly aware of that. And I can only repeat myself that
>>> I see
>>> this design as a rather extreme failure. And I think it's one of
>>> the reasons
>>> why NVidia is so dominant with Cuda.
>>>
>>> This whole approach KFD takes was designed with the idea of
>>> extending the
>>> CPU process into the GPUs, but this idea only works for a few use
>>> cases and
>>> is not something we should apply to drivers in general.
>>>
>>> A very good example are virtualization use cases where you end up
>>> with CPU
>>> address != GPU address because the VAs are actually coming from the
>>> guest VM
>>> and not the host process.
>>>
>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should
>>> not have
>>> any influence on the design of the kernel UAPI.
>>>
>>> If you want to do something similar as KFD for Xe I think you need
>>> to get
>>> explicit permission to do this from Dave and Daniel and maybe even
>>> Linus.
>> I think the one and only one exception where an SVM uapi like in kfd
>> makes
>> sense, is if the _hardware_ itself, not the software stack defined
>> semantics that you've happened to build on top of that hw, enforces a
>> 1:1
>> mapping with the cpu process address space.
>>
>> Which means your hardware is using PASID, IOMMU based translation,
>> PCI-ATS
>> (address translation services) or whatever your hw calls it and has
>> _no_
>> device-side pagetables on top. Which from what I've seen all devices
>> with
>> device-memory have, simply because they need some place to store
>> whether
>> that memory is currently in device memory or should be translated
>> using
>> PASID. Currently there's no gpu that works with PASID only, but there
>> are
>> some on-cpu-die accelerator things that do work like that.
>>
>> Maybe in the future there will be some accelerators that are fully
>> cpu
>> cache coherent (including atomics) with something like CXL, and the
>> on-device memory is managed as normal system memory with struct page
>> as
>> ZONE_DEVICE and accelerator va -> physical address translation is
>> only
>> done with PASID ... but for now I haven't seen that, definitely not
>> in
>> upstream drivers.
>>
>> And the moment you have some per-device pagetables or per-device
>> memory
>> management of some sort (like using gpuva mgr) then I'm 100% agreeing
>> with
>> Christian that the kfd SVM model is too strict and not a great idea.
>>
>> Cheers, Sima
>
> I'm trying to digest all the comments here, The end goal is to be able
> to support something similar to this here:
>
> https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/
>
> Christian, If I understand you correctly, you're strongly suggesting
> not to try to manage a common virtual address space across different
> devices in the kernel, but merely providing building blocks to do so,
> like for example a generalized userptr with migration support using
> HMM; That way each "mirror" of the CPU mm would be per device and
> inserted into the gpu_vm just like any other gpu_vma, and user-space
> would dictate the A..B -> C..D mapping by choosing the GPU_VA for the
> vma.

Exactly that, yes.

> Sima, it sounds like you're suggesting to shy away from hmm and not
> even attempt to support this except if it can be done using IOMMU sva
> on selected hardware?

I think that comment goes more into the direction of: If you have 
ATS/ATC/PRI capable hardware which exposes the functionality to make 
memory reads and writes directly into the address space of the CPU then 
yes an SVM only interface is ok because the hardware can't do anything 
else. But as long as you have something like GPUVM then please don't 
restrict yourself.

Which I totally agree on as well. The ATS/ATC/PRI combination doesn't 
allow using separate page tables device and CPU and so also not separate 
VAs.

This was one of the reasons why we stopped using this approach for AMD GPUs.

Regards,
Christian.

> Could you clarify a bit?
>
> Thanks,
> Thomas
>
>
>
>
>
>
>

Christian König Jan. 26, 2024, 1:23 p.m. UTC | #27

Am 25.01.24 um 19:37 schrieb Zeng, Oak:
>> -----Original Message-----
>> From: Felix Kuehling<felix.kuehling@amd.com>
>> Sent: Thursday, January 25, 2024 12:16 PM
>> To: Zeng, Oak<oak.zeng@intel.com>; Christian König
>> <christian.koenig@amd.com>; Danilo Krummrich<dakr@redhat.com>; Dave
>> Airlie<airlied@redhat.com>; Daniel Vetter<daniel@ffwll.ch>; Shah, Ankur N
>> <ankur.n.shah@intel.com>; Winiarski, Michal<michal.winiarski@intel.com>
>> Cc: Welty, Brian<brian.welty@intel.com>;dri-devel@lists.freedesktop.org; intel-
>> xe@lists.freedesktop.org; Bommu, Krishnaiah<krishnaiah.bommu@intel.com>;
>> Ghimiray, Himal Prasad<himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>; Brost, Matthew
>> <matthew.brost@intel.com>; Gupta, saurabhg<saurabhg.gupta@intel.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>>
>> On 2024-01-24 20:17, Zeng, Oak wrote:
>>> Hi Christian,
>>>
>>> Even though I mentioned KFD design, I didn’t mean to copy the KFD
>>> design. I also had hard time to understand the difficulty of KFD under
>>> virtualization environment.
>>>
>> The problem with virtualization is related to virtualization design
>> choices. There is a single process that proxies requests from multiple
>> processes in one (or more?) VMs to the GPU driver. That means, we need a
>> single process with multiple contexts (and address spaces). One proxy
>> process on the host must support multiple guest address spaces.
> My first response is, why processes on the virtual machine can't open /dev/kfd device itself?

Because it's not using SRIOV, we are using native context and so the KFD 
driver is on the host and not the guest.

> Also try to picture why base amdgpu driver (which is per hardware device based) doesn't have this problem... creating multiple contexts under single amdgpu device, each context servicing one guest process?

Yes, exactly that.

>> I don't know much more than these very high level requirements, and I
>> only found out about those a few weeks ago. Due to my own bias I can't
>> comment whether there are bad design choices in the proxy architecture
>> or in KFD or both. The way we are considering fixing this, is to enable
>> creating multiple KFD contexts in the same process. Each of those
>> contexts will still represent a shared virtual address space across
>> devices (but not the CPU). Because the device address space is not
>> shared with the CPU, we cannot support our SVM API in this situation.
>>
> One kfd process, multiple contexts, each context has a shared address space across devices.... I do see some complications

Zeng, Oak Jan. 26, 2024, 8:13 p.m. UTC | #28

> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Friday, January 26, 2024 5:10 AM
> To: Zeng, Oak <oak.zeng@intel.com>; David Airlie <airlied@redhat.com>
> Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
> <michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
> Brian <brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com>; dri-
> devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Gupta, saurabhg
> <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; Daniel
> Vetter <daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com>; Bommu,
> Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
> <niranjana.vishwanathapura@intel.com>
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> Hi Oak,
> 
> you can still use SVM, but it should not be a design criteria for the
> kernel UAPI. In other words the UAPI should be designed in such a way
> that the GPU virtual address can be equal to the CPU virtual address of
> a buffer, but can also be different to support use cases where this
> isn't the case.

Terminology:
SVM: any technology which can achieve a shared virtual address space b/t cpu and devices. The virtual address space can be managed by user space or kernel space. Intel implemented a SVM, based on the BO-centric gpu driver (gem-create, vm-bind) where virtual address space is managed by UMD.
System allocator: another way of implement SVM. User just use malloc'ed memory for gpu submission. Virtual address space is managed by Linux core mm. In practice, we leverage HMM to implement system allocator.
This article described details of all those different model: https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/

Our programming model allows a mixture use of system allocator (even though system allocator is ) and traditional vm_bind (where cpu address can != gpu address). Let me re-post the pseudo codes:

	1. Fd0 = open(/"dev/dri/render0")
	2. Fd1 = open("/dev/dri/render1")
	3. Fd3 = open("/dev/dri/xe-svm")
	4. Gpu_Vm0 =xe_vm_create(fd0) 
	5. Gpu_Vm1 = xe_vm_create(fd1) 
	6. Queue0 = xe_exec_queue_create(fd0, gpu_vm0)
	7. Queue1 = xe_exec_queue_create(fd1, gpu_vm1)
	8. ptr = malloc()
	9. bo = xe_bo_create(fd0)
	10. Vm_bind(bo, gpu_vm0, va)//va is from UMD, cpu can access bo with same or different va. It is UMD's responsibility that va doesn't conflict with malloc'ed PTRs.
	11. Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0
	12. Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1
	13. Xe_exec(queue0, va)//submit gpu job which use va, on card0

In above codes, the va used in vm_bind (line 10, Intel's API to bind an object to a va for GPU access) can be different from the CPU address when cpu access the same object. But whenever user use malloc'ed ptr for GPU submission (line 11, 12, so called system allocator), it implies CPU and GPUs use the same ptr to access.

In above vm_bind, it is user/UMD's responsibility to guarantee that vm_bind va doesn't conflict with malloc'ed ptr. Otherwise it is treated as programming error.

I think this design still meets your design restrictions. 


> 
> Additionally to what Dave wrote I can summarize a few things I have
> learned while working on the AMD GPU drivers in the last decade or so:
> 
> 1. Userspace requirements are *not* relevant for UAPI or even more
> general kernel driver design.
> 
> 2. What should be done is to look at the hardware capabilities and try
> to expose those in a save manner to userspace.
> 
> 3. The userspace requirements are then used to validate the kernel
> driver and especially the UAPI design to ensure that nothing was missed.
> 
> The consequence of this is that nobody should ever use things like Cuda,
> Vulkan, OpenCL, OpenGL etc.. as argument to propose a certain UAPI design.
> 
> What should be done instead is to say: My hardware works in this and
> that way -> we want to expose it like this -> because that enables us to
> implement the high level API in this and that way.
> 
> Only this gives then a complete picture of how things interact together
> and allows the kernel community to influence and validate the design.

What you described above is mainly bottom up. I know other people do top down, or whole system vertical HW-SW co-design. I don't have strong opinion here.

Regards,
Oak

> 
> This doesn't mean that you need to throw away everything, but it gives a
> clear restriction that designs are not nailed in stone and for example
> you can't use something like a waterfall model.
> 
> Going to answer on your other questions separately.
> 
> Regards,
> Christian.
> 
> Am 25.01.24 um 06:25 schrieb Zeng, Oak:
> > Hi Dave,
> >
> > Let me step back. When I wrote " shared virtual address space b/t cpu and all
> gpu devices is a hard requirement for our system allocator design", I meant this is
> not only Intel's design requirement. Rather this is a common requirement for
> both Intel, AMD and Nvidia. Take a look at cuda driver API definition of
> cuMemAllocManaged (search this API on https://docs.nvidia.com/cuda/cuda-
> driver-api/group__CUDA__MEM.html#group__CUDA__MEM), it said:
> >
> > "The pointer is valid on the CPU and on all GPUs in the system that support
> managed memory."
> >
> > This means the program virtual address space is shared b/t CPU and all GPU
> devices on the system. The system allocator we are discussing is just one step
> advanced than cuMemAllocManaged: it allows malloc'ed memory to be shared
> b/t CPU and all GPU devices.
> >
> > I hope we all agree with this point.
> >
> > With that, I agree with Christian that in kmd we should make driver code per-
> device based instead of managing all devices in one driver instance. Our system
> allocator (and generally xekmd)design follows this rule: we make xe_vm per
> device based - one device is *not* aware of other device's address space, as I
> explained in previous email. I started this email seeking a one drm_gpuvm
> instance to cover all GPU devices. I gave up this approach (at least for now) per
> Danilo and Christian's feedback: We will continue to have per device based
> drm_gpuvm. I hope this is aligned with Christian but I will have to wait for
> Christian's reply to my previous email.
> >
> > I hope this clarify thing a little.
> >
> > Regards,
> > Oak
> >
> >> -----Original Message-----
> >> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of
> David
> >> Airlie
> >> Sent: Wednesday, January 24, 2024 8:25 PM
> >> To: Zeng, Oak <oak.zeng@intel.com>
> >> Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>;
> >> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
> >> <michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com>;
> Welty,
> >> Brian <brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com>;
> dri-
> >> devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Gupta, saurabhg
> >> <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; Daniel
> >> Vetter <daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com>;
> Bommu,
> >> Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
> >> <niranjana.vishwanathapura@intel.com>; Christian König
> >> <christian.koenig@amd.com>
> >> Subject: Re: Making drm_gpuvm work across gpu devices
> >>
> >>>
> >>> For us, Xekmd doesn't need to know it is running under bare metal or
> >> virtualized environment. Xekmd is always a guest driver. All the virtual address
> >> used in xekmd is guest virtual address. For SVM, we require all the VF devices
> >> share one single shared address space with guest CPU program. So all the
> design
> >> works in bare metal environment can automatically work under virtualized
> >> environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am
> wrong.
> >>>
> >>>
> >>> Again, shared virtual address space b/t cpu and all gpu devices is a hard
> >> requirement for our system allocator design (which means malloc’ed memory,
> >> cpu stack variables, globals can be directly used in gpu program. Same
> >> requirement as kfd SVM design). This was aligned with our user space
> software
> >> stack.
> >>
> >> Just to make a very general point here (I'm hoping you listen to
> >> Christian a bit more and hoping he replies in more detail), but just
> >> because you have a system allocator design done, it doesn't in any way
> >> enforce the requirements on the kernel driver to accept that design.
> >> Bad system design should be pushed back on, not enforced in
> >> implementation stages. It's a trap Intel falls into regularly since
> >> they say well we already agreed this design with the userspace team
> >> and we can't change it now. This isn't acceptable. Design includes
> >> upstream discussion and feedback, if you say misdesigned the system
> >> allocator (and I'm not saying you definitely have), and this is
> >> pushing back on that, then you have to go fix your system
> >> architecture.
> >>
> >> KFD was an experiment like this, I pushed back on AMD at the start
> >> saying it was likely a bad plan, we let it go and got a lot of
> >> experience in why it was a bad design.
> >>
> >> Dave.

Zeng, Oak Jan. 27, 2024, 2:21 a.m. UTC | #29

Regarding the idea of expanding userptr to support migration, we explored this idea long time ago. It provides similar functions of the system allocator but its interface is not as convenient as system allocator. Besides the shared virtual address space, another benefit of a system allocator is, you can offload cpu program to gpu easier, you don’t need to call driver specific API (such as register_userptr and vm_bind in this case) for memory allocation.

We also scoped the implementation. It turned out to be big, and not as beautiful as hmm. Why we gave up this approach.

From: Christian König <christian.koenig@amd.com>
Sent: Friday, January 26, 2024 7:52 AM
To: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>
Cc: Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; dri-devel@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Dave Airlie <airlied@redhat.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org
Subject: Re: Making drm_gpuvm work across gpu devices

Am 26.01.24 um 09:21 schrieb Thomas Hellström:

Hi, all

On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:

On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]

Yes most API are per device based.

One exception I know is actually the kfd SVM API. If you look at

the svm_ioctl function, it is per-process based. Each kfd_process

represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not

do that

ever again.

Need to say, kfd SVM represent a shared virtual address space

across CPU and all GPU devices on the system. This is by the

definition of SVM (shared virtual memory). This is very different

from our legacy gpu *device* driver which works for only one

device (i.e., if you want one device to access another device's

memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker

for a

virtualization projects. Having SVM as device independent feature

which

somehow ties to the process address space turned out to be an

extremely bad

idea.

The background is that this only works for some use cases but not

all of

them.

What's working much better is to just have a mirror functionality

which says

that a range A..B of the process address space is mapped into a

range C..D

of the GPU address space.

Those ranges can then be used to implement the SVM feature required

for

higher level APIs and not something you need at the UAPI or even

inside the

low level kernel memory management.

When you talk about migrating memory to a device you also do this

on a per

device basis and *not* tied to the process address space. If you

then get

crappy performance because userspace gave contradicting information

where to

migrate memory then that's a bug in userspace and not something the

kernel

should try to prevent somehow.

[SNIP]

I think if you start using the same drm_gpuvm for multiple

devices you

will sooner or later start to run into the same mess we have

seen with

KFD, where we moved more and more functionality from the KFD to

the DRM

render node because we found that a lot of the stuff simply

doesn't work

correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A

single pseudo /dev/kfd device represent all hardware gpu devices.

That is why during kfd open, many pdd (process device data) is

created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that

I see

this design as a rather extreme failure. And I think it's one of

the reasons

why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of

extending the

CPU process into the GPUs, but this idea only works for a few use

cases and

is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up

with CPU

address != GPU address because the VAs are actually coming from the

guest VM

and not the host process.

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should

not have

any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need

to get

explicit permission to do this from Dave and Daniel and maybe even

Linus.

I think the one and only one exception where an SVM uapi like in kfd

makes

sense, is if the _hardware_ itself, not the software stack defined

semantics that you've happened to build on top of that hw, enforces a

1:1

mapping with the cpu process address space.

Which means your hardware is using PASID, IOMMU based translation,

PCI-ATS

(address translation services) or whatever your hw calls it and has

_no_

device-side pagetables on top. Which from what I've seen all devices

with

device-memory have, simply because they need some place to store

whether

that memory is currently in device memory or should be translated

using

PASID. Currently there's no gpu that works with PASID only, but there

are

some on-cpu-die accelerator things that do work like that.

Maybe in the future there will be some accelerators that are fully

cpu

cache coherent (including atomics) with something like CXL, and the

on-device memory is managed as normal system memory with struct page

as

ZONE_DEVICE and accelerator va -> physical address translation is

only

done with PASID ... but for now I haven't seen that, definitely not

in

upstream drivers.

And the moment you have some per-device pagetables or per-device

memory

management of some sort (like using gpuva mgr) then I'm 100% agreeing

with

Christian that the kfd SVM model is too strict and not a great idea.

Cheers, Sima

I'm trying to digest all the comments here, The end goal is to be able

to support something similar to this here:

https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/

Christian, If I understand you correctly, you're strongly suggesting

not to try to manage a common virtual address space across different

devices in the kernel, but merely providing building blocks to do so,

like for example a generalized userptr with migration support using

HMM; That way each "mirror" of the CPU mm would be per device and

inserted into the gpu_vm just like any other gpu_vma, and user-space

would dictate the A..B -> C..D mapping by choosing the GPU_VA for the

vma.

Exactly that, yes.

Sima, it sounds like you're suggesting to shy away from hmm and not

even attempt to support this except if it can be done using IOMMU sva

on selected hardware?

I think that comment goes more into the direction of: If you have ATS/ATC/PRI capable hardware which exposes the functionality to make memory reads and writes directly into the address space of the CPU then yes an SVM only interface is ok because the hardware can't do anything else. But as long as you have something like GPUVM then please don't restrict yourself.

Which I totally agree on as well. The ATS/ATC/PRI combination doesn't allow using separate page tables device and CPU and so also not separate VAs.

This was one of the reasons why we stopped using this approach for AMD GPUs.

Regards,
Christian.

Could you clarify a bit?

Thanks,

Thomas

Christian König Jan. 29, 2024, 10:10 a.m. UTC | #30

Am 26.01.24 um 21:13 schrieb Zeng, Oak:
>> -----Original Message-----
>> From: Christian König<christian.koenig@amd.com>
>> Sent: Friday, January 26, 2024 5:10 AM
>> To: Zeng, Oak<oak.zeng@intel.com>; David Airlie<airlied@redhat.com>
>> Cc: Ghimiray, Himal Prasad<himal.prasad.ghimiray@intel.com>;
>> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
>> <michal.winiarski@intel.com>; Felix Kuehling<felix.kuehling@amd.com>; Welty,
>> Brian<brian.welty@intel.com>; Shah, Ankur N<ankur.n.shah@intel.com>; dri-
>> devel@lists.freedesktop.org;intel-xe@lists.freedesktop.org; Gupta, saurabhg
>> <saurabhg.gupta@intel.com>; Danilo Krummrich<dakr@redhat.com>; Daniel
>> Vetter<daniel@ffwll.ch>; Brost, Matthew<matthew.brost@intel.com>; Bommu,
>> Krishnaiah<krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
>> <niranjana.vishwanathapura@intel.com>
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> Hi Oak,
>>
>> you can still use SVM, but it should not be a design criteria for the
>> kernel UAPI. In other words the UAPI should be designed in such a way
>> that the GPU virtual address can be equal to the CPU virtual address of
>> a buffer, but can also be different to support use cases where this
>> isn't the case.
> Terminology:
> SVM: any technology which can achieve a shared virtual address space b/t cpu and devices. The virtual address space can be managed by user space or kernel space. Intel implemented a SVM, based on the BO-centric gpu driver (gem-create, vm-bind) where virtual address space is managed by UMD.
> System allocator: another way of implement SVM. User just use malloc'ed memory for gpu submission. Virtual address space is managed by Linux core mm. In practice, we leverage HMM to implement system allocator.
> This article described details of all those different model:https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/
>
> Our programming model allows a mixture use of system allocator (even though system allocator is ) and traditional vm_bind (where cpu address can != gpu address). Let me re-post the pseudo codes:
>
> 	1. Fd0 = open(/"dev/dri/render0")
> 	2. Fd1 = open("/dev/dri/render1")
> 	3. Fd3 = open("/dev/dri/xe-svm")
> 	4. Gpu_Vm0 =xe_vm_create(fd0)
> 	5. Gpu_Vm1 = xe_vm_create(fd1)
> 	6. Queue0 = xe_exec_queue_create(fd0, gpu_vm0)
> 	7. Queue1 = xe_exec_queue_create(fd1, gpu_vm1)
> 	8. ptr = malloc()
> 	9. bo = xe_bo_create(fd0)
> 	10. Vm_bind(bo, gpu_vm0, va)//va is from UMD, cpu can access bo with same or different va. It is UMD's responsibility that va doesn't conflict with malloc'ed PTRs.
> 	11. Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0
> 	12. Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1
> 	13. Xe_exec(queue0, va)//submit gpu job which use va, on card0
>
> In above codes, the va used in vm_bind (line 10, Intel's API to bind an object to a va for GPU access) can be different from the CPU address when cpu access the same object. But whenever user use malloc'ed ptr for GPU submission (line 11, 12, so called system allocator), it implies CPU and GPUs use the same ptr to access.
>
> In above vm_bind, it is user/UMD's responsibility to guarantee that vm_bind va doesn't conflict with malloc'ed ptr. Otherwise it is treated as programming error.
>
> I think this design still meets your design restrictions.

Well why do you need this "Fd3 = open("/dev/dri/xe-svm")" ?

As far as I see fd3 isn't used anywhere. What you can do is to bind 
parts of your process address space to your driver connections (fd1, fd2 
etc..) with a vm_bind(), but this should *not* come because of 
implicitely using some other file descriptor in the process.

As far as I can see this design is exactly what failed so badly with KFD.

Regards,
Christian.

>
>
>> Additionally to what Dave wrote I can summarize a few things I have
>> learned while working on the AMD GPU drivers in the last decade or so:
>>
>> 1. Userspace requirements are *not* relevant for UAPI or even more
>> general kernel driver design.
>>
>> 2. What should be done is to look at the hardware capabilities and try
>> to expose those in a save manner to userspace.
>>
>> 3. The userspace requirements are then used to validate the kernel
>> driver and especially the UAPI design to ensure that nothing was missed.
>>
>> The consequence of this is that nobody should ever use things like Cuda,
>> Vulkan, OpenCL, OpenGL etc.. as argument to propose a certain UAPI design.
>>
>> What should be done instead is to say: My hardware works in this and
>> that way -> we want to expose it like this -> because that enables us to
>> implement the high level API in this and that way.
>>
>> Only this gives then a complete picture of how things interact together
>> and allows the kernel community to influence and validate the design.
> What you described above is mainly bottom up. I know other people do top down, or whole system vertical HW-SW co-design. I don't have strong opinion here.
>
> Regards,
> Oak
>
>> This doesn't mean that you need to throw away everything, but it gives a
>> clear restriction that designs are not nailed in stone and for example
>> you can't use something like a waterfall model.
>>
>> Going to answer on your other questions separately.
>>
>> Regards,
>> Christian.
>>
>> Am 25.01.24 um 06:25 schrieb Zeng, Oak:
>>> Hi Dave,
>>>
>>> Let me step back. When I wrote " shared virtual address space b/t cpu and all
>> gpu devices is a hard requirement for our system allocator design", I meant this is
>> not only Intel's design requirement. Rather this is a common requirement for
>> both Intel, AMD and Nvidia. Take a look at cuda driver API definition of
>> cuMemAllocManaged (search this API onhttps://docs.nvidia.com/cuda/cuda-
>> driver-api/group__CUDA__MEM.html#group__CUDA__MEM), it said:
>>> "The pointer is valid on the CPU and on all GPUs in the system that support
>> managed memory."
>>> This means the program virtual address space is shared b/t CPU and all GPU
>> devices on the system. The system allocator we are discussing is just one step
>> advanced than cuMemAllocManaged: it allows malloc'ed memory to be shared
>> b/t CPU and all GPU devices.
>>> I hope we all agree with this point.
>>>
>>> With that, I agree with Christian that in kmd we should make driver code per-
>> device based instead of managing all devices in one driver instance. Our system
>> allocator (and generally xekmd)design follows this rule: we make xe_vm per
>> device based - one device is *not* aware of other device's address space, as I
>> explained in previous email. I started this email seeking a one drm_gpuvm
>> instance to cover all GPU devices. I gave up this approach (at least for now) per
>> Danilo and Christian's feedback: We will continue to have per device based
>> drm_gpuvm. I hope this is aligned with Christian but I will have to wait for
>> Christian's reply to my previous email.
>>> I hope this clarify thing a little.
>>>
>>> Regards,
>>> Oak
>>>
>>>> -----Original Message-----
>>>> From: dri-devel<dri-devel-bounces@lists.freedesktop.org>  On Behalf Of
>> David
>>>> Airlie
>>>> Sent: Wednesday, January 24, 2024 8:25 PM
>>>> To: Zeng, Oak<oak.zeng@intel.com>
>>>> Cc: Ghimiray, Himal Prasad<himal.prasad.ghimiray@intel.com>;
>>>> Thomas.Hellstrom@linux.intel.com; Winiarski, Michal
>>>> <michal.winiarski@intel.com>; Felix Kuehling<felix.kuehling@amd.com>;
>> Welty,
>>>> Brian<brian.welty@intel.com>; Shah, Ankur N<ankur.n.shah@intel.com>;
>> dri-
>>>> devel@lists.freedesktop.org;intel-xe@lists.freedesktop.org; Gupta, saurabhg
>>>> <saurabhg.gupta@intel.com>; Danilo Krummrich<dakr@redhat.com>; Daniel
>>>> Vetter<daniel@ffwll.ch>; Brost, Matthew<matthew.brost@intel.com>;
>> Bommu,
>>>> Krishnaiah<krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana
>>>> <niranjana.vishwanathapura@intel.com>; Christian König
>>>> <christian.koenig@amd.com>
>>>> Subject: Re: Making drm_gpuvm work across gpu devices
>>>>
>>>>> For us, Xekmd doesn't need to know it is running under bare metal or
>>>> virtualized environment. Xekmd is always a guest driver. All the virtual address
>>>> used in xekmd is guest virtual address. For SVM, we require all the VF devices
>>>> share one single shared address space with guest CPU program. So all the
>> design
>>>> works in bare metal environment can automatically work under virtualized
>>>> environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am
>> wrong.
>>>>>
>>>>> Again, shared virtual address space b/t cpu and all gpu devices is a hard
>>>> requirement for our system allocator design (which means malloc’ed memory,
>>>> cpu stack variables, globals can be directly used in gpu program. Same
>>>> requirement as kfd SVM design). This was aligned with our user space
>> software
>>>> stack.
>>>>
>>>> Just to make a very general point here (I'm hoping you listen to
>>>> Christian a bit more and hoping he replies in more detail), but just
>>>> because you have a system allocator design done, it doesn't in any way
>>>> enforce the requirements on the kernel driver to accept that design.
>>>> Bad system design should be pushed back on, not enforced in
>>>> implementation stages. It's a trap Intel falls into regularly since
>>>> they say well we already agreed this design with the userspace team
>>>> and we can't change it now. This isn't acceptable. Design includes
>>>> upstream discussion and feedback, if you say misdesigned the system
>>>> allocator (and I'm not saying you definitely have), and this is
>>>> pushing back on that, then you have to go fix your system
>>>> architecture.
>>>>
>>>> KFD was an experiment like this, I pushed back on AMD at the start
>>>> saying it was likely a bad plan, we let it go and got a lot of
>>>> experience in why it was a bad design.
>>>>
>>>> Dave.

Christian König Jan. 29, 2024, 10:19 a.m. UTC | #31

Well Daniel and Dave noted it as well, so I'm just repeating it: Your 
design choices are not an argument to get something upstream.

It's the job of the maintainers and at the end of the Linus to judge of 
something is acceptable or not.

As far as I can see a good part of this this idea has been exercised 
lengthy with KFD and it turned out to not be the best approach.

So from what I've seen the design you outlined is extremely unlikely to 
go upstream.

Regards,
Christian.

Am 27.01.24 um 03:21 schrieb Zeng, Oak:
>
> Regarding the idea of expanding userptr to support migration, we 
> explored this idea long time ago. It provides similar functions of the 
> system allocator but its interface is not as convenient as system 
> allocator. Besides the shared virtual address space, another benefit 
> of a system allocator is, you can offload cpu program to gpu easier, 
> you don’t need to call driver specific API (such as register_userptr 
> and vm_bind in this case) for memory allocation.
>
> We also scoped the implementation. It turned out to be big, and not as 
> beautiful as hmm. Why we gave up this approach.
>
> *From:*Christian König <christian.koenig@amd.com>
> *Sent:* Friday, January 26, 2024 7:52 AM
> *To:* Thomas Hellström <thomas.hellstrom@linux.intel.com>; Daniel 
> Vetter <daniel@ffwll.ch>
> *Cc:* Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling 
> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; 
> Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Zeng, Oak 
> <oak.zeng@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; 
> Danilo Krummrich <dakr@redhat.com>; dri-devel@lists.freedesktop.org; 
> Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Dave Airlie 
> <airlied@redhat.com>; Vishwanathapura, Niranjana 
> <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Am 26.01.24 um 09:21 schrieb Thomas Hellström:
>
>     Hi, all
>
>     On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:
>
>         On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>
>             Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>
>                 [SNIP]
>
>                 Yes most API are per device based.
>
>                 One exception I know is actually the kfd SVM API. If you look at
>
>                 the svm_ioctl function, it is per-process based. Each kfd_process
>
>                 represent a process across N gpu devices.
>
>             Yeah and that was a big mistake in my opinion. We should really not
>
>             do that
>
>             ever again.
>
>                 Need to say, kfd SVM represent a shared virtual address space
>
>                 across CPU and all GPU devices on the system. This is by the
>
>                 definition of SVM (shared virtual memory). This is very different
>
>                 from our legacy gpu *device* driver which works for only one
>
>                 device (i.e., if you want one device to access another device's
>
>                 memory, you will have to use dma-buf export/import etc).
>
>             Exactly that thinking is what we have currently found as blocker
>
>             for a
>
>             virtualization projects. Having SVM as device independent feature
>
>             which
>
>             somehow ties to the process address space turned out to be an
>
>             extremely bad
>
>             idea.
>
>             The background is that this only works for some use cases but not
>
>             all of
>
>             them.
>
>             What's working much better is to just have a mirror functionality
>
>             which says
>
>             that a range A..B of the process address space is mapped into a
>
>             range C..D
>
>             of the GPU address space.
>
>             Those ranges can then be used to implement the SVM feature required
>
>             for
>
>             higher level APIs and not something you need at the UAPI or even
>
>             inside the
>
>             low level kernel memory management.
>
>             When you talk about migrating memory to a device you also do this
>
>             on a per
>
>             device basis and *not* tied to the process address space. If you
>
>             then get
>
>             crappy performance because userspace gave contradicting information
>
>             where to
>
>             migrate memory then that's a bug in userspace and not something the
>
>             kernel
>
>             should try to prevent somehow.
>
>             [SNIP]
>
>                     I think if you start using the same drm_gpuvm for multiple
>
>                     devices you
>
>                     will sooner or later start to run into the same mess we have
>
>                     seen with
>
>                     KFD, where we moved more and more functionality from the KFD to
>
>                     the DRM
>
>                     render node because we found that a lot of the stuff simply
>
>                     doesn't work
>
>                     correctly with a single object to maintain the state.
>
>                 As I understand it, KFD is designed to work across devices. A
>
>                 single pseudo /dev/kfd device represent all hardware gpu devices.
>
>                 That is why during kfd open, many pdd (process device data) is
>
>                 created, each for one hardware device for this process.
>
>             Yes, I'm perfectly aware of that. And I can only repeat myself that
>
>             I see
>
>             this design as a rather extreme failure. And I think it's one of
>
>             the reasons
>
>             why NVidia is so dominant with Cuda.
>
>             This whole approach KFD takes was designed with the idea of
>
>             extending the
>
>             CPU process into the GPUs, but this idea only works for a few use
>
>             cases and
>
>             is not something we should apply to drivers in general.
>
>             A very good example are virtualization use cases where you end up
>
>             with CPU
>
>             address != GPU address because the VAs are actually coming from the
>
>             guest VM
>
>             and not the host process.
>
>             SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should
>
>             not have
>
>             any influence on the design of the kernel UAPI.
>
>             If you want to do something similar as KFD for Xe I think you need
>
>             to get
>
>             explicit permission to do this from Dave and Daniel and maybe even
>
>             Linus.
>
>         I think the one and only one exception where an SVM uapi like in kfd
>
>         makes
>
>         sense, is if the _hardware_ itself, not the software stack defined
>
>         semantics that you've happened to build on top of that hw, enforces a
>
>         1:1
>
>         mapping with the cpu process address space.
>
>         Which means your hardware is using PASID, IOMMU based translation,
>
>         PCI-ATS
>
>         (address translation services) or whatever your hw calls it and has
>
>         _no_
>
>         device-side pagetables on top. Which from what I've seen all devices
>
>         with
>
>         device-memory have, simply because they need some place to store
>
>         whether
>
>         that memory is currently in device memory or should be translated
>
>         using
>
>         PASID. Currently there's no gpu that works with PASID only, but there
>
>         are
>
>         some on-cpu-die accelerator things that do work like that.
>
>         Maybe in the future there will be some accelerators that are fully
>
>         cpu
>
>         cache coherent (including atomics) with something like CXL, and the
>
>         on-device memory is managed as normal system memory with struct page
>
>         as
>
>         ZONE_DEVICE and accelerator va -> physical address translation is
>
>         only
>
>         done with PASID ... but for now I haven't seen that, definitely not
>
>         in
>
>         upstream drivers.
>
>         And the moment you have some per-device pagetables or per-device
>
>         memory
>
>         management of some sort (like using gpuva mgr) then I'm 100% agreeing
>
>         with
>
>         Christian that the kfd SVM model is too strict and not a great idea.
>
>         Cheers, Sima
>
>     I'm trying to digest all the comments here, The end goal is to be able
>
>     to support something similar to this here:
>
>     https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/
>
>     Christian, If I understand you correctly, you're strongly suggesting
>
>     not to try to manage a common virtual address space across different
>
>     devices in the kernel, but merely providing building blocks to do so,
>
>     like for example a generalized userptr with migration support using
>
>     HMM; That way each "mirror" of the CPU mm would be per device and
>
>     inserted into the gpu_vm just like any other gpu_vma, and user-space
>
>     would dictate the A..B -> C..D mapping by choosing the GPU_VA for the
>
>     vma.
>
>
> Exactly that, yes.
>
>
>     Sima, it sounds like you're suggesting to shy away from hmm and not
>
>     even attempt to support this except if it can be done using IOMMU sva
>
>     on selected hardware?
>
>
> I think that comment goes more into the direction of: If you have 
> ATS/ATC/PRI capable hardware which exposes the functionality to make 
> memory reads and writes directly into the address space of the CPU 
> then yes an SVM only interface is ok because the hardware can't do 
> anything else. But as long as you have something like GPUVM then 
> please don't restrict yourself.
>
> Which I totally agree on as well. The ATS/ATC/PRI combination doesn't 
> allow using separate page tables device and CPU and so also not 
> separate VAs.
>
> This was one of the reasons why we stopped using this approach for AMD 
> GPUs.
>
> Regards,
> Christian.
>
>
>     Could you clarify a bit?
>
>     Thanks,
>
>     Thomas
>

Felix Kuehling Jan. 29, 2024, 3:03 p.m. UTC | #32

On 2024-01-25 13:32, Daniel Vetter wrote:
> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>> [SNIP]
>>> Yes most API are per device based.
>>>
>>> One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.
>> Yeah and that was a big mistake in my opinion. We should really not do that
>> ever again.
>>
>>> Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
>> Exactly that thinking is what we have currently found as blocker for a
>> virtualization projects. Having SVM as device independent feature which
>> somehow ties to the process address space turned out to be an extremely bad
>> idea.
>>
>> The background is that this only works for some use cases but not all of
>> them.
>>
>> What's working much better is to just have a mirror functionality which says
>> that a range A..B of the process address space is mapped into a range C..D
>> of the GPU address space.
>>
>> Those ranges can then be used to implement the SVM feature required for
>> higher level APIs and not something you need at the UAPI or even inside the
>> low level kernel memory management.
>>
>> When you talk about migrating memory to a device you also do this on a per
>> device basis and *not* tied to the process address space. If you then get
>> crappy performance because userspace gave contradicting information where to
>> migrate memory then that's a bug in userspace and not something the kernel
>> should try to prevent somehow.
>>
>> [SNIP]
>>>> I think if you start using the same drm_gpuvm for multiple devices you
>>>> will sooner or later start to run into the same mess we have seen with
>>>> KFD, where we moved more and more functionality from the KFD to the DRM
>>>> render node because we found that a lot of the stuff simply doesn't work
>>>> correctly with a single object to maintain the state.
>>> As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.
>> Yes, I'm perfectly aware of that. And I can only repeat myself that I see
>> this design as a rather extreme failure. And I think it's one of the reasons
>> why NVidia is so dominant with Cuda.
>>
>> This whole approach KFD takes was designed with the idea of extending the
>> CPU process into the GPUs, but this idea only works for a few use cases and
>> is not something we should apply to drivers in general.
>>
>> A very good example are virtualization use cases where you end up with CPU
>> address != GPU address because the VAs are actually coming from the guest VM
>> and not the host process.
>>
>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have
>> any influence on the design of the kernel UAPI.
>>
>> If you want to do something similar as KFD for Xe I think you need to get
>> explicit permission to do this from Dave and Daniel and maybe even Linus.
> I think the one and only one exception where an SVM uapi like in kfd makes
> sense, is if the _hardware_ itself, not the software stack defined
> semantics that you've happened to build on top of that hw, enforces a 1:1
> mapping with the cpu process address space.
>
> Which means your hardware is using PASID, IOMMU based translation, PCI-ATS
> (address translation services) or whatever your hw calls it and has _no_
> device-side pagetables on top. Which from what I've seen all devices with
> device-memory have, simply because they need some place to store whether
> that memory is currently in device memory or should be translated using
> PASID. Currently there's no gpu that works with PASID only, but there are
> some on-cpu-die accelerator things that do work like that.
>
> Maybe in the future there will be some accelerators that are fully cpu
> cache coherent (including atomics) with something like CXL, and the
> on-device memory is managed as normal system memory with struct page as
> ZONE_DEVICE and accelerator va -> physical address translation is only
> done with PASID ... but for now I haven't seen that, definitely not in
> upstream drivers.
>
> And the moment you have some per-device pagetables or per-device memory
> management of some sort (like using gpuva mgr) then I'm 100% agreeing with
> Christian that the kfd SVM model is too strict and not a great idea.

That basically means, without ATS/PRI+PASID you cannot implement a 
unified memory programming model, where GPUs or accelerators access 
virtual addresses without pre-registering them with an SVM API call.

Unified memory is a feature implemented by the KFD SVM API and used by 
ROCm. This is used e.g. to implement OpenMP USM (unified shared memory). 
It's implemented with recoverable GPU page faults. If the page fault 
interrupt handler cannot assume a shared virtual address space, then 
implementing this feature isn't possible.

Regards,
   Felix


>
> Cheers, Sima

Christian König Jan. 29, 2024, 3:33 p.m. UTC | #33

Am 29.01.24 um 16:03 schrieb Felix Kuehling:
> On 2024-01-25 13:32, Daniel Vetter wrote:
>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>> [SNIP]
>>>> Yes most API are per device based.
>>>>
>>>> One exception I know is actually the kfd SVM API. If you look at 
>>>> the svm_ioctl function, it is per-process based. Each kfd_process 
>>>> represent a process across N gpu devices.
>>> Yeah and that was a big mistake in my opinion. We should really not 
>>> do that
>>> ever again.
>>>
>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>> across CPU and all GPU devices on the system. This is by the 
>>>> definition of SVM (shared virtual memory). This is very different 
>>>> from our legacy gpu *device* driver which works for only one device 
>>>> (i.e., if you want one device to access another device's memory, 
>>>> you will have to use dma-buf export/import etc).
>>> Exactly that thinking is what we have currently found as blocker for a
>>> virtualization projects. Having SVM as device independent feature which
>>> somehow ties to the process address space turned out to be an 
>>> extremely bad
>>> idea.
>>>
>>> The background is that this only works for some use cases but not 
>>> all of
>>> them.
>>>
>>> What's working much better is to just have a mirror functionality 
>>> which says
>>> that a range A..B of the process address space is mapped into a 
>>> range C..D
>>> of the GPU address space.
>>>
>>> Those ranges can then be used to implement the SVM feature required for
>>> higher level APIs and not something you need at the UAPI or even 
>>> inside the
>>> low level kernel memory management.
>>>
>>> When you talk about migrating memory to a device you also do this on 
>>> a per
>>> device basis and *not* tied to the process address space. If you 
>>> then get
>>> crappy performance because userspace gave contradicting information 
>>> where to
>>> migrate memory then that's a bug in userspace and not something the 
>>> kernel
>>> should try to prevent somehow.
>>>
>>> [SNIP]
>>>>> I think if you start using the same drm_gpuvm for multiple devices 
>>>>> you
>>>>> will sooner or later start to run into the same mess we have seen 
>>>>> with
>>>>> KFD, where we moved more and more functionality from the KFD to 
>>>>> the DRM
>>>>> render node because we found that a lot of the stuff simply 
>>>>> doesn't work
>>>>> correctly with a single object to maintain the state.
>>>> As I understand it, KFD is designed to work across devices. A 
>>>> single pseudo /dev/kfd device represent all hardware gpu devices. 
>>>> That is why during kfd open, many pdd (process device data) is 
>>>> created, each for one hardware device for this process.
>>> Yes, I'm perfectly aware of that. And I can only repeat myself that 
>>> I see
>>> this design as a rather extreme failure. And I think it's one of the 
>>> reasons
>>> why NVidia is so dominant with Cuda.
>>>
>>> This whole approach KFD takes was designed with the idea of 
>>> extending the
>>> CPU process into the GPUs, but this idea only works for a few use 
>>> cases and
>>> is not something we should apply to drivers in general.
>>>
>>> A very good example are virtualization use cases where you end up 
>>> with CPU
>>> address != GPU address because the VAs are actually coming from the 
>>> guest VM
>>> and not the host process.
>>>
>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should 
>>> not have
>>> any influence on the design of the kernel UAPI.
>>>
>>> If you want to do something similar as KFD for Xe I think you need 
>>> to get
>>> explicit permission to do this from Dave and Daniel and maybe even 
>>> Linus.
>> I think the one and only one exception where an SVM uapi like in kfd 
>> makes
>> sense, is if the _hardware_ itself, not the software stack defined
>> semantics that you've happened to build on top of that hw, enforces a 
>> 1:1
>> mapping with the cpu process address space.
>>
>> Which means your hardware is using PASID, IOMMU based translation, 
>> PCI-ATS
>> (address translation services) or whatever your hw calls it and has _no_
>> device-side pagetables on top. Which from what I've seen all devices 
>> with
>> device-memory have, simply because they need some place to store whether
>> that memory is currently in device memory or should be translated using
>> PASID. Currently there's no gpu that works with PASID only, but there 
>> are
>> some on-cpu-die accelerator things that do work like that.
>>
>> Maybe in the future there will be some accelerators that are fully cpu
>> cache coherent (including atomics) with something like CXL, and the
>> on-device memory is managed as normal system memory with struct page as
>> ZONE_DEVICE and accelerator va -> physical address translation is only
>> done with PASID ... but for now I haven't seen that, definitely not in
>> upstream drivers.
>>
>> And the moment you have some per-device pagetables or per-device memory
>> management of some sort (like using gpuva mgr) then I'm 100% agreeing 
>> with
>> Christian that the kfd SVM model is too strict and not a great idea.
>
> That basically means, without ATS/PRI+PASID you cannot implement a 
> unified memory programming model, where GPUs or accelerators access 
> virtual addresses without pre-registering them with an SVM API call.
>
> Unified memory is a feature implemented by the KFD SVM API and used by 
> ROCm. This is used e.g. to implement OpenMP USM (unified shared 
> memory). It's implemented with recoverable GPU page faults. If the 
> page fault interrupt handler cannot assume a shared virtual address 
> space, then implementing this feature isn't possible.

Why not? As far as I can see the OpenMP USM is just another funky way of 
userptr handling.

The difference is that in an userptr we assume that we always need to 
request the whole block A..B from a mapping while for page fault based 
handling it can be just any page in between A and B which is requested 
and made available to the GPU address space.

As far as I can see there is absolutely no need for any special SVM 
handling.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>>
>> Cheers, Sima

Felix Kuehling Jan. 29, 2024, 4:24 p.m. UTC | #34

On 2024-01-29 10:33, Christian König wrote:
> Am 29.01.24 um 16:03 schrieb Felix Kuehling:
>> On 2024-01-25 13:32, Daniel Vetter wrote:
>>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>>> [SNIP]
>>>>> Yes most API are per device based.
>>>>>
>>>>> One exception I know is actually the kfd SVM API. If you look at 
>>>>> the svm_ioctl function, it is per-process based. Each kfd_process 
>>>>> represent a process across N gpu devices.
>>>> Yeah and that was a big mistake in my opinion. We should really not 
>>>> do that
>>>> ever again.
>>>>
>>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>>> across CPU and all GPU devices on the system. This is by the 
>>>>> definition of SVM (shared virtual memory). This is very different 
>>>>> from our legacy gpu *device* driver which works for only one 
>>>>> device (i.e., if you want one device to access another device's 
>>>>> memory, you will have to use dma-buf export/import etc).
>>>> Exactly that thinking is what we have currently found as blocker for a
>>>> virtualization projects. Having SVM as device independent feature 
>>>> which
>>>> somehow ties to the process address space turned out to be an 
>>>> extremely bad
>>>> idea.
>>>>
>>>> The background is that this only works for some use cases but not 
>>>> all of
>>>> them.
>>>>
>>>> What's working much better is to just have a mirror functionality 
>>>> which says
>>>> that a range A..B of the process address space is mapped into a 
>>>> range C..D
>>>> of the GPU address space.
>>>>
>>>> Those ranges can then be used to implement the SVM feature required 
>>>> for
>>>> higher level APIs and not something you need at the UAPI or even 
>>>> inside the
>>>> low level kernel memory management.
>>>>
>>>> When you talk about migrating memory to a device you also do this 
>>>> on a per
>>>> device basis and *not* tied to the process address space. If you 
>>>> then get
>>>> crappy performance because userspace gave contradicting information 
>>>> where to
>>>> migrate memory then that's a bug in userspace and not something the 
>>>> kernel
>>>> should try to prevent somehow.
>>>>
>>>> [SNIP]
>>>>>> I think if you start using the same drm_gpuvm for multiple 
>>>>>> devices you
>>>>>> will sooner or later start to run into the same mess we have seen 
>>>>>> with
>>>>>> KFD, where we moved more and more functionality from the KFD to 
>>>>>> the DRM
>>>>>> render node because we found that a lot of the stuff simply 
>>>>>> doesn't work
>>>>>> correctly with a single object to maintain the state.
>>>>> As I understand it, KFD is designed to work across devices. A 
>>>>> single pseudo /dev/kfd device represent all hardware gpu devices. 
>>>>> That is why during kfd open, many pdd (process device data) is 
>>>>> created, each for one hardware device for this process.
>>>> Yes, I'm perfectly aware of that. And I can only repeat myself that 
>>>> I see
>>>> this design as a rather extreme failure. And I think it's one of 
>>>> the reasons
>>>> why NVidia is so dominant with Cuda.
>>>>
>>>> This whole approach KFD takes was designed with the idea of 
>>>> extending the
>>>> CPU process into the GPUs, but this idea only works for a few use 
>>>> cases and
>>>> is not something we should apply to drivers in general.
>>>>
>>>> A very good example are virtualization use cases where you end up 
>>>> with CPU
>>>> address != GPU address because the VAs are actually coming from the 
>>>> guest VM
>>>> and not the host process.
>>>>
>>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should 
>>>> not have
>>>> any influence on the design of the kernel UAPI.
>>>>
>>>> If you want to do something similar as KFD for Xe I think you need 
>>>> to get
>>>> explicit permission to do this from Dave and Daniel and maybe even 
>>>> Linus.
>>> I think the one and only one exception where an SVM uapi like in kfd 
>>> makes
>>> sense, is if the _hardware_ itself, not the software stack defined
>>> semantics that you've happened to build on top of that hw, enforces 
>>> a 1:1
>>> mapping with the cpu process address space.
>>>
>>> Which means your hardware is using PASID, IOMMU based translation, 
>>> PCI-ATS
>>> (address translation services) or whatever your hw calls it and has 
>>> _no_
>>> device-side pagetables on top. Which from what I've seen all devices 
>>> with
>>> device-memory have, simply because they need some place to store 
>>> whether
>>> that memory is currently in device memory or should be translated using
>>> PASID. Currently there's no gpu that works with PASID only, but 
>>> there are
>>> some on-cpu-die accelerator things that do work like that.
>>>
>>> Maybe in the future there will be some accelerators that are fully cpu
>>> cache coherent (including atomics) with something like CXL, and the
>>> on-device memory is managed as normal system memory with struct page as
>>> ZONE_DEVICE and accelerator va -> physical address translation is only
>>> done with PASID ... but for now I haven't seen that, definitely not in
>>> upstream drivers.
>>>
>>> And the moment you have some per-device pagetables or per-device memory
>>> management of some sort (like using gpuva mgr) then I'm 100% 
>>> agreeing with
>>> Christian that the kfd SVM model is too strict and not a great idea.
>>
>> That basically means, without ATS/PRI+PASID you cannot implement a 
>> unified memory programming model, where GPUs or accelerators access 
>> virtual addresses without pre-registering them with an SVM API call.
>>
>> Unified memory is a feature implemented by the KFD SVM API and used 
>> by ROCm. This is used e.g. to implement OpenMP USM (unified shared 
>> memory). It's implemented with recoverable GPU page faults. If the 
>> page fault interrupt handler cannot assume a shared virtual address 
>> space, then implementing this feature isn't possible.
>
> Why not? As far as I can see the OpenMP USM is just another funky way 
> of userptr handling.
>
> The difference is that in an userptr we assume that we always need to 
> request the whole block A..B from a mapping while for page fault based 
> handling it can be just any page in between A and B which is requested 
> and made available to the GPU address space.
>
> As far as I can see there is absolutely no need for any special SVM 
> handling.

It does assume a shared virtual address space between CPU and GPUs. 
There are no API calls to tell the driver that address A on the CPU maps 
to address B on the GPU1 and address C on GPU2. The KFD SVM API was 
designed to work with this programming model, by augmenting the shared 
virtual address mappings with virtual address range attributes that can 
modify the migration policy and indicate prefetching, prefaulting, etc. 
You could think of it as madvise on steroids.

Regards,
   Felix


>
> Regards,
> Christian.
>
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>> Cheers, Sima
>

Christian König Jan. 29, 2024, 4:28 p.m. UTC | #35

Am 29.01.24 um 17:24 schrieb Felix Kuehling:
> On 2024-01-29 10:33, Christian König wrote:
>> Am 29.01.24 um 16:03 schrieb Felix Kuehling:
>>> On 2024-01-25 13:32, Daniel Vetter wrote:
>>>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>>>> [SNIP]
>>>>>> Yes most API are per device based.
>>>>>>
>>>>>> One exception I know is actually the kfd SVM API. If you look at 
>>>>>> the svm_ioctl function, it is per-process based. Each kfd_process 
>>>>>> represent a process across N gpu devices.
>>>>> Yeah and that was a big mistake in my opinion. We should really 
>>>>> not do that
>>>>> ever again.
>>>>>
>>>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>>>> across CPU and all GPU devices on the system. This is by the 
>>>>>> definition of SVM (shared virtual memory). This is very different 
>>>>>> from our legacy gpu *device* driver which works for only one 
>>>>>> device (i.e., if you want one device to access another device's 
>>>>>> memory, you will have to use dma-buf export/import etc).
>>>>> Exactly that thinking is what we have currently found as blocker 
>>>>> for a
>>>>> virtualization projects. Having SVM as device independent feature 
>>>>> which
>>>>> somehow ties to the process address space turned out to be an 
>>>>> extremely bad
>>>>> idea.
>>>>>
>>>>> The background is that this only works for some use cases but not 
>>>>> all of
>>>>> them.
>>>>>
>>>>> What's working much better is to just have a mirror functionality 
>>>>> which says
>>>>> that a range A..B of the process address space is mapped into a 
>>>>> range C..D
>>>>> of the GPU address space.
>>>>>
>>>>> Those ranges can then be used to implement the SVM feature 
>>>>> required for
>>>>> higher level APIs and not something you need at the UAPI or even 
>>>>> inside the
>>>>> low level kernel memory management.
>>>>>
>>>>> When you talk about migrating memory to a device you also do this 
>>>>> on a per
>>>>> device basis and *not* tied to the process address space. If you 
>>>>> then get
>>>>> crappy performance because userspace gave contradicting 
>>>>> information where to
>>>>> migrate memory then that's a bug in userspace and not something 
>>>>> the kernel
>>>>> should try to prevent somehow.
>>>>>
>>>>> [SNIP]
>>>>>>> I think if you start using the same drm_gpuvm for multiple 
>>>>>>> devices you
>>>>>>> will sooner or later start to run into the same mess we have 
>>>>>>> seen with
>>>>>>> KFD, where we moved more and more functionality from the KFD to 
>>>>>>> the DRM
>>>>>>> render node because we found that a lot of the stuff simply 
>>>>>>> doesn't work
>>>>>>> correctly with a single object to maintain the state.
>>>>>> As I understand it, KFD is designed to work across devices. A 
>>>>>> single pseudo /dev/kfd device represent all hardware gpu devices. 
>>>>>> That is why during kfd open, many pdd (process device data) is 
>>>>>> created, each for one hardware device for this process.
>>>>> Yes, I'm perfectly aware of that. And I can only repeat myself 
>>>>> that I see
>>>>> this design as a rather extreme failure. And I think it's one of 
>>>>> the reasons
>>>>> why NVidia is so dominant with Cuda.
>>>>>
>>>>> This whole approach KFD takes was designed with the idea of 
>>>>> extending the
>>>>> CPU process into the GPUs, but this idea only works for a few use 
>>>>> cases and
>>>>> is not something we should apply to drivers in general.
>>>>>
>>>>> A very good example are virtualization use cases where you end up 
>>>>> with CPU
>>>>> address != GPU address because the VAs are actually coming from 
>>>>> the guest VM
>>>>> and not the host process.
>>>>>
>>>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This 
>>>>> should not have
>>>>> any influence on the design of the kernel UAPI.
>>>>>
>>>>> If you want to do something similar as KFD for Xe I think you need 
>>>>> to get
>>>>> explicit permission to do this from Dave and Daniel and maybe even 
>>>>> Linus.
>>>> I think the one and only one exception where an SVM uapi like in 
>>>> kfd makes
>>>> sense, is if the _hardware_ itself, not the software stack defined
>>>> semantics that you've happened to build on top of that hw, enforces 
>>>> a 1:1
>>>> mapping with the cpu process address space.
>>>>
>>>> Which means your hardware is using PASID, IOMMU based translation, 
>>>> PCI-ATS
>>>> (address translation services) or whatever your hw calls it and has 
>>>> _no_
>>>> device-side pagetables on top. Which from what I've seen all 
>>>> devices with
>>>> device-memory have, simply because they need some place to store 
>>>> whether
>>>> that memory is currently in device memory or should be translated 
>>>> using
>>>> PASID. Currently there's no gpu that works with PASID only, but 
>>>> there are
>>>> some on-cpu-die accelerator things that do work like that.
>>>>
>>>> Maybe in the future there will be some accelerators that are fully cpu
>>>> cache coherent (including atomics) with something like CXL, and the
>>>> on-device memory is managed as normal system memory with struct 
>>>> page as
>>>> ZONE_DEVICE and accelerator va -> physical address translation is only
>>>> done with PASID ... but for now I haven't seen that, definitely not in
>>>> upstream drivers.
>>>>
>>>> And the moment you have some per-device pagetables or per-device 
>>>> memory
>>>> management of some sort (like using gpuva mgr) then I'm 100% 
>>>> agreeing with
>>>> Christian that the kfd SVM model is too strict and not a great idea.
>>>
>>> That basically means, without ATS/PRI+PASID you cannot implement a 
>>> unified memory programming model, where GPUs or accelerators access 
>>> virtual addresses without pre-registering them with an SVM API call.
>>>
>>> Unified memory is a feature implemented by the KFD SVM API and used 
>>> by ROCm. This is used e.g. to implement OpenMP USM (unified shared 
>>> memory). It's implemented with recoverable GPU page faults. If the 
>>> page fault interrupt handler cannot assume a shared virtual address 
>>> space, then implementing this feature isn't possible.
>>
>> Why not? As far as I can see the OpenMP USM is just another funky way 
>> of userptr handling.
>>
>> The difference is that in an userptr we assume that we always need to 
>> request the whole block A..B from a mapping while for page fault 
>> based handling it can be just any page in between A and B which is 
>> requested and made available to the GPU address space.
>>
>> As far as I can see there is absolutely no need for any special SVM 
>> handling.
>
> It does assume a shared virtual address space between CPU and GPUs. 
> There are no API calls to tell the driver that address A on the CPU 
> maps to address B on the GPU1 and address C on GPU2. The KFD SVM API 
> was designed to work with this programming model, by augmenting the 
> shared virtual address mappings with virtual address range attributes 
> that can modify the migration policy and indicate prefetching, 
> prefaulting, etc. You could think of it as madvise on steroids.

Yeah, so what? In this case you just say through an IOCTL that CPU range 
A..B should map to GPU range C..D and for A/B and C/D you use the 
maximum of the address space.

There is no restriction that this needs to be accurate in way. It's just 
the it can be accurate to be more efficient and eventually use only a 
fraction of the address space instead of all of it for some use cases.

So this isn't a blocker, it's just one special use case.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>>
>>>> Cheers, Sima
>>

Felix Kuehling Jan. 29, 2024, 5:52 p.m. UTC | #36

On 2024-01-29 11:28, Christian König wrote:
> Am 29.01.24 um 17:24 schrieb Felix Kuehling:
>> On 2024-01-29 10:33, Christian König wrote:
>>> Am 29.01.24 um 16:03 schrieb Felix Kuehling:
>>>> On 2024-01-25 13:32, Daniel Vetter wrote:
>>>>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>>>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>>>>> [SNIP]
>>>>>>> Yes most API are per device based.
>>>>>>>
>>>>>>> One exception I know is actually the kfd SVM API. If you look at 
>>>>>>> the svm_ioctl function, it is per-process based. Each 
>>>>>>> kfd_process represent a process across N gpu devices.
>>>>>> Yeah and that was a big mistake in my opinion. We should really 
>>>>>> not do that
>>>>>> ever again.
>>>>>>
>>>>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>>>>> across CPU and all GPU devices on the system. This is by the 
>>>>>>> definition of SVM (shared virtual memory). This is very 
>>>>>>> different from our legacy gpu *device* driver which works for 
>>>>>>> only one device (i.e., if you want one device to access another 
>>>>>>> device's memory, you will have to use dma-buf export/import etc).
>>>>>> Exactly that thinking is what we have currently found as blocker 
>>>>>> for a
>>>>>> virtualization projects. Having SVM as device independent feature 
>>>>>> which
>>>>>> somehow ties to the process address space turned out to be an 
>>>>>> extremely bad
>>>>>> idea.
>>>>>>
>>>>>> The background is that this only works for some use cases but not 
>>>>>> all of
>>>>>> them.
>>>>>>
>>>>>> What's working much better is to just have a mirror functionality 
>>>>>> which says
>>>>>> that a range A..B of the process address space is mapped into a 
>>>>>> range C..D
>>>>>> of the GPU address space.
>>>>>>
>>>>>> Those ranges can then be used to implement the SVM feature 
>>>>>> required for
>>>>>> higher level APIs and not something you need at the UAPI or even 
>>>>>> inside the
>>>>>> low level kernel memory management.
>>>>>>
>>>>>> When you talk about migrating memory to a device you also do this 
>>>>>> on a per
>>>>>> device basis and *not* tied to the process address space. If you 
>>>>>> then get
>>>>>> crappy performance because userspace gave contradicting 
>>>>>> information where to
>>>>>> migrate memory then that's a bug in userspace and not something 
>>>>>> the kernel
>>>>>> should try to prevent somehow.
>>>>>>
>>>>>> [SNIP]
>>>>>>>> I think if you start using the same drm_gpuvm for multiple 
>>>>>>>> devices you
>>>>>>>> will sooner or later start to run into the same mess we have 
>>>>>>>> seen with
>>>>>>>> KFD, where we moved more and more functionality from the KFD to 
>>>>>>>> the DRM
>>>>>>>> render node because we found that a lot of the stuff simply 
>>>>>>>> doesn't work
>>>>>>>> correctly with a single object to maintain the state.
>>>>>>> As I understand it, KFD is designed to work across devices. A 
>>>>>>> single pseudo /dev/kfd device represent all hardware gpu 
>>>>>>> devices. That is why during kfd open, many pdd (process device 
>>>>>>> data) is created, each for one hardware device for this process.
>>>>>> Yes, I'm perfectly aware of that. And I can only repeat myself 
>>>>>> that I see
>>>>>> this design as a rather extreme failure. And I think it's one of 
>>>>>> the reasons
>>>>>> why NVidia is so dominant with Cuda.
>>>>>>
>>>>>> This whole approach KFD takes was designed with the idea of 
>>>>>> extending the
>>>>>> CPU process into the GPUs, but this idea only works for a few use 
>>>>>> cases and
>>>>>> is not something we should apply to drivers in general.
>>>>>>
>>>>>> A very good example are virtualization use cases where you end up 
>>>>>> with CPU
>>>>>> address != GPU address because the VAs are actually coming from 
>>>>>> the guest VM
>>>>>> and not the host process.
>>>>>>
>>>>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This 
>>>>>> should not have
>>>>>> any influence on the design of the kernel UAPI.
>>>>>>
>>>>>> If you want to do something similar as KFD for Xe I think you 
>>>>>> need to get
>>>>>> explicit permission to do this from Dave and Daniel and maybe 
>>>>>> even Linus.
>>>>> I think the one and only one exception where an SVM uapi like in 
>>>>> kfd makes
>>>>> sense, is if the _hardware_ itself, not the software stack defined
>>>>> semantics that you've happened to build on top of that hw, 
>>>>> enforces a 1:1
>>>>> mapping with the cpu process address space.
>>>>>
>>>>> Which means your hardware is using PASID, IOMMU based translation, 
>>>>> PCI-ATS
>>>>> (address translation services) or whatever your hw calls it and 
>>>>> has _no_
>>>>> device-side pagetables on top. Which from what I've seen all 
>>>>> devices with
>>>>> device-memory have, simply because they need some place to store 
>>>>> whether
>>>>> that memory is currently in device memory or should be translated 
>>>>> using
>>>>> PASID. Currently there's no gpu that works with PASID only, but 
>>>>> there are
>>>>> some on-cpu-die accelerator things that do work like that.
>>>>>
>>>>> Maybe in the future there will be some accelerators that are fully 
>>>>> cpu
>>>>> cache coherent (including atomics) with something like CXL, and the
>>>>> on-device memory is managed as normal system memory with struct 
>>>>> page as
>>>>> ZONE_DEVICE and accelerator va -> physical address translation is 
>>>>> only
>>>>> done with PASID ... but for now I haven't seen that, definitely 
>>>>> not in
>>>>> upstream drivers.
>>>>>
>>>>> And the moment you have some per-device pagetables or per-device 
>>>>> memory
>>>>> management of some sort (like using gpuva mgr) then I'm 100% 
>>>>> agreeing with
>>>>> Christian that the kfd SVM model is too strict and not a great idea.
>>>>
>>>> That basically means, without ATS/PRI+PASID you cannot implement a 
>>>> unified memory programming model, where GPUs or accelerators access 
>>>> virtual addresses without pre-registering them with an SVM API call.
>>>>
>>>> Unified memory is a feature implemented by the KFD SVM API and used 
>>>> by ROCm. This is used e.g. to implement OpenMP USM (unified shared 
>>>> memory). It's implemented with recoverable GPU page faults. If the 
>>>> page fault interrupt handler cannot assume a shared virtual address 
>>>> space, then implementing this feature isn't possible.
>>>
>>> Why not? As far as I can see the OpenMP USM is just another funky 
>>> way of userptr handling.
>>>
>>> The difference is that in an userptr we assume that we always need 
>>> to request the whole block A..B from a mapping while for page fault 
>>> based handling it can be just any page in between A and B which is 
>>> requested and made available to the GPU address space.
>>>
>>> As far as I can see there is absolutely no need for any special SVM 
>>> handling.
>>
>> It does assume a shared virtual address space between CPU and GPUs. 
>> There are no API calls to tell the driver that address A on the CPU 
>> maps to address B on the GPU1 and address C on GPU2. The KFD SVM API 
>> was designed to work with this programming model, by augmenting the 
>> shared virtual address mappings with virtual address range attributes 
>> that can modify the migration policy and indicate prefetching, 
>> prefaulting, etc. You could think of it as madvise on steroids.
>
> Yeah, so what? In this case you just say through an IOCTL that CPU 
> range A..B should map to GPU range C..D and for A/B and C/D you use 
> the maximum of the address space.

What I want is that address range A..B on the CPU matches A..B on the 
GPU, because I'm sharing pointers between CPU and GPU. I can't think of 
any sane user mode using a unified memory programming model, that would 
ever ask KFD to map unified memory mappints to a different address range 
on the GPU. Adding such an ioclt is a complete waste of time, and can 
only serve to add unnecessary complexity.

Regards,
   Felix


>
> There is no restriction that this needs to be accurate in way. It's 
> just the it can be accurate to be more efficient and eventually use 
> only a fraction of the address space instead of all of it for some use 
> cases.
>
> So this isn't a blocker, it's just one special use case.
>
> Regards,
> Christian.
>
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>>   Felix
>>>>
>>>>
>>>>>
>>>>> Cheers, Sima
>>>
>

Christian König Jan. 29, 2024, 7:03 p.m. UTC | #37

Am 29.01.24 um 18:52 schrieb Felix Kuehling:
> On 2024-01-29 11:28, Christian König wrote:
>> Am 29.01.24 um 17:24 schrieb Felix Kuehling:
>>> On 2024-01-29 10:33, Christian König wrote:
>>>> Am 29.01.24 um 16:03 schrieb Felix Kuehling:
>>>>> On 2024-01-25 13:32, Daniel Vetter wrote:
>>>>>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>>>>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>>>>>> [SNIP]
>>>>>>>> Yes most API are per device based.
>>>>>>>>
>>>>>>>> One exception I know is actually the kfd SVM API. If you look 
>>>>>>>> at the svm_ioctl function, it is per-process based. Each 
>>>>>>>> kfd_process represent a process across N gpu devices.
>>>>>>> Yeah and that was a big mistake in my opinion. We should really 
>>>>>>> not do that
>>>>>>> ever again.
>>>>>>>
>>>>>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>>>>>> across CPU and all GPU devices on the system. This is by the 
>>>>>>>> definition of SVM (shared virtual memory). This is very 
>>>>>>>> different from our legacy gpu *device* driver which works for 
>>>>>>>> only one device (i.e., if you want one device to access another 
>>>>>>>> device's memory, you will have to use dma-buf export/import etc).
>>>>>>> Exactly that thinking is what we have currently found as blocker 
>>>>>>> for a
>>>>>>> virtualization projects. Having SVM as device independent 
>>>>>>> feature which
>>>>>>> somehow ties to the process address space turned out to be an 
>>>>>>> extremely bad
>>>>>>> idea.
>>>>>>>
>>>>>>> The background is that this only works for some use cases but 
>>>>>>> not all of
>>>>>>> them.
>>>>>>>
>>>>>>> What's working much better is to just have a mirror 
>>>>>>> functionality which says
>>>>>>> that a range A..B of the process address space is mapped into a 
>>>>>>> range C..D
>>>>>>> of the GPU address space.
>>>>>>>
>>>>>>> Those ranges can then be used to implement the SVM feature 
>>>>>>> required for
>>>>>>> higher level APIs and not something you need at the UAPI or even 
>>>>>>> inside the
>>>>>>> low level kernel memory management.
>>>>>>>
>>>>>>> When you talk about migrating memory to a device you also do 
>>>>>>> this on a per
>>>>>>> device basis and *not* tied to the process address space. If you 
>>>>>>> then get
>>>>>>> crappy performance because userspace gave contradicting 
>>>>>>> information where to
>>>>>>> migrate memory then that's a bug in userspace and not something 
>>>>>>> the kernel
>>>>>>> should try to prevent somehow.
>>>>>>>
>>>>>>> [SNIP]
>>>>>>>>> I think if you start using the same drm_gpuvm for multiple 
>>>>>>>>> devices you
>>>>>>>>> will sooner or later start to run into the same mess we have 
>>>>>>>>> seen with
>>>>>>>>> KFD, where we moved more and more functionality from the KFD 
>>>>>>>>> to the DRM
>>>>>>>>> render node because we found that a lot of the stuff simply 
>>>>>>>>> doesn't work
>>>>>>>>> correctly with a single object to maintain the state.
>>>>>>>> As I understand it, KFD is designed to work across devices. A 
>>>>>>>> single pseudo /dev/kfd device represent all hardware gpu 
>>>>>>>> devices. That is why during kfd open, many pdd (process device 
>>>>>>>> data) is created, each for one hardware device for this process.
>>>>>>> Yes, I'm perfectly aware of that. And I can only repeat myself 
>>>>>>> that I see
>>>>>>> this design as a rather extreme failure. And I think it's one of 
>>>>>>> the reasons
>>>>>>> why NVidia is so dominant with Cuda.
>>>>>>>
>>>>>>> This whole approach KFD takes was designed with the idea of 
>>>>>>> extending the
>>>>>>> CPU process into the GPUs, but this idea only works for a few 
>>>>>>> use cases and
>>>>>>> is not something we should apply to drivers in general.
>>>>>>>
>>>>>>> A very good example are virtualization use cases where you end 
>>>>>>> up with CPU
>>>>>>> address != GPU address because the VAs are actually coming from 
>>>>>>> the guest VM
>>>>>>> and not the host process.
>>>>>>>
>>>>>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This 
>>>>>>> should not have
>>>>>>> any influence on the design of the kernel UAPI.
>>>>>>>
>>>>>>> If you want to do something similar as KFD for Xe I think you 
>>>>>>> need to get
>>>>>>> explicit permission to do this from Dave and Daniel and maybe 
>>>>>>> even Linus.
>>>>>> I think the one and only one exception where an SVM uapi like in 
>>>>>> kfd makes
>>>>>> sense, is if the _hardware_ itself, not the software stack defined
>>>>>> semantics that you've happened to build on top of that hw, 
>>>>>> enforces a 1:1
>>>>>> mapping with the cpu process address space.
>>>>>>
>>>>>> Which means your hardware is using PASID, IOMMU based 
>>>>>> translation, PCI-ATS
>>>>>> (address translation services) or whatever your hw calls it and 
>>>>>> has _no_
>>>>>> device-side pagetables on top. Which from what I've seen all 
>>>>>> devices with
>>>>>> device-memory have, simply because they need some place to store 
>>>>>> whether
>>>>>> that memory is currently in device memory or should be translated 
>>>>>> using
>>>>>> PASID. Currently there's no gpu that works with PASID only, but 
>>>>>> there are
>>>>>> some on-cpu-die accelerator things that do work like that.
>>>>>>
>>>>>> Maybe in the future there will be some accelerators that are 
>>>>>> fully cpu
>>>>>> cache coherent (including atomics) with something like CXL, and the
>>>>>> on-device memory is managed as normal system memory with struct 
>>>>>> page as
>>>>>> ZONE_DEVICE and accelerator va -> physical address translation is 
>>>>>> only
>>>>>> done with PASID ... but for now I haven't seen that, definitely 
>>>>>> not in
>>>>>> upstream drivers.
>>>>>>
>>>>>> And the moment you have some per-device pagetables or per-device 
>>>>>> memory
>>>>>> management of some sort (like using gpuva mgr) then I'm 100% 
>>>>>> agreeing with
>>>>>> Christian that the kfd SVM model is too strict and not a great idea.
>>>>>
>>>>> That basically means, without ATS/PRI+PASID you cannot implement a 
>>>>> unified memory programming model, where GPUs or accelerators 
>>>>> access virtual addresses without pre-registering them with an SVM 
>>>>> API call.
>>>>>
>>>>> Unified memory is a feature implemented by the KFD SVM API and 
>>>>> used by ROCm. This is used e.g. to implement OpenMP USM (unified 
>>>>> shared memory). It's implemented with recoverable GPU page faults. 
>>>>> If the page fault interrupt handler cannot assume a shared virtual 
>>>>> address space, then implementing this feature isn't possible.
>>>>
>>>> Why not? As far as I can see the OpenMP USM is just another funky 
>>>> way of userptr handling.
>>>>
>>>> The difference is that in an userptr we assume that we always need 
>>>> to request the whole block A..B from a mapping while for page fault 
>>>> based handling it can be just any page in between A and B which is 
>>>> requested and made available to the GPU address space.
>>>>
>>>> As far as I can see there is absolutely no need for any special SVM 
>>>> handling.
>>>
>>> It does assume a shared virtual address space between CPU and GPUs. 
>>> There are no API calls to tell the driver that address A on the CPU 
>>> maps to address B on the GPU1 and address C on GPU2. The KFD SVM API 
>>> was designed to work with this programming model, by augmenting the 
>>> shared virtual address mappings with virtual address range 
>>> attributes that can modify the migration policy and indicate 
>>> prefetching, prefaulting, etc. You could think of it as madvise on 
>>> steroids.
>>
>> Yeah, so what? In this case you just say through an IOCTL that CPU 
>> range A..B should map to GPU range C..D and for A/B and C/D you use 
>> the maximum of the address space.
>
> What I want is that address range A..B on the CPU matches A..B on the 
> GPU, because I'm sharing pointers between CPU and GPU. I can't think 
> of any sane user mode using a unified memory programming model, that 
> would ever ask KFD to map unified memory mappints to a different 
> address range on the GPU. Adding such an ioclt is a complete waste of 
> time, and can only serve to add unnecessary complexity.

This is exactly the use case which happens when the submitting process 
is not the one originally stitching together the command stream.

Basically all native context, virtualization and other proxy use cases 
work like this.

So that the CPU address doesn't match the GPU address is an absolutely 
real use case and should be able to be handled by the GPU VA interface.

Regards,
Christian.



>
> Regards,
>   Felix
>
>
>>
>> There is no restriction that this needs to be accurate in way. It's 
>> just the it can be accurate to be more efficient and eventually use 
>> only a fraction of the address space instead of all of it for some 
>> use cases.
>>
>> So this isn't a blocker, it's just one special use case.
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>>
>>>>> Regards,
>>>>>   Felix
>>>>>
>>>>>
>>>>>>
>>>>>> Cheers, Sima
>>>>
>>

Zeng, Oak Jan. 29, 2024, 8:09 p.m. UTC | #38

Hi Christian,

Even though this email thread was started to discuss shared virtual address space b/t multiple GPU devices, I eventually found you even don’t agree with a shared virtual address space b/t CPU and GPU program. So let’s forget about multiple GPU devices for now. I will try explain the shared address space b/t cpu and one gpu.

HMM was designed to solve the GPU programmability problem with a very fundamental assumption which is GPU program shares a same virtual address space with CPU program, for example, with HMM any CPU pointers (such as malloc’ed, stack variables and globals) can be used directly on you GPU shader program. Are you against this design goal? HMM is already part of linux core MM and Linus approved this design. CC’ed Jérôme.

Here is an example of how application can use system allocator (hmm),  I copied from https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/. CC’ed a few Nvidia folks.

void sortfile(FILE* fp, int N) {
  char* data;
  data = (char*)malloc(N);

  fread(data, 1, N, fp);
  qsort<<<...>>>(data, N, 1, cmp);
  cudaDeviceSynchronize();

  use_data(data);
  free(data)
}

As you can see, malloced ptr is used directly in GPU program, no userptr ioctl, no vm_bind. This is the model Intel also want to support, besides AMD and Nvidia.

Lastly, nouveau in the kernel already support hmm and system allocator. It also support shared virtual address space b/t CPU and GPU program. All the codes already merged upstream.


See also comments inline to your questions.

I will address your other email separately.

Regards,
Oak

From: Christian König <christian.koenig@amd.com>
Sent: Monday, January 29, 2024 5:11 AM
To: Zeng, Oak <oak.zeng@intel.com>; David Airlie <airlied@redhat.com>
Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Winiarski, Michal <michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 26.01.24 um 21:13 schrieb Zeng, Oak:

-----Original Message-----

From: Christian König <christian.koenig@amd.com><mailto:christian.koenig@amd.com>

Sent: Friday, January 26, 2024 5:10 AM

To: Zeng, Oak <oak.zeng@intel.com><mailto:oak.zeng@intel.com>; David Airlie <airlied@redhat.com><mailto:airlied@redhat.com>

Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com><mailto:himal.prasad.ghimiray@intel.com>;

Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>; Winiarski, Michal

<michal.winiarski@intel.com><mailto:michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>; Welty,

Brian <brian.welty@intel.com><mailto:brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com><mailto:ankur.n.shah@intel.com>; dri-

devel@lists.freedesktop.org<mailto:devel@lists.freedesktop.org>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Gupta, saurabhg

<saurabhg.gupta@intel.com><mailto:saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com><mailto:dakr@redhat.com>; Daniel

Vetter <daniel@ffwll.ch><mailto:daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com><mailto:matthew.brost@intel.com>; Bommu,

Krishnaiah <krishnaiah.bommu@intel.com><mailto:krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana

<niranjana.vishwanathapura@intel.com><mailto:niranjana.vishwanathapura@intel.com>

Subject: Re: Making drm_gpuvm work across gpu devices



Hi Oak,



you can still use SVM, but it should not be a design criteria for the

kernel UAPI. In other words the UAPI should be designed in such a way

that the GPU virtual address can be equal to the CPU virtual address of

a buffer, but can also be different to support use cases where this

isn't the case.



Terminology:

SVM: any technology which can achieve a shared virtual address space b/t cpu and devices. The virtual address space can be managed by user space or kernel space. Intel implemented a SVM, based on the BO-centric gpu driver (gem-create, vm-bind) where virtual address space is managed by UMD.

System allocator: another way of implement SVM. User just use malloc'ed memory for gpu submission. Virtual address space is managed by Linux core mm. In practice, we leverage HMM to implement system allocator.

This article described details of all those different model: https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/



Our programming model allows a mixture use of system allocator (even though system allocator is ) and traditional vm_bind (where cpu address can != gpu address). Let me re-post the pseudo codes:



 1. Fd0 = open(/"dev/dri/render0")

 2. Fd1 = open("/dev/dri/render1")

 3. Fd3 = open("/dev/dri/xe-svm")

 4. Gpu_Vm0 =xe_vm_create(fd0)

 5. Gpu_Vm1 = xe_vm_create(fd1)

 6. Queue0 = xe_exec_queue_create(fd0, gpu_vm0)

 7. Queue1 = xe_exec_queue_create(fd1, gpu_vm1)

 8. ptr = malloc()

 9. bo = xe_bo_create(fd0)

 10. Vm_bind(bo, gpu_vm0, va)//va is from UMD, cpu can access bo with same or different va. It is UMD's responsibility that va doesn't conflict with malloc'ed PTRs.

 11. Xe_exec(queue0, ptr)//submit gpu job which use ptr, on card0

 12. Xe_exec(queue1, ptr)//submit gpu job which use ptr, on card1

 13. Xe_exec(queue0, va)//submit gpu job which use va, on card0



In above codes, the va used in vm_bind (line 10, Intel's API to bind an object to a va for GPU access) can be different from the CPU address when cpu access the same object. But whenever user use malloc'ed ptr for GPU submission (line 11, 12, so called system allocator), it implies CPU and GPUs use the same ptr to access.



In above vm_bind, it is user/UMD's responsibility to guarantee that vm_bind va doesn't conflict with malloc'ed ptr. Otherwise it is treated as programming error.



I think this design still meets your design restrictions.

Well why do you need this "Fd3 = open("/dev/dri/xe-svm")" ?

As far as I see fd3 isn't used anywhere.

We use fd3 for memory hints ioctls (I didn’t write in above program). Under the picture of system allocator, memory hint is applied to a virtual address range in a process, not specific to one GPU device. So we can’t use fd1 and fd2 for this purpose. For example, you can set the preferred memory location of a address range to be on gpu device1’s memory.


What you can do is to bind parts of your process address space to your driver connections (fd1, fd2 etc..) with a vm_bind(), but this should *not* come because of implicitely using some other file descriptor in the process.


We already have a vm_bind api which is used for a split CPU and GPU virtual address space (means GPU virtual address space can != CPU virtual address space.) for KMD. In this case, it is UMD’s responsibility to manage the whole virtual address space. UMD can make the CPU VA ==GPU VA or CPU VA!=GPU VA. It doesn’t matter for KMD. We already have this thing working. We also used this approach to achieve a shared virtual address space b/t CPU and GPU, where UMD managed to make CPU VA == GPU VA.

All the discussion in this email thread was triggered by our effort to support system allocator, which means application can use CPU pointers directly on GPU shader program *without* extra driver IOCTL call. The purpose of this programming model is to further simplify the GPU programming across all programming languages. By the definition of system allocator, GPU va is always == CPU VA.

Our API/xeKmd is designed to work for both of above two programming model.


As far as I can see this design is exactly what failed so badly with KFD.

Regards,
Christian.











Additionally to what Dave wrote I can summarize a few things I have

learned while working on the AMD GPU drivers in the last decade or so:



1. Userspace requirements are *not* relevant for UAPI or even more

general kernel driver design.



2. What should be done is to look at the hardware capabilities and try

to expose those in a save manner to userspace.



3. The userspace requirements are then used to validate the kernel

driver and especially the UAPI design to ensure that nothing was missed.



The consequence of this is that nobody should ever use things like Cuda,

Vulkan, OpenCL, OpenGL etc.. as argument to propose a certain UAPI design.



What should be done instead is to say: My hardware works in this and

that way -> we want to expose it like this -> because that enables us to

implement the high level API in this and that way.



Only this gives then a complete picture of how things interact together

and allows the kernel community to influence and validate the design.



What you described above is mainly bottom up. I know other people do top down, or whole system vertical HW-SW co-design. I don't have strong opinion here.



Regards,

Oak





This doesn't mean that you need to throw away everything, but it gives a

clear restriction that designs are not nailed in stone and for example

you can't use something like a waterfall model.



Going to answer on your other questions separately.



Regards,

Christian.



Am 25.01.24 um 06:25 schrieb Zeng, Oak:

Hi Dave,



Let me step back. When I wrote " shared virtual address space b/t cpu and all

gpu devices is a hard requirement for our system allocator design", I meant this is

not only Intel's design requirement. Rather this is a common requirement for

both Intel, AMD and Nvidia. Take a look at cuda driver API definition of

cuMemAllocManaged (search this API on https://docs.nvidia.com/cuda/cuda-

driver-api/group__CUDA__MEM.html#group__CUDA__MEM), it said:



"The pointer is valid on the CPU and on all GPUs in the system that support

managed memory."



This means the program virtual address space is shared b/t CPU and all GPU

devices on the system. The system allocator we are discussing is just one step

advanced than cuMemAllocManaged: it allows malloc'ed memory to be shared

b/t CPU and all GPU devices.



I hope we all agree with this point.



With that, I agree with Christian that in kmd we should make driver code per-

device based instead of managing all devices in one driver instance. Our system

allocator (and generally xekmd)design follows this rule: we make xe_vm per

device based - one device is *not* aware of other device's address space, as I

explained in previous email. I started this email seeking a one drm_gpuvm

instance to cover all GPU devices. I gave up this approach (at least for now) per

Danilo and Christian's feedback: We will continue to have per device based

drm_gpuvm. I hope this is aligned with Christian but I will have to wait for

Christian's reply to my previous email.



I hope this clarify thing a little.



Regards,

Oak



-----Original Message-----

From: dri-devel <dri-devel-bounces@lists.freedesktop.org><mailto:dri-devel-bounces@lists.freedesktop.org> On Behalf Of

David

Airlie

Sent: Wednesday, January 24, 2024 8:25 PM

To: Zeng, Oak <oak.zeng@intel.com><mailto:oak.zeng@intel.com>

Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com><mailto:himal.prasad.ghimiray@intel.com>;

Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>; Winiarski, Michal

<michal.winiarski@intel.com><mailto:michal.winiarski@intel.com>; Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>;

Welty,

Brian <brian.welty@intel.com><mailto:brian.welty@intel.com>; Shah, Ankur N <ankur.n.shah@intel.com><mailto:ankur.n.shah@intel.com>;

dri-

devel@lists.freedesktop.org<mailto:devel@lists.freedesktop.org>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Gupta, saurabhg

<saurabhg.gupta@intel.com><mailto:saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com><mailto:dakr@redhat.com>; Daniel

Vetter <daniel@ffwll.ch><mailto:daniel@ffwll.ch>; Brost, Matthew <matthew.brost@intel.com><mailto:matthew.brost@intel.com>;

Bommu,

Krishnaiah <krishnaiah.bommu@intel.com><mailto:krishnaiah.bommu@intel.com>; Vishwanathapura, Niranjana

<niranjana.vishwanathapura@intel.com><mailto:niranjana.vishwanathapura@intel.com>; Christian König

<christian.koenig@amd.com><mailto:christian.koenig@amd.com>

Subject: Re: Making drm_gpuvm work across gpu devices





For us, Xekmd doesn't need to know it is running under bare metal or

virtualized environment. Xekmd is always a guest driver. All the virtual address

used in xekmd is guest virtual address. For SVM, we require all the VF devices

share one single shared address space with guest CPU program. So all the

design

works in bare metal environment can automatically work under virtualized

environment. +@Shah, Ankur N +@Winiarski, Michal to backup me if I am

wrong.





Again, shared virtual address space b/t cpu and all gpu devices is a hard

requirement for our system allocator design (which means malloc’ed memory,

cpu stack variables, globals can be directly used in gpu program. Same

requirement as kfd SVM design). This was aligned with our user space

software

stack.



Just to make a very general point here (I'm hoping you listen to

Christian a bit more and hoping he replies in more detail), but just

because you have a system allocator design done, it doesn't in any way

enforce the requirements on the kernel driver to accept that design.

Bad system design should be pushed back on, not enforced in

implementation stages. It's a trap Intel falls into regularly since

they say well we already agreed this design with the userspace team

and we can't change it now. This isn't acceptable. Design includes

upstream discussion and feedback, if you say misdesigned the system

allocator (and I'm not saying you definitely have), and this is

pushing back on that, then you have to go fix your system

architecture.



KFD was an experiment like this, I pushed back on AMD at the start

saying it was likely a bad plan, we let it go and got a lot of

experience in why it was a bad design.



Dave.

Felix Kuehling Jan. 29, 2024, 8:24 p.m. UTC | #39

On 2024-01-29 14:03, Christian König wrote:
> Am 29.01.24 um 18:52 schrieb Felix Kuehling:
>> On 2024-01-29 11:28, Christian König wrote:
>>> Am 29.01.24 um 17:24 schrieb Felix Kuehling:
>>>> On 2024-01-29 10:33, Christian König wrote:
>>>>> Am 29.01.24 um 16:03 schrieb Felix Kuehling:
>>>>>> On 2024-01-25 13:32, Daniel Vetter wrote:
>>>>>>> On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>>>>>>>> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>>>>>>>>> [SNIP]
>>>>>>>>> Yes most API are per device based.
>>>>>>>>>
>>>>>>>>> One exception I know is actually the kfd SVM API. If you look 
>>>>>>>>> at the svm_ioctl function, it is per-process based. Each 
>>>>>>>>> kfd_process represent a process across N gpu devices.
>>>>>>>> Yeah and that was a big mistake in my opinion. We should really 
>>>>>>>> not do that
>>>>>>>> ever again.
>>>>>>>>
>>>>>>>>> Need to say, kfd SVM represent a shared virtual address space 
>>>>>>>>> across CPU and all GPU devices on the system. This is by the 
>>>>>>>>> definition of SVM (shared virtual memory). This is very 
>>>>>>>>> different from our legacy gpu *device* driver which works for 
>>>>>>>>> only one device (i.e., if you want one device to access 
>>>>>>>>> another device's memory, you will have to use dma-buf 
>>>>>>>>> export/import etc).
>>>>>>>> Exactly that thinking is what we have currently found as 
>>>>>>>> blocker for a
>>>>>>>> virtualization projects. Having SVM as device independent 
>>>>>>>> feature which
>>>>>>>> somehow ties to the process address space turned out to be an 
>>>>>>>> extremely bad
>>>>>>>> idea.
>>>>>>>>
>>>>>>>> The background is that this only works for some use cases but 
>>>>>>>> not all of
>>>>>>>> them.
>>>>>>>>
>>>>>>>> What's working much better is to just have a mirror 
>>>>>>>> functionality which says
>>>>>>>> that a range A..B of the process address space is mapped into a 
>>>>>>>> range C..D
>>>>>>>> of the GPU address space.
>>>>>>>>
>>>>>>>> Those ranges can then be used to implement the SVM feature 
>>>>>>>> required for
>>>>>>>> higher level APIs and not something you need at the UAPI or 
>>>>>>>> even inside the
>>>>>>>> low level kernel memory management.
>>>>>>>>
>>>>>>>> When you talk about migrating memory to a device you also do 
>>>>>>>> this on a per
>>>>>>>> device basis and *not* tied to the process address space. If 
>>>>>>>> you then get
>>>>>>>> crappy performance because userspace gave contradicting 
>>>>>>>> information where to
>>>>>>>> migrate memory then that's a bug in userspace and not something 
>>>>>>>> the kernel
>>>>>>>> should try to prevent somehow.
>>>>>>>>
>>>>>>>> [SNIP]
>>>>>>>>>> I think if you start using the same drm_gpuvm for multiple 
>>>>>>>>>> devices you
>>>>>>>>>> will sooner or later start to run into the same mess we have 
>>>>>>>>>> seen with
>>>>>>>>>> KFD, where we moved more and more functionality from the KFD 
>>>>>>>>>> to the DRM
>>>>>>>>>> render node because we found that a lot of the stuff simply 
>>>>>>>>>> doesn't work
>>>>>>>>>> correctly with a single object to maintain the state.
>>>>>>>>> As I understand it, KFD is designed to work across devices. A 
>>>>>>>>> single pseudo /dev/kfd device represent all hardware gpu 
>>>>>>>>> devices. That is why during kfd open, many pdd (process device 
>>>>>>>>> data) is created, each for one hardware device for this process.
>>>>>>>> Yes, I'm perfectly aware of that. And I can only repeat myself 
>>>>>>>> that I see
>>>>>>>> this design as a rather extreme failure. And I think it's one 
>>>>>>>> of the reasons
>>>>>>>> why NVidia is so dominant with Cuda.
>>>>>>>>
>>>>>>>> This whole approach KFD takes was designed with the idea of 
>>>>>>>> extending the
>>>>>>>> CPU process into the GPUs, but this idea only works for a few 
>>>>>>>> use cases and
>>>>>>>> is not something we should apply to drivers in general.
>>>>>>>>
>>>>>>>> A very good example are virtualization use cases where you end 
>>>>>>>> up with CPU
>>>>>>>> address != GPU address because the VAs are actually coming from 
>>>>>>>> the guest VM
>>>>>>>> and not the host process.
>>>>>>>>
>>>>>>>> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This 
>>>>>>>> should not have
>>>>>>>> any influence on the design of the kernel UAPI.
>>>>>>>>
>>>>>>>> If you want to do something similar as KFD for Xe I think you 
>>>>>>>> need to get
>>>>>>>> explicit permission to do this from Dave and Daniel and maybe 
>>>>>>>> even Linus.
>>>>>>> I think the one and only one exception where an SVM uapi like in 
>>>>>>> kfd makes
>>>>>>> sense, is if the _hardware_ itself, not the software stack defined
>>>>>>> semantics that you've happened to build on top of that hw, 
>>>>>>> enforces a 1:1
>>>>>>> mapping with the cpu process address space.
>>>>>>>
>>>>>>> Which means your hardware is using PASID, IOMMU based 
>>>>>>> translation, PCI-ATS
>>>>>>> (address translation services) or whatever your hw calls it and 
>>>>>>> has _no_
>>>>>>> device-side pagetables on top. Which from what I've seen all 
>>>>>>> devices with
>>>>>>> device-memory have, simply because they need some place to store 
>>>>>>> whether
>>>>>>> that memory is currently in device memory or should be 
>>>>>>> translated using
>>>>>>> PASID. Currently there's no gpu that works with PASID only, but 
>>>>>>> there are
>>>>>>> some on-cpu-die accelerator things that do work like that.
>>>>>>>
>>>>>>> Maybe in the future there will be some accelerators that are 
>>>>>>> fully cpu
>>>>>>> cache coherent (including atomics) with something like CXL, and the
>>>>>>> on-device memory is managed as normal system memory with struct 
>>>>>>> page as
>>>>>>> ZONE_DEVICE and accelerator va -> physical address translation 
>>>>>>> is only
>>>>>>> done with PASID ... but for now I haven't seen that, definitely 
>>>>>>> not in
>>>>>>> upstream drivers.
>>>>>>>
>>>>>>> And the moment you have some per-device pagetables or per-device 
>>>>>>> memory
>>>>>>> management of some sort (like using gpuva mgr) then I'm 100% 
>>>>>>> agreeing with
>>>>>>> Christian that the kfd SVM model is too strict and not a great 
>>>>>>> idea.
>>>>>>
>>>>>> That basically means, without ATS/PRI+PASID you cannot implement 
>>>>>> a unified memory programming model, where GPUs or accelerators 
>>>>>> access virtual addresses without pre-registering them with an SVM 
>>>>>> API call.
>>>>>>
>>>>>> Unified memory is a feature implemented by the KFD SVM API and 
>>>>>> used by ROCm. This is used e.g. to implement OpenMP USM (unified 
>>>>>> shared memory). It's implemented with recoverable GPU page 
>>>>>> faults. If the page fault interrupt handler cannot assume a 
>>>>>> shared virtual address space, then implementing this feature 
>>>>>> isn't possible.
>>>>>
>>>>> Why not? As far as I can see the OpenMP USM is just another funky 
>>>>> way of userptr handling.
>>>>>
>>>>> The difference is that in an userptr we assume that we always need 
>>>>> to request the whole block A..B from a mapping while for page 
>>>>> fault based handling it can be just any page in between A and B 
>>>>> which is requested and made available to the GPU address space.
>>>>>
>>>>> As far as I can see there is absolutely no need for any special 
>>>>> SVM handling.
>>>>
>>>> It does assume a shared virtual address space between CPU and GPUs. 
>>>> There are no API calls to tell the driver that address A on the CPU 
>>>> maps to address B on the GPU1 and address C on GPU2. The KFD SVM 
>>>> API was designed to work with this programming model, by augmenting 
>>>> the shared virtual address mappings with virtual address range 
>>>> attributes that can modify the migration policy and indicate 
>>>> prefetching, prefaulting, etc. You could think of it as madvise on 
>>>> steroids.
>>>
>>> Yeah, so what? In this case you just say through an IOCTL that CPU 
>>> range A..B should map to GPU range C..D and for A/B and C/D you use 
>>> the maximum of the address space.
>>
>> What I want is that address range A..B on the CPU matches A..B on the 
>> GPU, because I'm sharing pointers between CPU and GPU. I can't think 
>> of any sane user mode using a unified memory programming model, that 
>> would ever ask KFD to map unified memory mappints to a different 
>> address range on the GPU. Adding such an ioclt is a complete waste of 
>> time, and can only serve to add unnecessary complexity.
>
> This is exactly the use case which happens when the submitting process 
> is not the one originally stitching together the command stream.
>
> Basically all native context, virtualization and other proxy use cases 
> work like this.

You cannot use unified memory in this type of environment. A GPU page 
fault would occur in a GPU virtual address space in the host-side proxy 
process. That page fault would need to be resolved to map some memory 
from a process running in a guest? Which guest? Which process? That's 
anyone's guess. There is no way to annotate that because the pointer in 
the fault is coming from a shader program that's running in the guest 
context and competely unaware of the virtualization. There are no API 
calls from the guest before the fault occurs to establish a meaningful 
mapping.

The way this virtualization of the GPU is implemented, with out proxy 
multiplexing multiple clients is just fundamentally incompatible with a 
unified memory programming model that has to assume a shared virtual 
address space to make sense. You'd need separate proxy processes on the 
host side to handle that. You can't blame this on bad design decisions 
in the SVM API. As I see it, it's just a fundamental limitation of the 
virtualization approach that cannot handle guest applications that want 
to use a unified shared memory programming model.

That's why I suggested to completely disable the SVM API in this 
virtualization scenario when you create a KFD context that's separate 
from the process address space. It is not essential for any 
non-unified-memory functionality. ROCm user mode has fallbacks to work 
without it, because we also needs to support older kernels and GPUs that 
didn't support this programming model.

Regards,
   Felix


>
> So that the CPU address doesn't match the GPU address is an absolutely 
> real use case and should be able to be handled by the GPU VA interface.
>
> Regards,
> Christian.
>
>
>
>>
>> Regards,
>>   Felix
>>
>>
>>>
>>> There is no restriction that this needs to be accurate in way. It's 
>>> just the it can be accurate to be more efficient and eventually use 
>>> only a fraction of the address space instead of all of it for some 
>>> use cases.
>>>
>>> So this isn't a blocker, it's just one special use case.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>>   Felix
>>>>
>>>>
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>>   Felix
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Cheers, Sima
>>>>>
>>>
>

Zeng, Oak Jan. 30, 2024, 12:21 a.m. UTC | #40

The example you used to prove that KFD is a design failure, is against *any* design which utilize system allocator and hmm. The way that one proxy process running on host to handle many guest processes, doesn’t fit into the concept of “share address space b/t cpu and gpu”. The shared address space has to be within one process. Your proxy process represent many guest processes. It is a fundamental conflict.

Also your userptr proposal does’t solve this problem either:
Imagine you have guest process1 mapping CPU address range A…B to GPU address range C…D
And you have guest process 2 mapping CPU address range A…B to GPU address range C…D, since process 1 and 2 are two different process, it is legal for process 2 to do the exact same mapping.
Now when gpu shader access address C…D, a gpu page fault happens, what does your proxy process do? Which guest process will this fault be directed to and handled? Except you have extra information/API to tell proxy process and GPU HW, there is no way to figure out.

Compared to the shared virtual address space concept of HMM, the userptr design is nothing new except it allows CPU and GPU to use different address to access the same object. If you replace above C…D with A…B, above description becomes a description of the “problem” of HMM/shared virtual address design.

Both design has the same difficulty with your example of the special virtualization environment setup.

As said, we spent effort scoped the userptr solution some time ago. The problem we found enabling userptr with migration were:

  1.  The user interface of userptr is not as convenient as system allocator. With the userptr solution, user need to call userptr_ioctl and vm_bind for *every* single cpu pointer that he want to use in a gpu program. While with system allocator, programmer just use any cpu pointer directly in gpu program without any extra driver ioctls.
  2.  We don’t see the real benefit of using a different Gpu address C…D than the A..B, except you can prove my above reasoning is wrong. In most use cases, you can make GPU C…D == CPU A…B, why bother then?
  3.  Looked into implementation details, since hmm fundamentally assume a shared virtual address space b/t cpu and device, for the userptr solution to leverage hmm, you need perform address space conversion every time you calls into hmm functions.

In summary, GPU device is just a piece of HW to accelerate your CPU program. If HW allows, it is more convenient to use shared address space b/t cpu and GPU. On old HW (example, no gpu page fault support, or gpu only has a very limited address space), we can disable system allocator/SVM. If you use different address space on modern GPU, why don’t you use different address space on different CPU cores?

Regards,
Oak
From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Monday, January 29, 2024 5:20 AM
To: Zeng, Oak <oak.zeng@intel.com>; Thomas Hellström <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Dave Airlie <airlied@redhat.com>
Cc: Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Well Daniel and Dave noted it as well, so I'm just repeating it: Your design choices are not an argument to get something upstream.

It's the job of the maintainers and at the end of the Linus to judge of something is acceptable or not.

As far as I can see a good part of this this idea has been exercised lengthy with KFD and it turned out to not be the best approach.

So from what I've seen the design you outlined is extremely unlikely to go upstream.

Regards,
Christian.
Am 27.01.24 um 03:21 schrieb Zeng, Oak:
Regarding the idea of expanding userptr to support migration, we explored this idea long time ago. It provides similar functions of the system allocator but its interface is not as convenient as system allocator. Besides the shared virtual address space, another benefit of a system allocator is, you can offload cpu program to gpu easier, you don’t need to call driver specific API (such as register_userptr and vm_bind in this case) for memory allocation.

We also scoped the implementation. It turned out to be big, and not as beautiful as hmm. Why we gave up this approach.

From: Christian König <christian.koenig@amd.com><mailto:christian.koenig@amd.com>
Sent: Friday, January 26, 2024 7:52 AM
To: Thomas Hellström <thomas.hellstrom@linux.intel.com><mailto:thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch><mailto:daniel@ffwll.ch>
Cc: Brost, Matthew <matthew.brost@intel.com><mailto:matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com><mailto:brian.welty@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com><mailto:himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com><mailto:oak.zeng@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com><mailto:saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com><mailto:dakr@redhat.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com><mailto:krishnaiah.bommu@intel.com>; Dave Airlie <airlied@redhat.com><mailto:airlied@redhat.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com><mailto:niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 26.01.24 um 09:21 schrieb Thomas Hellström:



Hi, all



On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:

On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]

Yes most API are per device based.



One exception I know is actually the kfd SVM API. If you look at

the svm_ioctl function, it is per-process based. Each kfd_process

represent a process across N gpu devices.



Yeah and that was a big mistake in my opinion. We should really not

do that

ever again.



Need to say, kfd SVM represent a shared virtual address space

across CPU and all GPU devices on the system. This is by the

definition of SVM (shared virtual memory). This is very different

from our legacy gpu *device* driver which works for only one

device (i.e., if you want one device to access another device's

memory, you will have to use dma-buf export/import etc).



Exactly that thinking is what we have currently found as blocker

for a

virtualization projects. Having SVM as device independent feature

which

somehow ties to the process address space turned out to be an

extremely bad

idea.



The background is that this only works for some use cases but not

all of

them.



What's working much better is to just have a mirror functionality

which says

that a range A..B of the process address space is mapped into a

range C..D

of the GPU address space.



Those ranges can then be used to implement the SVM feature required

for

higher level APIs and not something you need at the UAPI or even

inside the

low level kernel memory management.



When you talk about migrating memory to a device you also do this

on a per

device basis and *not* tied to the process address space. If you

then get

crappy performance because userspace gave contradicting information

where to

migrate memory then that's a bug in userspace and not something the

kernel

should try to prevent somehow.



[SNIP]

I think if you start using the same drm_gpuvm for multiple

devices you

will sooner or later start to run into the same mess we have

seen with

KFD, where we moved more and more functionality from the KFD to

the DRM

render node because we found that a lot of the stuff simply

doesn't work

correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A

single pseudo /dev/kfd device represent all hardware gpu devices.

That is why during kfd open, many pdd (process device data) is

created, each for one hardware device for this process.



Yes, I'm perfectly aware of that. And I can only repeat myself that

I see

this design as a rather extreme failure. And I think it's one of

the reasons

why NVidia is so dominant with Cuda.



This whole approach KFD takes was designed with the idea of

extending the

CPU process into the GPUs, but this idea only works for a few use

cases and

is not something we should apply to drivers in general.



A very good example are virtualization use cases where you end up

with CPU

address != GPU address because the VAs are actually coming from the

guest VM

and not the host process.



SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should

not have

any influence on the design of the kernel UAPI.



If you want to do something similar as KFD for Xe I think you need

to get

explicit permission to do this from Dave and Daniel and maybe even

Linus.



I think the one and only one exception where an SVM uapi like in kfd

makes

sense, is if the _hardware_ itself, not the software stack defined

semantics that you've happened to build on top of that hw, enforces a

1:1

mapping with the cpu process address space.



Which means your hardware is using PASID, IOMMU based translation,

PCI-ATS

(address translation services) or whatever your hw calls it and has

_no_

device-side pagetables on top. Which from what I've seen all devices

with

device-memory have, simply because they need some place to store

whether

that memory is currently in device memory or should be translated

using

PASID. Currently there's no gpu that works with PASID only, but there

are

some on-cpu-die accelerator things that do work like that.



Maybe in the future there will be some accelerators that are fully

cpu

cache coherent (including atomics) with something like CXL, and the

on-device memory is managed as normal system memory with struct page

as

ZONE_DEVICE and accelerator va -> physical address translation is

only

done with PASID ... but for now I haven't seen that, definitely not

in

upstream drivers.



And the moment you have some per-device pagetables or per-device

memory

management of some sort (like using gpuva mgr) then I'm 100% agreeing

with

Christian that the kfd SVM model is too strict and not a great idea.



Cheers, Sima





I'm trying to digest all the comments here, The end goal is to be able

to support something similar to this here:



https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/



Christian, If I understand you correctly, you're strongly suggesting

not to try to manage a common virtual address space across different

devices in the kernel, but merely providing building blocks to do so,

like for example a generalized userptr with migration support using

HMM; That way each "mirror" of the CPU mm would be per device and

inserted into the gpu_vm just like any other gpu_vma, and user-space

would dictate the A..B -> C..D mapping by choosing the GPU_VA for the

vma.

Exactly that, yes.






Sima, it sounds like you're suggesting to shy away from hmm and not

even attempt to support this except if it can be done using IOMMU sva

on selected hardware?

I think that comment goes more into the direction of: If you have ATS/ATC/PRI capable hardware which exposes the functionality to make memory reads and writes directly into the address space of the CPU then yes an SVM only interface is ok because the hardware can't do anything else. But as long as you have something like GPUVM then please don't restrict yourself.

Which I totally agree on as well. The ATS/ATC/PRI combination doesn't allow using separate page tables device and CPU and so also not separate VAs.

This was one of the reasons why we stopped using this approach for AMD GPUs.

Regards,
Christian.




Could you clarify a bit?



Thanks,

Thomas

Christian König Jan. 30, 2024, 8:39 a.m. UTC | #41

Am 30.01.24 um 01:21 schrieb Zeng, Oak:
>
> The example you used to prove that KFD is a design failure, is against 
> *any* design which utilize system allocator and hmm. The way that one 
> proxy process running on host to handle many guest processes, doesn’t 
> fit into the concept of “share address space b/t cpu and gpu”. The 
> shared address space has to be within one process. Your proxy process 
> represent many guest processes. It is a fundamental conflict.
>
> Also your userptr proposal does’t solve this problem either:
>
> Imagine you have guest process1 mapping CPU address range A…B to GPU 
> address range C…D
>
> And you have guest process 2 mapping CPU address range A…B to GPU 
> address range C…D, since process 1 and 2 are two different process, it 
> is legal for process 2 to do the exact same mapping.
>
> Now when gpu shader access address C…D, a gpu page fault happens, what 
> does your proxy process do? Which guest process will this fault be 
> directed to and handled? Except you have extra information/API to tell 
> proxy process and GPU HW, there is no way to figure out.
>

Well yes, as far as I can see the fundamental design issue in the KFD is 
that it ties together CPU and GPU address space. That came from the 
implementation using the ATS/PRI feature to access the CPU address space 
from the GPU.

If you don't do ATS/PRI then you don't have that restriction and you can 
do as many GPU address spaces per CPU process as you want. This is just 
how the hw works.

So in your example above when you have multiple mappings for the range 
A..B you also have multiple GPU address spaces and so can distinct where 
the page fault is coming from just by looking at the source of it. All 
you then need is userfaultfd() to forward the fault to the client and 
you are pretty much done.

> Compared to the shared virtual address space concept of HMM, the 
> userptr design is nothing new except it allows CPU and GPU to use 
> different address to access the same object. If you replace above C…D 
> with A…B, above description becomes a description of the “problem” of 
> HMM/shared virtual address design.
>
> Both design has the same difficulty with your example of the special 
> virtualization environment setup.
>
> As said, we spent effort scoped the userptr solution some time ago. 
> The problem we found enabling userptr with migration were:
>
>  1. The user interface of userptr is not as convenient as system
>     allocator. With the userptr solution, user need to call
>     userptr_ioctl and vm_bind for *every* single cpu pointer that he
>     want to use in a gpu program. While with system allocator,
>     programmer just use any cpu pointer directly in gpu program
>     without any extra driver ioctls.
>

And I think exactly that is questionable. Why not at least call it for 
the whole address space once during initialization?

 >     We don’t see the real benefit of using a different Gpu address 
C…D than the A..B, except you can prove my above reasoning is wrong. In 
most use cases, you can make GPU C…D == CPU A…B, why bother then?

Because there are cases where this isn't true. We just recently ran into 
exactly that use case with a customer. It might be that you will never 
need this, but again the approach should generally be that the kernel 
exposes the hardware features and as far as I can see the hardware can 
do this.

And apart from those use cases there is also another good reason for 
this: CPU are going towards 5 level of page tables and GPUs are lacking 
behind. It's not unrealistic to run into cases where you can only mirror 
parts of the CPU address space into the GPU address space because of 
hardware restrictions. And in this case you absolutely do want the 
flexibility to have different address ranges.


 >     Looked into implementation details, since hmm fundamentally 
assume a shared virtual address space b/t cpu and device, for the 
userptr solution to leverage hmm, you need perform address space 
conversion every time you calls into hmm functions.

Correct, but that is trivial. I mean we do nothing else with VMAs 
mapping into the address space of files on the CPU either.

Which is by the way a good analogy. The CPU address space consists of 
anonymous memory and file mappings, where the later covers both real 
files on a file system as well as devices.

The struct address_space in the Linux kernel for example describes a 
file address space and not the CPU address space because the later is 
just a technical tool to form an execution environment which can access 
the former.

With GPUs it's pretty much the same. You have mappings which can be 
backed by CPU address space using functionalities like HMM as well as 
buffer objects created directly through device drivers.

> In summary, GPU device is just a piece of HW to accelerate your CPU 
> program.
>

Well exactly that's not how I see it. CPU accelerators are extensions 
like SSE, AVX, FPUs etc... GPU are accelerators attached as I/O devices.

And that GPUs are separate to the CPU is a benefit which gives them 
advantage over CPU based acceleration approaches.

This obviously makes GPUs harder to program and SVM is a method to 
counter this, but that doesn't make SVM a good design pattern for kernel 
or device driver interfaces.

> If HW allows, it is more convenient to use shared address space b/t 
> cpu and GPU. On old HW (example, no gpu page fault support, or gpu 
> only has a very limited address space), we can disable system 
> allocator/SVM. If you use different address space on modern GPU, why 
> don’t you use different address space on different CPU cores?
>

Quite simple, modern CPU are homogeneous. From the application point of 
view they still look more or less the same they did 40 years ago.

GPUs on the other hand look quite a bit different. SVM is now a tool to 
reduce this difference but it doesn't make the differences in execution 
environment go away.

And I can only repeat myself that this is actually a good thing, cause 
otherwise GPUs would loose some of their advantage over CPUs.

Regards,
Christian.

> Regards,
>
> Oak
>
> *From:*dri-devel <dri-devel-bounces@lists.freedesktop.org> *On Behalf 
> Of *Christian König
> *Sent:* Monday, January 29, 2024 5:20 AM
> *To:* Zeng, Oak <oak.zeng@intel.com>; Thomas Hellström 
> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; 
> Dave Airlie <airlied@redhat.com>
> *Cc:* Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling 
> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; 
> dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad 
> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah 
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg 
> <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana 
> <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; 
> Danilo Krummrich <dakr@redhat.com>
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Well Daniel and Dave noted it as well, so I'm just repeating it: Your 
> design choices are not an argument to get something upstream.
>
> It's the job of the maintainers and at the end of the Linus to judge 
> of something is acceptable or not.
>
> As far as I can see a good part of this this idea has been exercised 
> lengthy with KFD and it turned out to not be the best approach.
>
> So from what I've seen the design you outlined is extremely unlikely 
> to go upstream.
>
> Regards,
> Christian.
>
> Am 27.01.24 um 03:21 schrieb Zeng, Oak:
>
>     Regarding the idea of expanding userptr to support migration, we
>     explored this idea long time ago. It provides similar functions of
>     the system allocator but its interface is not as convenient as
>     system allocator. Besides the shared virtual address space,
>     another benefit of a system allocator is, you can offload cpu
>     program to gpu easier, you don’t need to call driver specific API
>     (such as register_userptr and vm_bind in this case) for memory
>     allocation.
>
>     We also scoped the implementation. It turned out to be big, and
>     not as beautiful as hmm. Why we gave up this approach.
>
>     *From:*Christian König <christian.koenig@amd.com>
>     <mailto:christian.koenig@amd.com>
>     *Sent:* Friday, January 26, 2024 7:52 AM
>     *To:* Thomas Hellström <thomas.hellstrom@linux.intel.com>
>     <mailto:thomas.hellstrom@linux.intel.com>; Daniel Vetter
>     <daniel@ffwll.ch> <mailto:daniel@ffwll.ch>
>     *Cc:* Brost, Matthew <matthew.brost@intel.com>
>     <mailto:matthew.brost@intel.com>; Felix Kuehling
>     <felix.kuehling@amd.com> <mailto:felix.kuehling@amd.com>; Welty,
>     Brian <brian.welty@intel.com> <mailto:brian.welty@intel.com>;
>     Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>
>     <mailto:himal.prasad.ghimiray@intel.com>; Zeng, Oak
>     <oak.zeng@intel.com> <mailto:oak.zeng@intel.com>; Gupta, saurabhg
>     <saurabhg.gupta@intel.com> <mailto:saurabhg.gupta@intel.com>;
>     Danilo Krummrich <dakr@redhat.com> <mailto:dakr@redhat.com>;
>     dri-devel@lists.freedesktop.org
>     <mailto:dri-devel@lists.freedesktop.org>; Bommu, Krishnaiah
>     <krishnaiah.bommu@intel.com> <mailto:krishnaiah.bommu@intel.com>;
>     Dave Airlie <airlied@redhat.com> <mailto:airlied@redhat.com>;
>     Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>
>     <mailto:niranjana.vishwanathapura@intel.com>;
>     intel-xe@lists.freedesktop.org <mailto:intel-xe@lists.freedesktop.org>
>     *Subject:* Re: Making drm_gpuvm work across gpu devices
>
>     Am 26.01.24 um 09:21 schrieb Thomas Hellström:
>
>
>         Hi, all
>
>           
>
>         On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:
>
>             On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>
>                 Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>
>                     [SNIP]
>
>                     Yes most API are per device based.
>
>                       
>
>                     One exception I know is actually the kfd SVM API. If you look at
>
>                     the svm_ioctl function, it is per-process based. Each kfd_process
>
>                     represent a process across N gpu devices.
>
>                   
>
>                 Yeah and that was a big mistake in my opinion. We should really not
>
>                 do that
>
>                 ever again.
>
>                   
>
>                     Need to say, kfd SVM represent a shared virtual address space
>
>                     across CPU and all GPU devices on the system. This is by the
>
>                     definition of SVM (shared virtual memory). This is very different
>
>                     from our legacy gpu *device* driver which works for only one
>
>                     device (i.e., if you want one device to access another device's
>
>                     memory, you will have to use dma-buf export/import etc).
>
>                   
>
>                 Exactly that thinking is what we have currently found as blocker
>
>                 for a
>
>                 virtualization projects. Having SVM as device independent feature
>
>                 which
>
>                 somehow ties to the process address space turned out to be an
>
>                 extremely bad
>
>                 idea.
>
>                   
>
>                 The background is that this only works for some use cases but not
>
>                 all of
>
>                 them.
>
>                   
>
>                 What's working much better is to just have a mirror functionality
>
>                 which says
>
>                 that a range A..B of the process address space is mapped into a
>
>                 range C..D
>
>                 of the GPU address space.
>
>                   
>
>                 Those ranges can then be used to implement the SVM feature required
>
>                 for
>
>                 higher level APIs and not something you need at the UAPI or even
>
>                 inside the
>
>                 low level kernel memory management.
>
>                   
>
>                 When you talk about migrating memory to a device you also do this
>
>                 on a per
>
>                 device basis and *not* tied to the process address space. If you
>
>                 then get
>
>                 crappy performance because userspace gave contradicting information
>
>                 where to
>
>                 migrate memory then that's a bug in userspace and not something the
>
>                 kernel
>
>                 should try to prevent somehow.
>
>                   
>
>                 [SNIP]
>
>                         I think if you start using the same drm_gpuvm for multiple
>
>                         devices you
>
>                         will sooner or later start to run into the same mess we have
>
>                         seen with
>
>                         KFD, where we moved more and more functionality from the KFD to
>
>                         the DRM
>
>                         render node because we found that a lot of the stuff simply
>
>                         doesn't work
>
>                         correctly with a single object to maintain the state.
>
>                     As I understand it, KFD is designed to work across devices. A
>
>                     single pseudo /dev/kfd device represent all hardware gpu devices.
>
>                     That is why during kfd open, many pdd (process device data) is
>
>                     created, each for one hardware device for this process.
>
>                   
>
>                 Yes, I'm perfectly aware of that. And I can only repeat myself that
>
>                 I see
>
>                 this design as a rather extreme failure. And I think it's one of
>
>                 the reasons
>
>                 why NVidia is so dominant with Cuda.
>
>                   
>
>                 This whole approach KFD takes was designed with the idea of
>
>                 extending the
>
>                 CPU process into the GPUs, but this idea only works for a few use
>
>                 cases and
>
>                 is not something we should apply to drivers in general.
>
>                   
>
>                 A very good example are virtualization use cases where you end up
>
>                 with CPU
>
>                 address != GPU address because the VAs are actually coming from the
>
>                 guest VM
>
>                 and not the host process.
>
>                   
>
>                 SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should
>
>                 not have
>
>                 any influence on the design of the kernel UAPI.
>
>                   
>
>                 If you want to do something similar as KFD for Xe I think you need
>
>                 to get
>
>                 explicit permission to do this from Dave and Daniel and maybe even
>
>                 Linus.
>
>               
>
>             I think the one and only one exception where an SVM uapi like in kfd
>
>             makes
>
>             sense, is if the _hardware_ itself, not the software stack defined
>
>             semantics that you've happened to build on top of that hw, enforces a
>
>             1:1
>
>             mapping with the cpu process address space.
>
>               
>
>             Which means your hardware is using PASID, IOMMU based translation,
>
>             PCI-ATS
>
>             (address translation services) or whatever your hw calls it and has
>
>             _no_
>
>             device-side pagetables on top. Which from what I've seen all devices
>
>             with
>
>             device-memory have, simply because they need some place to store
>
>             whether
>
>             that memory is currently in device memory or should be translated
>
>             using
>
>             PASID. Currently there's no gpu that works with PASID only, but there
>
>             are
>
>             some on-cpu-die accelerator things that do work like that.
>
>               
>
>             Maybe in the future there will be some accelerators that are fully
>
>             cpu
>
>             cache coherent (including atomics) with something like CXL, and the
>
>             on-device memory is managed as normal system memory with struct page
>
>             as
>
>             ZONE_DEVICE and accelerator va -> physical address translation is
>
>             only
>
>             done with PASID ... but for now I haven't seen that, definitely not
>
>             in
>
>             upstream drivers.
>
>               
>
>             And the moment you have some per-device pagetables or per-device
>
>             memory
>
>             management of some sort (like using gpuva mgr) then I'm 100% agreeing
>
>             with
>
>             Christian that the kfd SVM model is too strict and not a great idea.
>
>               
>
>             Cheers, Sima
>
>           
>
>           
>
>         I'm trying to digest all the comments here, The end goal is to be able
>
>         to support something similar to this here:
>
>           
>
>         https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/
>
>           
>
>         Christian, If I understand you correctly, you're strongly suggesting
>
>         not to try to manage a common virtual address space across different
>
>         devices in the kernel, but merely providing building blocks to do so,
>
>         like for example a generalized userptr with migration support using
>
>         HMM; That way each "mirror" of the CPU mm would be per device and
>
>         inserted into the gpu_vm just like any other gpu_vma, and user-space
>
>         would dictate the A..B -> C..D mapping by choosing the GPU_VA for the
>
>         vma.
>
>
>     Exactly that, yes.
>
>
>
>           
>
>         Sima, it sounds like you're suggesting to shy away from hmm and not
>
>         even attempt to support this except if it can be done using IOMMU sva
>
>         on selected hardware?
>
>
>     I think that comment goes more into the direction of: If you have
>     ATS/ATC/PRI capable hardware which exposes the functionality to
>     make memory reads and writes directly into the address space of
>     the CPU then yes an SVM only interface is ok because the hardware
>     can't do anything else. But as long as you have something like
>     GPUVM then please don't restrict yourself.
>
>     Which I totally agree on as well. The ATS/ATC/PRI combination
>     doesn't allow using separate page tables device and CPU and so
>     also not separate VAs.
>
>     This was one of the reasons why we stopped using this approach for
>     AMD GPUs.
>
>     Regards,
>     Christian.
>
>
>
>         Could you clarify a bit?
>
>           
>
>         Thanks,
>
>         Thomas
>
>           
>
>           
>
>           
>
>           
>
>           
>
>           
>
>           
>

Thomas Hellstrom Jan. 30, 2024, 8:43 a.m. UTC | #42

Hi, Oak,

On 1/30/24 01:21, Zeng, Oak wrote:
>
> The example you used to prove that KFD is a design failure, is against 
> *any* design which utilize system allocator and hmm. The way that one 
> proxy process running on host to handle many guest processes, doesn’t 
> fit into the concept of “share address space b/t cpu and gpu”. The 
> shared address space has to be within one process. Your proxy process 
> represent many guest processes. It is a fundamental conflict.
>
> Also your userptr proposal does’t solve this problem either:
>
> Imagine you have guest process1 mapping CPU address range A…B to GPU 
> address range C…D
>
> And you have guest process 2 mapping CPU address range A…B to GPU 
> address range C…D, since process 1 and 2 are two different process, it 
> is legal for process 2 to do the exact same mapping.
>
> Now when gpu shader access address C…D, a gpu page fault happens, what 
> does your proxy process do? Which guest process will this fault be 
> directed to and handled? Except you have extra information/API to tell 
> proxy process and GPU HW, there is no way to figure out.
>
> Compared to the shared virtual address space concept of HMM, the 
> userptr design is nothing new except it allows CPU and GPU to use 
> different address to access the same object. If you replace above C…D 
> with A…B, above description becomes a description of the “problem” of 
> HMM/shared virtual address design.
>
> Both design has the same difficulty with your example of the special 
> virtualization environment setup.
>
> As said, we spent effort scoped the userptr solution some time ago. 
> The problem we found enabling userptr with migration were:
>
>  1. The user interface of userptr is not as convenient as system
>     allocator. With the userptr solution, user need to call
>     userptr_ioctl and vm_bind for *every* single cpu pointer that he
>     want to use in a gpu program. While with system allocator,
>     programmer just use any cpu pointer directly in gpu program
>     without any extra driver ioctls.
>
No, the augmented userptr (lets call it "hmmptr" to distinguish here) 
would typically only be bound once when the VM is created. It's just a 
different way to expose the whole SVM mapping to user-space. It's 
sparsely populated and is not backed by a bo, and it is per-device so 
UMD would have to replicate the SVM setup and attribute settings on each 
device.

> 1.
>
>
>  2. We don’t see the real benefit of using a different Gpu address C…D
>     than the A..B, except you can prove my above reasoning is wrong.
>     In most use cases, you can make GPU C…D == CPU A…B, why bother then?
>  3. Looked into implementation details, since hmm fundamentally assume
>     a shared virtual address space b/t cpu and device, for the userptr
>     solution to leverage hmm, you need perform address space
>     conversion every time you calls into hmm functions.
>
I think very much focus lands on the A..B -> C..D mapping in the 
discussion. It's just an added flexibility with little or no 
implementation cost. Although I must admit I'm not fully clear about the 
actual use-case. In a para-virtualized environment like virGL or 
vmware's vmx/renderers I could imagine C..D being the guest virtual 
addresses including compute kernel pointers, A..B being the host 
renderer's CPU virtual addresses. (Host creates the VM's, and then this 
translation is needed. I'm not sure para-virtualized SVM exists ATM, but 
forcing A==C, B==D in the uAPI would rule out such a beast in the future?)

/Thomas


> 1.
>
>
>
> In summary, GPU device is just a piece of HW to accelerate your CPU 
> program. If HW allows, it is more convenient to use shared address 
> space b/t cpu and GPU. On old HW (example, no gpu page fault support, 
> or gpu only has a very limited address space), we can disable system 
> allocator/SVM. If you use different address space on modern GPU, why 
> don’t you use different address space on different CPU cores?
>
> Regards,
>
> Oak
>
> *From:*dri-devel <dri-devel-bounces@lists.freedesktop.org> *On Behalf 
> Of *Christian König
> *Sent:* Monday, January 29, 2024 5:20 AM
> *To:* Zeng, Oak <oak.zeng@intel.com>; Thomas Hellström 
> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; 
> Dave Airlie <airlied@redhat.com>
> *Cc:* Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling 
> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; 
> dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad 
> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah 
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg 
> <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana 
> <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; 
> Danilo Krummrich <dakr@redhat.com>
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Well Daniel and Dave noted it as well, so I'm just repeating it: Your 
> design choices are not an argument to get something upstream.
>
> It's the job of the maintainers and at the end of the Linus to judge 
> of something is acceptable or not.
>
> As far as I can see a good part of this this idea has been exercised 
> lengthy with KFD and it turned out to not be the best approach.
>
> So from what I've seen the design you outlined is extremely unlikely 
> to go upstream.
>
> Regards,
> Christian.
>
> Am 27.01.24 um 03:21 schrieb Zeng, Oak:
>
>     Regarding the idea of expanding userptr to support migration, we
>     explored this idea long time ago. It provides similar functions of
>     the system allocator but its interface is not as convenient as
>     system allocator. Besides the shared virtual address space,
>     another benefit of a system allocator is, you can offload cpu
>     program to gpu easier, you don’t need to call driver specific API
>     (such as register_userptr and vm_bind in this case) for memory
>     allocation.
>
>     We also scoped the implementation. It turned out to be big, and
>     not as beautiful as hmm. Why we gave up this approach.
>
>     *From:*Christian König <christian.koenig@amd.com>
>     <mailto:christian.koenig@amd.com>
>     *Sent:* Friday, January 26, 2024 7:52 AM
>     *To:* Thomas Hellström <thomas.hellstrom@linux.intel.com>
>     <mailto:thomas.hellstrom@linux.intel.com>; Daniel Vetter
>     <daniel@ffwll.ch> <mailto:daniel@ffwll.ch>
>     *Cc:* Brost, Matthew <matthew.brost@intel.com>
>     <mailto:matthew.brost@intel.com>; Felix Kuehling
>     <felix.kuehling@amd.com> <mailto:felix.kuehling@amd.com>; Welty,
>     Brian <brian.welty@intel.com> <mailto:brian.welty@intel.com>;
>     Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>
>     <mailto:himal.prasad.ghimiray@intel.com>; Zeng, Oak
>     <oak.zeng@intel.com> <mailto:oak.zeng@intel.com>; Gupta, saurabhg
>     <saurabhg.gupta@intel.com> <mailto:saurabhg.gupta@intel.com>;
>     Danilo Krummrich <dakr@redhat.com> <mailto:dakr@redhat.com>;
>     dri-devel@lists.freedesktop.org
>     <mailto:dri-devel@lists.freedesktop.org>; Bommu, Krishnaiah
>     <krishnaiah.bommu@intel.com> <mailto:krishnaiah.bommu@intel.com>;
>     Dave Airlie <airlied@redhat.com> <mailto:airlied@redhat.com>;
>     Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>
>     <mailto:niranjana.vishwanathapura@intel.com>;
>     intel-xe@lists.freedesktop.org <mailto:intel-xe@lists.freedesktop.org>
>     *Subject:* Re: Making drm_gpuvm work across gpu devices
>
>     Am 26.01.24 um 09:21 schrieb Thomas Hellström:
>
>
>         Hi, all
>
>           
>
>         On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:
>
>             On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:
>
>                 Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>
>                     [SNIP]
>
>                     Yes most API are per device based.
>
>                       
>
>                     One exception I know is actually the kfd SVM API. If you look at
>
>                     the svm_ioctl function, it is per-process based. Each kfd_process
>
>                     represent a process across N gpu devices.
>
>                   
>
>                 Yeah and that was a big mistake in my opinion. We should really not
>
>                 do that
>
>                 ever again.
>
>                   
>
>                     Need to say, kfd SVM represent a shared virtual address space
>
>                     across CPU and all GPU devices on the system. This is by the
>
>                     definition of SVM (shared virtual memory). This is very different
>
>                     from our legacy gpu *device* driver which works for only one
>
>                     device (i.e., if you want one device to access another device's
>
>                     memory, you will have to use dma-buf export/import etc).
>
>                   
>
>                 Exactly that thinking is what we have currently found as blocker
>
>                 for a
>
>                 virtualization projects. Having SVM as device independent feature
>
>                 which
>
>                 somehow ties to the process address space turned out to be an
>
>                 extremely bad
>
>                 idea.
>
>                   
>
>                 The background is that this only works for some use cases but not
>
>                 all of
>
>                 them.
>
>                   
>
>                 What's working much better is to just have a mirror functionality
>
>                 which says
>
>                 that a range A..B of the process address space is mapped into a
>
>                 range C..D
>
>                 of the GPU address space.
>
>                   
>
>                 Those ranges can then be used to implement the SVM feature required
>
>                 for
>
>                 higher level APIs and not something you need at the UAPI or even
>
>                 inside the
>
>                 low level kernel memory management.
>
>                   
>
>                 When you talk about migrating memory to a device you also do this
>
>                 on a per
>
>                 device basis and *not* tied to the process address space. If you
>
>                 then get
>
>                 crappy performance because userspace gave contradicting information
>
>                 where to
>
>                 migrate memory then that's a bug in userspace and not something the
>
>                 kernel
>
>                 should try to prevent somehow.
>
>                   
>
>                 [SNIP]
>
>                         I think if you start using the same drm_gpuvm for multiple
>
>                         devices you
>
>                         will sooner or later start to run into the same mess we have
>
>                         seen with
>
>                         KFD, where we moved more and more functionality from the KFD to
>
>                         the DRM
>
>                         render node because we found that a lot of the stuff simply
>
>                         doesn't work
>
>                         correctly with a single object to maintain the state.
>
>                     As I understand it, KFD is designed to work across devices. A
>
>                     single pseudo /dev/kfd device represent all hardware gpu devices.
>
>                     That is why during kfd open, many pdd (process device data) is
>
>                     created, each for one hardware device for this process.
>
>                   
>
>                 Yes, I'm perfectly aware of that. And I can only repeat myself that
>
>                 I see
>
>                 this design as a rather extreme failure. And I think it's one of
>
>                 the reasons
>
>                 why NVidia is so dominant with Cuda.
>
>                   
>
>                 This whole approach KFD takes was designed with the idea of
>
>                 extending the
>
>                 CPU process into the GPUs, but this idea only works for a few use
>
>                 cases and
>
>                 is not something we should apply to drivers in general.
>
>                   
>
>                 A very good example are virtualization use cases where you end up
>
>                 with CPU
>
>                 address != GPU address because the VAs are actually coming from the
>
>                 guest VM
>
>                 and not the host process.
>
>                   
>
>                 SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should
>
>                 not have
>
>                 any influence on the design of the kernel UAPI.
>
>                   
>
>                 If you want to do something similar as KFD for Xe I think you need
>
>                 to get
>
>                 explicit permission to do this from Dave and Daniel and maybe even
>
>                 Linus.
>
>               
>
>             I think the one and only one exception where an SVM uapi like in kfd
>
>             makes
>
>             sense, is if the _hardware_ itself, not the software stack defined
>
>             semantics that you've happened to build on top of that hw, enforces a
>
>             1:1
>
>             mapping with the cpu process address space.
>
>               
>
>             Which means your hardware is using PASID, IOMMU based translation,
>
>             PCI-ATS
>
>             (address translation services) or whatever your hw calls it and has
>
>             _no_
>
>             device-side pagetables on top. Which from what I've seen all devices
>
>             with
>
>             device-memory have, simply because they need some place to store
>
>             whether
>
>             that memory is currently in device memory or should be translated
>
>             using
>
>             PASID. Currently there's no gpu that works with PASID only, but there
>
>             are
>
>             some on-cpu-die accelerator things that do work like that.
>
>               
>
>             Maybe in the future there will be some accelerators that are fully
>
>             cpu
>
>             cache coherent (including atomics) with something like CXL, and the
>
>             on-device memory is managed as normal system memory with struct page
>
>             as
>
>             ZONE_DEVICE and accelerator va -> physical address translation is
>
>             only
>
>             done with PASID ... but for now I haven't seen that, definitely not
>
>             in
>
>             upstream drivers.
>
>               
>
>             And the moment you have some per-device pagetables or per-device
>
>             memory
>
>             management of some sort (like using gpuva mgr) then I'm 100% agreeing
>
>             with
>
>             Christian that the kfd SVM model is too strict and not a great idea.
>
>               
>
>             Cheers, Sima
>
>           
>
>           
>
>         I'm trying to digest all the comments here, The end goal is to be able
>
>         to support something similar to this here:
>
>           
>
>         https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/
>
>           
>
>         Christian, If I understand you correctly, you're strongly suggesting
>
>         not to try to manage a common virtual address space across different
>
>         devices in the kernel, but merely providing building blocks to do so,
>
>         like for example a generalized userptr with migration support using
>
>         HMM; That way each "mirror" of the CPU mm would be per device and
>
>         inserted into the gpu_vm just like any other gpu_vma, and user-space
>
>         would dictate the A..B -> C..D mapping by choosing the GPU_VA for the
>
>         vma.
>
>
>     Exactly that, yes.
>
>
>
>           
>
>         Sima, it sounds like you're suggesting to shy away from hmm and not
>
>         even attempt to support this except if it can be done using IOMMU sva
>
>         on selected hardware?
>
>
>     I think that comment goes more into the direction of: If you have
>     ATS/ATC/PRI capable hardware which exposes the functionality to
>     make memory reads and writes directly into the address space of
>     the CPU then yes an SVM only interface is ok because the hardware
>     can't do anything else. But as long as you have something like
>     GPUVM then please don't restrict yourself.
>
>     Which I totally agree on as well. The ATS/ATC/PRI combination
>     doesn't allow using separate page tables device and CPU and so
>     also not separate VAs.
>
>     This was one of the reasons why we stopped using this approach for
>     AMD GPUs.
>
>     Regards,
>     Christian.
>
>
>
>         Could you clarify a bit?
>
>           
>
>         Thanks,
>
>         Thomas
>
>           
>
>           
>
>           
>
>           
>
>           
>
>           
>
>           
>

Zeng, Oak Jan. 30, 2024, 10:29 p.m. UTC | #43

Hi Christian,

Nvidia Nouveau driver uses exactly the same concept of SVM with HMM, GPU address in the same process is exactly the same with CPU virtual address. It is already in upstream Linux kernel. We Intel just follow the same direction for our customers. Why we are not allowed?

From: Christian König <christian.koenig@amd.com>
Sent: Tuesday, January 30, 2024 3:40 AM
To: Zeng, Oak <oak.zeng@intel.com>; Thomas Hellström <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Dave Airlie <airlied@redhat.com>
Cc: Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 30.01.24 um 01:21 schrieb Zeng, Oak:

The example you used to prove that KFD is a design failure, is against *any* design which utilize system allocator and hmm. The way that one proxy process running on host to handle many guest processes, doesn’t fit into the concept of “share address space b/t cpu and gpu”. The shared address space has to be within one process. Your proxy process represent many guest processes. It is a fundamental conflict.

Also your userptr proposal does’t solve this problem either:
Imagine you have guest process1 mapping CPU address range A…B to GPU address range C…D
And you have guest process 2 mapping CPU address range A…B to GPU address range C…D, since process 1 and 2 are two different process, it is legal for process 2 to do the exact same mapping.
Now when gpu shader access address C…D, a gpu page fault happens, what does your proxy process do? Which guest process will this fault be directed to and handled? Except you have extra information/API to tell proxy process and GPU HW, there is no way to figure out.

Well yes, as far as I can see the fundamental design issue in the KFD is that it ties together CPU and GPU address space. That came from the implementation using the ATS/PRI feature to access the CPU address space from the GPU.

If you don't do ATS/PRI then you don't have that restriction and you can do as many GPU address spaces per CPU process as you want. This is just how the hw works.

So in your example above when you have multiple mappings for the range A..B you also have multiple GPU address spaces and so can distinct where the page fault is coming from just by looking at the source of it. All you then need is userfaultfd() to forward the fault to the client and you are pretty much done.

Compared to the shared virtual address space concept of HMM, the userptr design is nothing new except it allows CPU and GPU to use different address to access the same object. If you replace above C…D with A…B, above description becomes a description of the “problem” of HMM/shared virtual address design.

Both design has the same difficulty with your example of the special virtualization environment setup.

As said, we spent effort scoped the userptr solution some time ago. The problem we found enabling userptr with migration were:

  1.  The user interface of userptr is not as convenient as system allocator. With the userptr solution, user need to call userptr_ioctl and vm_bind for *every* single cpu pointer that he want to use in a gpu program. While with system allocator, programmer just use any cpu pointer directly in gpu program without any extra driver ioctls.

And I think exactly that is questionable. Why not at least call it for the whole address space once during initialization?

>     We don’t see the real benefit of using a different Gpu address C…D than the A..B, except you can prove my above reasoning is wrong. In most use cases, you can make GPU C…D == CPU A…B, why bother then?

Because there are cases where this isn't true. We just recently ran into exactly that use case with a customer. It might be that you will never need this, but again the approach should generally be that the kernel exposes the hardware features and as far as I can see the hardware can do this.

And apart from those use cases there is also another good reason for this: CPU are going towards 5 level of page tables and GPUs are lacking behind. It's not unrealistic to run into cases where you can only mirror parts of the CPU address space into the GPU address space because of hardware restrictions. And in this case you absolutely do want the flexibility to have different address ranges.

>     Looked into implementation details, since hmm fundamentally assume a shared virtual address space b/t cpu and device, for the userptr solution to leverage hmm, you need perform address space conversion every time you calls into hmm functions.

Correct, but that is trivial. I mean we do nothing else with VMAs mapping into the address space of files on the CPU either.

Which is by the way a good analogy. The CPU address space consists of anonymous memory and file mappings, where the later covers both real files on a file system as well as devices.

The struct address_space in the Linux kernel for example describes a file address space and not the CPU address space because the later is just a technical tool to form an execution environment which can access the former.

With GPUs it's pretty much the same. You have mappings which can be backed by CPU address space using functionalities like HMM as well as buffer objects created directly through device drivers.

In summary, GPU device is just a piece of HW to accelerate your CPU program.

Well exactly that's not how I see it. CPU accelerators are extensions like SSE, AVX, FPUs etc... GPU are accelerators attached as I/O devices.

And that GPUs are separate to the CPU is a benefit which gives them advantage over CPU based acceleration approaches.

This obviously makes GPUs harder to program and SVM is a method to counter this, but that doesn't make SVM a good design pattern for kernel or device driver interfaces.

If HW allows, it is more convenient to use shared address space b/t cpu and GPU. On old HW (example, no gpu page fault support, or gpu only has a very limited address space), we can disable system allocator/SVM. If you use different address space on modern GPU, why don’t you use different address space on different CPU cores?

Quite simple, modern CPU are homogeneous. From the application point of view they still look more or less the same they did 40 years ago.

GPUs on the other hand look quite a bit different. SVM is now a tool to reduce this difference but it doesn't make the differences in execution environment go away.

And I can only repeat myself that this is actually a good thing, cause otherwise GPUs would loose some of their advantage over CPUs.

Regards,
Christian.

Regards,
Oak
From: dri-devel <dri-devel-bounces@lists.freedesktop.org><mailto:dri-devel-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Monday, January 29, 2024 5:20 AM
To: Zeng, Oak <oak.zeng@intel.com><mailto:oak.zeng@intel.com>; Thomas Hellström <thomas.hellstrom@linux.intel.com><mailto:thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch><mailto:daniel@ffwll.ch>; Dave Airlie <airlied@redhat.com><mailto:airlied@redhat.com>
Cc: Brost, Matthew <matthew.brost@intel.com><mailto:matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com><mailto:brian.welty@intel.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com><mailto:himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com><mailto:krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com><mailto:saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com><mailto:niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Danilo Krummrich <dakr@redhat.com><mailto:dakr@redhat.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Well Daniel and Dave noted it as well, so I'm just repeating it: Your design choices are not an argument to get something upstream.

It's the job of the maintainers and at the end of the Linus to judge of something is acceptable or not.

As far as I can see a good part of this this idea has been exercised lengthy with KFD and it turned out to not be the best approach.

So from what I've seen the design you outlined is extremely unlikely to go upstream.

Regards,
Christian.
Am 27.01.24 um 03:21 schrieb Zeng, Oak:
Regarding the idea of expanding userptr to support migration, we explored this idea long time ago. It provides similar functions of the system allocator but its interface is not as convenient as system allocator. Besides the shared virtual address space, another benefit of a system allocator is, you can offload cpu program to gpu easier, you don’t need to call driver specific API (such as register_userptr and vm_bind in this case) for memory allocation.

We also scoped the implementation. It turned out to be big, and not as beautiful as hmm. Why we gave up this approach.

From: Christian König <christian.koenig@amd.com><mailto:christian.koenig@amd.com>
Sent: Friday, January 26, 2024 7:52 AM
To: Thomas Hellström <thomas.hellstrom@linux.intel.com><mailto:thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch><mailto:daniel@ffwll.ch>
Cc: Brost, Matthew <matthew.brost@intel.com><mailto:matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com><mailto:brian.welty@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com><mailto:himal.prasad.ghimiray@intel.com>; Zeng, Oak <oak.zeng@intel.com><mailto:oak.zeng@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com><mailto:saurabhg.gupta@intel.com>; Danilo Krummrich <dakr@redhat.com><mailto:dakr@redhat.com>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com><mailto:krishnaiah.bommu@intel.com>; Dave Airlie <airlied@redhat.com><mailto:airlied@redhat.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com><mailto:niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 26.01.24 um 09:21 schrieb Thomas Hellström:

Hi, all

On Thu, 2024-01-25 at 19:32 +0100, Daniel Vetter wrote:

On Wed, Jan 24, 2024 at 09:33:12AM +0100, Christian König wrote:

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]

Yes most API are per device based.

One exception I know is actually the kfd SVM API. If you look at

the svm_ioctl function, it is per-process based. Each kfd_process

represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not

do that

ever again.

Need to say, kfd SVM represent a shared virtual address space

across CPU and all GPU devices on the system. This is by the

definition of SVM (shared virtual memory). This is very different

from our legacy gpu *device* driver which works for only one

device (i.e., if you want one device to access another device's

memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker

for a

virtualization projects. Having SVM as device independent feature

which

somehow ties to the process address space turned out to be an

extremely bad

idea.

The background is that this only works for some use cases but not

all of

them.

What's working much better is to just have a mirror functionality

which says

that a range A..B of the process address space is mapped into a

range C..D

of the GPU address space.

Those ranges can then be used to implement the SVM feature required

for

higher level APIs and not something you need at the UAPI or even

inside the

low level kernel memory management.

When you talk about migrating memory to a device you also do this

on a per

device basis and *not* tied to the process address space. If you

then get

crappy performance because userspace gave contradicting information

where to

migrate memory then that's a bug in userspace and not something the

kernel

should try to prevent somehow.

[SNIP]

I think if you start using the same drm_gpuvm for multiple

devices you

will sooner or later start to run into the same mess we have

seen with

KFD, where we moved more and more functionality from the KFD to

the DRM

render node because we found that a lot of the stuff simply

doesn't work

correctly with a single object to maintain the state.

As I understand it, KFD is designed to work across devices. A

single pseudo /dev/kfd device represent all hardware gpu devices.

That is why during kfd open, many pdd (process device data) is

created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that

I see

this design as a rather extreme failure. And I think it's one of

the reasons

why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of

extending the

CPU process into the GPUs, but this idea only works for a few use

cases and

is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up

with CPU

address != GPU address because the VAs are actually coming from the

guest VM

and not the host process.

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should

not have

any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need

to get

explicit permission to do this from Dave and Daniel and maybe even

Linus.

I think the one and only one exception where an SVM uapi like in kfd

makes

sense, is if the _hardware_ itself, not the software stack defined

semantics that you've happened to build on top of that hw, enforces a

1:1

mapping with the cpu process address space.

Which means your hardware is using PASID, IOMMU based translation,

PCI-ATS

(address translation services) or whatever your hw calls it and has

_no_

device-side pagetables on top. Which from what I've seen all devices

with

device-memory have, simply because they need some place to store

whether

that memory is currently in device memory or should be translated

using

PASID. Currently there's no gpu that works with PASID only, but there

are

some on-cpu-die accelerator things that do work like that.

Maybe in the future there will be some accelerators that are fully

cpu

cache coherent (including atomics) with something like CXL, and the

on-device memory is managed as normal system memory with struct page

as

ZONE_DEVICE and accelerator va -> physical address translation is

only

done with PASID ... but for now I haven't seen that, definitely not

in

upstream drivers.

And the moment you have some per-device pagetables or per-device

memory

management of some sort (like using gpuva mgr) then I'm 100% agreeing

with

Christian that the kfd SVM model is too strict and not a great idea.

Cheers, Sima

I'm trying to digest all the comments here, The end goal is to be able

to support something similar to this here:

https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/

Christian, If I understand you correctly, you're strongly suggesting

not to try to manage a common virtual address space across different

devices in the kernel, but merely providing building blocks to do so,

like for example a generalized userptr with migration support using

HMM; That way each "mirror" of the CPU mm would be per device and

inserted into the gpu_vm just like any other gpu_vma, and user-space

would dictate the A..B -> C..D mapping by choosing the GPU_VA for the

vma.

Exactly that, yes.

Sima, it sounds like you're suggesting to shy away from hmm and not

even attempt to support this except if it can be done using IOMMU sva

on selected hardware?

I think that comment goes more into the direction of: If you have ATS/ATC/PRI capable hardware which exposes the functionality to make memory reads and writes directly into the address space of the CPU then yes an SVM only interface is ok because the hardware can't do anything else. But as long as you have something like GPUVM then please don't restrict yourself.

Which I totally agree on as well. The ATS/ATC/PRI combination doesn't allow using separate page tables device and CPU and so also not separate VAs.

This was one of the reasons why we stopped using this approach for AMD GPUs.

Regards,
Christian.

Could you clarify a bit?

Thanks,

Thomas

David Airlie Jan. 30, 2024, 11:12 p.m. UTC | #44

On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
>
> Hi Christian,
>
>
>
> Nvidia Nouveau driver uses exactly the same concept of SVM with HMM, GPU address in the same process is exactly the same with CPU virtual address. It is already in upstream Linux kernel. We Intel just follow the same direction for our customers. Why we are not allowed?

Oak, this isn't how upstream works, you don't get to appeal to
customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
isn't something NVIDIA would ever suggest for their customers. We also
likely wouldn't just accept NVIDIA's current solution upstream without
some serious discussions. The implementation in nouveau was more of a
sample HMM use case rather than a serious implementation. I suspect if
we do get down the road of making nouveau an actual compute driver for
SVM etc then it would have to severely change.

Dave.

Daniel Vetter Jan. 31, 2024, 9:15 a.m. UTC | #45

On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
> On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
> >
> > Hi Christian,
> >
> >
> >
> > Nvidia Nouveau driver uses exactly the same concept of SVM with HMM, GPU address in the same process is exactly the same with CPU virtual address. It is already in upstream Linux kernel. We Intel just follow the same direction for our customers. Why we are not allowed?
> 
> 
> Oak, this isn't how upstream works, you don't get to appeal to
> customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
> isn't something NVIDIA would ever suggest for their customers. We also
> likely wouldn't just accept NVIDIA's current solution upstream without
> some serious discussions. The implementation in nouveau was more of a
> sample HMM use case rather than a serious implementation. I suspect if
> we do get down the road of making nouveau an actual compute driver for
> SVM etc then it would have to severely change.

Yeah on the nouveau hmm code specifically my gut feeling impression is
that we didn't really make friends with that among core kernel
maintainers. It's a bit too much just a tech demo to be able to merge the
hmm core apis for nvidia's out-of-tree driver.

Also, a few years of learning and experience gaining happened meanwhile -
you always have to look at an api design in the context of when it was
designed, and that context changes all the time.

Cheers, Sima

Zeng, Oak Jan. 31, 2024, 8:17 p.m. UTC | #46

Hi Sima, Dave,

I am well aware nouveau driver is not what Nvidia do with their customer. The key argument is, can we move forward with the concept shared virtual address space b/t CPU and GPU? This is the foundation of HMM. We already have split address space support with other driver API. SVM, from its name, it means shared address space. Are we allowed to implement another driver model to allow SVM work, along with other APIs supporting split address space? Those two scheme can co-exist in harmony. We actually have real use cases to use both models in one application.   

Hi Christian, Thomas,

In your scheme, GPU VA can != GPU VA. This does introduce some flexibility. But this scheme alone doesn't solve the problem of the proxy process/para-virtualization. You will still need a second mechanism to partition GPU VA space b/t guest process1 and guest process2 because proxy process (or the host hypervisor whatever you call it) use one single gpu page table for all the guest/client processes. GPU VA for different guest process can't overlap. If this second mechanism exist, we of course can use the same mechanism to partition CPU VA space between guest processes as well, then we can still use shared VA b/t CPU and GPU inside one process, but process1 and process2's address space (for both cpu and gpu) doesn't overlap. This second mechanism is the key to solve the proxy process problem, not the flexibility you introduced. 

In practice, your scheme also have a risk of running out of process space because you have to partition whole address space b/t processes. Apparently allowing each guest process to own the whole process space and using separate GPU/CPU page table for different processes is a better solution than using single page table and partition process space b/t processes.

For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all SW-based GPU virtualization technology) is an old project. It is now replaced with HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago. So agreed your scheme add some flexibility. The question is, do we have a valid use case to use such flexibility? I don't see a single one ATM.

I also pictured into how to implement your scheme. You basically rejected the very foundation of hmm design which is shared address space b/t CPU and GPU. In your scheme, GPU VA = CPU VA + offset. In every single place where driver need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From application writer's perspective, whenever he want to use a CPU pointer in his GPU program, he add to add that offset. Do you think this is awkward?

Finally, to implement SVM, we need to implement some memory hint API which applies to a virtual address range across all GPU devices. For example, user would say, for this virtual address range, I prefer the backing store memory to be on GPU deviceX (because user knows deviceX would use this address range much more than other GPU devices or CPU). It doesn't make sense to me to make such API per device based. For example, if you tell device A that the preferred memory location is device B memory, this doesn't sounds correct to me because in your scheme, device A is not even aware of the existence of device B. right?

Regards,
Oak
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Wednesday, January 31, 2024 4:15 AM
> To: David Airlie <airlied@redhat.com>
> Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
> <christian.koenig@amd.com>; Thomas Hellström
> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
> Matthew <matthew.brost@intel.com>; Felix Kuehling
> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
> devel@lists.freedesktop.org; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> apopple@nvidia.com
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
> > On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
> > >
> > > Hi Christian,
> > >
> > >
> > >
> > > Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
> GPU address in the same process is exactly the same with CPU virtual address. It
> is already in upstream Linux kernel. We Intel just follow the same direction for
> our customers. Why we are not allowed?
> >
> >
> > Oak, this isn't how upstream works, you don't get to appeal to
> > customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
> > isn't something NVIDIA would ever suggest for their customers. We also
> > likely wouldn't just accept NVIDIA's current solution upstream without
> > some serious discussions. The implementation in nouveau was more of a
> > sample HMM use case rather than a serious implementation. I suspect if
> > we do get down the road of making nouveau an actual compute driver for
> > SVM etc then it would have to severely change.
> 
> Yeah on the nouveau hmm code specifically my gut feeling impression is
> that we didn't really make friends with that among core kernel
> maintainers. It's a bit too much just a tech demo to be able to merge the
> hmm core apis for nvidia's out-of-tree driver.
> 
> Also, a few years of learning and experience gaining happened meanwhile -
> you always have to look at an api design in the context of when it was
> designed, and that context changes all the time.
> 
> Cheers, Sima
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

Zeng, Oak Jan. 31, 2024, 8:59 p.m. UTC | #47

Fixed one typo: GPU VA != GPU VA should be GPU VA can != CPU VA

> -----Original Message-----
> From: Zeng, Oak
> Sent: Wednesday, January 31, 2024 3:17 PM
> To: Daniel Vetter <daniel@ffwll.ch>; David Airlie <airlied@redhat.com>
> Cc: Christian König <christian.koenig@amd.com>; Thomas Hellström
> <thomas.hellstrom@linux.intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
> Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal
> Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> apopple@nvidia.com
> Subject: RE: Making drm_gpuvm work across gpu devices
> 
> Hi Sima, Dave,
> 
> I am well aware nouveau driver is not what Nvidia do with their customer. The
> key argument is, can we move forward with the concept shared virtual address
> space b/t CPU and GPU? This is the foundation of HMM. We already have split
> address space support with other driver API. SVM, from its name, it means
> shared address space. Are we allowed to implement another driver model to
> allow SVM work, along with other APIs supporting split address space? Those two
> scheme can co-exist in harmony. We actually have real use cases to use both
> models in one application.
> 
> Hi Christian, Thomas,
> 
> In your scheme, GPU VA can != CPU VA. This does introduce some flexibility. But
> this scheme alone doesn't solve the problem of the proxy process/para-
> virtualization. You will still need a second mechanism to partition GPU VA space
> b/t guest process1 and guest process2 because proxy process (or the host
> hypervisor whatever you call it) use one single gpu page table for all the
> guest/client processes. GPU VA for different guest process can't overlap. If this
> second mechanism exist, we of course can use the same mechanism to partition
> CPU VA space between guest processes as well, then we can still use shared VA
> b/t CPU and GPU inside one process, but process1 and process2's address space
> (for both cpu and gpu) doesn't overlap. This second mechanism is the key to
> solve the proxy process problem, not the flexibility you introduced.
> 
> In practice, your scheme also have a risk of running out of process space because
> you have to partition whole address space b/t processes. Apparently allowing
> each guest process to own the whole process space and using separate GPU/CPU
> page table for different processes is a better solution than using single page table
> and partition process space b/t processes.
> 
> For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-
> Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all
> SW-based GPU virtualization technology) is an old project. It is now replaced with
> HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago.
> So agreed your scheme add some flexibility. The question is, do we have a valid
> use case to use such flexibility? I don't see a single one ATM.
> 
> I also pictured into how to implement your scheme. You basically rejected the
> very foundation of hmm design which is shared address space b/t CPU and GPU.
> In your scheme, GPU VA = CPU VA + offset. In every single place where driver
> need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in
> mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From
> application writer's perspective, whenever he want to use a CPU pointer in his
> GPU program, he add to add that offset. Do you think this is awkward?
> 
> Finally, to implement SVM, we need to implement some memory hint API which
> applies to a virtual address range across all GPU devices. For example, user would
> say, for this virtual address range, I prefer the backing store memory to be on
> GPU deviceX (because user knows deviceX would use this address range much
> more than other GPU devices or CPU). It doesn't make sense to me to make such
> API per device based. For example, if you tell device A that the preferred
> memory location is device B memory, this doesn't sounds correct to me because
> in your scheme, device A is not even aware of the existence of device B. right?
> 
> Regards,
> Oak
> > -----Original Message-----
> > From: Daniel Vetter <daniel@ffwll.ch>
> > Sent: Wednesday, January 31, 2024 4:15 AM
> > To: David Airlie <airlied@redhat.com>
> > Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
> > <christian.koenig@amd.com>; Thomas Hellström
> > <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
> > Matthew <matthew.brost@intel.com>; Felix Kuehling
> > <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
> > devel@lists.freedesktop.org; Ghimiray, Himal Prasad
> > <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> > <krishnaiah.bommu@intel.com>; Gupta, saurabhg
> <saurabhg.gupta@intel.com>;
> > Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> > xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur
> N
> > <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> > apopple@nvidia.com
> > Subject: Re: Making drm_gpuvm work across gpu devices
> >
> > On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
> > > On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
> > > >
> > > > Hi Christian,
> > > >
> > > >
> > > >
> > > > Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
> > GPU address in the same process is exactly the same with CPU virtual address.
> It
> > is already in upstream Linux kernel. We Intel just follow the same direction for
> > our customers. Why we are not allowed?
> > >
> > >
> > > Oak, this isn't how upstream works, you don't get to appeal to
> > > customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
> > > isn't something NVIDIA would ever suggest for their customers. We also
> > > likely wouldn't just accept NVIDIA's current solution upstream without
> > > some serious discussions. The implementation in nouveau was more of a
> > > sample HMM use case rather than a serious implementation. I suspect if
> > > we do get down the road of making nouveau an actual compute driver for
> > > SVM etc then it would have to severely change.
> >
> > Yeah on the nouveau hmm code specifically my gut feeling impression is
> > that we didn't really make friends with that among core kernel
> > maintainers. It's a bit too much just a tech demo to be able to merge the
> > hmm core apis for nvidia's out-of-tree driver.
> >
> > Also, a few years of learning and experience gaining happened meanwhile -
> > you always have to look at an api design in the context of when it was
> > designed, and that context changes all the time.
> >
> > Cheers, Sima
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch

Christian König Feb. 1, 2024, 8:52 a.m. UTC | #48

Hi Oak,

Am 31.01.24 um 21:17 schrieb Zeng, Oak:
> Hi Sima, Dave,
>
> I am well aware nouveau driver is not what Nvidia do with their customer. The key argument is, can we move forward with the concept shared virtual address space b/t CPU and GPU? This is the foundation of HMM. We already have split address space support with other driver API. SVM, from its name, it means shared address space. Are we allowed to implement another driver model to allow SVM work, along with other APIs supporting split address space? Those two scheme can co-exist in harmony. We actually have real use cases to use both models in one application.
>
> Hi Christian, Thomas,
>
> In your scheme, GPU VA can != GPU VA. This does introduce some flexibility. But this scheme alone doesn't solve the problem of the proxy process/para-virtualization. You will still need a second mechanism to partition GPU VA space b/t guest process1 and guest process2 because proxy process (or the host hypervisor whatever you call it) use one single gpu page table for all the guest/client processes. GPU VA for different guest process can't overlap. If this second mechanism exist, we of course can use the same mechanism to partition CPU VA space between guest processes as well, then we can still use shared VA b/t CPU and GPU inside one process, but process1 and process2's address space (for both cpu and gpu) doesn't overlap. This second mechanism is the key to solve the proxy process problem, not the flexibility you introduced.

That approach was suggested before, but it doesn't work. First of all 
you create a massive security hole when you give the GPU full access to 
the QEMU CPU process which runs the virtualization.

So even if you say CPU VA == GPU VA you still need some kind of 
flexibility otherwise you can't implement this use case securely.

Additional to this the CPU VAs are usually controlled by the OS and not 
some driver, so to make sure that host and guest VAs don't overlap you 
would need to add some kind of sync between the guest and host OS kernels.

> In practice, your scheme also have a risk of running out of process space because you have to partition whole address space b/t processes. Apparently allowing each guest process to own the whole process space and using separate GPU/CPU page table for different processes is a better solution than using single page table and partition process space b/t processes.

Yeah that you run out of address space is certainly possible. But as I 
said CPUs are switching to 5 level of pages tables and if you look at 
for example a "cat maps | cut -c-4 | sort -u" of process you will find 
that only a handful of 4GiB segments are actually used and thanks to 
recoverable page faults you can map those between host and client on 
demand. This gives you at least enough address space to handle a couple 
of thousand clients.

> For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all SW-based GPU virtualization technology) is an old project. It is now replaced with HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago. So agreed your scheme add some flexibility. The question is, do we have a valid use case to use such flexibility? I don't see a single one ATM.

Yeah, we have SRIOV functionality on AMD hw as well, but for some use 
cases it's just to inflexible.

> I also pictured into how to implement your scheme. You basically rejected the very foundation of hmm design which is shared address space b/t CPU and GPU. In your scheme, GPU VA = CPU VA + offset. In every single place where driver need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From application writer's perspective, whenever he want to use a CPU pointer in his GPU program, he add to add that offset. Do you think this is awkward?

What? This flexibility is there to prevent the application writer to 
change any offset.

> Finally, to implement SVM, we need to implement some memory hint API which applies to a virtual address range across all GPU devices. For example, user would say, for this virtual address range, I prefer the backing store memory to be on GPU deviceX (because user knows deviceX would use this address range much more than other GPU devices or CPU). It doesn't make sense to me to make such API per device based. For example, if you tell device A that the preferred memory location is device B memory, this doesn't sounds correct to me because in your scheme, device A is not even aware of the existence of device B. right?

Correct and while the additional flexibility is somewhat option I 
strongly think that not having a centralized approach for device driver 
settings is mandatory.

Going away from the well defined file descriptor based handling of 
device driver interfaces was one of the worst ideas I've ever seen in 
roughly thirty years of working with Unixiode operating systems. It 
basically broke everything, from reverse lockup handling for mmap() to 
file system privileges for hardware access.

As far as I can see anything which goes into the direction of opening 
/dev/kfd or /dev/xe_svm or something similar and saying that this then 
results into implicit SVM for your render nodes is an absolutely no-go 
and would required and explicit acknowledgement from Linus on the design 
to do something like that.

What you can do is to have an IOCTL for the render node file descriptor 
which says this device should do SVM with the current CPU address space 
and another IOCTL which says range A..B is preferred to migrate to this 
device for HMM when the device runs into a page fault.

And yes that obviously means shitty performance for device drivers 
because page play ping/pong if userspace gives contradicting information 
for migrations, but that is something supposed to be.

Everything else which works over the boarders of a device driver scope 
should be implemented as system call with the relevant review process 
around it.

Regards,
Christian.

>
> Regards,
> Oak
>> -----Original Message-----
>> From: Daniel Vetter <daniel@ffwll.ch>
>> Sent: Wednesday, January 31, 2024 4:15 AM
>> To: David Airlie <airlied@redhat.com>
>> Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
>> <christian.koenig@amd.com>; Thomas Hellström
>> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
>> Matthew <matthew.brost@intel.com>; Felix Kuehling
>> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
>> devel@lists.freedesktop.org; Ghimiray, Himal Prasad
>> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
>> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
>> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
>> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
>> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
>> apopple@nvidia.com
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
>>> On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
>>>> Hi Christian,
>>>>
>>>>
>>>>
>>>> Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
>> GPU address in the same process is exactly the same with CPU virtual address. It
>> is already in upstream Linux kernel. We Intel just follow the same direction for
>> our customers. Why we are not allowed?
>>>
>>> Oak, this isn't how upstream works, you don't get to appeal to
>>> customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
>>> isn't something NVIDIA would ever suggest for their customers. We also
>>> likely wouldn't just accept NVIDIA's current solution upstream without
>>> some serious discussions. The implementation in nouveau was more of a
>>> sample HMM use case rather than a serious implementation. I suspect if
>>> we do get down the road of making nouveau an actual compute driver for
>>> SVM etc then it would have to severely change.
>> Yeah on the nouveau hmm code specifically my gut feeling impression is
>> that we didn't really make friends with that among core kernel
>> maintainers. It's a bit too much just a tech demo to be able to merge the
>> hmm core apis for nvidia's out-of-tree driver.
>>
>> Also, a few years of learning and experience gaining happened meanwhile -
>> you always have to look at an api design in the context of when it was
>> designed, and that context changes all the time.
>>
>> Cheers, Sima
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch

Zeng, Oak Feb. 23, 2024, 8:12 p.m. UTC | #49

Hi Christian,

I go back this old email to ask a question.

Quote from your email:
“Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.”
“SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.”

There are two category of SVM:

  1.  driver svm allocator: this is implemented in user space,  i.g., cudaMallocManaged (cuda) or zeMemAllocShared (L0) or clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd and our umd implemented clSVMAlloc and zeMemAllocShared on top of gem_create/vm_bind. Range A..B of the process address space is mapped into a range C..D of the GPU address space, exactly as you said.
  2.  system svm allocator:  This doesn’t introduce extra driver API for memory allocation. Any valid CPU virtual address can be used directly transparently in a GPU program without any extra driver API call. Quote from kernel Documentation/vm/hmm.hst: “Any application memory region (private anonymous, shared memory, or regular file backed memory) can be used by a device transparently” and “to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space”. In system svm allocator, we don’t need that A..B C..D mapping.

It looks like you were talking of 1). Were you?

Oak
From: Christian König <christian.koenig@amd.com>
Sent: Wednesday, January 24, 2024 3:33 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 23.01.24 um 20:37 schrieb Zeng, Oak:

[SNIP]



Yes most API are per device based.



One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.

Yeah and that was a big mistake in my opinion. We should really not do that ever again.



Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).

Exactly that thinking is what we have currently found as blocker for a virtualization projects. Having SVM as device independent feature which somehow ties to the process address space turned out to be an extremely bad idea.

The background is that this only works for some use cases but not all of them.

What's working much better is to just have a mirror functionality which says that a range A..B of the process address space is mapped into a range C..D of the GPU address space.

Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.

When you talk about migrating memory to a device you also do this on a per device basis and *not* tied to the process address space. If you then get crappy performance because userspace gave contradicting information where to migrate memory then that's a bug in userspace and not something the kernel should try to prevent somehow.

[SNIP]


I think if you start using the same drm_gpuvm for multiple devices you

will sooner or later start to run into the same mess we have seen with

KFD, where we moved more and more functionality from the KFD to the DRM

render node because we found that a lot of the stuff simply doesn't work

correctly with a single object to maintain the state.



As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.

Yes, I'm perfectly aware of that. And I can only repeat myself that I see this design as a rather extreme failure. And I think it's one of the reasons why NVidia is so dominant with Cuda.

This whole approach KFD takes was designed with the idea of extending the CPU process into the GPUs, but this idea only works for a few use cases and is not something we should apply to drivers in general.

A very good example are virtualization use cases where you end up with CPU address != GPU address because the VAs are actually coming from the guest VM and not the host process.

SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.

If you want to do something similar as KFD for Xe I think you need to get explicit permission to do this from Dave and Daniel and maybe even Linus.

Regards,
Christian.

Christian König Feb. 27, 2024, 6:54 a.m. UTC | #50

Hi Oak,

Am 23.02.24 um 21:12 schrieb Zeng, Oak:
>
> Hi Christian,
>
> I go back this old email to ask a question.
>

sorry totally missed that one.

> Quote from your email:
>
> “Those ranges can then be used to implement the SVM feature required 
> for higher level APIs and not something you need at the UAPI or even 
> inside the low level kernel memory management.”
>
> “SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should 
> not have any influence on the design of the kernel UAPI.”
>
> There are two category of SVM:
>
>  1. driver svm allocator: this is implemented in user space,  i.g.,
>     cudaMallocManaged (cuda) or zeMemAllocShared (L0) or
>     clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd
>     and our umd implemented clSVMAlloc and zeMemAllocShared on top of
>     gem_create/vm_bind. Range A..B of the process address space is
>     mapped into a range C..D of the GPU address space, exactly as you
>     said.
>  2. system svm allocator:  This doesn’t introduce extra driver API for
>     memory allocation. Any valid CPU virtual address can be used
>     directly transparently in a GPU program without any extra driver
>     API call. Quote from kernel Documentation/vm/hmm.hst: “Any
>     application memory region (private anonymous, shared memory, or
>     regular file backed memory) can be used by a device transparently”
>     and “to share the address space by duplicating the CPU page table
>     in the device page table so the same address points to the same
>     physical memory for any valid main memory address in the process
>     address space”. In system svm allocator, we don’t need that A..B
>     C..D mapping.
>
> It looks like you were talking of 1). Were you?
>

No, even when you fully mirror the whole address space from a process 
into the GPU you still need to enable this somehow with an IOCTL.

And while enabling this you absolutely should specify to which part of 
the address space this mirroring applies and where it maps to.

I see the system svm allocator as just a special case of the driver 
allocator where not fully backed buffer objects are allocated, but 
rather sparse one which are filled and migrated on demand.

Regards,
Christian.

> Oak
>
> *From:*Christian König <christian.koenig@amd.com>
> *Sent:* Wednesday, January 24, 2024 3:33 AM
> *To:* Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich 
> <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter 
> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>
> *Cc:* Welty, Brian <brian.welty@intel.com>; 
> dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; 
> Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad 
> <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; 
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; 
> Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg 
> <saurabhg.gupta@intel.com>
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Am 23.01.24 um 20:37 schrieb Zeng, Oak:
>
>     [SNIP]
>
>     Yes most API are per device based.
>
>     One exception I know is actually the kfd SVM API. If you look at the svm_ioctl function, it is per-process based. Each kfd_process represent a process across N gpu devices.
>
>
> Yeah and that was a big mistake in my opinion. We should really not do 
> that ever again.
>
>
>     Need to say, kfd SVM represent a shared virtual address space across CPU and all GPU devices on the system. This is by the definition of SVM (shared virtual memory). This is very different from our legacy gpu *device* driver which works for only one device (i.e., if you want one device to access another device's memory, you will have to use dma-buf export/import etc).
>
>
> Exactly that thinking is what we have currently found as blocker for a 
> virtualization projects. Having SVM as device independent feature 
> which somehow ties to the process address space turned out to be an 
> extremely bad idea.
>
> The background is that this only works for some use cases but not all 
> of them.
>
> What's working much better is to just have a mirror functionality 
> which says that a range A..B of the process address space is mapped 
> into a range C..D of the GPU address space.
>
> Those ranges can then be used to implement the SVM feature required 
> for higher level APIs and not something you need at the UAPI or even 
> inside the low level kernel memory management.
>
> When you talk about migrating memory to a device you also do this on a 
> per device basis and *not* tied to the process address space. If you 
> then get crappy performance because userspace gave contradicting 
> information where to migrate memory then that's a bug in userspace and 
> not something the kernel should try to prevent somehow.
>
> [SNIP]
>
>         I think if you start using the same drm_gpuvm for multiple devices you
>
>         will sooner or later start to run into the same mess we have seen with
>
>         KFD, where we moved more and more functionality from the KFD to the DRM
>
>         render node because we found that a lot of the stuff simply doesn't work
>
>         correctly with a single object to maintain the state.
>
>     As I understand it, KFD is designed to work across devices. A single pseudo /dev/kfd device represent all hardware gpu devices. That is why during kfd open, many pdd (process device data) is created, each for one hardware device for this process.
>
>
> Yes, I'm perfectly aware of that. And I can only repeat myself that I 
> see this design as a rather extreme failure. And I think it's one of 
> the reasons why NVidia is so dominant with Cuda.
>
> This whole approach KFD takes was designed with the idea of extending 
> the CPU process into the GPUs, but this idea only works for a few use 
> cases and is not something we should apply to drivers in general.
>
> A very good example are virtualization use cases where you end up with 
> CPU address != GPU address because the VAs are actually coming from 
> the guest VM and not the host process.
>
> SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should 
> not have any influence on the design of the kernel UAPI.
>
> If you want to do something similar as KFD for Xe I think you need to 
> get explicit permission to do this from Dave and Daniel and maybe even 
> Linus.
>
> Regards,
> Christian.
>

Zeng, Oak Feb. 27, 2024, 3:58 p.m. UTC | #51

From: Christian König <christian.koenig@amd.com>
Sent: Tuesday, February 27, 2024 1:54 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; jglisse@redhat.com
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Hi Oak,
Am 23.02.24 um 21:12 schrieb Zeng, Oak:
Hi Christian,

I go back this old email to ask a question.

sorry totally missed that one.

Quote from your email:
“Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.”
“SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.”

There are two category of SVM:

  1.  driver svm allocator: this is implemented in user space,  i.g., cudaMallocManaged (cuda) or zeMemAllocShared (L0) or clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd and our umd implemented clSVMAlloc and zeMemAllocShared on top of gem_create/vm_bind. Range A..B of the process address space is mapped into a range C..D of the GPU address space, exactly as you said.
  2.  system svm allocator:  This doesn’t introduce extra driver API for memory allocation. Any valid CPU virtual address can be used directly transparently in a GPU program without any extra driver API call. Quote from kernel Documentation/vm/hmm.hst: “Any application memory region (private anonymous, shared memory, or regular file backed memory) can be used by a device transparently” and “to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space”. In system svm allocator, we don’t need that A..B C..D mapping.

It looks like you were talking of 1). Were you?

No, even when you fully mirror the whole address space from a process into the GPU you still need to enable this somehow with an IOCTL.

And while enabling this you absolutely should specify to which part of the address space this mirroring applies and where it maps to.

Lets say we have a hardware platform where both CPU and GPU support 57bit virtual address range, how do you decide “which part of the address space this mirroring applies”? You have to mirror the whole address space (0~2^57-1), do you? As you designed it, the gigantic vm_bind/mirroring happens at the process initialization time, and at that time, you don’t know which part of the address space will be used for gpu program.

I see the system svm allocator as just a special case of the driver allocator where not fully backed buffer objects are allocated, but rather sparse one which are filled and migrated on demand.

Above statement is true to me. We don’t have BO for system svm allocator. It is a sparse one as we don’t map the whole vma to GPU. Our migration policy decide which pages/how much of the vma is migrated/mapped to GPU page table.

The difference b/t your mind and mine is, you want a gigantic vma (created during the gigantic vm_bind) to be sparsely populated to gpu. While I thought vma (xe_vma in xekmd codes) is a place to save memory attributes (such as caching, user preferred placement etc). All those memory attributes are range based, i.e., user can specify range1 is cached while range2 is uncached. So I don’t see how you can manage it with the gigantic vma.

Regards,
Oak

Regards,
Christian.

Zeng, Oak Feb. 28, 2024, 7:51 p.m. UTC | #52

The mail wasn’t indent/preface correctly. Manually format it.

From: Christian König <christian.koenig@amd.com<mailto:christian.koenig@amd.com>>
Sent: Tuesday, February 27, 2024 1:54 AM
To: Zeng, Oak <oak.zeng@intel.com<mailto:oak.zeng@intel.com>>; Danilo Krummrich <dakr@redhat.com<mailto:dakr@redhat.com>>; Dave Airlie <airlied@redhat.com<mailto:airlied@redhat.com>>; Daniel Vetter <daniel@ffwll.ch<mailto:daniel@ffwll.ch>>; Felix Kuehling <felix.kuehling@amd.com<mailto:felix.kuehling@amd.com>>; jglisse@redhat.com<mailto:jglisse@redhat.com>
Cc: Welty, Brian <brian.welty@intel.com<mailto:brian.welty@intel.com>>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com<mailto:krishnaiah.bommu@intel.com>>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com<mailto:himal.prasad.ghimiray@intel.com>>; Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com<mailto:niranjana.vishwanathapura@intel.com>>; Brost, Matthew <matthew.brost@intel.com<mailto:matthew.brost@intel.com>>; Gupta, saurabhg <saurabhg.gupta@intel.com<mailto:saurabhg.gupta@intel.com>>
Subject: Re: Making drm_gpuvm work across gpu devices

Hi Oak,
Am 23.02.24 um 21:12 schrieb Zeng, Oak:
Hi Christian,

I go back this old email to ask a question.

sorry totally missed that one.

Quote from your email:
“Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.”
“SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.”

There are two category of SVM:

1.       driver svm allocator: this is implemented in user space,  i.g., cudaMallocManaged (cuda) or zeMemAllocShared (L0) or clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd and our umd implemented clSVMAlloc and zeMemAllocShared on top of gem_create/vm_bind. Range A..B of the process address space is mapped into a range C..D of the GPU address space, exactly as you said.

2.       system svm allocator:  This doesn’t introduce extra driver API for memory allocation. Any valid CPU virtual address can be used directly transparently in a GPU program without any extra driver API call. Quote from kernel Documentation/vm/hmm.hst: “Any application memory region (private anonymous, shared memory, or regular file backed memory) can be used by a device transparently” and “to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space”. In system svm allocator, we don’t need that A..B C..D mapping.

It looks like you were talking of 1). Were you?

No, even when you fully mirror the whole address space from a process into the GPU you still need to enable this somehow with an IOCTL.

And while enabling this you absolutely should specify to which part of the address space this mirroring applies and where it maps to.

[Zeng, Oak]
Lets say we have a hardware platform where both CPU and GPU support 57bit(use it for example. The statement apply to any address range) virtual address range, how do you decide “which part of the address space this mirroring applies”? You have to mirror the whole address space [0~2^57-1], do you? As you designed it, the gigantic vm_bind/mirroring happens at the process initialization time, and at that time, you don’t know which part of the address space will be used for gpu program. Remember for system allocator, *any* valid CPU address can be used for GPU program.  If you add an offset to [0~2^57-1], you get an address out of 57bit address range. Is this a valid concern?

I see the system svm allocator as just a special case of the driver allocator where not fully backed buffer objects are allocated, but rather sparse one which are filled and migrated on demand.

[Zeng, Oak]
Above statement is true to me. We don’t have BO for system svm allocator. It is a sparse one as we can sparsely map vma to GPU. Our migration policy decide which pages/how much of the vma is migrated/mapped to GPU page table.

The difference b/t your mind and mine is, you want a gigantic vma (created during the gigantic vm_bind) to be sparsely populated to gpu. While I thought vma (xe_vma in xekmd codes) is a place to save memory attributes (such as caching, user preferred placement etc). All those memory attributes are range based, i.e., user can specify range1 is cached while range2 is uncached. So I don’t see how you can manage it with the gigantic vma. Do you split your gigantic vma later to save range based memory attributes?

Regards,
Oak

Regards,
Christian.

Christian König Feb. 29, 2024, 9:41 a.m. UTC | #53

Am 28.02.24 um 20:51 schrieb Zeng, Oak:
>
> The mail wasn’t indent/preface correctly. Manually format it.
>
> *From:*Christian König <christian.koenig@amd.com>
> *Sent:* Tuesday, February 27, 2024 1:54 AM
> *To:* Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich 
> <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter 
> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; 
> jglisse@redhat.com
> *Cc:* Welty, Brian <brian.welty@intel.com>; 
> dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; 
> Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad 
> <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; 
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; 
> Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg 
> <saurabhg.gupta@intel.com>
> *Subject:* Re: Making drm_gpuvm work across gpu devices
>
> Hi Oak,
>
> Am 23.02.24 um 21:12 schrieb Zeng, Oak:
>
>     Hi Christian,
>
>     I go back this old email to ask a question.
>
>
> sorry totally missed that one.
>
>     Quote from your email:
>
>     “Those ranges can then be used to implement the SVM feature
>     required for higher level APIs and not something you need at the
>     UAPI or even inside the low level kernel memory management.”
>
>     “SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This
>     should not have any influence on the design of the kernel UAPI.”
>
>     There are two category of SVM:
>
>     1.driver svm allocator: this is implemented in user space,  i.g.,
>     cudaMallocManaged (cuda) or zeMemAllocShared (L0) or
>     clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd
>     and our umd implemented clSVMAlloc and zeMemAllocShared on top of
>     gem_create/vm_bind. Range A..B of the process address space is
>     mapped into a range C..D of the GPU address space, exactly as you
>     said.
>
>     2.system svm allocator:  This doesn’t introduce extra driver API
>     for memory allocation. Any valid CPU virtual address can be used
>     directly transparently in a GPU program without any extra driver
>     API call. Quote from kernel Documentation/vm/hmm.hst: “Any
>     application memory region (private anonymous, shared memory, or
>     regular file backed memory) can be used by a device transparently”
>     and “to share the address space by duplicating the CPU page table
>     in the device page table so the same address points to the same
>     physical memory for any valid main memory address in the process
>     address space”. In system svm allocator, we don’t need that A..B
>     C..D mapping.
>
>     It looks like you were talking of 1). Were you?
>
>
> No, even when you fully mirror the whole address space from a process 
> into the GPU you still need to enable this somehow with an IOCTL.
>
> And while enabling this you absolutely should specify to which part of 
> the address space this mirroring applies and where it maps to.
>
> */[Zeng, Oak] /*
>
> Lets say we have a hardware platform where both CPU and GPU support 
> 57bit(use it for example. The statement apply to any address range) 
> virtual address range, how do you decide “which part of the address 
> space this mirroring applies”? You have to mirror the whole address 
> space [0~2^57-1], do you? As you designed it, the gigantic 
> vm_bind/mirroring happens at the process initialization time, and at 
> that time, you don’t know which part of the address space will be used 
> for gpu program. Remember for system allocator, *any* valid CPU 
> address can be used for GPU program.  If you add an offset to 
> [0~2^57-1], you get an address out of 57bit address range. Is this a 
> valid concern?
>

Well you can perfectly mirror on demand. You just need something similar 
to userfaultfd() for the GPU. This way you don't need to mirror the full 
address space, but can rather work with large chunks created on demand, 
let's say 1GiB or something like that.

The virtual address space is basically just a hardware functionality to 
route memory accesses. While the mirroring approach is a very common use 
case for data-centers and high performance computing there are quite a 
number of different use cases which makes use of virtual address space 
in a non "standard" fashion. The native context approach for VMs is just 
one example, databases and emulators are another one.

>
>
> I see the system svm allocator as just a special case of the driver 
> allocator where not fully backed buffer objects are allocated, but 
> rather sparse one which are filled and migrated on demand.
>
> */[Zeng, Oak] /*
>
> Above statement is true to me. We don’t have BO for system svm 
> allocator. It is a sparse one as we can sparsely map vma to GPU. Our 
> migration policy decide which pages/how much of the vma is 
> migrated/mapped to GPU page table.
>
> *//*
>
> The difference b/t your mind and mine is, you want a gigantic vma 
> (created during the gigantic vm_bind) to be sparsely populated to gpu. 
> While I thought vma (xe_vma in xekmd codes) is a place to save memory 
> attributes (such as caching, user preferred placement etc). All those 
> memory attributes are range based, i.e., user can specify range1 is 
> cached while range2 is uncached. So I don’t see how you can manage it 
> with the gigantic vma. Do you split your gigantic vma later to save 
> range based memory attributes?
>

Yes, exactly that. I mean the splitting and eventually merging of ranges 
is a standard functionality of the GPUVM code.

So when you need to store additional attributes per range then I would 
strongly suggest to make use of this splitting and merging functionality 
as well.

So basically an IOCTL which says range A..B of the GPU address space is 
mapped to offset X of the CPU address space with parameters Y (caching, 
migration behavior etc..). That is essentially the same we have for 
mapping GEM objects, the provider of the backing store is just something 
different.

Regards,
Christian.

> Regards,
>
> Oak
>
>
>
> Regards,
> Christian.
>

Zeng, Oak Feb. 29, 2024, 4:05 p.m. UTC | #54

Hi Christian,

Can you elaborate the mirror on demand/userfaultfd idea?

userfaultfd is a way for user space to take over page fault handling of a user registered range. From first look, it seems you want a user space page fault handler to mirror a large chunk of memory to GPU. I would imagine this handler is in UMD, because the whole purpose of system svm allocator is to allow user use cpu address (such as malloc’ed) on gpu program without extra driver api call. So the registration and mirroring of this large chunk can’t be in user program. With this, I pictured below sequence:

During process initialization time, umd register a large chunk (lets say 1GiB) of memory using userfaultfd, this include:

  1.  mem = mmap(NULL, 1GiB, MAP_ANON)
  2.  register range [mem, mem + 1GiB] through userfaultfd
  3.  after that, umd can wait on page fault event. When page fault happens, umd call vm_bind to mirror [mem, mem+1GiB] range to GPU

now in a user program:
                ptr = malloc(size);
                submit a GPU program which uses ptr

This is what I can picture. It doesn’t work because ptr can’t belong to [mem, mem+1GiB] range. So you can’t vm_bind/mirror ptr on demand to GPU.

Also, the page fault event in 3) above can’t happen at all. A page fault only happens when *CPU* access mem but in our case, it could be *only GPU* touch the memory.

The point is, with system svm allocator, user can use *any* valid CPU address for GPU program. This address can be anything in the range of [0~2^57-1]. This design requirement is quite simple and clean. I don’t see how to solve this with userfaultfd/on demand mirroring.

Regards,
Oak

From: Christian König <christian.koenig@amd.com>
Sent: Thursday, February 29, 2024 4:41 AM
To: Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; jglisse@redhat.com
Cc: Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Thomas.Hellstrom@linux.intel.com; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>
Subject: Re: Making drm_gpuvm work across gpu devices

Am 28.02.24 um 20:51 schrieb Zeng, Oak:


The mail wasn’t indent/preface correctly. Manually format it.


From: Christian König <christian.koenig@amd.com<mailto:christian.koenig@amd.com>>
Sent: Tuesday, February 27, 2024 1:54 AM
To: Zeng, Oak <oak.zeng@intel.com<mailto:oak.zeng@intel.com>>; Danilo Krummrich <dakr@redhat.com<mailto:dakr@redhat.com>>; Dave Airlie <airlied@redhat.com<mailto:airlied@redhat.com>>; Daniel Vetter <daniel@ffwll.ch<mailto:daniel@ffwll.ch>>; Felix Kuehling <felix.kuehling@amd.com<mailto:felix.kuehling@amd.com>>; jglisse@redhat.com<mailto:jglisse@redhat.com>
Cc: Welty, Brian <brian.welty@intel.com<mailto:brian.welty@intel.com>>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>; intel-xe@lists.freedesktop.org<mailto:intel-xe@lists.freedesktop.org>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com<mailto:krishnaiah.bommu@intel.com>>; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com<mailto:himal.prasad.ghimiray@intel.com>>; Thomas.Hellstrom@linux.intel.com<mailto:Thomas.Hellstrom@linux.intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com<mailto:niranjana.vishwanathapura@intel.com>>; Brost, Matthew <matthew.brost@intel.com<mailto:matthew.brost@intel.com>>; Gupta, saurabhg <saurabhg.gupta@intel.com<mailto:saurabhg.gupta@intel.com>>
Subject: Re: Making drm_gpuvm work across gpu devices

Hi Oak,
Am 23.02.24 um 21:12 schrieb Zeng, Oak:
Hi Christian,

I go back this old email to ask a question.

sorry totally missed that one.



Quote from your email:
“Those ranges can then be used to implement the SVM feature required for higher level APIs and not something you need at the UAPI or even inside the low level kernel memory management.”
“SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This should not have any influence on the design of the kernel UAPI.”

There are two category of SVM:

1.       driver svm allocator: this is implemented in user space,  i.g., cudaMallocManaged (cuda) or zeMemAllocShared (L0) or clSVMAlloc(openCL). Intel already have gem_create/vm_bind in xekmd and our umd implemented clSVMAlloc and zeMemAllocShared on top of gem_create/vm_bind. Range A..B of the process address space is mapped into a range C..D of the GPU address space, exactly as you said.

2.       system svm allocator:  This doesn’t introduce extra driver API for memory allocation. Any valid CPU virtual address can be used directly transparently in a GPU program without any extra driver API call. Quote from kernel Documentation/vm/hmm.hst: “Any application memory region (private anonymous, shared memory, or regular file backed memory) can be used by a device transparently” and “to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space”. In system svm allocator, we don’t need that A..B C..D mapping.

It looks like you were talking of 1). Were you?

No, even when you fully mirror the whole address space from a process into the GPU you still need to enable this somehow with an IOCTL.

And while enabling this you absolutely should specify to which part of the address space this mirroring applies and where it maps to.


[Zeng, Oak]
Lets say we have a hardware platform where both CPU and GPU support 57bit(use it for example. The statement apply to any address range) virtual address range, how do you decide “which part of the address space this mirroring applies”? You have to mirror the whole address space [0~2^57-1], do you? As you designed it, the gigantic vm_bind/mirroring happens at the process initialization time, and at that time, you don’t know which part of the address space will be used for gpu program. Remember for system allocator, *any* valid CPU address can be used for GPU program.  If you add an offset to [0~2^57-1], you get an address out of 57bit address range. Is this a valid concern?

Well you can perfectly mirror on demand. You just need something similar to userfaultfd() for the GPU. This way you don't need to mirror the full address space, but can rather work with large chunks created on demand, let's say 1GiB or something like that.

The virtual address space is basically just a hardware functionality to route memory accesses. While the mirroring approach is a very common use case for data-centers and high performance computing there are quite a number of different use cases which makes use of virtual address space in a non "standard" fashion. The native context approach for VMs is just one example, databases and emulators are another one.




I see the system svm allocator as just a special case of the driver allocator where not fully backed buffer objects are allocated, but rather sparse one which are filled and migrated on demand.


[Zeng, Oak]
Above statement is true to me. We don’t have BO for system svm allocator. It is a sparse one as we can sparsely map vma to GPU. Our migration policy decide which pages/how much of the vma is migrated/mapped to GPU page table.

The difference b/t your mind and mine is, you want a gigantic vma (created during the gigantic vm_bind) to be sparsely populated to gpu. While I thought vma (xe_vma in xekmd codes) is a place to save memory attributes (such as caching, user preferred placement etc). All those memory attributes are range based, i.e., user can specify range1 is cached while range2 is uncached. So I don’t see how you can manage it with the gigantic vma. Do you split your gigantic vma later to save range based memory attributes?

Yes, exactly that. I mean the splitting and eventually merging of ranges is a standard functionality of the GPUVM code.

So when you need to store additional attributes per range then I would strongly suggest to make use of this splitting and merging functionality as well.

So basically an IOCTL which says range A..B of the GPU address space is mapped to offset X of the CPU address space with parameters Y (caching, migration behavior etc..). That is essentially the same we have for mapping GEM objects, the provider of the backing store is just something different.

Regards,
Christian.



Regards,
Oak


Regards,
Christian.

Thomas Hellstrom Feb. 29, 2024, 5:12 p.m. UTC | #55

Hi, Christian.

On Thu, 2024-02-29 at 10:41 +0100, Christian König wrote:
> Am 28.02.24 um 20:51 schrieb Zeng, Oak:
> > 
> > The mail wasn’t indent/preface correctly. Manually format it.
> > 
> > *From:*Christian König <christian.koenig@amd.com>
> > *Sent:* Tuesday, February 27, 2024 1:54 AM
> > *To:* Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich 
> > <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter 
> > <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>; 
> > jglisse@redhat.com
> > *Cc:* Welty, Brian <brian.welty@intel.com>; 
> > dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org; 
> > Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal
> > Prasad 
> > <himal.prasad.ghimiray@intel.com>;
> > Thomas.Hellstrom@linux.intel.com; 
> > Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; 
> > Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg 
> > <saurabhg.gupta@intel.com>
> > *Subject:* Re: Making drm_gpuvm work across gpu devices
> > 
> > Hi Oak,
> > 
> > Am 23.02.24 um 21:12 schrieb Zeng, Oak:
> > 
> >     Hi Christian,
> > 
> >     I go back this old email to ask a question.
> > 
> > 
> > sorry totally missed that one.
> > 
> >     Quote from your email:
> > 
> >     “Those ranges can then be used to implement the SVM feature
> >     required for higher level APIs and not something you need at
> > the
> >     UAPI or even inside the low level kernel memory management.”
> > 
> >     “SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This
> >     should not have any influence on the design of the kernel
> > UAPI.”
> > 
> >     There are two category of SVM:
> > 
> >     1.driver svm allocator: this is implemented in user space,
> >  i.g.,
> >     cudaMallocManaged (cuda) or zeMemAllocShared (L0) or
> >     clSVMAlloc(openCL). Intel already have gem_create/vm_bind in
> > xekmd
> >     and our umd implemented clSVMAlloc and zeMemAllocShared on top
> > of
> >     gem_create/vm_bind. Range A..B of the process address space is
> >     mapped into a range C..D of the GPU address space, exactly as
> > you
> >     said.
> > 
> >     2.system svm allocator:  This doesn’t introduce extra driver
> > API
> >     for memory allocation. Any valid CPU virtual address can be
> > used
> >     directly transparently in a GPU program without any extra
> > driver
> >     API call. Quote from kernel Documentation/vm/hmm.hst: “Any
> >     application memory region (private anonymous, shared memory, or
> >     regular file backed memory) can be used by a device
> > transparently”
> >     and “to share the address space by duplicating the CPU page
> > table
> >     in the device page table so the same address points to the same
> >     physical memory for any valid main memory address in the
> > process
> >     address space”. In system svm allocator, we don’t need that
> > A..B
> >     C..D mapping.
> > 
> >     It looks like you were talking of 1). Were you?
> > 
> > 
> > No, even when you fully mirror the whole address space from a
> > process 
> > into the GPU you still need to enable this somehow with an IOCTL.
> > 
> > And while enabling this you absolutely should specify to which part
> > of 
> > the address space this mirroring applies and where it maps to.
> > 
> > */[Zeng, Oak] /*
> > 
> > Lets say we have a hardware platform where both CPU and GPU support
> > 57bit(use it for example. The statement apply to any address range)
> > virtual address range, how do you decide “which part of the address
> > space this mirroring applies”? You have to mirror the whole address
> > space [0~2^57-1], do you? As you designed it, the gigantic 
> > vm_bind/mirroring happens at the process initialization time, and
> > at 
> > that time, you don’t know which part of the address space will be
> > used 
> > for gpu program. Remember for system allocator, *any* valid CPU 
> > address can be used for GPU program.  If you add an offset to 
> > [0~2^57-1], you get an address out of 57bit address range. Is this
> > a 
> > valid concern?
> > 
> 
> Well you can perfectly mirror on demand. You just need something
> similar 
> to userfaultfd() for the GPU. This way you don't need to mirror the
> full 
> address space, but can rather work with large chunks created on
> demand, 
> let's say 1GiB or something like that.


What we're looking at as the current design is an augmented userptr
(A..B -> C..D mapping) which is internally sparsely populated in
chunks. KMD manages the population using gpu pagefaults. We acknowledge
that some parts of this mirror will not have a valid CPU mapping. That
is, no vma so a gpu page-fault that resolves to such a mirror address
will cause an error. Would you have any concerns / objections against
such an approach?

Thanks,
Thomas

Zeng, Oak Feb. 29, 2024, 6:22 p.m. UTC | #56

Hi Christian/Daniel/Dave/Felix/Thomas, and all,

We have been refining our design internally in the past month. Below is our plan. Please let us know if you have any concern.

1) Remove pseudo /dev/xe-svm device. All system allocator interfaces will be through /dev/dri/render devices. Not global interface.

2) Unify userptr and system allocator codes. We will treat userptr as a speciality of system allocator without migration capability. We will introduce the hmmptr concept for system allocator. We will extend vm_bind API to map a range A..B of process address space to a range C..D of GPU address space for hmmptr. For hmmptr, if gpu program accesses an address which is not backed by core mm vma, it is a fatal error.

3) Multiple device support. We have identified p2p use-cases where we might want to leave memory on a foreign device or direct migrations to a foreign device and therefore might need a global structure that tracks or caches the migration state per process address space. We didn't completely settle down this design. We will come back when we have more details.

4)We will first work on this code on xekmd then look to move some common codes to drm layer so it can also be used by other vendors.

Thomas and me still have open questions to Christian. We will follow up.

Thanks all for this discussion.

Regards,
Oak

> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Thursday, February 1, 2024 3:52 AM
> To: Zeng, Oak <oak.zeng@intel.com>; Daniel Vetter <daniel@ffwll.ch>; David
> Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
> Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal
> Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> apopple@nvidia.com
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> Hi Oak,
> 
> Am 31.01.24 um 21:17 schrieb Zeng, Oak:
> > Hi Sima, Dave,
> >
> > I am well aware nouveau driver is not what Nvidia do with their customer. The
> key argument is, can we move forward with the concept shared virtual address
> space b/t CPU and GPU? This is the foundation of HMM. We already have split
> address space support with other driver API. SVM, from its name, it means
> shared address space. Are we allowed to implement another driver model to
> allow SVM work, along with other APIs supporting split address space? Those two
> scheme can co-exist in harmony. We actually have real use cases to use both
> models in one application.
> >
> > Hi Christian, Thomas,
> >
> > In your scheme, GPU VA can != GPU VA. This does introduce some flexibility.
> But this scheme alone doesn't solve the problem of the proxy process/para-
> virtualization. You will still need a second mechanism to partition GPU VA space
> b/t guest process1 and guest process2 because proxy process (or the host
> hypervisor whatever you call it) use one single gpu page table for all the
> guest/client processes. GPU VA for different guest process can't overlap. If this
> second mechanism exist, we of course can use the same mechanism to partition
> CPU VA space between guest processes as well, then we can still use shared VA
> b/t CPU and GPU inside one process, but process1 and process2's address space
> (for both cpu and gpu) doesn't overlap. This second mechanism is the key to
> solve the proxy process problem, not the flexibility you introduced.
> 
> That approach was suggested before, but it doesn't work. First of all
> you create a massive security hole when you give the GPU full access to
> the QEMU CPU process which runs the virtualization.
> 
> So even if you say CPU VA == GPU VA you still need some kind of
> flexibility otherwise you can't implement this use case securely.
> 
> Additional to this the CPU VAs are usually controlled by the OS and not
> some driver, so to make sure that host and guest VAs don't overlap you
> would need to add some kind of sync between the guest and host OS kernels.
> 
> > In practice, your scheme also have a risk of running out of process space
> because you have to partition whole address space b/t processes. Apparently
> allowing each guest process to own the whole process space and using separate
> GPU/CPU page table for different processes is a better solution than using single
> page table and partition process space b/t processes.
> 
> Yeah that you run out of address space is certainly possible. But as I
> said CPUs are switching to 5 level of pages tables and if you look at
> for example a "cat maps | cut -c-4 | sort -u" of process you will find
> that only a handful of 4GiB segments are actually used and thanks to
> recoverable page faults you can map those between host and client on
> demand. This gives you at least enough address space to handle a couple
> of thousand clients.
> 
> > For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-
> Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all
> SW-based GPU virtualization technology) is an old project. It is now replaced with
> HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago.
> So agreed your scheme add some flexibility. The question is, do we have a valid
> use case to use such flexibility? I don't see a single one ATM.
> 
> Yeah, we have SRIOV functionality on AMD hw as well, but for some use
> cases it's just to inflexible.
> 
> > I also pictured into how to implement your scheme. You basically rejected the
> very foundation of hmm design which is shared address space b/t CPU and GPU.
> In your scheme, GPU VA = CPU VA + offset. In every single place where driver
> need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in
> mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From
> application writer's perspective, whenever he want to use a CPU pointer in his
> GPU program, he add to add that offset. Do you think this is awkward?
> 
> What? This flexibility is there to prevent the application writer to
> change any offset.
> 
> > Finally, to implement SVM, we need to implement some memory hint API
> which applies to a virtual address range across all GPU devices. For example, user
> would say, for this virtual address range, I prefer the backing store memory to be
> on GPU deviceX (because user knows deviceX would use this address range
> much more than other GPU devices or CPU). It doesn't make sense to me to
> make such API per device based. For example, if you tell device A that the
> preferred memory location is device B memory, this doesn't sounds correct to
> me because in your scheme, device A is not even aware of the existence of
> device B. right?
> 
> Correct and while the additional flexibility is somewhat option I
> strongly think that not having a centralized approach for device driver
> settings is mandatory.
> 
> Going away from the well defined file descriptor based handling of
> device driver interfaces was one of the worst ideas I've ever seen in
> roughly thirty years of working with Unixiode operating systems. It
> basically broke everything, from reverse lockup handling for mmap() to
> file system privileges for hardware access.
> 
> As far as I can see anything which goes into the direction of opening
> /dev/kfd or /dev/xe_svm or something similar and saying that this then
> results into implicit SVM for your render nodes is an absolutely no-go
> and would required and explicit acknowledgement from Linus on the design
> to do something like that.
> 
> What you can do is to have an IOCTL for the render node file descriptor
> which says this device should do SVM with the current CPU address space
> and another IOCTL which says range A..B is preferred to migrate to this
> device for HMM when the device runs into a page fault.
> 
> And yes that obviously means shitty performance for device drivers
> because page play ping/pong if userspace gives contradicting information
> for migrations, but that is something supposed to be.
> 
> Everything else which works over the boarders of a device driver scope
> should be implemented as system call with the relevant review process
> around it.
> 
> Regards,
> Christian.
> 
> >
> > Regards,
> > Oak
> >> -----Original Message-----
> >> From: Daniel Vetter <daniel@ffwll.ch>
> >> Sent: Wednesday, January 31, 2024 4:15 AM
> >> To: David Airlie <airlied@redhat.com>
> >> Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
> >> <christian.koenig@amd.com>; Thomas Hellström
> >> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
> >> Matthew <matthew.brost@intel.com>; Felix Kuehling
> >> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
> >> devel@lists.freedesktop.org; Ghimiray, Himal Prasad
> >> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> >> <krishnaiah.bommu@intel.com>; Gupta, saurabhg
> <saurabhg.gupta@intel.com>;
> >> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> >> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah,
> Ankur N
> >> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> >> apopple@nvidia.com
> >> Subject: Re: Making drm_gpuvm work across gpu devices
> >>
> >> On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
> >>> On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
> >>>> Hi Christian,
> >>>>
> >>>>
> >>>>
> >>>> Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
> >> GPU address in the same process is exactly the same with CPU virtual address.
> It
> >> is already in upstream Linux kernel. We Intel just follow the same direction for
> >> our customers. Why we are not allowed?
> >>>
> >>> Oak, this isn't how upstream works, you don't get to appeal to
> >>> customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
> >>> isn't something NVIDIA would ever suggest for their customers. We also
> >>> likely wouldn't just accept NVIDIA's current solution upstream without
> >>> some serious discussions. The implementation in nouveau was more of a
> >>> sample HMM use case rather than a serious implementation. I suspect if
> >>> we do get down the road of making nouveau an actual compute driver for
> >>> SVM etc then it would have to severely change.
> >> Yeah on the nouveau hmm code specifically my gut feeling impression is
> >> that we didn't really make friends with that among core kernel
> >> maintainers. It's a bit too much just a tech demo to be able to merge the
> >> hmm core apis for nvidia's out-of-tree driver.
> >>
> >> Also, a few years of learning and experience gaining happened meanwhile -
> >> you always have to look at an api design in the context of when it was
> >> designed, and that context changes all the time.
> >>
> >> Cheers, Sima
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch

Christian König March 1, 2024, 7:01 a.m. UTC | #57

Hi Thomas,

Am 29.02.24 um 18:12 schrieb Thomas Hellström:
> Hi, Christian.
>
> On Thu, 2024-02-29 at 10:41 +0100, Christian König wrote:
>> Am 28.02.24 um 20:51 schrieb Zeng, Oak:
>>> The mail wasn’t indent/preface correctly. Manually format it.
>>>
>>> *From:*Christian König <christian.koenig@amd.com>
>>> *Sent:* Tuesday, February 27, 2024 1:54 AM
>>> *To:* Zeng, Oak <oak.zeng@intel.com>; Danilo Krummrich
>>> <dakr@redhat.com>; Dave Airlie <airlied@redhat.com>; Daniel Vetter
>>> <daniel@ffwll.ch>; Felix Kuehling <felix.kuehling@amd.com>;
>>> jglisse@redhat.com
>>> *Cc:* Welty, Brian <brian.welty@intel.com>;
>>> dri-devel@lists.freedesktop.org; intel-xe@lists.freedesktop.org;
>>> Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Ghimiray, Himal
>>> Prasad
>>> <himal.prasad.ghimiray@intel.com>;
>>> Thomas.Hellstrom@linux.intel.com;
>>> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>;
>>> Brost, Matthew <matthew.brost@intel.com>; Gupta, saurabhg
>>> <saurabhg.gupta@intel.com>
>>> *Subject:* Re: Making drm_gpuvm work across gpu devices
>>>
>>> Hi Oak,
>>>
>>> Am 23.02.24 um 21:12 schrieb Zeng, Oak:
>>>
>>>      Hi Christian,
>>>
>>>      I go back this old email to ask a question.
>>>
>>>
>>> sorry totally missed that one.
>>>
>>>      Quote from your email:
>>>
>>>      “Those ranges can then be used to implement the SVM feature
>>>      required for higher level APIs and not something you need at
>>> the
>>>      UAPI or even inside the low level kernel memory management.”
>>>
>>>      “SVM is a high level concept of OpenCL, Cuda, ROCm etc.. This
>>>      should not have any influence on the design of the kernel
>>> UAPI.”
>>>
>>>      There are two category of SVM:
>>>
>>>      1.driver svm allocator: this is implemented in user space,
>>>   i.g.,
>>>      cudaMallocManaged (cuda) or zeMemAllocShared (L0) or
>>>      clSVMAlloc(openCL). Intel already have gem_create/vm_bind in
>>> xekmd
>>>      and our umd implemented clSVMAlloc and zeMemAllocShared on top
>>> of
>>>      gem_create/vm_bind. Range A..B of the process address space is
>>>      mapped into a range C..D of the GPU address space, exactly as
>>> you
>>>      said.
>>>
>>>      2.system svm allocator:  This doesn’t introduce extra driver
>>> API
>>>      for memory allocation. Any valid CPU virtual address can be
>>> used
>>>      directly transparently in a GPU program without any extra
>>> driver
>>>      API call. Quote from kernel Documentation/vm/hmm.hst: “Any
>>>      application memory region (private anonymous, shared memory, or
>>>      regular file backed memory) can be used by a device
>>> transparently”
>>>      and “to share the address space by duplicating the CPU page
>>> table
>>>      in the device page table so the same address points to the same
>>>      physical memory for any valid main memory address in the
>>> process
>>>      address space”. In system svm allocator, we don’t need that
>>> A..B
>>>      C..D mapping.
>>>
>>>      It looks like you were talking of 1). Were you?
>>>
>>>
>>> No, even when you fully mirror the whole address space from a
>>> process
>>> into the GPU you still need to enable this somehow with an IOCTL.
>>>
>>> And while enabling this you absolutely should specify to which part
>>> of
>>> the address space this mirroring applies and where it maps to.
>>>
>>> */[Zeng, Oak] /*
>>>
>>> Lets say we have a hardware platform where both CPU and GPU support
>>> 57bit(use it for example. The statement apply to any address range)
>>> virtual address range, how do you decide “which part of the address
>>> space this mirroring applies”? You have to mirror the whole address
>>> space [0~2^57-1], do you? As you designed it, the gigantic
>>> vm_bind/mirroring happens at the process initialization time, and
>>> at
>>> that time, you don’t know which part of the address space will be
>>> used
>>> for gpu program. Remember for system allocator, *any* valid CPU
>>> address can be used for GPU program.  If you add an offset to
>>> [0~2^57-1], you get an address out of 57bit address range. Is this
>>> a
>>> valid concern?
>>>
>> Well you can perfectly mirror on demand. You just need something
>> similar
>> to userfaultfd() for the GPU. This way you don't need to mirror the
>> full
>> address space, but can rather work with large chunks created on
>> demand,
>> let's say 1GiB or something like that.
>
> What we're looking at as the current design is an augmented userptr
> (A..B -> C..D mapping) which is internally sparsely populated in
> chunks. KMD manages the population using gpu pagefaults. We acknowledge
> that some parts of this mirror will not have a valid CPU mapping. That
> is, no vma so a gpu page-fault that resolves to such a mirror address
> will cause an error. Would you have any concerns / objections against
> such an approach?

Nope, as far as I can see that sounds like a perfectly valid design to me.

Regards,
Christian.

>
> Thanks,
> Thomas
>
>
>

Zeng, Oak March 8, 2024, 4:43 a.m. UTC | #58

Hello all,

Since I didn't get a reply for this one, I assume below are agreed. But feel free to let us know if you don't agree.

Thanks,
Oak

-----Original Message-----
From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of Zeng, Oak
Sent: Thursday, February 29, 2024 1:23 PM
To: Christian König <christian.koenig@amd.com>; Daniel Vetter <daniel@ffwll.ch>; David Airlie <airlied@redhat.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com; apopple@nvidia.com
Subject: RE: Making drm_gpuvm work across gpu devices

Hi Christian/Daniel/Dave/Felix/Thomas, and all,

We have been refining our design internally in the past month. Below is our plan. Please let us know if you have any concern.

1) Remove pseudo /dev/xe-svm device. All system allocator interfaces will be through /dev/dri/render devices. Not global interface.

2) Unify userptr and system allocator codes. We will treat userptr as a speciality of system allocator without migration capability. We will introduce the hmmptr concept for system allocator. We will extend vm_bind API to map a range A..B of process address space to a range C..D of GPU address space for hmmptr. For hmmptr, if gpu program accesses an address which is not backed by core mm vma, it is a fatal error.

3) Multiple device support. We have identified p2p use-cases where we might want to leave memory on a foreign device or direct migrations to a foreign device and therefore might need a global structure that tracks or caches the migration state per process address space. We didn't completely settle down this design. We will come back when we have more details.

4)We will first work on this code on xekmd then look to move some common codes to drm layer so it can also be used by other vendors.

Thomas and me still have open questions to Christian. We will follow up.

Thanks all for this discussion.

Regards,
Oak

> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Thursday, February 1, 2024 3:52 AM
> To: Zeng, Oak <oak.zeng@intel.com>; Daniel Vetter <daniel@ffwll.ch>; David
> Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Brost, Matthew
> <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
> Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal
> Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> apopple@nvidia.com
> Subject: Re: Making drm_gpuvm work across gpu devices
> 
> Hi Oak,
> 
> Am 31.01.24 um 21:17 schrieb Zeng, Oak:
> > Hi Sima, Dave,
> >
> > I am well aware nouveau driver is not what Nvidia do with their customer. The
> key argument is, can we move forward with the concept shared virtual address
> space b/t CPU and GPU? This is the foundation of HMM. We already have split
> address space support with other driver API. SVM, from its name, it means
> shared address space. Are we allowed to implement another driver model to
> allow SVM work, along with other APIs supporting split address space? Those two
> scheme can co-exist in harmony. We actually have real use cases to use both
> models in one application.
> >
> > Hi Christian, Thomas,
> >
> > In your scheme, GPU VA can != GPU VA. This does introduce some flexibility.
> But this scheme alone doesn't solve the problem of the proxy process/para-
> virtualization. You will still need a second mechanism to partition GPU VA space
> b/t guest process1 and guest process2 because proxy process (or the host
> hypervisor whatever you call it) use one single gpu page table for all the
> guest/client processes. GPU VA for different guest process can't overlap. If this
> second mechanism exist, we of course can use the same mechanism to partition
> CPU VA space between guest processes as well, then we can still use shared VA
> b/t CPU and GPU inside one process, but process1 and process2's address space
> (for both cpu and gpu) doesn't overlap. This second mechanism is the key to
> solve the proxy process problem, not the flexibility you introduced.
> 
> That approach was suggested before, but it doesn't work. First of all
> you create a massive security hole when you give the GPU full access to
> the QEMU CPU process which runs the virtualization.
> 
> So even if you say CPU VA == GPU VA you still need some kind of
> flexibility otherwise you can't implement this use case securely.
> 
> Additional to this the CPU VAs are usually controlled by the OS and not
> some driver, so to make sure that host and guest VAs don't overlap you
> would need to add some kind of sync between the guest and host OS kernels.
> 
> > In practice, your scheme also have a risk of running out of process space
> because you have to partition whole address space b/t processes. Apparently
> allowing each guest process to own the whole process space and using separate
> GPU/CPU page table for different processes is a better solution than using single
> page table and partition process space b/t processes.
> 
> Yeah that you run out of address space is certainly possible. But as I
> said CPUs are switching to 5 level of pages tables and if you look at
> for example a "cat maps | cut -c-4 | sort -u" of process you will find
> that only a handful of 4GiB segments are actually used and thanks to
> recoverable page faults you can map those between host and client on
> demand. This gives you at least enough address space to handle a couple
> of thousand clients.
> 
> > For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-
> Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all
> SW-based GPU virtualization technology) is an old project. It is now replaced with
> HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago.
> So agreed your scheme add some flexibility. The question is, do we have a valid
> use case to use such flexibility? I don't see a single one ATM.
> 
> Yeah, we have SRIOV functionality on AMD hw as well, but for some use
> cases it's just to inflexible.
> 
> > I also pictured into how to implement your scheme. You basically rejected the
> very foundation of hmm design which is shared address space b/t CPU and GPU.
> In your scheme, GPU VA = CPU VA + offset. In every single place where driver
> need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in
> mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From
> application writer's perspective, whenever he want to use a CPU pointer in his
> GPU program, he add to add that offset. Do you think this is awkward?
> 
> What? This flexibility is there to prevent the application writer to
> change any offset.
> 
> > Finally, to implement SVM, we need to implement some memory hint API
> which applies to a virtual address range across all GPU devices. For example, user
> would say, for this virtual address range, I prefer the backing store memory to be
> on GPU deviceX (because user knows deviceX would use this address range
> much more than other GPU devices or CPU). It doesn't make sense to me to
> make such API per device based. For example, if you tell device A that the
> preferred memory location is device B memory, this doesn't sounds correct to
> me because in your scheme, device A is not even aware of the existence of
> device B. right?
> 
> Correct and while the additional flexibility is somewhat option I
> strongly think that not having a centralized approach for device driver
> settings is mandatory.
> 
> Going away from the well defined file descriptor based handling of
> device driver interfaces was one of the worst ideas I've ever seen in
> roughly thirty years of working with Unixiode operating systems. It
> basically broke everything, from reverse lockup handling for mmap() to
> file system privileges for hardware access.
> 
> As far as I can see anything which goes into the direction of opening
> /dev/kfd or /dev/xe_svm or something similar and saying that this then
> results into implicit SVM for your render nodes is an absolutely no-go
> and would required and explicit acknowledgement from Linus on the design
> to do something like that.
> 
> What you can do is to have an IOCTL for the render node file descriptor
> which says this device should do SVM with the current CPU address space
> and another IOCTL which says range A..B is preferred to migrate to this
> device for HMM when the device runs into a page fault.
> 
> And yes that obviously means shitty performance for device drivers
> because page play ping/pong if userspace gives contradicting information
> for migrations, but that is something supposed to be.
> 
> Everything else which works over the boarders of a device driver scope
> should be implemented as system call with the relevant review process
> around it.
> 
> Regards,
> Christian.
> 
> >
> > Regards,
> > Oak
> >> -----Original Message-----
> >> From: Daniel Vetter <daniel@ffwll.ch>
> >> Sent: Wednesday, January 31, 2024 4:15 AM
> >> To: David Airlie <airlied@redhat.com>
> >> Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
> >> <christian.koenig@amd.com>; Thomas Hellström
> >> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
> >> Matthew <matthew.brost@intel.com>; Felix Kuehling
> >> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
> >> devel@lists.freedesktop.org; Ghimiray, Himal Prasad
> >> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
> >> <krishnaiah.bommu@intel.com>; Gupta, saurabhg
> <saurabhg.gupta@intel.com>;
> >> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
> >> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah,
> Ankur N
> >> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
> >> apopple@nvidia.com
> >> Subject: Re: Making drm_gpuvm work across gpu devices
> >>
> >> On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
> >>> On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
> >>>> Hi Christian,
> >>>>
> >>>>
> >>>>
> >>>> Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
> >> GPU address in the same process is exactly the same with CPU virtual address.
> It
> >> is already in upstream Linux kernel. We Intel just follow the same direction for
> >> our customers. Why we are not allowed?
> >>>
> >>> Oak, this isn't how upstream works, you don't get to appeal to
> >>> customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
> >>> isn't something NVIDIA would ever suggest for their customers. We also
> >>> likely wouldn't just accept NVIDIA's current solution upstream without
> >>> some serious discussions. The implementation in nouveau was more of a
> >>> sample HMM use case rather than a serious implementation. I suspect if
> >>> we do get down the road of making nouveau an actual compute driver for
> >>> SVM etc then it would have to severely change.
> >> Yeah on the nouveau hmm code specifically my gut feeling impression is
> >> that we didn't really make friends with that among core kernel
> >> maintainers. It's a bit too much just a tech demo to be able to merge the
> >> hmm core apis for nvidia's out-of-tree driver.
> >>
> >> Also, a few years of learning and experience gaining happened meanwhile -
> >> you always have to look at an api design in the context of when it was
> >> designed, and that context changes all the time.
> >>
> >> Cheers, Sima
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch

Christian König March 8, 2024, 10:07 a.m. UTC | #59

Hi Oak,

sorry the mail sounded like you didn't expected a reply.

And yes, the approaches outlined in the mail sounds really good to me.

Regards,
Christian.

Am 08.03.24 um 05:43 schrieb Zeng, Oak:
> Hello all,
>
> Since I didn't get a reply for this one, I assume below are agreed. But feel free to let us know if you don't agree.
>
> Thanks,
> Oak
>
> -----Original Message-----
> From: dri-devel <dri-devel-bounces@lists.freedesktop.org> On Behalf Of Zeng, Oak
> Sent: Thursday, February 29, 2024 1:23 PM
> To: Christian König <christian.koenig@amd.com>; Daniel Vetter <daniel@ffwll.ch>; David Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Brost, Matthew <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>; Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com; apopple@nvidia.com
> Subject: RE: Making drm_gpuvm work across gpu devices
>
> Hi Christian/Daniel/Dave/Felix/Thomas, and all,
>
> We have been refining our design internally in the past month. Below is our plan. Please let us know if you have any concern.
>
> 1) Remove pseudo /dev/xe-svm device. All system allocator interfaces will be through /dev/dri/render devices. Not global interface.
>
> 2) Unify userptr and system allocator codes. We will treat userptr as a speciality of system allocator without migration capability. We will introduce the hmmptr concept for system allocator. We will extend vm_bind API to map a range A..B of process address space to a range C..D of GPU address space for hmmptr. For hmmptr, if gpu program accesses an address which is not backed by core mm vma, it is a fatal error.
>
> 3) Multiple device support. We have identified p2p use-cases where we might want to leave memory on a foreign device or direct migrations to a foreign device and therefore might need a global structure that tracks or caches the migration state per process address space. We didn't completely settle down this design. We will come back when we have more details.
>
> 4)We will first work on this code on xekmd then look to move some common codes to drm layer so it can also be used by other vendors.
>
> Thomas and me still have open questions to Christian. We will follow up.
>
> Thanks all for this discussion.
>
> Regards,
> Oak
>
>> -----Original Message-----
>> From: Christian König <christian.koenig@amd.com>
>> Sent: Thursday, February 1, 2024 3:52 AM
>> To: Zeng, Oak <oak.zeng@intel.com>; Daniel Vetter <daniel@ffwll.ch>; David
>> Airlie <airlied@redhat.com>
>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>; Brost, Matthew
>> <matthew.brost@intel.com>; Felix Kuehling <felix.kuehling@amd.com>; Welty,
>> Brian <brian.welty@intel.com>; dri-devel@lists.freedesktop.org; Ghimiray, Himal
>> Prasad <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
>> <krishnaiah.bommu@intel.com>; Gupta, saurabhg <saurabhg.gupta@intel.com>;
>> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
>> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah, Ankur N
>> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
>> apopple@nvidia.com
>> Subject: Re: Making drm_gpuvm work across gpu devices
>>
>> Hi Oak,
>>
>> Am 31.01.24 um 21:17 schrieb Zeng, Oak:
>>> Hi Sima, Dave,
>>>
>>> I am well aware nouveau driver is not what Nvidia do with their customer. The
>> key argument is, can we move forward with the concept shared virtual address
>> space b/t CPU and GPU? This is the foundation of HMM. We already have split
>> address space support with other driver API. SVM, from its name, it means
>> shared address space. Are we allowed to implement another driver model to
>> allow SVM work, along with other APIs supporting split address space? Those two
>> scheme can co-exist in harmony. We actually have real use cases to use both
>> models in one application.
>>> Hi Christian, Thomas,
>>>
>>> In your scheme, GPU VA can != GPU VA. This does introduce some flexibility.
>> But this scheme alone doesn't solve the problem of the proxy process/para-
>> virtualization. You will still need a second mechanism to partition GPU VA space
>> b/t guest process1 and guest process2 because proxy process (or the host
>> hypervisor whatever you call it) use one single gpu page table for all the
>> guest/client processes. GPU VA for different guest process can't overlap. If this
>> second mechanism exist, we of course can use the same mechanism to partition
>> CPU VA space between guest processes as well, then we can still use shared VA
>> b/t CPU and GPU inside one process, but process1 and process2's address space
>> (for both cpu and gpu) doesn't overlap. This second mechanism is the key to
>> solve the proxy process problem, not the flexibility you introduced.
>>
>> That approach was suggested before, but it doesn't work. First of all
>> you create a massive security hole when you give the GPU full access to
>> the QEMU CPU process which runs the virtualization.
>>
>> So even if you say CPU VA == GPU VA you still need some kind of
>> flexibility otherwise you can't implement this use case securely.
>>
>> Additional to this the CPU VAs are usually controlled by the OS and not
>> some driver, so to make sure that host and guest VAs don't overlap you
>> would need to add some kind of sync between the guest and host OS kernels.
>>
>>> In practice, your scheme also have a risk of running out of process space
>> because you have to partition whole address space b/t processes. Apparently
>> allowing each guest process to own the whole process space and using separate
>> GPU/CPU page table for different processes is a better solution than using single
>> page table and partition process space b/t processes.
>>
>> Yeah that you run out of address space is certainly possible. But as I
>> said CPUs are switching to 5 level of pages tables and if you look at
>> for example a "cat maps | cut -c-4 | sort -u" of process you will find
>> that only a handful of 4GiB segments are actually used and thanks to
>> recoverable page faults you can map those between host and client on
>> demand. This gives you at least enough address space to handle a couple
>> of thousand clients.
>>
>>> For Intel GPU, para-virtualization (xenGT, see https://github.com/intel/XenGT-
>> Preview-kernel. It is similar idea of the proxy process in Flex's email. They are all
>> SW-based GPU virtualization technology) is an old project. It is now replaced with
>> HW accelerated SRIOV/system virtualization. XenGT is abandoned long time ago.
>> So agreed your scheme add some flexibility. The question is, do we have a valid
>> use case to use such flexibility? I don't see a single one ATM.
>>
>> Yeah, we have SRIOV functionality on AMD hw as well, but for some use
>> cases it's just to inflexible.
>>
>>> I also pictured into how to implement your scheme. You basically rejected the
>> very foundation of hmm design which is shared address space b/t CPU and GPU.
>> In your scheme, GPU VA = CPU VA + offset. In every single place where driver
>> need to call hmm facilities such as hmm_range_fault, migrate_vma_setup and in
>> mmu notifier call back, you need to offset the GPU VA to get a CPU VA. From
>> application writer's perspective, whenever he want to use a CPU pointer in his
>> GPU program, he add to add that offset. Do you think this is awkward?
>>
>> What? This flexibility is there to prevent the application writer to
>> change any offset.
>>
>>> Finally, to implement SVM, we need to implement some memory hint API
>> which applies to a virtual address range across all GPU devices. For example, user
>> would say, for this virtual address range, I prefer the backing store memory to be
>> on GPU deviceX (because user knows deviceX would use this address range
>> much more than other GPU devices or CPU). It doesn't make sense to me to
>> make such API per device based. For example, if you tell device A that the
>> preferred memory location is device B memory, this doesn't sounds correct to
>> me because in your scheme, device A is not even aware of the existence of
>> device B. right?
>>
>> Correct and while the additional flexibility is somewhat option I
>> strongly think that not having a centralized approach for device driver
>> settings is mandatory.
>>
>> Going away from the well defined file descriptor based handling of
>> device driver interfaces was one of the worst ideas I've ever seen in
>> roughly thirty years of working with Unixiode operating systems. It
>> basically broke everything, from reverse lockup handling for mmap() to
>> file system privileges for hardware access.
>>
>> As far as I can see anything which goes into the direction of opening
>> /dev/kfd or /dev/xe_svm or something similar and saying that this then
>> results into implicit SVM for your render nodes is an absolutely no-go
>> and would required and explicit acknowledgement from Linus on the design
>> to do something like that.
>>
>> What you can do is to have an IOCTL for the render node file descriptor
>> which says this device should do SVM with the current CPU address space
>> and another IOCTL which says range A..B is preferred to migrate to this
>> device for HMM when the device runs into a page fault.
>>
>> And yes that obviously means shitty performance for device drivers
>> because page play ping/pong if userspace gives contradicting information
>> for migrations, but that is something supposed to be.
>>
>> Everything else which works over the boarders of a device driver scope
>> should be implemented as system call with the relevant review process
>> around it.
>>
>> Regards,
>> Christian.
>>
>>> Regards,
>>> Oak
>>>> -----Original Message-----
>>>> From: Daniel Vetter <daniel@ffwll.ch>
>>>> Sent: Wednesday, January 31, 2024 4:15 AM
>>>> To: David Airlie <airlied@redhat.com>
>>>> Cc: Zeng, Oak <oak.zeng@intel.com>; Christian König
>>>> <christian.koenig@amd.com>; Thomas Hellström
>>>> <thomas.hellstrom@linux.intel.com>; Daniel Vetter <daniel@ffwll.ch>; Brost,
>>>> Matthew <matthew.brost@intel.com>; Felix Kuehling
>>>> <felix.kuehling@amd.com>; Welty, Brian <brian.welty@intel.com>; dri-
>>>> devel@lists.freedesktop.org; Ghimiray, Himal Prasad
>>>> <himal.prasad.ghimiray@intel.com>; Bommu, Krishnaiah
>>>> <krishnaiah.bommu@intel.com>; Gupta, saurabhg
>> <saurabhg.gupta@intel.com>;
>>>> Vishwanathapura, Niranjana <niranjana.vishwanathapura@intel.com>; intel-
>>>> xe@lists.freedesktop.org; Danilo Krummrich <dakr@redhat.com>; Shah,
>> Ankur N
>>>> <ankur.n.shah@intel.com>; jglisse@redhat.com; rcampbell@nvidia.com;
>>>> apopple@nvidia.com
>>>> Subject: Re: Making drm_gpuvm work across gpu devices
>>>>
>>>> On Wed, Jan 31, 2024 at 09:12:39AM +1000, David Airlie wrote:
>>>>> On Wed, Jan 31, 2024 at 8:29 AM Zeng, Oak <oak.zeng@intel.com> wrote:
>>>>>> Hi Christian,
>>>>>>
>>>>>>
>>>>>>
>>>>>> Nvidia Nouveau driver uses exactly the same concept of SVM with HMM,
>>>> GPU address in the same process is exactly the same with CPU virtual address.
>> It
>>>> is already in upstream Linux kernel. We Intel just follow the same direction for
>>>> our customers. Why we are not allowed?
>>>>> Oak, this isn't how upstream works, you don't get to appeal to
>>>>> customer or internal design. nouveau isn't "NVIDIA"'s and it certainly
>>>>> isn't something NVIDIA would ever suggest for their customers. We also
>>>>> likely wouldn't just accept NVIDIA's current solution upstream without
>>>>> some serious discussions. The implementation in nouveau was more of a
>>>>> sample HMM use case rather than a serious implementation. I suspect if
>>>>> we do get down the road of making nouveau an actual compute driver for
>>>>> SVM etc then it would have to severely change.
>>>> Yeah on the nouveau hmm code specifically my gut feeling impression is
>>>> that we didn't really make friends with that among core kernel
>>>> maintainers. It's a bit too much just a tech demo to be able to merge the
>>>> hmm core apis for nvidia's out-of-tree driver.
>>>>
>>>> Also, a few years of learning and experience gaining happened meanwhile -
>>>> you always have to look at an api design in the context of when it was
>>>> designed, and that context changes all the time.
>>>>
>>>> Cheers, Sima
>>>> --
>>>> Daniel Vetter
>>>> Software Engineer, Intel Corporation
>>>> http://blog.ffwll.ch

diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 467d68f8332e..462603abab8a 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -22,6 +22,7 @@ 
 #include "xe_pt.h"
 #include "xe_trace.h"
 #include "xe_vm.h"
+#include "xe_svm.h"
 
 enum fault_type {
 	NOT_PRESENT = 0,
@@ -131,6 +132,11 @@  static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
 	if (!vm || !xe_vm_in_fault_mode(vm))
 		return -EINVAL;
 
+	if (vm->svm) {
+		ret = xe_svm_handle_gpu_fault(vm, gt, pf);
+		goto put_vm;
+	}
+
 retry_userptr:
 	/*
 	 * TODO: Avoid exclusive lock if VM doesn't have userptrs, or
@@ -219,6 +225,7 @@  static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
 		if (ret >= 0)
 			ret = 0;
 	}
+put_vm:
 	xe_vm_put(vm);
 
 	return ret;
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 0c13690a19f5..1ade8d7f0ab2 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -12,6 +12,7 @@ 
 #include "xe_svm.h"
 #include <linux/hmm.h>
 #include <linux/scatterlist.h>
+#include <drm/xe_drm.h>
 #include "xe_pt.h"
 #include "xe_assert.h"
 #include "xe_vm_types.h"
@@ -206,3 +207,118 @@  static int svm_populate_range(struct xe_svm_range *svm_range,
 		kvfree(pfns);
 	return ret;
 }
+
+/**
+ * svm_access_allowed() -  Determine whether read or/and write to vma is allowed
+ *
+ * @write: true means a read and write access; false: read only access
+ */
+static bool svm_access_allowed(struct vm_area_struct *vma, bool write)
+{
+	unsigned long access = VM_READ;
+
+	if (write)
+		access |= VM_WRITE;
+
+	return (vma->vm_flags & access) == access;
+}
+
+/**
+ * svm_should_migrate() - Determine whether we should migrate a range to
+ * a destination memory region
+ *
+ * @range: The svm memory range to consider
+ * @dst_region: target destination memory region
+ * @is_atomic_fault: Is the intended migration triggered by a atomic access?
+ * On some platform, we have to migrate memory to guarantee atomic correctness.
+ */
+static bool svm_should_migrate(struct xe_svm_range *range,
+				struct xe_mem_region *dst_region, bool is_atomic_fault)
+{
+	return true;
+}
+
+/**
+ * xe_svm_handle_gpu_fault() - gpu page fault handler for svm subsystem
+ *
+ * @vm: The vm of the fault.
+ * @gt: The gt hardware on which the fault happens.
+ * @pf: page fault descriptor
+ *
+ * Workout a backing memory for the fault address, migrate memory from
+ * system memory to gpu vram if nessary, and map the fault address to
+ * GPU so GPU HW can retry the last operation which has caused the GPU
+ * page fault.
+ */
+int xe_svm_handle_gpu_fault(struct xe_vm *vm,
+				struct xe_gt *gt,
+				struct pagefault *pf)
+{
+	u8 access_type = pf->access_type;
+	u64 page_addr = pf->page_addr;
+	struct hmm_range hmm_range;
+	struct vm_area_struct *vma;
+	struct xe_svm_range *range;
+	struct mm_struct *mm;
+	struct xe_svm *svm;
+	int ret = 0;
+
+	svm = vm->svm;
+	if (!svm)
+		return -EINVAL;
+
+	mm = svm->mm;
+	mmap_read_lock(mm);
+	vma = find_vma_intersection(mm, page_addr, page_addr + 4);
+	if (!vma) {
+		mmap_read_unlock(mm);
+		return -ENOENT;
+	}
+
+	if (!svm_access_allowed (vma, access_type != ACCESS_TYPE_READ)) {
+		mmap_read_unlock(mm);
+		return -EPERM;
+	}
+
+	range = xe_svm_range_from_addr(svm, page_addr);
+	if (!range) {
+		range = xe_svm_range_create(svm, vma);
+		if (!range) {
+			mmap_read_unlock(mm);
+			return -ENOMEM;
+		}
+	}
+
+	if (svm_should_migrate(range, &gt->tile->mem.vram,
+						access_type == ACCESS_TYPE_ATOMIC))
+		/** Migrate whole svm range for now.
+		 *  This is subject to change once we introduce a migration granularity
+		 *  parameter for user to select.
+		 *
+		 *	Migration is best effort. If we failed to migrate to vram,
+		 *	we just map that range to gpu in system memory. For cases
+		 *	such as gpu atomic operation which requires memory to be
+		 *	resident in vram, we will fault again and retry migration.
+		 */
+		svm_migrate_range_to_vram(range, vma, gt->tile);
+
+	ret = svm_populate_range(range, &hmm_range, vma->vm_flags & VM_WRITE);
+	mmap_read_unlock(mm);
+	/** There is no need to destroy this range. Range can be reused later */
+	if (ret)
+		goto free_pfns;
+
+	/**FIXME: set the DM, AE flags in PTE*/
+	ret = xe_bind_svm_range(vm, gt->tile, &hmm_range,
+		!(vma->vm_flags & VM_WRITE) ? DRM_XE_VM_BIND_FLAG_READONLY : 0);
+	/** Concurrent cpu page table update happened,
+	 *  Return successfully so we will retry everything
+	 *  on next gpu page fault.
+	 */
+	if (ret == -EAGAIN)
+		ret = 0;
+
+free_pfns:
+	kvfree(hmm_range.hmm_pfns);
+	return ret;
+}
diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
index 659bcb7927d6..a8ff4957a9b8 100644
--- a/drivers/gpu/drm/xe/xe_svm.h
+++ b/drivers/gpu/drm/xe/xe_svm.h
@@ -20,6 +20,7 @@ 
 
 struct xe_vm;
 struct mm_struct;
+struct pagefault;
 
 #define XE_MAX_SVM_PROCESS 5 /* Maximumly support 32 SVM process*/
 extern DECLARE_HASHTABLE(xe_svm_table, XE_MAX_SVM_PROCESS);
@@ -94,6 +95,8 @@  bool xe_svm_range_belongs_to_vma(struct mm_struct *mm,
 void xe_svm_range_unregister_mmu_notifier(struct xe_svm_range *range);
 int xe_svm_range_register_mmu_notifier(struct xe_svm_range *range);
 void xe_svm_range_prepare_destroy(struct xe_svm_range *range);
+struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
+									struct vm_area_struct *vma);
 
 int xe_svm_build_sg(struct hmm_range *range, struct sg_table *st);
 int xe_svm_devm_add(struct xe_tile *tile, struct xe_mem_region *mem);
@@ -106,4 +109,7 @@  int xe_devm_alloc_pages(struct xe_tile *tile,
 
 void xe_devm_free_blocks(struct list_head *blocks);
 void xe_devm_page_free(struct page *page);
+int xe_svm_handle_gpu_fault(struct xe_vm *vm,
+				struct xe_gt *gt,
+				struct pagefault *pf);
 #endif
diff --git a/drivers/gpu/drm/xe/xe_svm_range.c b/drivers/gpu/drm/xe/xe_svm_range.c
index dfb4660dc26f..05c088dddc2d 100644
--- a/drivers/gpu/drm/xe/xe_svm_range.c
+++ b/drivers/gpu/drm/xe/xe_svm_range.c
@@ -182,3 +182,46 @@  void xe_svm_range_prepare_destroy(struct xe_svm_range *range)
 	xe_invalidate_svm_range(vm, range->start, length);
 	xe_svm_range_unregister_mmu_notifier(range);
 }
+
+static void add_range_to_svm(struct xe_svm_range *range)
+{
+	range->inode.start = range->start;
+	range->inode.last = range->end;
+	mutex_lock(&range->svm->mutex);
+	interval_tree_insert(&range->inode, &range->svm->range_tree);
+	mutex_unlock(&range->svm->mutex);
+}
+
+/**
+ * xe_svm_range_create() - create and initialize a svm range
+ *
+ * @svm: the svm that the range belongs to
+ * @vma: the corresponding vma of the range
+ *
+ * Create range, add it to svm's interval tree. Regiter a mmu
+ * interval notifier for this range.
+ *
+ * Return the pointer of the created svm range
+ * or NULL if fail
+ */
+struct xe_svm_range *xe_svm_range_create(struct xe_svm *svm,
+									struct vm_area_struct *vma)
+{
+	struct xe_svm_range *range = kzalloc(sizeof(*range), GFP_KERNEL);
+
+	if (!range)
+		return NULL;
+
+	range->start = vma->vm_start;
+	range->end = vma->vm_end;
+	range->vma = vma;
+	range->svm = svm;
+
+	if (xe_svm_range_register_mmu_notifier(range)){
+		kfree(range);
+		return NULL;
+	}
+
+	add_range_to_svm(range);
+	return range;
+}