diff mbox series

[RFC,05/28] drm/gpusvm: Add support for GPU Shared Virtual Memory

Message ID 20240828024901.2582335-6-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce GPU SVM and Xe SVM implementation | expand

Commit Message

Matthew Brost Aug. 28, 2024, 2:48 a.m. UTC
This patch introduces support for GPU Shared Virtual Memory (SVM) in the
Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
sharing of memory between the CPU and GPU, enhancing performance and
flexibility in GPU computing tasks.

The patch adds the necessary infrastructure for SVM, including data
structures and functions for managing SVM ranges and notifiers. It also
provides mechanisms for allocating, deallocating, and migrating memory
regions between system RAM and GPU VRAM.

This mid-layer is largely inspired by GPUVM.

Cc: Dave Airlie <airlied@redhat.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: <dri-devel@lists.freedesktop.org>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/Makefile     |    3 +-
 drivers/gpu/drm/xe/drm_gpusvm.c | 2174 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
 3 files changed, 2591 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
 create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h

Comments

Daniel Vetter Aug. 28, 2024, 2:31 p.m. UTC | #1
On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only a read-lock
> +			 * is required but there apears to be a window between
> +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> +			 * via migrate_vma_setup and the pages actually moving
> +			 * in migrate_vma_finalize in which this code can grab
> +			 * garbage pages. Grabbing the write-lock if the range
> +			 * is attached to vram appears to protect against this
> +			 * race.
> +			 */

This one is really scary, since it means the entire migrate pte trickery
is essentially completely busted. Grabbing the mmap write lock just means
you block out pretty much everything interesting from concurrently
happening.

My gut feeling says we need to figure out what's happening here, because
this looks a bit too fundamental to me.
-Sima


> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {
> +			if (WARN_ON_ONCE(i && order !=
> +					 hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);
> +
> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE << order,
> +						   DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] = (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> +					   unsigned long *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU SVM. It
> + * iterates over each page frame number provided in @migrate_pfn, maps the
> + * corresponding page, and stores the DMA address in the provided @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> + *                   should hold a reference to the VRAM allocation, which
> + *                   should be dropped via ops->vram_allocation or upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It performs the
> + * necessary setup and invokes the driver-specific operations for migration to
> + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> + * until ops->vram_release is called which only upon successful return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> +	 * always an error. Need to revisit possible cases and how to handle. We
> +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> +	 */
> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   migrate.src, npages, DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the VM area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> +						unsigned long npages,
> +						unsigned long *src_mpfn,
> +						unsigned long *mpfn, u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> + * migration done via migrate_device_* functions. Fallback path as it is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm->device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   dst, npages, DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified GPU SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	/* Corner where VMA area struct has been partially unmapped */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src, migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM range to
> + * SRAM. It performs necessary checks and invokes the internal migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner case when
> +	 * VRAM backing has been partially unmapped from MM's address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called while
> +		 * holding a dma-resv lock, and a final put can grab the mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> + * It retrieves the GPU SVM range information from the faulting page and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);
> +
> +	return err ? VM_FAULT_SIGBUS : 0;
> +}
> +
> +/**
> + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> + */
> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> +	.page_free = drm_gpusvm_page_free,
> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> +};
> +
> +/**
> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> + *
> + * Returns:
> + * Pointer to the GPU SVM device page map operations structure.
> + */
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> +{
> +	return &drm_gpusvm_pagemap_ops;
> +}
> +
> +/**
> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @start: Start address
> + * @end: End address
> + *
> + * Returns:
> + * True if GPU SVM has mapping, False otherwise
> + */
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> +		struct drm_gpusvm_range *range = NULL;
> +
> +		drm_gpusvm_for_each_range(range, notifier, start, end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> new file mode 100644
> index 000000000000..0ea70f8534a8
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> @@ -0,0 +1,415 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef __DRM_GPUSVM_H__
> +#define __DRM_GPUSVM_H__
> +
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/workqueue.h>
> +
> +struct dev_pagemap_ops;
> +struct drm_device;
> +struct drm_gpusvm;
> +struct drm_gpusvm_notifier;
> +struct drm_gpusvm_ops;
> +struct drm_gpusvm_range;
> +
> +/**
> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> + *
> + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> + * These operations are provided by the GPU driver to manage SVM ranges and
> + * perform operations such as migration between VRAM and system RAM.
> + */
> +struct drm_gpusvm_ops {
> +	/**
> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> +	 *
> +	 * This function shall allocate a GPU SVM notifier.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> +	 */
> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> +
> +	/**
> +	 * @notifier_free: Free a GPU SVM notifier (optional)
> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> +	 *
> +	 * This function shall free a GPU SVM notifier.
> +	 */
> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> +
> +	/**
> +	 * @range_alloc: Allocate a GPU SVM range (optional)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 *
> +	 * This function shall allocate a GPU SVM range.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> +	 */
> +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> +
> +	/**
> +	 * @range_free: Free a GPU SVM range (optional)
> +	 * @range: Pointer to the GPU SVM range to be freed
> +	 *
> +	 * This function shall free a GPU SVM range.
> +	 */
> +	void (*range_free)(struct drm_gpusvm_range *range);
> +
> +	/**
> +	 * @vram_release: Release VRAM allocation (optional)
> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> +	 *
> +	 * This function shall release VRAM allocation and expects to drop a
> +	 * reference to VRAM allocation.
> +	 */
> +	void (*vram_release)(void *vram_allocation);
> +
> +	/**
> +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> +	 * @npages: Number of pages to populate
> +	 * @pfn: Array of page frame numbers to populate
> +	 *
> +	 * This function shall populate VRAM page frame numbers (PFN).
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> +				 void *vram_allocation,
> +				 unsigned long npages,
> +				 unsigned long *pfn);
> +
> +	/**
> +	 * @copy_to_vram: Copy to VRAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (destination)
> +	 * @dma_addr: Pointer to array of DMA addresses (source)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to VRAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @copy_to_sram: Copy to system RAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (source)
> +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to system RAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @invalidate: Invalidate GPU SVM notifier (required)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @notifier: Pointer to the GPU SVM notifier
> +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> +	 *
> +	 * This function shall invalidate the GPU page tables. It can safely
> +	 * walk the notifier range RB tree/list in this function. Called while
> +	 * holding the notifier lock.
> +	 */
> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> +			   struct drm_gpusvm_notifier *notifier,
> +			   const struct mmu_notifier_range *mmu_range);
> +};
> +
> +/**
> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: MMU interval notifier
> + * @interval: Interval for the notifier
> + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> + * @root: Cached root node of the RB tree containing ranges
> + * @range_list: List head containing of ranges in the same order they appear in
> + *              interval tree. This is useful to keep iterating ranges while
> + *              doing modifications to RB tree.
> + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> + *                 removed
> + *
> + * This structure represents a GPU SVM notifier.
> + */
> +struct drm_gpusvm_notifier {
> +	struct drm_gpusvm *gpusvm;
> +	struct mmu_interval_notifier notifier;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} interval;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct rb_root_cached root;
> +	struct list_head range_list;
> +	struct {
> +		u32 removed : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier
> + * @refcount: Reference count for the range
> + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> + * @va: Virtual address range
> + * @notifier_seq: Notifier sequence number of the range's pages
> + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> + * @flags.unmapped: Flag indicating if the range has been unmapped
> + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> + *                       on @order which releases via kfree
> + *
> + * This structure represents a GPU SVM range used for tracking memory ranges
> + * mapped in a DRM device.
> + */
> +struct drm_gpusvm_range {
> +	struct drm_gpusvm *gpusvm;
> +	struct drm_gpusvm_notifier *notifier;
> +	struct kref refcount;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} va;
> +	unsigned long notifier_seq;
> +	union {
> +		struct page **pages;
> +		dma_addr_t *dma_addr;
> +	};
> +	void *vram_allocation;
> +	u16 order;
> +	struct {
> +		/* All flags below must be set upon creation */
> +		u16 migrate_vram : 1;
> +		/* All flags below must be set / cleared under notifier lock */
> +		u16 unmapped : 1;
> +		u16 partial_unmap : 1;
> +		u16 has_vram_pages : 1;
> +		u16 has_dma_mapping : 1;
> +		u16 kfree_mapping : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm - GPU SVM structure
> + *
> + * @name: Name of the GPU SVM
> + * @drm: Pointer to the DRM device structure
> + * @mm: Pointer to the mm_struct for the address space
> + * @device_private_page_owner: Device private pages owner
> + * @mm_start: Start address of GPU SVM
> + * @mm_range: Range of the GPU SVM
> + * @notifier_size: Size of individual notifiers
> + * @ops: Pointer to the operations structure for GPU SVM
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> + *               Entries should be powers of 2 in descending order.
> + * @num_chunks: Number of chunks
> + * @notifier_lock: Read-write semaphore for protecting notifier operations
> + * @zdd_wq: Workqueue for deferred work on zdd destruction
> + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> + * @notifier_list: list head containing of notifiers in the same order they
> + *                 appear in interval tree. This is useful to keep iterating
> + *                 notifiers while doing modifications to RB tree.
> + *
> + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> + *
> + * No reference counting is provided, as this is expected to be embedded in the
> + * driver VM structure along with the struct drm_gpuvm, which handles reference
> + * counting.
> + */
> +struct drm_gpusvm {
> +	const char *name;
> +	struct drm_device *drm;
> +	struct mm_struct *mm;
> +	void *device_private_page_owner;
> +	u64 mm_start;
> +	u64 mm_range;
> +	u64 notifier_size;
> +	const struct drm_gpusvm_ops *ops;
> +	const u64 *chunk_sizes;
> +	int num_chunks;
> +	struct rw_semaphore notifier_lock;
> +	struct workqueue_struct *zdd_wq;
> +	struct rb_root_cached root;
> +	struct list_head notifier_list;
> +};
> +
> +/**
> + * struct drm_gpusvm_ctx - DRM GPU SVM context
> + *
> + * @mmap_locked: mmap lock is locked
> + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> + *                (e.g.dma-revs -> mmap lock)
> + * @in_notifier: entering from a MMU notifier
> + * @read_only: operating on read-only memory
> + * @vram_possible: possible to use VRAM
> + * @prefault: prefault pages
> + *
> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> + */
> +struct drm_gpusvm_ctx {
> +	u32 mmap_locked :1;
> +	u32 trylock_mmap :1;
> +	u32 in_notifier :1;
> +	u32 read_only :1;
> +	u32 vram_possible :1;
> +	u32 prefault :1;
> +};
> +
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks);
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> +
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range);
> +
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx);
> +
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx);
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> +
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> +
> +/**
> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, take lock
> + */
> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> +	down_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, drop lock
> + */
> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> +	up_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> + * @range: a pointer to the current GPU SVM range
> + *
> + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> + *         current range is the last one or if the input range is NULL.
> + */
> +static inline struct drm_gpusvm_range *
> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> +{
> +	if (range && !list_is_last(&range->rb.entry,
> +				   &range->notifier->range_list))
> +		return list_next_entry(range, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> + * to use while holding the driver SVM lock or the notifier lock.
> + */
> +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> +	for ((range__) = (range__) ?:					\
> +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> +	     (range__) && (range__->va.start < (end__));		\
> +	     (range__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> + * @range: Pointer to the GPU SVM range structure.
> + * @mmu_range: Pointer to the MMU notifier range structure.
> + *
> + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> + * if the range partially falls within the provided MMU notifier range.
> + */
> +static inline void
> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> +			      const struct mmu_notifier_range *mmu_range)
> +{
> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> +
> +	range->flags.unmapped = true;
> +	if (range->va.start < mmu_range->start ||
> +	    range->va.end > mmu_range->end)
> +		range->flags.partial_unmap = true;
> +}
> +
> +#endif /* __DRM_GPUSVM_H__ */
> -- 
> 2.34.1
>
Christian König Aug. 28, 2024, 2:46 p.m. UTC | #2
Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
>> +		if (!ctx->mmap_locked) {
>> +			/*
>> +			 * XXX: HMM locking document indicates only a read-lock
>> +			 * is required but there apears to be a window between
>> +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
>> +			 * via migrate_vma_setup and the pages actually moving
>> +			 * in migrate_vma_finalize in which this code can grab
>> +			 * garbage pages. Grabbing the write-lock if the range
>> +			 * is attached to vram appears to protect against this
>> +			 * race.
>> +			 */
> This one is really scary, since it means the entire migrate pte trickery
> is essentially completely busted. Grabbing the mmap write lock just means
> you block out pretty much everything interesting from concurrently
> happening.
>
> My gut feeling says we need to figure out what's happening here, because
> this looks a bit too fundamental to me.

I think I have at least a high level understanding what's going on here, 
Felix and especially Philip should know more of the details.

In general grabbing the mm_lock to protect PTEs from changing is 
completely nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!

Even with the write side of the mm_lock taken it is perfectly possible 
that PTE change. It's just less likely.

We run into multiple issues before we figured out this important 
distinction as well.

Christian.

> -Sima
>
>
>> +			if (vram_pages)
>> +				mmap_write_lock(mm);
>> +			else
>> +				mmap_read_lock(mm);
>> +		}
>> +		err = hmm_range_fault(&hmm_range);
>> +		if (!ctx->mmap_locked) {
>> +			if (vram_pages)
>> +				mmap_write_unlock(mm);
>> +			else
>> +				mmap_read_unlock(mm);
>> +		}
>> +
>> +		if (err == -EBUSY) {
>> +			if (time_after(jiffies, timeout))
>> +				break;
>> +
>> +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> +			continue;
>> +		}
>> +		break;
>> +	}
>> +	if (!ctx->mmap_locked)
>> +		mmput(mm);
>> +	if (err)
>> +		goto err_free;
>> +
>> +	pages = (struct page **)pfns;
>> +
>> +	if (ctx->prefault) {
>> +		range->pages = pages;
>> +		goto set_seqno;
>> +	}
>> +
>> +map_pages:
>> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
>> +		WARN_ON_ONCE(!range->vram_allocation);
>> +
>> +		for (i = 0; i < npages; ++i) {
>> +			pages[i] = hmm_pfn_to_page(pfns[i]);
>> +
>> +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
>> +				err = -EOPNOTSUPP;
>> +				goto err_free;
>> +			}
>> +		}
>> +
>> +		/* Do not race with notifier unmapping pages */
>> +		drm_gpusvm_notifier_lock(gpusvm);
>> +		range->flags.has_vram_pages = true;
>> +		range->pages = pages;
>> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> +			err = -EAGAIN;
>> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
>> +		}
>> +		drm_gpusvm_notifier_unlock(gpusvm);
>> +	} else {
>> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
>> +
>> +		for_each_dma_page(i, j, npages, order) {
>> +			if (WARN_ON_ONCE(i && order !=
>> +					 hmm_pfn_to_map_order(pfns[i]))) {
>> +				err = -EOPNOTSUPP;
>> +				npages = i;
>> +				goto err_unmap;
>> +			}
>> +			order = hmm_pfn_to_map_order(pfns[i]);
>> +
>> +			pages[j] = hmm_pfn_to_page(pfns[i]);
>> +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
>> +				err = -EOPNOTSUPP;
>> +				npages = i;
>> +				goto err_unmap;
>> +			}
>> +
>> +			set_page_dirty_lock(pages[j]);
>> +			mark_page_accessed(pages[j]);
>> +
>> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
>> +						   pages[j], 0,
>> +						   PAGE_SIZE << order,
>> +						   DMA_BIDIRECTIONAL);
>> +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
>> +				err = -EFAULT;
>> +				npages = i;
>> +				goto err_unmap;
>> +			}
>> +		}
>> +
>> +		/* Huge pages, reduce memory footprint */
>> +		if (order) {
>> +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
>> +						 GFP_KERNEL);
>> +			if (dma_addr) {
>> +				for (i = 0; i < j; ++i)
>> +					dma_addr[i] = (dma_addr_t)pfns[i];
>> +				kvfree(pfns);
>> +				kfree_mapping = true;
>> +			} else {
>> +				dma_addr = (dma_addr_t *)pfns;
>> +			}
>> +		}
>> +
>> +		/* Do not race with notifier unmapping pages */
>> +		drm_gpusvm_notifier_lock(gpusvm);
>> +		range->order = order;
>> +		range->flags.kfree_mapping = kfree_mapping;
>> +		range->flags.has_dma_mapping = true;
>> +		range->dma_addr = dma_addr;
>> +		range->vram_allocation = NULL;
>> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> +			err = -EAGAIN;
>> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
>> +		}
>> +		drm_gpusvm_notifier_unlock(gpusvm);
>> +	}
>> +
>> +	if (err == -EAGAIN)
>> +		goto retry;
>> +set_seqno:
>> +	range->notifier_seq = hmm_range.notifier_seq;
>> +
>> +	return 0;
>> +
>> +err_unmap:
>> +	for_each_dma_page(i, j, npages, order)
>> +		dma_unmap_page(gpusvm->drm->dev,
>> +			       (dma_addr_t)pfns[j],
>> +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
>> +err_free:
>> +	if (alloc_pfns)
>> +		kvfree(pfns);
>> +err_out:
>> +	return err;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @range: Pointer to the GPU SVM range structure
>> + * @ctx: GPU SVM context
>> + *
>> + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
>> + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
>> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
>> + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
>> + * security model.
>> + */
>> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
>> +				  struct drm_gpusvm_range *range,
>> +				  const struct drm_gpusvm_ctx *ctx)
>> +{
>> +	if (ctx->in_notifier)
>> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
>> +	else
>> +		drm_gpusvm_notifier_lock(gpusvm);
>> +
>> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
>> +
>> +	if (!ctx->in_notifier)
>> +		drm_gpusvm_notifier_unlock(gpusvm);
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migration_put_page - Put a migration page
>> + * @page: Pointer to the page to put
>> + *
>> + * This function unlocks and puts a page.
>> + */
>> +static void drm_gpusvm_migration_put_page(struct page *page)
>> +{
>> +	unlock_page(page);
>> +	put_page(page);
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migration_put_pages - Put migration pages
>> + * @npages: Number of pages
>> + * @migrate_pfn: Array of migrate page frame numbers
>> + *
>> + * This function puts an array of pages.
>> + */
>> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
>> +					   unsigned long *migrate_pfn)
>> +{
>> +	unsigned long i;
>> +
>> +	for (i = 0; i < npages; ++i) {
>> +		if (!migrate_pfn[i])
>> +			continue;
>> +
>> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
>> +		migrate_pfn[i] = 0;
>> +	}
>> +}
>> +
>> +/**
>> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
>> + * @page: Pointer to the page
>> + * @zdd: Pointer to the GPU SVM zone device data
>> + *
>> + * This function associates the given page with the specified GPU SVM zone
>> + * device data and initializes it for zone device usage.
>> + */
>> +static void drm_gpusvm_get_vram_page(struct page *page,
>> +				     struct drm_gpusvm_zdd *zdd)
>> +{
>> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
>> +	zone_device_page_init(page);
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
>> + * @dev: The device for which the pages are being mapped
>> + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
>> + * @migrate_pfn: Array of migrate page frame numbers to map
>> + * @npages: Number of pages to map
>> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
>> + *
>> + * This function maps pages of memory for migration usage in GPU SVM. It
>> + * iterates over each page frame number provided in @migrate_pfn, maps the
>> + * corresponding page, and stores the DMA address in the provided @dma_addr
>> + * array.
>> + *
>> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
>> + */
>> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
>> +					dma_addr_t *dma_addr,
>> +					long unsigned int *migrate_pfn,
>> +					unsigned long npages,
>> +					enum dma_data_direction dir)
>> +{
>> +	unsigned long i;
>> +
>> +	for (i = 0; i < npages; ++i) {
>> +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
>> +
>> +		if (!page)
>> +			continue;
>> +
>> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
>> +			return -EFAULT;
>> +
>> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
>> +		if (dma_mapping_error(dev, dma_addr[i]))
>> +			return -EFAULT;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
>> + * @dev: The device for which the pages were mapped
>> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
>> + * @npages: Number of pages to unmap
>> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
>> + *
>> + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
>> + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
>> + * if it's valid and not already unmapped, and unmaps the corresponding page.
>> + */
>> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
>> +					   dma_addr_t *dma_addr,
>> +					   unsigned long npages,
>> +					   enum dma_data_direction dir)
>> +{
>> +	unsigned long i;
>> +
>> +	for (i = 0; i < npages; ++i) {
>> +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
>> +			continue;
>> +
>> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
>> +	}
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @range: Pointer to the GPU SVM range structure
>> + *                   failure of this function.
>> + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
>> + *                   should hold a reference to the VRAM allocation, which
>> + *                   should be dropped via ops->vram_allocation or upon the
>> + *                   failure of this function.
>> + * @ctx: GPU SVM context
>> + *
>> + * This function migrates the specified GPU SVM range to VRAM. It performs the
>> + * necessary setup and invokes the driver-specific operations for migration to
>> + * VRAM. Upon successful return, @vram_allocation can safely reference @range
>> + * until ops->vram_release is called which only upon successful return.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>> + */
>> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
>> +			       struct drm_gpusvm_range *range,
>> +			       void *vram_allocation,
>> +			       const struct drm_gpusvm_ctx *ctx)
>> +{
>> +	u64 start = range->va.start, end = range->va.end;
>> +	struct migrate_vma migrate = {
>> +		.start		= start,
>> +		.end		= end,
>> +		.pgmap_owner	= gpusvm->device_private_page_owner,
>> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
>> +	};
>> +	struct mm_struct *mm = gpusvm->mm;
>> +	unsigned long i, npages = npages_in_range(start, end);
>> +	struct vm_area_struct *vas;
>> +	struct drm_gpusvm_zdd *zdd = NULL;
>> +	struct page **pages;
>> +	dma_addr_t *dma_addr;
>> +	void *buf;
>> +	int err;
>> +
>> +	if (!range->flags.migrate_vram)
>> +		return -EINVAL;
>> +
>> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
>> +	    !gpusvm->ops->copy_to_sram)
>> +		return -EOPNOTSUPP;
>> +
>> +	if (!ctx->mmap_locked) {
>> +		if (!mmget_not_zero(mm)) {
>> +			err = -EFAULT;
>> +			goto err_out;
>> +		}
>> +		mmap_write_lock(mm);
>> +	}
>> +
>> +	mmap_assert_locked(mm);
>> +
>> +	vas = vma_lookup(mm, start);
>> +	if (!vas) {
>> +		err = -ENOENT;
>> +		goto err_mmunlock;
>> +	}
>> +
>> +	if (end > vas->vm_end || start < vas->vm_start) {
>> +		err = -EINVAL;
>> +		goto err_mmunlock;
>> +	}
>> +
>> +	if (!vma_is_anonymous(vas)) {
>> +		err = -EBUSY;
>> +		goto err_mmunlock;
>> +	}
>> +
>> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
>> +		       sizeof(*pages), GFP_KERNEL);
>> +	if (!buf) {
>> +		err = -ENOMEM;
>> +		goto err_mmunlock;
>> +	}
>> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
>> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
>> +
>> +	zdd = drm_gpusvm_zdd_alloc(range);
>> +	if (!zdd) {
>> +		err = -ENOMEM;
>> +		goto err_free;
>> +	}
>> +
>> +	migrate.vma = vas;
>> +	migrate.src = buf;
>> +	migrate.dst = migrate.src + npages;
>> +
>> +	err = migrate_vma_setup(&migrate);
>> +	if (err)
>> +		goto err_free;
>> +
>> +	/*
>> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
>> +	 * always an error. Need to revisit possible cases and how to handle. We
>> +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
>> +	 */
>> +
>> +	if (!migrate.cpages) {
>> +		err = -EFAULT;
>> +		goto err_free;
>> +	}
>> +
>> +	if (migrate.cpages != npages) {
>> +		err = -EBUSY;
>> +		goto err_finalize;
>> +	}
>> +
>> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
>> +					     migrate.dst);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
>> +					   migrate.src, npages, DMA_TO_DEVICE);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	for (i = 0; i < npages; ++i) {
>> +		struct page *page = pfn_to_page(migrate.dst[i]);
>> +
>> +		pages[i] = page;
>> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
>> +		drm_gpusvm_get_vram_page(page, zdd);
>> +	}
>> +
>> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	/* Upon success bind vram allocation to range and zdd */
>> +	range->vram_allocation = vram_allocation;
>> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
>> +
>> +err_finalize:
>> +	if (err)
>> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
>> +	migrate_vma_pages(&migrate);
>> +	migrate_vma_finalize(&migrate);
>> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
>> +				       DMA_TO_DEVICE);
>> +err_free:
>> +	if (zdd)
>> +		drm_gpusvm_zdd_put(zdd);
>> +	kvfree(buf);
>> +err_mmunlock:
>> +	if (!ctx->mmap_locked) {
>> +		mmap_write_unlock(mm);
>> +		mmput(mm);
>> +	}
>> +err_out:
>> +	return err;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
>> + * @vas: Pointer to the VM area structure, can be NULL
>> + * @npages: Number of pages to populate
>> + * @src_mpfn: Source array of migrate PFNs
>> + * @mpfn: Array of migrate PFNs to populate
>> + * @addr: Start address for PFN allocation
>> + *
>> + * This function populates the SRAM migrate page frame numbers (PFNs) for the
>> + * specified VM area structure. It allocates and locks pages in the VM area for
>> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
>> + * alloc_page for allocation.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>> + */
>> +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
>> +						unsigned long npages,
>> +						unsigned long *src_mpfn,
>> +						unsigned long *mpfn, u64 addr)
>> +{
>> +	unsigned long i;
>> +
>> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
>> +		struct page *page;
>> +
>> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
>> +			continue;
>> +
>> +		if (vas)
>> +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
>> +		else
>> +			page = alloc_page(GFP_HIGHUSER);
>> +
>> +		if (!page)
>> +			return -ENOMEM;
>> +
>> +		lock_page(page);
>> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @range: Pointer to the GPU SVM range structure
>> + *
>> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
>> + * migration done via migrate_device_* functions. Fallback path as it is
>> + * preferred to issue migrations with mmap lock.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>> + */
>> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
>> +				    struct drm_gpusvm_range *range)
>> +{
>> +	unsigned long npages;
>> +	struct page **pages;
>> +	unsigned long *src, *dst;
>> +	dma_addr_t *dma_addr;
>> +	void *buf;
>> +	int i, err = 0;
>> +
>> +	npages = npages_in_range(range->va.start, range->va.end);
>> +
>> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
>> +		       sizeof(*pages), GFP_KERNEL);
>> +	if (!buf) {
>> +		err = -ENOMEM;
>> +		goto err_out;
>> +	}
>> +	src = buf;
>> +	dst = buf + (sizeof(*src) * npages);
>> +	dma_addr = buf + (2 * sizeof(*src) * npages);
>> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
>> +
>> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
>> +					     npages, src);
>> +	if (err)
>> +		goto err_free;
>> +
>> +	err = migrate_device_vma_range(gpusvm->mm,
>> +				       gpusvm->device_private_page_owner, src,
>> +				       npages, range->va.start);
>> +	if (err)
>> +		goto err_free;
>> +
>> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
>> +					   dst, npages, DMA_BIDIRECTIONAL);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	for (i = 0; i < npages; ++i)
>> +		pages[i] = migrate_pfn_to_page(src[i]);
>> +
>> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +err_finalize:
>> +	if (err)
>> +		drm_gpusvm_migration_put_pages(npages, dst);
>> +	migrate_device_pages(src, dst, npages);
>> +	migrate_device_finalize(src, dst, npages);
>> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
>> +				       DMA_BIDIRECTIONAL);
>> +err_free:
>> +	kvfree(buf);
>> +err_out:
>> +
>> +	return err;
>> +}
>> +
>> +/**
>> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @vas: Pointer to the VM area structure
>> + * @page: Pointer to the page for fault handling (can be NULL)
>> + * @start: Start address of the migration range
>> + * @end: End address of the migration range
>> + *
>> + * This internal function performs the migration of the specified GPU SVM range
>> + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
>> + * invokes the driver-specific operations for migration to SRAM.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>> + */
>> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
>> +					struct vm_area_struct *vas,
>> +					struct page *page,
>> +					u64 start, u64 end)
>> +{
>> +	struct migrate_vma migrate = {
>> +		.vma		= vas,
>> +		.pgmap_owner	= gpusvm->device_private_page_owner,
>> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
>> +		.fault_page	= page,
>> +	};
>> +	unsigned long npages;
>> +	struct page **pages;
>> +	dma_addr_t *dma_addr;
>> +	void *buf;
>> +	int i, err = 0;
>> +
>> +	mmap_assert_locked(gpusvm->mm);
>> +
>> +	/* Corner where VMA area struct has been partially unmapped */
>> +	if (start < vas->vm_start)
>> +		start = vas->vm_start;
>> +	if (end > vas->vm_end)
>> +		end = vas->vm_end;
>> +
>> +	migrate.start = start;
>> +	migrate.end = end;
>> +	npages = npages_in_range(start, end);
>> +
>> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
>> +		       sizeof(*pages), GFP_KERNEL);
>> +	if (!buf) {
>> +		err = -ENOMEM;
>> +		goto err_out;
>> +	}
>> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
>> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
>> +
>> +	migrate.vma = vas;
>> +	migrate.src = buf;
>> +	migrate.dst = migrate.src + npages;
>> +
>> +	err = migrate_vma_setup(&migrate);
>> +	if (err)
>> +		goto err_free;
>> +
>> +	/* Raced with another CPU fault, nothing to do */
>> +	if (!migrate.cpages)
>> +		goto err_free;
>> +
>> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
>> +						   migrate.src, migrate.dst,
>> +						   start);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
>> +					   migrate.dst, npages,
>> +					   DMA_BIDIRECTIONAL);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +	for (i = 0; i < npages; ++i)
>> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
>> +
>> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
>> +	if (err)
>> +		goto err_finalize;
>> +
>> +err_finalize:
>> +	if (err)
>> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
>> +	migrate_vma_pages(&migrate);
>> +	migrate_vma_finalize(&migrate);
>> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
>> +				       DMA_BIDIRECTIONAL);
>> +err_free:
>> +	kvfree(buf);
>> +err_out:
>> +	mmap_assert_locked(gpusvm->mm);
>> +
>> +	return err;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @range: Pointer to the GPU SVM range structure
>> + * @ctx: GPU SVM context
>> + *
>> + * This function initiates the migration of the specified GPU SVM range to
>> + * SRAM. It performs necessary checks and invokes the internal migration
>> + * function for actual migration.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>> + */
>> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
>> +			       struct drm_gpusvm_range *range,
>> +			       const struct drm_gpusvm_ctx *ctx)
>> +{
>> +	u64 start = range->va.start, end = range->va.end;
>> +	struct mm_struct *mm = gpusvm->mm;
>> +	struct vm_area_struct *vas;
>> +	int err;
>> +	bool retry = false;
>> +
>> +	if (!ctx->mmap_locked) {
>> +		if (!mmget_not_zero(mm)) {
>> +			err = -EFAULT;
>> +			goto err_out;
>> +		}
>> +		if (ctx->trylock_mmap) {
>> +			if (!mmap_read_trylock(mm))  {
>> +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
>> +				goto err_mmput;
>> +			}
>> +		} else {
>> +			mmap_read_lock(mm);
>> +		}
>> +	}
>> +
>> +	mmap_assert_locked(mm);
>> +
>> +	/*
>> +	 * Loop required to find all VMA area structs for the corner case when
>> +	 * VRAM backing has been partially unmapped from MM's address space.
>> +	 */
>> +again:
>> +	vas = find_vma(mm, start);
>> +	if (!vas) {
>> +		if (!retry)
>> +			err = -ENOENT;
>> +		goto err_mmunlock;
>> +	}
>> +
>> +	if (end <= vas->vm_start || start >= vas->vm_end) {
>> +		if (!retry)
>> +			err = -EINVAL;
>> +		goto err_mmunlock;
>> +	}
>> +
>> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
>> +	if (err)
>> +		goto err_mmunlock;
>> +
>> +	if (vas->vm_end < end) {
>> +		retry = true;
>> +		start = vas->vm_end;
>> +		goto again;
>> +	}
>> +
>> +	if (!ctx->mmap_locked) {
>> +		mmap_read_unlock(mm);
>> +		/*
>> +		 * Using mmput_async as this function can be called while
>> +		 * holding a dma-resv lock, and a final put can grab the mmap
>> +		 * lock, causing a lock inversion.
>> +		 */
>> +		mmput_async(mm);
>> +	}
>> +
>> +	return 0;
>> +
>> +err_mmunlock:
>> +	if (!ctx->mmap_locked)
>> +		mmap_read_unlock(mm);
>> +err_mmput:
>> +	if (!ctx->mmap_locked)
>> +		mmput_async(mm);
>> +err_out:
>> +	return err;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
>> + * @page: Pointer to the page
>> + *
>> + * This function is a callback used to put the GPU SVM zone device data
>> + * associated with a page when it is being released.
>> + */
>> +static void drm_gpusvm_page_free(struct page *page)
>> +{
>> +	drm_gpusvm_zdd_put(page->zone_device_data);
>> +}
>> +
>> +/**
>> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
>> + * @vmf: Pointer to the fault information structure
>> + *
>> + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
>> + * It retrieves the GPU SVM range information from the faulting page and invokes
>> + * the internal migration function to migrate the range back to RAM.
>> + *
>> + * Returns:
>> + * VM_FAULT_SIGBUS on failure, 0 on success.
>> + */
>> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
>> +{
>> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
>> +	int err;
>> +
>> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
>> +					   vmf->vma, vmf->page,
>> +					   zdd->range->va.start,
>> +					   zdd->range->va.end);
>> +
>> +	return err ? VM_FAULT_SIGBUS : 0;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
>> + */
>> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
>> +	.page_free = drm_gpusvm_page_free,
>> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
>> +};
>> +
>> +/**
>> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
>> + *
>> + * Returns:
>> + * Pointer to the GPU SVM device page map operations structure.
>> + */
>> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
>> +{
>> +	return &drm_gpusvm_pagemap_ops;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
>> + * @gpusvm: Pointer to the GPU SVM structure.
>> + * @start: Start address
>> + * @end: End address
>> + *
>> + * Returns:
>> + * True if GPU SVM has mapping, False otherwise
>> + */
>> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
>> +{
>> +	struct drm_gpusvm_notifier *notifier;
>> +
>> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
>> +		struct drm_gpusvm_range *range = NULL;
>> +
>> +		drm_gpusvm_for_each_range(range, notifier, start, end)
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
>> new file mode 100644
>> index 000000000000..0ea70f8534a8
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
>> @@ -0,0 +1,415 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2024 Intel Corporation
>> + */
>> +
>> +#ifndef __DRM_GPUSVM_H__
>> +#define __DRM_GPUSVM_H__
>> +
>> +#include <linux/kref.h>
>> +#include <linux/mmu_notifier.h>
>> +#include <linux/workqueue.h>
>> +
>> +struct dev_pagemap_ops;
>> +struct drm_device;
>> +struct drm_gpusvm;
>> +struct drm_gpusvm_notifier;
>> +struct drm_gpusvm_ops;
>> +struct drm_gpusvm_range;
>> +
>> +/**
>> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
>> + *
>> + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
>> + * These operations are provided by the GPU driver to manage SVM ranges and
>> + * perform operations such as migration between VRAM and system RAM.
>> + */
>> +struct drm_gpusvm_ops {
>> +	/**
>> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
>> +	 *
>> +	 * This function shall allocate a GPU SVM notifier.
>> +	 *
>> +	 * Returns:
>> +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
>> +	 */
>> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
>> +
>> +	/**
>> +	 * @notifier_free: Free a GPU SVM notifier (optional)
>> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
>> +	 *
>> +	 * This function shall free a GPU SVM notifier.
>> +	 */
>> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
>> +
>> +	/**
>> +	 * @range_alloc: Allocate a GPU SVM range (optional)
>> +	 * @gpusvm: Pointer to the GPU SVM
>> +	 *
>> +	 * This function shall allocate a GPU SVM range.
>> +	 *
>> +	 * Returns:
>> +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
>> +	 */
>> +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
>> +
>> +	/**
>> +	 * @range_free: Free a GPU SVM range (optional)
>> +	 * @range: Pointer to the GPU SVM range to be freed
>> +	 *
>> +	 * This function shall free a GPU SVM range.
>> +	 */
>> +	void (*range_free)(struct drm_gpusvm_range *range);
>> +
>> +	/**
>> +	 * @vram_release: Release VRAM allocation (optional)
>> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
>> +	 *
>> +	 * This function shall release VRAM allocation and expects to drop a
>> +	 * reference to VRAM allocation.
>> +	 */
>> +	void (*vram_release)(void *vram_allocation);
>> +
>> +	/**
>> +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
>> +	 * @gpusvm: Pointer to the GPU SVM
>> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
>> +	 * @npages: Number of pages to populate
>> +	 * @pfn: Array of page frame numbers to populate
>> +	 *
>> +	 * This function shall populate VRAM page frame numbers (PFN).
>> +	 *
>> +	 * Returns:
>> +	 * 0 on success, a negative error code on failure.
>> +	 */
>> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
>> +				 void *vram_allocation,
>> +				 unsigned long npages,
>> +				 unsigned long *pfn);
>> +
>> +	/**
>> +	 * @copy_to_vram: Copy to VRAM (required for migration)
>> +	 * @gpusvm: Pointer to the GPU SVM
>> +	 * @pages: Pointer to array of VRAM pages (destination)
>> +	 * @dma_addr: Pointer to array of DMA addresses (source)
>> +	 * @npages: Number of pages to copy
>> +	 *
>> +	 * This function shall copy pages to VRAM.
>> +	 *
>> +	 * Returns:
>> +	 * 0 on success, a negative error code on failure.
>> +	 */
>> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
>> +			    struct page **pages,
>> +			    dma_addr_t *dma_addr,
>> +			    unsigned long npages);
>> +
>> +	/**
>> +	 * @copy_to_sram: Copy to system RAM (required for migration)
>> +	 * @gpusvm: Pointer to the GPU SVM
>> +	 * @pages: Pointer to array of VRAM pages (source)
>> +	 * @dma_addr: Pointer to array of DMA addresses (destination)
>> +	 * @npages: Number of pages to copy
>> +	 *
>> +	 * This function shall copy pages to system RAM.
>> +	 *
>> +	 * Returns:
>> +	 * 0 on success, a negative error code on failure.
>> +	 */
>> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
>> +			    struct page **pages,
>> +			    dma_addr_t *dma_addr,
>> +			    unsigned long npages);
>> +
>> +	/**
>> +	 * @invalidate: Invalidate GPU SVM notifier (required)
>> +	 * @gpusvm: Pointer to the GPU SVM
>> +	 * @notifier: Pointer to the GPU SVM notifier
>> +	 * @mmu_range: Pointer to the mmu_notifier_range structure
>> +	 *
>> +	 * This function shall invalidate the GPU page tables. It can safely
>> +	 * walk the notifier range RB tree/list in this function. Called while
>> +	 * holding the notifier lock.
>> +	 */
>> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
>> +			   struct drm_gpusvm_notifier *notifier,
>> +			   const struct mmu_notifier_range *mmu_range);
>> +};
>> +
>> +/**
>> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
>> + *
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @notifier: MMU interval notifier
>> + * @interval: Interval for the notifier
>> + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
>> + * @root: Cached root node of the RB tree containing ranges
>> + * @range_list: List head containing of ranges in the same order they appear in
>> + *              interval tree. This is useful to keep iterating ranges while
>> + *              doing modifications to RB tree.
>> + * @flags.removed: Flag indicating whether the MMU interval notifier has been
>> + *                 removed
>> + *
>> + * This structure represents a GPU SVM notifier.
>> + */
>> +struct drm_gpusvm_notifier {
>> +	struct drm_gpusvm *gpusvm;
>> +	struct mmu_interval_notifier notifier;
>> +	struct {
>> +		u64 start;
>> +		u64 end;
>> +	} interval;
>> +	struct {
>> +		struct rb_node node;
>> +		struct list_head entry;
>> +		u64 __subtree_last;
>> +	} rb;
>> +	struct rb_root_cached root;
>> +	struct list_head range_list;
>> +	struct {
>> +		u32 removed : 1;
>> +	} flags;
>> +};
>> +
>> +/**
>> + * struct drm_gpusvm_range - Structure representing a GPU SVM range
>> + *
>> + * @gpusvm: Pointer to the GPU SVM structure
>> + * @notifier: Pointer to the GPU SVM notifier
>> + * @refcount: Reference count for the range
>> + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
>> + * @va: Virtual address range
>> + * @notifier_seq: Notifier sequence number of the range's pages
>> + * @pages: Pointer to the array of pages (if backing store is in VRAM)
>> + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
>> + * @vram_allocation: Driver-private pointer to the VRAM allocation
>> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
>> + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
>> + * @flags.unmapped: Flag indicating if the range has been unmapped
>> + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
>> + * @flags.has_vram_pages: Flag indicating if the range has vram pages
>> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
>> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
>> + *                       on @order which releases via kfree
>> + *
>> + * This structure represents a GPU SVM range used for tracking memory ranges
>> + * mapped in a DRM device.
>> + */
>> +struct drm_gpusvm_range {
>> +	struct drm_gpusvm *gpusvm;
>> +	struct drm_gpusvm_notifier *notifier;
>> +	struct kref refcount;
>> +	struct {
>> +		struct rb_node node;
>> +		struct list_head entry;
>> +		u64 __subtree_last;
>> +	} rb;
>> +	struct {
>> +		u64 start;
>> +		u64 end;
>> +	} va;
>> +	unsigned long notifier_seq;
>> +	union {
>> +		struct page **pages;
>> +		dma_addr_t *dma_addr;
>> +	};
>> +	void *vram_allocation;
>> +	u16 order;
>> +	struct {
>> +		/* All flags below must be set upon creation */
>> +		u16 migrate_vram : 1;
>> +		/* All flags below must be set / cleared under notifier lock */
>> +		u16 unmapped : 1;
>> +		u16 partial_unmap : 1;
>> +		u16 has_vram_pages : 1;
>> +		u16 has_dma_mapping : 1;
>> +		u16 kfree_mapping : 1;
>> +	} flags;
>> +};
>> +
>> +/**
>> + * struct drm_gpusvm - GPU SVM structure
>> + *
>> + * @name: Name of the GPU SVM
>> + * @drm: Pointer to the DRM device structure
>> + * @mm: Pointer to the mm_struct for the address space
>> + * @device_private_page_owner: Device private pages owner
>> + * @mm_start: Start address of GPU SVM
>> + * @mm_range: Range of the GPU SVM
>> + * @notifier_size: Size of individual notifiers
>> + * @ops: Pointer to the operations structure for GPU SVM
>> + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
>> + *               Entries should be powers of 2 in descending order.
>> + * @num_chunks: Number of chunks
>> + * @notifier_lock: Read-write semaphore for protecting notifier operations
>> + * @zdd_wq: Workqueue for deferred work on zdd destruction
>> + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
>> + * @notifier_list: list head containing of notifiers in the same order they
>> + *                 appear in interval tree. This is useful to keep iterating
>> + *                 notifiers while doing modifications to RB tree.
>> + *
>> + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
>> + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
>> + *
>> + * No reference counting is provided, as this is expected to be embedded in the
>> + * driver VM structure along with the struct drm_gpuvm, which handles reference
>> + * counting.
>> + */
>> +struct drm_gpusvm {
>> +	const char *name;
>> +	struct drm_device *drm;
>> +	struct mm_struct *mm;
>> +	void *device_private_page_owner;
>> +	u64 mm_start;
>> +	u64 mm_range;
>> +	u64 notifier_size;
>> +	const struct drm_gpusvm_ops *ops;
>> +	const u64 *chunk_sizes;
>> +	int num_chunks;
>> +	struct rw_semaphore notifier_lock;
>> +	struct workqueue_struct *zdd_wq;
>> +	struct rb_root_cached root;
>> +	struct list_head notifier_list;
>> +};
>> +
>> +/**
>> + * struct drm_gpusvm_ctx - DRM GPU SVM context
>> + *
>> + * @mmap_locked: mmap lock is locked
>> + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
>> + *                (e.g.dma-revs -> mmap lock)
>> + * @in_notifier: entering from a MMU notifier
>> + * @read_only: operating on read-only memory
>> + * @vram_possible: possible to use VRAM
>> + * @prefault: prefault pages
>> + *
>> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
>> + */
>> +struct drm_gpusvm_ctx {
>> +	u32 mmap_locked :1;
>> +	u32 trylock_mmap :1;
>> +	u32 in_notifier :1;
>> +	u32 read_only :1;
>> +	u32 vram_possible :1;
>> +	u32 prefault :1;
>> +};
>> +
>> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
>> +		    const char *name, struct drm_device *drm,
>> +		    struct mm_struct *mm, void *device_private_page_owner,
>> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
>> +		    const struct drm_gpusvm_ops *ops,
>> +		    const u64 *chunk_sizes, int num_chunks);
>> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
>> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
>> +
>> +struct drm_gpusvm_range *
>> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
>> +				u64 gpuva_start, u64 gpuva_end,
>> +				const struct drm_gpusvm_ctx *ctx);
>> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
>> +			     struct drm_gpusvm_range *range);
>> +
>> +struct drm_gpusvm_range *
>> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
>> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
>> +
>> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
>> +				  struct drm_gpusvm_range *range);
>> +
>> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
>> +			       struct drm_gpusvm_range *range,
>> +			       const struct drm_gpusvm_ctx *ctx);
>> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
>> +				  struct drm_gpusvm_range *range,
>> +				  const struct drm_gpusvm_ctx *ctx);
>> +
>> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
>> +			       struct drm_gpusvm_range *range,
>> +			       void *vram_allocation,
>> +			       const struct drm_gpusvm_ctx *ctx);
>> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
>> +			       struct drm_gpusvm_range *range,
>> +			       const struct drm_gpusvm_ctx *ctx);
>> +
>> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
>> +
>> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
>> +
>> +struct drm_gpusvm_range *
>> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
>> +
>> +/**
>> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
>> + * @gpusvm__: Pointer to the GPU SVM structure.
>> + *
>> + * Abstract client usage GPU SVM notifier lock, take lock
>> + */
>> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
>> +	down_read(&(gpusvm__)->notifier_lock)
>> +
>> +/**
>> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
>> + * @gpusvm__: Pointer to the GPU SVM structure.
>> + *
>> + * Abstract client usage GPU SVM notifier lock, drop lock
>> + */
>> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
>> +	up_read(&(gpusvm__)->notifier_lock)
>> +
>> +/**
>> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
>> + * @range: a pointer to the current GPU SVM range
>> + *
>> + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
>> + *         current range is the last one or if the input range is NULL.
>> + */
>> +static inline struct drm_gpusvm_range *
>> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
>> +{
>> +	if (range && !list_is_last(&range->rb.entry,
>> +				   &range->notifier->range_list))
>> +		return list_next_entry(range, rb.entry);
>> +
>> +	return NULL;
>> +}
>> +
>> +/**
>> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
>> + * @range__: Iterator variable for the ranges. If set, it indicates the start of
>> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
>> + * @notifier__: Pointer to the GPU SVM notifier
>> + * @start__: Start address of the range
>> + * @end__: End address of the range
>> + *
>> + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
>> + * to use while holding the driver SVM lock or the notifier lock.
>> + */
>> +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
>> +	for ((range__) = (range__) ?:					\
>> +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
>> +	     (range__) && (range__->va.start < (end__));		\
>> +	     (range__) = __drm_gpusvm_range_next(range__))
>> +
>> +/**
>> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
>> + * @range: Pointer to the GPU SVM range structure.
>> + * @mmu_range: Pointer to the MMU notifier range structure.
>> + *
>> + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
>> + * if the range partially falls within the provided MMU notifier range.
>> + */
>> +static inline void
>> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
>> +			      const struct mmu_notifier_range *mmu_range)
>> +{
>> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
>> +
>> +	range->flags.unmapped = true;
>> +	if (range->va.start < mmu_range->start ||
>> +	    range->va.end > mmu_range->end)
>> +		range->flags.partial_unmap = true;
>> +}
>> +
>> +#endif /* __DRM_GPUSVM_H__ */
>> -- 
>> 2.34.1
>>
Matthew Brost Aug. 28, 2024, 3:43 p.m. UTC | #3
On Wed, Aug 28, 2024 at 04:46:24PM +0200, Christian König wrote:
> Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates only a read-lock
> > > +			 * is required but there apears to be a window between
> > > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > +			 * via migrate_vma_setup and the pages actually moving
> > > +			 * in migrate_vma_finalize in which this code can grab
> > > +			 * garbage pages. Grabbing the write-lock if the range
> > > +			 * is attached to vram appears to protect against this
> > > +			 * race.
> > > +			 */

Thanks the comments, replying to both of you inline.

> > This one is really scary, since it means the entire migrate pte trickery
> > is essentially completely busted. Grabbing the mmap write lock just means
> > you block out pretty much everything interesting from concurrently
> > happening.
> > 
> > My gut feeling says we need to figure out what's happening here, because
> > this looks a bit too fundamental to me.

I agree. I haven’t looked into this issue for a couple of months but
really need to understand what is going on.

I should have mentioned this in the cover letter: the goal of this
series was to produce something for review that is stable and supports
UMDs/user applications. It was not intended to be presented as a final
solution. This issue certainly falls into the category of "needs to be
understood and requires a proper fix."

One open question I have is whether the test case that triggers this
issue is even defined behavior. The test creates concurrent access
between the GPU and CPU to the same memory address, resulting in GPU and
CPU faults racing against each other. It’s possible that this is
undefined behavior, so data corruption might be acceptable—i.e., the
kernel can’t crash, but incorrect results might be permissible.

e.g. This is the only defined usage model:

alloc_memory();
start_compute_kernel();
sync_on_compute_kernel_completion();
read_memory();

Hopefully, in the next week or so, I'll be heavily engaging with the UMD
teams. Development can then start, and applications will be running soon
after. This will allow us to address issues like this, collect data on
memory usage, and verify some of the assumptions I've made, such as
optimizing for 2M+ allocations.

> 
> I think I have at least a high level understanding what's going on here,
> Felix and especially Philip should know more of the details.
> 

I meant to reach out to AMD for issues like this. So, Felix
(felix.kuehling@amd.com) and Philip (Philip.Yang@amd.com) would be good
contacts?

> In general grabbing the mm_lock to protect PTEs from changing is completely
> nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!
> 

Thanks for the hint. I believe that in the AMD implementation, I noticed
some additional locks for migration, which might be how you mitigated
this issue.

I must say it is a bit unfortunate that the HMM locking documentation
doesn’t mention this. I believe the documentation needs additional
information, which I can add once we finalize the solution.

Matt 

> Even with the write side of the mm_lock taken it is perfectly possible that
> PTE change. It's just less likely.
> 
> We run into multiple issues before we figured out this important distinction
> as well.
> 
> Christian.
> 
> > -Sima
> > 
> > 
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE << order,
> > > +						   DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified GPU SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > + *                   should hold a reference to the VRAM allocation, which
> > > + *                   should be dropped via ops->vram_allocation or upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > + * necessary setup and invokes the driver-specific operations for migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > + * until ops->vram_release is called which only upon successful return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   migrate.src, npages, DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > +						unsigned long npages,
> > > +						unsigned long *src_mpfn,
> > > +						unsigned long *mpfn, u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm->device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   dst, npages, DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially unmapped */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src, migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU SVM range to
> > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner case when
> > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be called while
> > > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone device data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > + * the internal migration function to migrate the range back to RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > + */
> > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > + * perform operations such as migration between VRAM and system RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and expects to drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables. It can safely
> > > +	 * walk the notifier range RB tree/list in this function. Called while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order they appear in
> > > + *              interval tree. This is useful to keep iterating ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under notifier lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > + *               Entries should be powers of 2 in descending order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > + * @notifier_list: list head containing of notifiers in the same order they
> > > + *                 appear in interval tree. This is useful to keep iterating
> > > + *                 notifiers while doing modifications to RB tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > + *         current range is the last one or if the input range is NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier->range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > + * to use while holding the driver SVM lock or the notifier lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > > +	for ((range__) = (range__) ?:					\
> > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > > +	     (range__) && (range__->va.start < (end__));		\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > +			      const struct mmu_notifier_range *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> > > -- 
> > > 2.34.1
> > > 
>
Alex Deucher Aug. 28, 2024, 4:06 p.m. UTC | #4
On Wed, Aug 28, 2024 at 11:53 AM Matthew Brost <matthew.brost@intel.com> wrote:
>
> On Wed, Aug 28, 2024 at 04:46:24PM +0200, Christian König wrote:
> > Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > +         if (!ctx->mmap_locked) {
> > > > +                 /*
> > > > +                  * XXX: HMM locking document indicates only a read-lock
> > > > +                  * is required but there apears to be a window between
> > > > +                  * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > > +                  * via migrate_vma_setup and the pages actually moving
> > > > +                  * in migrate_vma_finalize in which this code can grab
> > > > +                  * garbage pages. Grabbing the write-lock if the range
> > > > +                  * is attached to vram appears to protect against this
> > > > +                  * race.
> > > > +                  */
>
> Thanks the comments, replying to both of you inline.
>
> > > This one is really scary, since it means the entire migrate pte trickery
> > > is essentially completely busted. Grabbing the mmap write lock just means
> > > you block out pretty much everything interesting from concurrently
> > > happening.
> > >
> > > My gut feeling says we need to figure out what's happening here, because
> > > this looks a bit too fundamental to me.
>
> I agree. I haven’t looked into this issue for a couple of months but
> really need to understand what is going on.
>
> I should have mentioned this in the cover letter: the goal of this
> series was to produce something for review that is stable and supports
> UMDs/user applications. It was not intended to be presented as a final
> solution. This issue certainly falls into the category of "needs to be
> understood and requires a proper fix."
>
> One open question I have is whether the test case that triggers this
> issue is even defined behavior. The test creates concurrent access
> between the GPU and CPU to the same memory address, resulting in GPU and
> CPU faults racing against each other. It’s possible that this is
> undefined behavior, so data corruption might be acceptable—i.e., the
> kernel can’t crash, but incorrect results might be permissible.
>
> e.g. This is the only defined usage model:
>
> alloc_memory();
> start_compute_kernel();
> sync_on_compute_kernel_completion();
> read_memory();
>
> Hopefully, in the next week or so, I'll be heavily engaging with the UMD
> teams. Development can then start, and applications will be running soon
> after. This will allow us to address issues like this, collect data on
> memory usage, and verify some of the assumptions I've made, such as
> optimizing for 2M+ allocations.
>
> >
> > I think I have at least a high level understanding what's going on here,
> > Felix and especially Philip should know more of the details.
> >
>
> I meant to reach out to AMD for issues like this. So, Felix
> (felix.kuehling@amd.com) and Philip (Philip.Yang@amd.com) would be good
> contacts?

Yes.

Alex

>
> > In general grabbing the mm_lock to protect PTEs from changing is completely
> > nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!
> >
>
> Thanks for the hint. I believe that in the AMD implementation, I noticed
> some additional locks for migration, which might be how you mitigated
> this issue.
>
> I must say it is a bit unfortunate that the HMM locking documentation
> doesn’t mention this. I believe the documentation needs additional
> information, which I can add once we finalize the solution.
>
> Matt
>
> > Even with the write side of the mm_lock taken it is perfectly possible that
> > PTE change. It's just less likely.
> >
> > We run into multiple issues before we figured out this important distinction
> > as well.
> >
> > Christian.
> >
> > > -Sima
> > >
> > >
> > > > +                 if (vram_pages)
> > > > +                         mmap_write_lock(mm);
> > > > +                 else
> > > > +                         mmap_read_lock(mm);
> > > > +         }
> > > > +         err = hmm_range_fault(&hmm_range);
> > > > +         if (!ctx->mmap_locked) {
> > > > +                 if (vram_pages)
> > > > +                         mmap_write_unlock(mm);
> > > > +                 else
> > > > +                         mmap_read_unlock(mm);
> > > > +         }
> > > > +
> > > > +         if (err == -EBUSY) {
> > > > +                 if (time_after(jiffies, timeout))
> > > > +                         break;
> > > > +
> > > > +                 hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > > +                 continue;
> > > > +         }
> > > > +         break;
> > > > + }
> > > > + if (!ctx->mmap_locked)
> > > > +         mmput(mm);
> > > > + if (err)
> > > > +         goto err_free;
> > > > +
> > > > + pages = (struct page **)pfns;
> > > > +
> > > > + if (ctx->prefault) {
> > > > +         range->pages = pages;
> > > > +         goto set_seqno;
> > > > + }
> > > > +
> > > > +map_pages:
> > > > + if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > +         WARN_ON_ONCE(!range->vram_allocation);
> > > > +
> > > > +         for (i = 0; i < npages; ++i) {
> > > > +                 pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > +
> > > > +                 if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > +                         err = -EOPNOTSUPP;
> > > > +                         goto err_free;
> > > > +                 }
> > > > +         }
> > > > +
> > > > +         /* Do not race with notifier unmapping pages */
> > > > +         drm_gpusvm_notifier_lock(gpusvm);
> > > > +         range->flags.has_vram_pages = true;
> > > > +         range->pages = pages;
> > > > +         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > +                 err = -EAGAIN;
> > > > +                 __drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +         }
> > > > +         drm_gpusvm_notifier_unlock(gpusvm);
> > > > + } else {
> > > > +         dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > +
> > > > +         for_each_dma_page(i, j, npages, order) {
> > > > +                 if (WARN_ON_ONCE(i && order !=
> > > > +                                  hmm_pfn_to_map_order(pfns[i]))) {
> > > > +                         err = -EOPNOTSUPP;
> > > > +                         npages = i;
> > > > +                         goto err_unmap;
> > > > +                 }
> > > > +                 order = hmm_pfn_to_map_order(pfns[i]);
> > > > +
> > > > +                 pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > +                 if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > +                         err = -EOPNOTSUPP;
> > > > +                         npages = i;
> > > > +                         goto err_unmap;
> > > > +                 }
> > > > +
> > > > +                 set_page_dirty_lock(pages[j]);
> > > > +                 mark_page_accessed(pages[j]);
> > > > +
> > > > +                 dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > > +                                            pages[j], 0,
> > > > +                                            PAGE_SIZE << order,
> > > > +                                            DMA_BIDIRECTIONAL);
> > > > +                 if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > > +                         err = -EFAULT;
> > > > +                         npages = i;
> > > > +                         goto err_unmap;
> > > > +                 }
> > > > +         }
> > > > +
> > > > +         /* Huge pages, reduce memory footprint */
> > > > +         if (order) {
> > > > +                 dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > > +                                          GFP_KERNEL);
> > > > +                 if (dma_addr) {
> > > > +                         for (i = 0; i < j; ++i)
> > > > +                                 dma_addr[i] = (dma_addr_t)pfns[i];
> > > > +                         kvfree(pfns);
> > > > +                         kfree_mapping = true;
> > > > +                 } else {
> > > > +                         dma_addr = (dma_addr_t *)pfns;
> > > > +                 }
> > > > +         }
> > > > +
> > > > +         /* Do not race with notifier unmapping pages */
> > > > +         drm_gpusvm_notifier_lock(gpusvm);
> > > > +         range->order = order;
> > > > +         range->flags.kfree_mapping = kfree_mapping;
> > > > +         range->flags.has_dma_mapping = true;
> > > > +         range->dma_addr = dma_addr;
> > > > +         range->vram_allocation = NULL;
> > > > +         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > +                 err = -EAGAIN;
> > > > +                 __drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +         }
> > > > +         drm_gpusvm_notifier_unlock(gpusvm);
> > > > + }
> > > > +
> > > > + if (err == -EAGAIN)
> > > > +         goto retry;
> > > > +set_seqno:
> > > > + range->notifier_seq = hmm_range.notifier_seq;
> > > > +
> > > > + return 0;
> > > > +
> > > > +err_unmap:
> > > > + for_each_dma_page(i, j, npages, order)
> > > > +         dma_unmap_page(gpusvm->drm->dev,
> > > > +                        (dma_addr_t)pfns[j],
> > > > +                        PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > + if (alloc_pfns)
> > > > +         kvfree(pfns);
> > > > +err_out:
> > > > + return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > > + * security model.
> > > > + */
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +                           struct drm_gpusvm_range *range,
> > > > +                           const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > + if (ctx->in_notifier)
> > > > +         lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > > + else
> > > > +         drm_gpusvm_notifier_lock(gpusvm);
> > > > +
> > > > + __drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +
> > > > + if (!ctx->in_notifier)
> > > > +         drm_gpusvm_notifier_unlock(gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > + * @page: Pointer to the page to put
> > > > + *
> > > > + * This function unlocks and puts a page.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > +{
> > > > + unlock_page(page);
> > > > + put_page(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > + * @npages: Number of pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > + *
> > > > + * This function puts an array of pages.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > +                                    unsigned long *migrate_pfn)
> > > > +{
> > > > + unsigned long i;
> > > > +
> > > > + for (i = 0; i < npages; ++i) {
> > > > +         if (!migrate_pfn[i])
> > > > +                 continue;
> > > > +
> > > > +         drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > > +         migrate_pfn[i] = 0;
> > > > + }
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > + * @page: Pointer to the page
> > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > + *
> > > > + * This function associates the given page with the specified GPU SVM zone
> > > > + * device data and initializes it for zone device usage.
> > > > + */
> > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > +                              struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > + page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > + zone_device_page_init(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > > + * @dev: The device for which the pages are being mapped
> > > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > + * @npages: Number of pages to map
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > > + * array.
> > > > + *
> > > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > > + */
> > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > +                                 dma_addr_t *dma_addr,
> > > > +                                 long unsigned int *migrate_pfn,
> > > > +                                 unsigned long npages,
> > > > +                                 enum dma_data_direction dir)
> > > > +{
> > > > + unsigned long i;
> > > > +
> > > > + for (i = 0; i < npages; ++i) {
> > > > +         struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > > +
> > > > +         if (!page)
> > > > +                 continue;
> > > > +
> > > > +         if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > +                 return -EFAULT;
> > > > +
> > > > +         dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > > +         if (dma_mapping_error(dev, dma_addr[i]))
> > > > +                 return -EFAULT;
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > > + * @dev: The device for which the pages were mapped
> > > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > > + * @npages: Number of pages to unmap
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > > + */
> > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > +                                    dma_addr_t *dma_addr,
> > > > +                                    unsigned long npages,
> > > > +                                    enum dma_data_direction dir)
> > > > +{
> > > > + unsigned long i;
> > > > +
> > > > + for (i = 0; i < npages; ++i) {
> > > > +         if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > > +                 continue;
> > > > +
> > > > +         dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > > + }
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *                   failure of this function.
> > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > > + *                   should hold a reference to the VRAM allocation, which
> > > > + *                   should be dropped via ops->vram_allocation or upon the
> > > > + *                   failure of this function.
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > > + * necessary setup and invokes the driver-specific operations for migration to
> > > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > > + * until ops->vram_release is called which only upon successful return.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +                        struct drm_gpusvm_range *range,
> > > > +                        void *vram_allocation,
> > > > +                        const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > + u64 start = range->va.start, end = range->va.end;
> > > > + struct migrate_vma migrate = {
> > > > +         .start          = start,
> > > > +         .end            = end,
> > > > +         .pgmap_owner    = gpusvm->device_private_page_owner,
> > > > +         .flags          = MIGRATE_VMA_SELECT_SYSTEM,
> > > > + };
> > > > + struct mm_struct *mm = gpusvm->mm;
> > > > + unsigned long i, npages = npages_in_range(start, end);
> > > > + struct vm_area_struct *vas;
> > > > + struct drm_gpusvm_zdd *zdd = NULL;
> > > > + struct page **pages;
> > > > + dma_addr_t *dma_addr;
> > > > + void *buf;
> > > > + int err;
> > > > +
> > > > + if (!range->flags.migrate_vram)
> > > > +         return -EINVAL;
> > > > +
> > > > + if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > > +     !gpusvm->ops->copy_to_sram)
> > > > +         return -EOPNOTSUPP;
> > > > +
> > > > + if (!ctx->mmap_locked) {
> > > > +         if (!mmget_not_zero(mm)) {
> > > > +                 err = -EFAULT;
> > > > +                 goto err_out;
> > > > +         }
> > > > +         mmap_write_lock(mm);
> > > > + }
> > > > +
> > > > + mmap_assert_locked(mm);
> > > > +
> > > > + vas = vma_lookup(mm, start);
> > > > + if (!vas) {
> > > > +         err = -ENOENT;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > +
> > > > + if (end > vas->vm_end || start < vas->vm_start) {
> > > > +         err = -EINVAL;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > +
> > > > + if (!vma_is_anonymous(vas)) {
> > > > +         err = -EBUSY;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > +
> > > > + buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > +                sizeof(*pages), GFP_KERNEL);
> > > > + if (!buf) {
> > > > +         err = -ENOMEM;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > + dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > + pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > + zdd = drm_gpusvm_zdd_alloc(range);
> > > > + if (!zdd) {
> > > > +         err = -ENOMEM;
> > > > +         goto err_free;
> > > > + }
> > > > +
> > > > + migrate.vma = vas;
> > > > + migrate.src = buf;
> > > > + migrate.dst = migrate.src + npages;
> > > > +
> > > > + err = migrate_vma_setup(&migrate);
> > > > + if (err)
> > > > +         goto err_free;
> > > > +
> > > > + /*
> > > > +  * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > +  * always an error. Need to revisit possible cases and how to handle. We
> > > > +  * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > > +  */
> > > > +
> > > > + if (!migrate.cpages) {
> > > > +         err = -EFAULT;
> > > > +         goto err_free;
> > > > + }
> > > > +
> > > > + if (migrate.cpages != npages) {
> > > > +         err = -EBUSY;
> > > > +         goto err_finalize;
> > > > + }
> > > > +
> > > > + err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > > +                                      migrate.dst);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +                                    migrate.src, npages, DMA_TO_DEVICE);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + for (i = 0; i < npages; ++i) {
> > > > +         struct page *page = pfn_to_page(migrate.dst[i]);
> > > > +
> > > > +         pages[i] = page;
> > > > +         migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > +         drm_gpusvm_get_vram_page(page, zdd);
> > > > + }
> > > > +
> > > > + err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + /* Upon success bind vram allocation to range and zdd */
> > > > + range->vram_allocation = vram_allocation;
> > > > + WRITE_ONCE(zdd->vram_allocation, vram_allocation);      /* Owns ref */
> > > > +
> > > > +err_finalize:
> > > > + if (err)
> > > > +         drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > + migrate_vma_pages(&migrate);
> > > > + migrate_vma_finalize(&migrate);
> > > > + drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +                                DMA_TO_DEVICE);
> > > > +err_free:
> > > > + if (zdd)
> > > > +         drm_gpusvm_zdd_put(zdd);
> > > > + kvfree(buf);
> > > > +err_mmunlock:
> > > > + if (!ctx->mmap_locked) {
> > > > +         mmap_write_unlock(mm);
> > > > +         mmput(mm);
> > > > + }
> > > > +err_out:
> > > > + return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > + * @npages: Number of pages to populate
> > > > + * @src_mpfn: Source array of migrate PFNs
> > > > + * @mpfn: Array of migrate PFNs to populate
> > > > + * @addr: Start address for PFN allocation
> > > > + *
> > > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > > + * alloc_page for allocation.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > > +                                         unsigned long npages,
> > > > +                                         unsigned long *src_mpfn,
> > > > +                                         unsigned long *mpfn, u64 addr)
> > > > +{
> > > > + unsigned long i;
> > > > +
> > > > + for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > +         struct page *page;
> > > > +
> > > > +         if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > +                 continue;
> > > > +
> > > > +         if (vas)
> > > > +                 page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > > +         else
> > > > +                 page = alloc_page(GFP_HIGHUSER);
> > > > +
> > > > +         if (!page)
> > > > +                 return -ENOMEM;
> > > > +
> > > > +         lock_page(page);
> > > > +         mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > > + * preferred to issue migrations with mmap lock.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > +                             struct drm_gpusvm_range *range)
> > > > +{
> > > > + unsigned long npages;
> > > > + struct page **pages;
> > > > + unsigned long *src, *dst;
> > > > + dma_addr_t *dma_addr;
> > > > + void *buf;
> > > > + int i, err = 0;
> > > > +
> > > > + npages = npages_in_range(range->va.start, range->va.end);
> > > > +
> > > > + buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > > +                sizeof(*pages), GFP_KERNEL);
> > > > + if (!buf) {
> > > > +         err = -ENOMEM;
> > > > +         goto err_out;
> > > > + }
> > > > + src = buf;
> > > > + dst = buf + (sizeof(*src) * npages);
> > > > + dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > + pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > + err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > > +                                      npages, src);
> > > > + if (err)
> > > > +         goto err_free;
> > > > +
> > > > + err = migrate_device_vma_range(gpusvm->mm,
> > > > +                                gpusvm->device_private_page_owner, src,
> > > > +                                npages, range->va.start);
> > > > + if (err)
> > > > +         goto err_free;
> > > > +
> > > > + err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +                                    dst, npages, DMA_BIDIRECTIONAL);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + for (i = 0; i < npages; ++i)
> > > > +         pages[i] = migrate_pfn_to_page(src[i]);
> > > > +
> > > > + err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > + if (err)
> > > > +         drm_gpusvm_migration_put_pages(npages, dst);
> > > > + migrate_device_pages(src, dst, npages);
> > > > + migrate_device_finalize(src, dst, npages);
> > > > + drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +                                DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > + kvfree(buf);
> > > > +err_out:
> > > > +
> > > > + return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @vas: Pointer to the VM area structure
> > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > + * @start: Start address of the migration range
> > > > + * @end: End address of the migration range
> > > > + *
> > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +                                 struct vm_area_struct *vas,
> > > > +                                 struct page *page,
> > > > +                                 u64 start, u64 end)
> > > > +{
> > > > + struct migrate_vma migrate = {
> > > > +         .vma            = vas,
> > > > +         .pgmap_owner    = gpusvm->device_private_page_owner,
> > > > +         .flags          = MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > +         .fault_page     = page,
> > > > + };
> > > > + unsigned long npages;
> > > > + struct page **pages;
> > > > + dma_addr_t *dma_addr;
> > > > + void *buf;
> > > > + int i, err = 0;
> > > > +
> > > > + mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > + /* Corner where VMA area struct has been partially unmapped */
> > > > + if (start < vas->vm_start)
> > > > +         start = vas->vm_start;
> > > > + if (end > vas->vm_end)
> > > > +         end = vas->vm_end;
> > > > +
> > > > + migrate.start = start;
> > > > + migrate.end = end;
> > > > + npages = npages_in_range(start, end);
> > > > +
> > > > + buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > +                sizeof(*pages), GFP_KERNEL);
> > > > + if (!buf) {
> > > > +         err = -ENOMEM;
> > > > +         goto err_out;
> > > > + }
> > > > + dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > + pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > + migrate.vma = vas;
> > > > + migrate.src = buf;
> > > > + migrate.dst = migrate.src + npages;
> > > > +
> > > > + err = migrate_vma_setup(&migrate);
> > > > + if (err)
> > > > +         goto err_free;
> > > > +
> > > > + /* Raced with another CPU fault, nothing to do */
> > > > + if (!migrate.cpages)
> > > > +         goto err_free;
> > > > +
> > > > + err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > +                                            migrate.src, migrate.dst,
> > > > +                                            start);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +                                    migrate.dst, npages,
> > > > +                                    DMA_BIDIRECTIONAL);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > + for (i = 0; i < npages; ++i)
> > > > +         pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > > +
> > > > + err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > + if (err)
> > > > +         goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > + if (err)
> > > > +         drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > + migrate_vma_pages(&migrate);
> > > > + migrate_vma_finalize(&migrate);
> > > > + drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +                                DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > + kvfree(buf);
> > > > +err_out:
> > > > + mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > + return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function initiates the migration of the specified GPU SVM range to
> > > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > > + * function for actual migration.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +                        struct drm_gpusvm_range *range,
> > > > +                        const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > + u64 start = range->va.start, end = range->va.end;
> > > > + struct mm_struct *mm = gpusvm->mm;
> > > > + struct vm_area_struct *vas;
> > > > + int err;
> > > > + bool retry = false;
> > > > +
> > > > + if (!ctx->mmap_locked) {
> > > > +         if (!mmget_not_zero(mm)) {
> > > > +                 err = -EFAULT;
> > > > +                 goto err_out;
> > > > +         }
> > > > +         if (ctx->trylock_mmap) {
> > > > +                 if (!mmap_read_trylock(mm))  {
> > > > +                         err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > +                         goto err_mmput;
> > > > +                 }
> > > > +         } else {
> > > > +                 mmap_read_lock(mm);
> > > > +         }
> > > > + }
> > > > +
> > > > + mmap_assert_locked(mm);
> > > > +
> > > > + /*
> > > > +  * Loop required to find all VMA area structs for the corner case when
> > > > +  * VRAM backing has been partially unmapped from MM's address space.
> > > > +  */
> > > > +again:
> > > > + vas = find_vma(mm, start);
> > > > + if (!vas) {
> > > > +         if (!retry)
> > > > +                 err = -ENOENT;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > +
> > > > + if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > +         if (!retry)
> > > > +                 err = -EINVAL;
> > > > +         goto err_mmunlock;
> > > > + }
> > > > +
> > > > + err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > > + if (err)
> > > > +         goto err_mmunlock;
> > > > +
> > > > + if (vas->vm_end < end) {
> > > > +         retry = true;
> > > > +         start = vas->vm_end;
> > > > +         goto again;
> > > > + }
> > > > +
> > > > + if (!ctx->mmap_locked) {
> > > > +         mmap_read_unlock(mm);
> > > > +         /*
> > > > +          * Using mmput_async as this function can be called while
> > > > +          * holding a dma-resv lock, and a final put can grab the mmap
> > > > +          * lock, causing a lock inversion.
> > > > +          */
> > > > +         mmput_async(mm);
> > > > + }
> > > > +
> > > > + return 0;
> > > > +
> > > > +err_mmunlock:
> > > > + if (!ctx->mmap_locked)
> > > > +         mmap_read_unlock(mm);
> > > > +err_mmput:
> > > > + if (!ctx->mmap_locked)
> > > > +         mmput_async(mm);
> > > > +err_out:
> > > > + return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > > + * @page: Pointer to the page
> > > > + *
> > > > + * This function is a callback used to put the GPU SVM zone device data
> > > > + * associated with a page when it is being released.
> > > > + */
> > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > +{
> > > > + drm_gpusvm_zdd_put(page->zone_device_data);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > + * @vmf: Pointer to the fault information structure
> > > > + *
> > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > + * the internal migration function to migrate the range back to RAM.
> > > > + *
> > > > + * Returns:
> > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > + */
> > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > +{
> > > > + struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > + int err;
> > > > +
> > > > + err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > +                                    vmf->vma, vmf->page,
> > > > +                                    zdd->range->va.start,
> > > > +                                    zdd->range->va.end);
> > > > +
> > > > + return err ? VM_FAULT_SIGBUS : 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > > + */
> > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > > + .page_free = drm_gpusvm_page_free,
> > > > + .migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM device page map operations structure.
> > > > + */
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > > +{
> > > > + return &drm_gpusvm_pagemap_ops;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + * @start: Start address
> > > > + * @end: End address
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM has mapping, False otherwise
> > > > + */
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > > +{
> > > > + struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > + drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > > +         struct drm_gpusvm_range *range = NULL;
> > > > +
> > > > +         drm_gpusvm_for_each_range(range, notifier, start, end)
> > > > +                 return true;
> > > > + }
> > > > +
> > > > + return false;
> > > > +}
> > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > new file mode 100644
> > > > index 000000000000..0ea70f8534a8
> > > > --- /dev/null
> > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > @@ -0,0 +1,415 @@
> > > > +/* SPDX-License-Identifier: MIT */
> > > > +/*
> > > > + * Copyright © 2024 Intel Corporation
> > > > + */
> > > > +
> > > > +#ifndef __DRM_GPUSVM_H__
> > > > +#define __DRM_GPUSVM_H__
> > > > +
> > > > +#include <linux/kref.h>
> > > > +#include <linux/mmu_notifier.h>
> > > > +#include <linux/workqueue.h>
> > > > +
> > > > +struct dev_pagemap_ops;
> > > > +struct drm_device;
> > > > +struct drm_gpusvm;
> > > > +struct drm_gpusvm_notifier;
> > > > +struct drm_gpusvm_ops;
> > > > +struct drm_gpusvm_range;
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > + *
> > > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > > + * perform operations such as migration between VRAM and system RAM.
> > > > + */
> > > > +struct drm_gpusvm_ops {
> > > > + /**
> > > > +  * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > > +  *
> > > > +  * This function shall allocate a GPU SVM notifier.
> > > > +  *
> > > > +  * Returns:
> > > > +  * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > > +  */
> > > > + struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > +
> > > > + /**
> > > > +  * @notifier_free: Free a GPU SVM notifier (optional)
> > > > +  * @notifier: Pointer to the GPU SVM notifier to be freed
> > > > +  *
> > > > +  * This function shall free a GPU SVM notifier.
> > > > +  */
> > > > + void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > > +
> > > > + /**
> > > > +  * @range_alloc: Allocate a GPU SVM range (optional)
> > > > +  * @gpusvm: Pointer to the GPU SVM
> > > > +  *
> > > > +  * This function shall allocate a GPU SVM range.
> > > > +  *
> > > > +  * Returns:
> > > > +  * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > > +  */
> > > > + struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > > +
> > > > + /**
> > > > +  * @range_free: Free a GPU SVM range (optional)
> > > > +  * @range: Pointer to the GPU SVM range to be freed
> > > > +  *
> > > > +  * This function shall free a GPU SVM range.
> > > > +  */
> > > > + void (*range_free)(struct drm_gpusvm_range *range);
> > > > +
> > > > + /**
> > > > +  * @vram_release: Release VRAM allocation (optional)
> > > > +  * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > +  *
> > > > +  * This function shall release VRAM allocation and expects to drop a
> > > > +  * reference to VRAM allocation.
> > > > +  */
> > > > + void (*vram_release)(void *vram_allocation);
> > > > +
> > > > + /**
> > > > +  * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > > +  * @gpusvm: Pointer to the GPU SVM
> > > > +  * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > +  * @npages: Number of pages to populate
> > > > +  * @pfn: Array of page frame numbers to populate
> > > > +  *
> > > > +  * This function shall populate VRAM page frame numbers (PFN).
> > > > +  *
> > > > +  * Returns:
> > > > +  * 0 on success, a negative error code on failure.
> > > > +  */
> > > > + int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > +                          void *vram_allocation,
> > > > +                          unsigned long npages,
> > > > +                          unsigned long *pfn);
> > > > +
> > > > + /**
> > > > +  * @copy_to_vram: Copy to VRAM (required for migration)
> > > > +  * @gpusvm: Pointer to the GPU SVM
> > > > +  * @pages: Pointer to array of VRAM pages (destination)
> > > > +  * @dma_addr: Pointer to array of DMA addresses (source)
> > > > +  * @npages: Number of pages to copy
> > > > +  *
> > > > +  * This function shall copy pages to VRAM.
> > > > +  *
> > > > +  * Returns:
> > > > +  * 0 on success, a negative error code on failure.
> > > > +  */
> > > > + int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > +                     struct page **pages,
> > > > +                     dma_addr_t *dma_addr,
> > > > +                     unsigned long npages);
> > > > +
> > > > + /**
> > > > +  * @copy_to_sram: Copy to system RAM (required for migration)
> > > > +  * @gpusvm: Pointer to the GPU SVM
> > > > +  * @pages: Pointer to array of VRAM pages (source)
> > > > +  * @dma_addr: Pointer to array of DMA addresses (destination)
> > > > +  * @npages: Number of pages to copy
> > > > +  *
> > > > +  * This function shall copy pages to system RAM.
> > > > +  *
> > > > +  * Returns:
> > > > +  * 0 on success, a negative error code on failure.
> > > > +  */
> > > > + int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > +                     struct page **pages,
> > > > +                     dma_addr_t *dma_addr,
> > > > +                     unsigned long npages);
> > > > +
> > > > + /**
> > > > +  * @invalidate: Invalidate GPU SVM notifier (required)
> > > > +  * @gpusvm: Pointer to the GPU SVM
> > > > +  * @notifier: Pointer to the GPU SVM notifier
> > > > +  * @mmu_range: Pointer to the mmu_notifier_range structure
> > > > +  *
> > > > +  * This function shall invalidate the GPU page tables. It can safely
> > > > +  * walk the notifier range RB tree/list in this function. Called while
> > > > +  * holding the notifier lock.
> > > > +  */
> > > > + void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > +                    struct drm_gpusvm_notifier *notifier,
> > > > +                    const struct mmu_notifier_range *mmu_range);
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: MMU interval notifier
> > > > + * @interval: Interval for the notifier
> > > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > > + * @root: Cached root node of the RB tree containing ranges
> > > > + * @range_list: List head containing of ranges in the same order they appear in
> > > > + *              interval tree. This is useful to keep iterating ranges while
> > > > + *              doing modifications to RB tree.
> > > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > > + *                 removed
> > > > + *
> > > > + * This structure represents a GPU SVM notifier.
> > > > + */
> > > > +struct drm_gpusvm_notifier {
> > > > + struct drm_gpusvm *gpusvm;
> > > > + struct mmu_interval_notifier notifier;
> > > > + struct {
> > > > +         u64 start;
> > > > +         u64 end;
> > > > + } interval;
> > > > + struct {
> > > > +         struct rb_node node;
> > > > +         struct list_head entry;
> > > > +         u64 __subtree_last;
> > > > + } rb;
> > > > + struct rb_root_cached root;
> > > > + struct list_head range_list;
> > > > + struct {
> > > > +         u32 removed : 1;
> > > > + } flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > + * @refcount: Reference count for the range
> > > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > > + * @va: Virtual address range
> > > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > > + *                       on @order which releases via kfree
> > > > + *
> > > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > > + * mapped in a DRM device.
> > > > + */
> > > > +struct drm_gpusvm_range {
> > > > + struct drm_gpusvm *gpusvm;
> > > > + struct drm_gpusvm_notifier *notifier;
> > > > + struct kref refcount;
> > > > + struct {
> > > > +         struct rb_node node;
> > > > +         struct list_head entry;
> > > > +         u64 __subtree_last;
> > > > + } rb;
> > > > + struct {
> > > > +         u64 start;
> > > > +         u64 end;
> > > > + } va;
> > > > + unsigned long notifier_seq;
> > > > + union {
> > > > +         struct page **pages;
> > > > +         dma_addr_t *dma_addr;
> > > > + };
> > > > + void *vram_allocation;
> > > > + u16 order;
> > > > + struct {
> > > > +         /* All flags below must be set upon creation */
> > > > +         u16 migrate_vram : 1;
> > > > +         /* All flags below must be set / cleared under notifier lock */
> > > > +         u16 unmapped : 1;
> > > > +         u16 partial_unmap : 1;
> > > > +         u16 has_vram_pages : 1;
> > > > +         u16 has_dma_mapping : 1;
> > > > +         u16 kfree_mapping : 1;
> > > > + } flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm - GPU SVM structure
> > > > + *
> > > > + * @name: Name of the GPU SVM
> > > > + * @drm: Pointer to the DRM device structure
> > > > + * @mm: Pointer to the mm_struct for the address space
> > > > + * @device_private_page_owner: Device private pages owner
> > > > + * @mm_start: Start address of GPU SVM
> > > > + * @mm_range: Range of the GPU SVM
> > > > + * @notifier_size: Size of individual notifiers
> > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > > + *               Entries should be powers of 2 in descending order.
> > > > + * @num_chunks: Number of chunks
> > > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > > + * @notifier_list: list head containing of notifiers in the same order they
> > > > + *                 appear in interval tree. This is useful to keep iterating
> > > > + *                 notifiers while doing modifications to RB tree.
> > > > + *
> > > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > > + *
> > > > + * No reference counting is provided, as this is expected to be embedded in the
> > > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > > + * counting.
> > > > + */
> > > > +struct drm_gpusvm {
> > > > + const char *name;
> > > > + struct drm_device *drm;
> > > > + struct mm_struct *mm;
> > > > + void *device_private_page_owner;
> > > > + u64 mm_start;
> > > > + u64 mm_range;
> > > > + u64 notifier_size;
> > > > + const struct drm_gpusvm_ops *ops;
> > > > + const u64 *chunk_sizes;
> > > > + int num_chunks;
> > > > + struct rw_semaphore notifier_lock;
> > > > + struct workqueue_struct *zdd_wq;
> > > > + struct rb_root_cached root;
> > > > + struct list_head notifier_list;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > + *
> > > > + * @mmap_locked: mmap lock is locked
> > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > > + *                (e.g.dma-revs -> mmap lock)
> > > > + * @in_notifier: entering from a MMU notifier
> > > > + * @read_only: operating on read-only memory
> > > > + * @vram_possible: possible to use VRAM
> > > > + * @prefault: prefault pages
> > > > + *
> > > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > > + */
> > > > +struct drm_gpusvm_ctx {
> > > > + u32 mmap_locked :1;
> > > > + u32 trylock_mmap :1;
> > > > + u32 in_notifier :1;
> > > > + u32 read_only :1;
> > > > + u32 vram_possible :1;
> > > > + u32 prefault :1;
> > > > +};
> > > > +
> > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > +             const char *name, struct drm_device *drm,
> > > > +             struct mm_struct *mm, void *device_private_page_owner,
> > > > +             u64 mm_start, u64 mm_range, u64 notifier_size,
> > > > +             const struct drm_gpusvm_ops *ops,
> > > > +             const u64 *chunk_sizes, int num_chunks);
> > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > > +                         u64 gpuva_start, u64 gpuva_end,
> > > > +                         const struct drm_gpusvm_ctx *ctx);
> > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > +                      struct drm_gpusvm_range *range);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > +
> > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > +                           struct drm_gpusvm_range *range);
> > > > +
> > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > +                        struct drm_gpusvm_range *range,
> > > > +                        const struct drm_gpusvm_ctx *ctx);
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +                           struct drm_gpusvm_range *range,
> > > > +                           const struct drm_gpusvm_ctx *ctx);
> > > > +
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +                        struct drm_gpusvm_range *range,
> > > > +                        void *vram_allocation,
> > > > +                        const struct drm_gpusvm_ctx *ctx);
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +                        struct drm_gpusvm_range *range,
> > > > +                        const struct drm_gpusvm_ctx *ctx);
> > > > +
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > > +
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_lock(gpusvm__)       \
> > > > + down_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)     \
> > > > + up_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > > + * @range: a pointer to the current GPU SVM range
> > > > + *
> > > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > > + *         current range is the last one or if the input range is NULL.
> > > > + */
> > > > +static inline struct drm_gpusvm_range *
> > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > +{
> > > > + if (range && !list_is_last(&range->rb.entry,
> > > > +                            &range->notifier->range_list))
> > > > +         return list_next_entry(range, rb.entry);
> > > > +
> > > > + return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > > + *            the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the range
> > > > + * @end__: End address of the range
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > > + * to use while holding the driver SVM lock or the notifier lock.
> > > > + */
> > > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)   \
> > > > + for ((range__) = (range__) ?:                                   \
> > > > +      drm_gpusvm_range_find((notifier__), (start__), (end__));   \
> > > > +      (range__) && (range__->va.start < (end__));                \
> > > > +      (range__) = __drm_gpusvm_range_next(range__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > > + * @range: Pointer to the GPU SVM range structure.
> > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > + *
> > > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > > + * if the range partially falls within the provided MMU notifier range.
> > > > + */
> > > > +static inline void
> > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > > +                       const struct mmu_notifier_range *mmu_range)
> > > > +{
> > > > + lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > > +
> > > > + range->flags.unmapped = true;
> > > > + if (range->va.start < mmu_range->start ||
> > > > +     range->va.end > mmu_range->end)
> > > > +         range->flags.partial_unmap = true;
> > > > +}
> > > > +
> > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > --
> > > > 2.34.1
> > > >
> >
Daniel Vetter Aug. 28, 2024, 4:25 p.m. UTC | #5
On Wed, Aug 28, 2024 at 03:43:48PM +0000, Matthew Brost wrote:
> On Wed, Aug 28, 2024 at 04:46:24PM +0200, Christian König wrote:
> > Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > +		if (!ctx->mmap_locked) {
> > > > +			/*
> > > > +			 * XXX: HMM locking document indicates only a read-lock
> > > > +			 * is required but there apears to be a window between
> > > > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > > +			 * via migrate_vma_setup and the pages actually moving
> > > > +			 * in migrate_vma_finalize in which this code can grab
> > > > +			 * garbage pages. Grabbing the write-lock if the range
> > > > +			 * is attached to vram appears to protect against this
> > > > +			 * race.
> > > > +			 */
> 
> Thanks the comments, replying to both of you inline.
> 
> > > This one is really scary, since it means the entire migrate pte trickery
> > > is essentially completely busted. Grabbing the mmap write lock just means
> > > you block out pretty much everything interesting from concurrently
> > > happening.
> > > 
> > > My gut feeling says we need to figure out what's happening here, because
> > > this looks a bit too fundamental to me.
> 
> I agree. I haven’t looked into this issue for a couple of months but
> really need to understand what is going on.
> 
> I should have mentioned this in the cover letter: the goal of this
> series was to produce something for review that is stable and supports
> UMDs/user applications. It was not intended to be presented as a final
> solution. This issue certainly falls into the category of "needs to be
> understood and requires a proper fix."
> 
> One open question I have is whether the test case that triggers this
> issue is even defined behavior. The test creates concurrent access
> between the GPU and CPU to the same memory address, resulting in GPU and
> CPU faults racing against each other. It’s possible that this is
> undefined behavior, so data corruption might be acceptable—i.e., the
> kernel can’t crash, but incorrect results might be permissible.

Yes this is supposed to be defined, at least from an hmm pov. And core mm/
is ridiculous in how many races it allows, especially around concurrent
fault handling.

It is ofc really slow if every fault results in a migration, but that's a
matter of the application setting stupid memory migration hints for the
gpu.

> e.g. This is the only defined usage model:
> 
> alloc_memory();
> start_compute_kernel();
> sync_on_compute_kernel_completion();
> read_memory();
> 
> Hopefully, in the next week or so, I'll be heavily engaging with the UMD
> teams. Development can then start, and applications will be running soon
> after. This will allow us to address issues like this, collect data on
> memory usage, and verify some of the assumptions I've made, such as
> optimizing for 2M+ allocations.
> 
> > 
> > I think I have at least a high level understanding what's going on here,
> > Felix and especially Philip should know more of the details.
> > 
> 
> I meant to reach out to AMD for issues like this. So, Felix
> (felix.kuehling@amd.com) and Philip (Philip.Yang@amd.com) would be good
> contacts?
> 
> > In general grabbing the mm_lock to protect PTEs from changing is completely
> > nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!
> > 
> 
> Thanks for the hint. I believe that in the AMD implementation, I noticed
> some additional locks for migration, which might be how you mitigated
> this issue.

Yeah, so in general hold mmap_reading is indeed pure magic thinking for
preventing pte changes, like Christian points out. It doesn't stop
invalidates, and with the per vma locking it also doesn't stop new valid
ptes from being inserted at least for anon memory.

Except migration pte entries that point at vram pages are special, and are
_only_ resolved while holding mmap_read. Which means holding mmap_write
for the case of looking up our own vram pages with hmm_range_fault
actually prevents issues. And so this duct-tape of holding mmap_write very
much looks like a working hack to plug any races against concurrently
ongoing migrations to system memory due to cpu faults.

An even more fun corner case is multiple concurrent cpu faults on the same
vram page. fork gets you that, or maybe a bit more reasonable mremap with
MREMAP_DONTUNMAP | MREMAP_MAYMOVE. I think just hammer the same va with
multiple threads along isn't enough, it's better to have a private va for
each thread pointing at the same anon memory page, so that you can get
more parallel faults due to finely grained pte locking.

Would be a good testcase to add, if you don't have it yet.

> I must say it is a bit unfortunate that the HMM locking documentation
> doesn’t mention this. I believe the documentation needs additional
> information, which I can add once we finalize the solution.

Yeah, at least from my very cursory lock you don't have enough locking.
I've written an in-depth reply to patch 23 with the high-level summary of
my thoughts.

Cheers, Sima

> 
> Matt 
> 
> > Even with the write side of the mm_lock taken it is perfectly possible that
> > PTE change. It's just less likely.
> > 
> > We run into multiple issues before we figured out this important distinction
> > as well.
> > 
> > Christian.
> > 
> > > -Sima
> > > 
> > > 
> > > > +			if (vram_pages)
> > > > +				mmap_write_lock(mm);
> > > > +			else
> > > > +				mmap_read_lock(mm);
> > > > +		}
> > > > +		err = hmm_range_fault(&hmm_range);
> > > > +		if (!ctx->mmap_locked) {
> > > > +			if (vram_pages)
> > > > +				mmap_write_unlock(mm);
> > > > +			else
> > > > +				mmap_read_unlock(mm);
> > > > +		}
> > > > +
> > > > +		if (err == -EBUSY) {
> > > > +			if (time_after(jiffies, timeout))
> > > > +				break;
> > > > +
> > > > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > > +			continue;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput(mm);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	pages = (struct page **)pfns;
> > > > +
> > > > +	if (ctx->prefault) {
> > > > +		range->pages = pages;
> > > > +		goto set_seqno;
> > > > +	}
> > > > +
> > > > +map_pages:
> > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > +
> > > > +		for (i = 0; i < npages; ++i) {
> > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > +
> > > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				goto err_free;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->flags.has_vram_pages = true;
> > > > +		range->pages = pages;
> > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	} else {
> > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > +
> > > > +		for_each_dma_page(i, j, npages, order) {
> > > > +			if (WARN_ON_ONCE(i && order !=
> > > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > +
> > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +
> > > > +			set_page_dirty_lock(pages[j]);
> > > > +			mark_page_accessed(pages[j]);
> > > > +
> > > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > > +						   pages[j], 0,
> > > > +						   PAGE_SIZE << order,
> > > > +						   DMA_BIDIRECTIONAL);
> > > > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > > +				err = -EFAULT;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Huge pages, reduce memory footprint */
> > > > +		if (order) {
> > > > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > > +						 GFP_KERNEL);
> > > > +			if (dma_addr) {
> > > > +				for (i = 0; i < j; ++i)
> > > > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > > > +				kvfree(pfns);
> > > > +				kfree_mapping = true;
> > > > +			} else {
> > > > +				dma_addr = (dma_addr_t *)pfns;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->order = order;
> > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > +		range->flags.has_dma_mapping = true;
> > > > +		range->dma_addr = dma_addr;
> > > > +		range->vram_allocation = NULL;
> > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	}
> > > > +
> > > > +	if (err == -EAGAIN)
> > > > +		goto retry;
> > > > +set_seqno:
> > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_unmap:
> > > > +	for_each_dma_page(i, j, npages, order)
> > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > +			       (dma_addr_t)pfns[j],
> > > > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	if (alloc_pfns)
> > > > +		kvfree(pfns);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > > + * security model.
> > > > + */
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range *range,
> > > > +				  const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	if (ctx->in_notifier)
> > > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > > +	else
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +
> > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +
> > > > +	if (!ctx->in_notifier)
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > + * @page: Pointer to the page to put
> > > > + *
> > > > + * This function unlocks and puts a page.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > +{
> > > > +	unlock_page(page);
> > > > +	put_page(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > + * @npages: Number of pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > + *
> > > > + * This function puts an array of pages.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > +					   unsigned long *migrate_pfn)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!migrate_pfn[i])
> > > > +			continue;
> > > > +
> > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > > +		migrate_pfn[i] = 0;
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > + * @page: Pointer to the page
> > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > + *
> > > > + * This function associates the given page with the specified GPU SVM zone
> > > > + * device data and initializes it for zone device usage.
> > > > + */
> > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > +				     struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > +	zone_device_page_init(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > > + * @dev: The device for which the pages are being mapped
> > > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > + * @npages: Number of pages to map
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > > + * array.
> > > > + *
> > > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > > + */
> > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > +					dma_addr_t *dma_addr,
> > > > +					long unsigned int *migrate_pfn,
> > > > +					unsigned long npages,
> > > > +					enum dma_data_direction dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > > +
> > > > +		if (!page)
> > > > +			continue;
> > > > +
> > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > +			return -EFAULT;
> > > > +
> > > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > +			return -EFAULT;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > > + * @dev: The device for which the pages were mapped
> > > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > > + * @npages: Number of pages to unmap
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > > + */
> > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > +					   dma_addr_t *dma_addr,
> > > > +					   unsigned long npages,
> > > > +					   enum dma_data_direction dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > > +			continue;
> > > > +
> > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *                   failure of this function.
> > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > > + *                   should hold a reference to the VRAM allocation, which
> > > > + *                   should be dropped via ops->vram_allocation or upon the
> > > > + *                   failure of this function.
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > > + * necessary setup and invokes the driver-specific operations for migration to
> > > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > > + * until ops->vram_release is called which only upon successful return.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       void *vram_allocation,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct migrate_vma migrate = {
> > > > +		.start		= start,
> > > > +		.end		= end,
> > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > +	};
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	unsigned long i, npages = npages_in_range(start, end);
> > > > +	struct vm_area_struct *vas;
> > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int err;
> > > > +
> > > > +	if (!range->flags.migrate_vram)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > > +	    !gpusvm->ops->copy_to_sram)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		mmap_write_lock(mm);
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	vas = vma_lookup(mm, start);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > +		err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (!vma_is_anonymous(vas)) {
> > > > +		err = -EBUSY;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > +	if (!zdd) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/*
> > > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > > +	 */
> > > > +
> > > > +	if (!migrate.cpages) {
> > > > +		err = -EFAULT;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	if (migrate.cpages != npages) {
> > > > +		err = -EBUSY;
> > > > +		goto err_finalize;
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > > +					     migrate.dst);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +					   migrate.src, npages, DMA_TO_DEVICE);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > > +
> > > > +		pages[i] = page;
> > > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	/* Upon success bind vram allocation to range and zdd */
> > > > +	range->vram_allocation = vram_allocation;
> > > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +				       DMA_TO_DEVICE);
> > > > +err_free:
> > > > +	if (zdd)
> > > > +		drm_gpusvm_zdd_put(zdd);
> > > > +	kvfree(buf);
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > + * @npages: Number of pages to populate
> > > > + * @src_mpfn: Source array of migrate PFNs
> > > > + * @mpfn: Array of migrate PFNs to populate
> > > > + * @addr: Start address for PFN allocation
> > > > + *
> > > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > > + * alloc_page for allocation.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > > +						unsigned long npages,
> > > > +						unsigned long *src_mpfn,
> > > > +						unsigned long *mpfn, u64 addr)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > +		struct page *page;
> > > > +
> > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > +			continue;
> > > > +
> > > > +		if (vas)
> > > > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > > +		else
> > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > +
> > > > +		if (!page)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		lock_page(page);
> > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > > + * preferred to issue migrations with mmap lock.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > +				    struct drm_gpusvm_range *range)
> > > > +{
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	unsigned long *src, *dst;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	src = buf;
> > > > +	dst = buf + (sizeof(*src) * npages);
> > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > > +					     npages, src);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > +				       gpusvm->device_private_page_owner, src,
> > > > +				       npages, range->va.start);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +					   dst, npages, DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > +	migrate_device_pages(src, dst, npages);
> > > > +	migrate_device_finalize(src, dst, npages);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @vas: Pointer to the VM area structure
> > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > + * @start: Start address of the migration range
> > > > + * @end: End address of the migration range
> > > > + *
> > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +					struct vm_area_struct *vas,
> > > > +					struct page *page,
> > > > +					u64 start, u64 end)
> > > > +{
> > > > +	struct migrate_vma migrate = {
> > > > +		.vma		= vas,
> > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > +		.fault_page	= page,
> > > > +	};
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	/* Corner where VMA area struct has been partially unmapped */
> > > > +	if (start < vas->vm_start)
> > > > +		start = vas->vm_start;
> > > > +	if (end > vas->vm_end)
> > > > +		end = vas->vm_end;
> > > > +
> > > > +	migrate.start = start;
> > > > +	migrate.end = end;
> > > > +	npages = npages_in_range(start, end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/* Raced with another CPU fault, nothing to do */
> > > > +	if (!migrate.cpages)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > +						   migrate.src, migrate.dst,
> > > > +						   start);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > +					   migrate.dst, npages,
> > > > +					   DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function initiates the migration of the specified GPU SVM range to
> > > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > > + * function for actual migration.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	struct vm_area_struct *vas;
> > > > +	int err;
> > > > +	bool retry = false;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		if (ctx->trylock_mmap) {
> > > > +			if (!mmap_read_trylock(mm))  {
> > > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > +				goto err_mmput;
> > > > +			}
> > > > +		} else {
> > > > +			mmap_read_lock(mm);
> > > > +		}
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	/*
> > > > +	 * Loop required to find all VMA area structs for the corner case when
> > > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > > +	 */
> > > > +again:
> > > > +	vas = find_vma(mm, start);
> > > > +	if (!vas) {
> > > > +		if (!retry)
> > > > +			err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > +		if (!retry)
> > > > +			err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > > +	if (err)
> > > > +		goto err_mmunlock;
> > > > +
> > > > +	if (vas->vm_end < end) {
> > > > +		retry = true;
> > > > +		start = vas->vm_end;
> > > > +		goto again;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_read_unlock(mm);
> > > > +		/*
> > > > +		 * Using mmput_async as this function can be called while
> > > > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > > > +		 * lock, causing a lock inversion.
> > > > +		 */
> > > > +		mmput_async(mm);
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmap_read_unlock(mm);
> > > > +err_mmput:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput_async(mm);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > > + * @page: Pointer to the page
> > > > + *
> > > > + * This function is a callback used to put the GPU SVM zone device data
> > > > + * associated with a page when it is being released.
> > > > + */
> > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > +{
> > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > + * @vmf: Pointer to the fault information structure
> > > > + *
> > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > + * the internal migration function to migrate the range back to RAM.
> > > > + *
> > > > + * Returns:
> > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > + */
> > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > +	int err;
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > +					   vmf->vma, vmf->page,
> > > > +					   zdd->range->va.start,
> > > > +					   zdd->range->va.end);
> > > > +
> > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > > + */
> > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > > +	.page_free = drm_gpusvm_page_free,
> > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM device page map operations structure.
> > > > + */
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > > +{
> > > > +	return &drm_gpusvm_pagemap_ops;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + * @start: Start address
> > > > + * @end: End address
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM has mapping, False otherwise
> > > > + */
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > > +		struct drm_gpusvm_range *range = NULL;
> > > > +
> > > > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > > > +			return true;
> > > > +	}
> > > > +
> > > > +	return false;
> > > > +}
> > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > new file mode 100644
> > > > index 000000000000..0ea70f8534a8
> > > > --- /dev/null
> > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > @@ -0,0 +1,415 @@
> > > > +/* SPDX-License-Identifier: MIT */
> > > > +/*
> > > > + * Copyright © 2024 Intel Corporation
> > > > + */
> > > > +
> > > > +#ifndef __DRM_GPUSVM_H__
> > > > +#define __DRM_GPUSVM_H__
> > > > +
> > > > +#include <linux/kref.h>
> > > > +#include <linux/mmu_notifier.h>
> > > > +#include <linux/workqueue.h>
> > > > +
> > > > +struct dev_pagemap_ops;
> > > > +struct drm_device;
> > > > +struct drm_gpusvm;
> > > > +struct drm_gpusvm_notifier;
> > > > +struct drm_gpusvm_ops;
> > > > +struct drm_gpusvm_range;
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > + *
> > > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > > + * perform operations such as migration between VRAM and system RAM.
> > > > + */
> > > > +struct drm_gpusvm_ops {
> > > > +	/**
> > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > > +	 *
> > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > > +	 */
> > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > +
> > > > +	/**
> > > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > > +	 *
> > > > +	 * This function shall free a GPU SVM notifier.
> > > > +	 */
> > > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > > +
> > > > +	/**
> > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 *
> > > > +	 * This function shall allocate a GPU SVM range.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > > +	 */
> > > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > > +
> > > > +	/**
> > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > +	 *
> > > > +	 * This function shall free a GPU SVM range.
> > > > +	 */
> > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > +
> > > > +	/**
> > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > +	 *
> > > > +	 * This function shall release VRAM allocation and expects to drop a
> > > > +	 * reference to VRAM allocation.
> > > > +	 */
> > > > +	void (*vram_release)(void *vram_allocation);
> > > > +
> > > > +	/**
> > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > +	 * @npages: Number of pages to populate
> > > > +	 * @pfn: Array of page frame numbers to populate
> > > > +	 *
> > > > +	 * This function shall populate VRAM page frame numbers (PFN).
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > +				 void *vram_allocation,
> > > > +				 unsigned long npages,
> > > > +				 unsigned long *pfn);
> > > > +
> > > > +	/**
> > > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > > +	 * @npages: Number of pages to copy
> > > > +	 *
> > > > +	 * This function shall copy pages to VRAM.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > +			    struct page **pages,
> > > > +			    dma_addr_t *dma_addr,
> > > > +			    unsigned long npages);
> > > > +
> > > > +	/**
> > > > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > > > +	 * @npages: Number of pages to copy
> > > > +	 *
> > > > +	 * This function shall copy pages to system RAM.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > +			    struct page **pages,
> > > > +			    dma_addr_t *dma_addr,
> > > > +			    unsigned long npages);
> > > > +
> > > > +	/**
> > > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > > +	 *
> > > > +	 * This function shall invalidate the GPU page tables. It can safely
> > > > +	 * walk the notifier range RB tree/list in this function. Called while
> > > > +	 * holding the notifier lock.
> > > > +	 */
> > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > +			   struct drm_gpusvm_notifier *notifier,
> > > > +			   const struct mmu_notifier_range *mmu_range);
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: MMU interval notifier
> > > > + * @interval: Interval for the notifier
> > > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > > + * @root: Cached root node of the RB tree containing ranges
> > > > + * @range_list: List head containing of ranges in the same order they appear in
> > > > + *              interval tree. This is useful to keep iterating ranges while
> > > > + *              doing modifications to RB tree.
> > > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > > + *                 removed
> > > > + *
> > > > + * This structure represents a GPU SVM notifier.
> > > > + */
> > > > +struct drm_gpusvm_notifier {
> > > > +	struct drm_gpusvm *gpusvm;
> > > > +	struct mmu_interval_notifier notifier;
> > > > +	struct {
> > > > +		u64 start;
> > > > +		u64 end;
> > > > +	} interval;
> > > > +	struct {
> > > > +		struct rb_node node;
> > > > +		struct list_head entry;
> > > > +		u64 __subtree_last;
> > > > +	} rb;
> > > > +	struct rb_root_cached root;
> > > > +	struct list_head range_list;
> > > > +	struct {
> > > > +		u32 removed : 1;
> > > > +	} flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > + * @refcount: Reference count for the range
> > > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > > + * @va: Virtual address range
> > > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > > + *                       on @order which releases via kfree
> > > > + *
> > > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > > + * mapped in a DRM device.
> > > > + */
> > > > +struct drm_gpusvm_range {
> > > > +	struct drm_gpusvm *gpusvm;
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +	struct kref refcount;
> > > > +	struct {
> > > > +		struct rb_node node;
> > > > +		struct list_head entry;
> > > > +		u64 __subtree_last;
> > > > +	} rb;
> > > > +	struct {
> > > > +		u64 start;
> > > > +		u64 end;
> > > > +	} va;
> > > > +	unsigned long notifier_seq;
> > > > +	union {
> > > > +		struct page **pages;
> > > > +		dma_addr_t *dma_addr;
> > > > +	};
> > > > +	void *vram_allocation;
> > > > +	u16 order;
> > > > +	struct {
> > > > +		/* All flags below must be set upon creation */
> > > > +		u16 migrate_vram : 1;
> > > > +		/* All flags below must be set / cleared under notifier lock */
> > > > +		u16 unmapped : 1;
> > > > +		u16 partial_unmap : 1;
> > > > +		u16 has_vram_pages : 1;
> > > > +		u16 has_dma_mapping : 1;
> > > > +		u16 kfree_mapping : 1;
> > > > +	} flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm - GPU SVM structure
> > > > + *
> > > > + * @name: Name of the GPU SVM
> > > > + * @drm: Pointer to the DRM device structure
> > > > + * @mm: Pointer to the mm_struct for the address space
> > > > + * @device_private_page_owner: Device private pages owner
> > > > + * @mm_start: Start address of GPU SVM
> > > > + * @mm_range: Range of the GPU SVM
> > > > + * @notifier_size: Size of individual notifiers
> > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > > + *               Entries should be powers of 2 in descending order.
> > > > + * @num_chunks: Number of chunks
> > > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > > + * @notifier_list: list head containing of notifiers in the same order they
> > > > + *                 appear in interval tree. This is useful to keep iterating
> > > > + *                 notifiers while doing modifications to RB tree.
> > > > + *
> > > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > > + *
> > > > + * No reference counting is provided, as this is expected to be embedded in the
> > > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > > + * counting.
> > > > + */
> > > > +struct drm_gpusvm {
> > > > +	const char *name;
> > > > +	struct drm_device *drm;
> > > > +	struct mm_struct *mm;
> > > > +	void *device_private_page_owner;
> > > > +	u64 mm_start;
> > > > +	u64 mm_range;
> > > > +	u64 notifier_size;
> > > > +	const struct drm_gpusvm_ops *ops;
> > > > +	const u64 *chunk_sizes;
> > > > +	int num_chunks;
> > > > +	struct rw_semaphore notifier_lock;
> > > > +	struct workqueue_struct *zdd_wq;
> > > > +	struct rb_root_cached root;
> > > > +	struct list_head notifier_list;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > + *
> > > > + * @mmap_locked: mmap lock is locked
> > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > > + *                (e.g.dma-revs -> mmap lock)
> > > > + * @in_notifier: entering from a MMU notifier
> > > > + * @read_only: operating on read-only memory
> > > > + * @vram_possible: possible to use VRAM
> > > > + * @prefault: prefault pages
> > > > + *
> > > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > > + */
> > > > +struct drm_gpusvm_ctx {
> > > > +	u32 mmap_locked :1;
> > > > +	u32 trylock_mmap :1;
> > > > +	u32 in_notifier :1;
> > > > +	u32 read_only :1;
> > > > +	u32 vram_possible :1;
> > > > +	u32 prefault :1;
> > > > +};
> > > > +
> > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > +		    const char *name, struct drm_device *drm,
> > > > +		    struct mm_struct *mm, void *device_private_page_owner,
> > > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > > +		    const struct drm_gpusvm_ops *ops,
> > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > +				const struct drm_gpusvm_ctx *ctx);
> > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > +			     struct drm_gpusvm_range *range);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > +
> > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range *range);
> > > > +
> > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range *range,
> > > > +				  const struct drm_gpusvm_ctx *ctx);
> > > > +
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       void *vram_allocation,
> > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > +
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > > +
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > > + * @range: a pointer to the current GPU SVM range
> > > > + *
> > > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > > + *         current range is the last one or if the input range is NULL.
> > > > + */
> > > > +static inline struct drm_gpusvm_range *
> > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > +				   &range->notifier->range_list))
> > > > +		return list_next_entry(range, rb.entry);
> > > > +
> > > > +	return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the range
> > > > + * @end__: End address of the range
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > > + * to use while holding the driver SVM lock or the notifier lock.
> > > > + */
> > > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > > > +	for ((range__) = (range__) ?:					\
> > > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > > > +	     (range__) && (range__->va.start < (end__));		\
> > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > > + * @range: Pointer to the GPU SVM range structure.
> > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > + *
> > > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > > + * if the range partially falls within the provided MMU notifier range.
> > > > + */
> > > > +static inline void
> > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > > +			      const struct mmu_notifier_range *mmu_range)
> > > > +{
> > > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > > +
> > > > +	range->flags.unmapped = true;
> > > > +	if (range->va.start < mmu_range->start ||
> > > > +	    range->va.end > mmu_range->end)
> > > > +		range->flags.partial_unmap = true;
> > > > +}
> > > > +
> > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > -- 
> > > > 2.34.1
> > > > 
> >
Daniel Vetter Aug. 28, 2024, 6:50 p.m. UTC | #6
On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner case when
> +	 * VRAM backing has been partially unmapped from MM's address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);

So a hiliarous case that amdkfd gets a bit better but still not entirely
is that the original vma might entirely gone. Even when you can still get
at the mm of that process. This happens with cow (or shared too I think)
mappings in forked child processes, or also if you play fun mremap games.

I think that outside of the ->migrate_to_ram callback migration/eviction
to sram cannot assume there's any reasonable vma around and has to
unconditionally go with the drm_gpusvm_evict_to_sram path.

Also in the migrate_to_ram case the vma is essentially nothing else that
informational about which ranges we might need if we prefault a bit (in
case the child changed the vma compared to the original one). So it's good
to as parameter for migrate_vma_setup, but absolutely nothing else.

amdkfd almost gets this right by being entirely based on their svm_range
structures, except they still have the lingering check that the orignal mm
is still alive. Of course you cannot ever use that memory on the gpu
anymore, but the child process could get very pissed if their memory is
suddenly gone. Also the eviction code has the same issue as yours and
limits itself to vma that still exist in the original mm, leaving anything
that's orphaned in children or remaps stuck in vram. At least that's my
understanding, I might very well be wrong.

So probably want a bunch of these testcases too to make sure that all
works, and we're not stuck with memory allocations in vram that we can't
move out.
-Sima
Thomas Hellström Aug. 29, 2024, 9:16 a.m. UTC | #7
Hi, Matt. 

Some initial design comments / questions:

On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> This patch introduces support for GPU Shared Virtual Memory (SVM) in
> the
> Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> sharing of memory between the CPU and GPU, enhancing performance and
> flexibility in GPU computing tasks.
> 
> The patch adds the necessary infrastructure for SVM, including data
> structures and functions for managing SVM ranges and notifiers. It
> also
> provides mechanisms for allocating, deallocating, and migrating
> memory
> regions between system RAM and GPU VRAM.
> 
> This mid-layer is largely inspired by GPUVM.
> 
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile     |    3 +-
>  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
>  3 files changed, 2591 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile
> b/drivers/gpu/drm/xe/Makefile
> index b9670ae09a9e..b8fc2ee58f1a 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>  
>  # core driver code
>  
> -xe-y += xe_bb.o \
> +xe-y += drm_gpusvm.o \
> +	xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
>  	xe_devcoredump.o \
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> b/drivers/gpu/drm/xe/drm_gpusvm.c
> new file mode 100644
> index 000000000000..fc1e44e6ae72
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> @@ -0,0 +1,2174 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + *
> + * Authors:
> + *     Matthew Brost <matthew.brost@intel.com>
> + */
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hmm.h>
> +#include <linux/memremap.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_types.h>
> +#include <linux/pagemap.h>
> +#include <linux/slab.h>
> +
> +#include <drm/drm_device.h>
> +#include "drm_gpusvm.h"
> +
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework designed to
> manage shared
> + * virtual memory between the CPU and GPU. It enables efficient data
> exchange and
> + * processing for GPU-accelerated applications by allowing memory
> sharing and
> + * synchronization between the CPU's and GPU's virtual address
> spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and
> notifying the
> + *		GPU of changes, notifiers are sized based on a GPU
> SVM
> + *		initialization parameter, with a recommendation of
> 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of
> ranges that
> + *		fall within the notifier interval. Notifiers are
> tracked within
> + *		a GPU SVM Red-BlacK tree and list and are
> dynamically inserted
> + *		or removed as ranges within the interval are created
> or
> + *		destroyed.

What is the benefit of this extra layer compared to direct insertion of
ranges using mmu_interval_notifier_insert?

IIRC the argument made previously about having wide notifiers was that
the rb tree lookups inside the core were costly and if there were only
a few, then the rb tree lookups within a notifier range could be
replaced with the page-table radix-tree-like lookup, so each lookup
complexity would be O(log(n_notifiers) + page_table_depth).

But now we have first an rb-tree lookup in the core and then an rb-tree
lookup within each notifier yeilding O(log(n_ranges))

I can see a small benefit in that inserting directly into the core rb-
tree will block pending ongoing invalidations, but at a cost of an
extra multiplexing layer.

> + * - Ranges: Represent memory ranges mapped in a DRM device and
> managed
> + *	     by GPU SVM. They are sized based on an array of chunk
> sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU
> address space.
> + *	     Upon GPU fault, the largest aligned chunk that fits
> within the
> + *	     faulting CPU address space is chosen for the range
> size. Ranges are
> + *	     expected to be dynamically allocated on GPU fault and
> removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges
> are tracked in
> + *	     a notifier's Red-Black tree.

How do ranges and chunks map to
 
a) Prefaulting granularity
b) Migration granularity?

> + * - Operations: Define the interface for driver-specific SVM
> operations such as
> + *		 allocation, page collection, migration,
> invalidations, and VRAM
> + *		 release.
> + *
> + * This layer provides interfaces for allocating, mapping,
> migrating, and
> + * releasing memory ranges between the CPU and GPU. It handles all
> core memory
> + * management interactions (DMA mapping, HMM, and migration) and
> provides
> + * driver-specific virtual functions (vfuncs). This infrastructure
> is sufficient
> + * to build the expected driver components for an SVM implementation
> as detailed
> + * below.
> + *
> + * Expected Driver Components:
> + * - GPU page fault handler: Used to create ranges and notifiers
> based on the
> + *			     fault address, optionally migrate the
> range to
> + *			     VRAM, and create GPU bindings.
> + * - Garbage collector: Used to destroy GPU bindings for ranges.
> Ranges are
> + *			expected to be added to the garbage
> collector upon
> + *			MMU_NOTIFY_UNMAP event.
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * GPU SVM handles locking for core MM interactions, i.e., it
> locks/unlocks the
> + * mmap lock as needed. Alternatively, if the driver prefers to
> handle the mmap
> + * lock itself, a 'locked' argument is provided to the functions
> that require
> + * the mmap lock. This option may be useful for drivers that need to
> call into
> + * GPU SVM while also holding a dma-resv lock, thus preventing
> locking
> + * inversions between the mmap and dma-resv locks.
> + *
> + * GPU SVM introduces a global notifier lock, which safeguards the
> notifier's
> + * range RB tree and list, as well as the range's DMA mappings and
> sequence
> + * number. GPU SVM manages all necessary locking and unlocking
> operations,
> + * except for the recheck of the range's sequence number
> + * (mmu_interval_read_retry) when the driver is committing GPU
> bindings. This
> + * lock corresponds to the 'driver->update' lock mentioned in the
> HMM
> + * documentation (TODO: Link). Future revisions may transition from
> a GPU SVM
> + * global lock to a per-notifier lock if finer-grained locking is
> deemed
> + * necessary.
> + *
> + * In addition to the locking mentioned above, the driver should
> implement a
> + * lock to safeguard core GPU SVM function calls that modify state,
> such as
> + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> Alternatively,
> + * these core functions can be called within a single kernel thread,
> for
> + * instance, using an ordered work queue. This lock is denoted as
> + * 'driver_svm_lock' in code examples.
> + */
> +
> +/**
> + * DOC: Migrataion
> + *
> + * The migration support is quite simple, allowing migration between
> SRAM and
> + * VRAM at the range granularity. For example, GPU SVM currently
> does not
> + * support mixing SRAM and VRAM pages within a range. This means
> that upon GPU
> + * fault, the entire range can be migrated to VRAM, and upon CPU
> fault, the
> + * entire range is migrated to SRAM.
> + *
> + * The reasoning for only supporting range granularity is as
> follows: it
> + * simplifies the implementation, and range sizes are driver-defined
> and should
> + * be relatively small.
> + */
> +
> +/**
> + * DOC: Partial Unmapping of Ranges
> + *
> + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> CPU resulting
> + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> main one
> + * being that a subset of the range still has CPU and GPU mappings.
> If the
> + * backing store for the range is in VRAM, a subset of the backing
> store has
> + * references. One option would be to split the range and VRAM
> backing store,
> + * but the implementation for this would be quite complicated. Given
> that
> + * partial unmappings are rare and driver-defined range sizes are
> relatively
> + * small, GPU SVM does not support splitting of ranges.
> + *
> + * With no support for range splitting, upon partial unmapping of a
> range, the
> + * driver is expected to invalidate and destroy the entire range. If
> the range
> + * has VRAM as its backing, the driver is also expected to migrate
> any remaining
> + * pages back to SRAM.

So what happens if we get a one-page invalidation, say protection
change event, or NUMA accounting event, in the middle of a range? Can
we unmap just that single gpu pte covering that range, that is, how do
the ranges map to invalidation granularity? Does this differ between
igfx an dgfx?

Thanks,
Thomas




> + */
> +
> +/**
> + * DOC: Examples
> + *
> + * This section provides two examples of how to build the expected
> driver
> + * components: the GPU page fault handler and the garbage collector.
> A third
> + * example demonstrates a sample invalidation driver vfunc.
> + *
> + * The generic code provided does not include logic for complex
> migration
> + * policies, optimized invalidations, or other potentially required
> driver
> + * locking (e.g., DMA-resv locks).
> + *
> + * 1) GPU page fault handler
> + *
> + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> drm_gpusvm_range *range)
> + *	{
> + *		int err = 0;
> + *
> + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> range);
> + *
> + *		drm_gpusvm_notifier_lock(gpusvm);
> + *		if (drm_gpusvm_range_pages_valid(range))
> + *			driver_commit_bind(gpusvm, range);
> + *		else
> + *			err = -EAGAIN;
> + *		drm_gpusvm_notifier_unlock(gpusvm);
> + *
> + *		return err;
> + *	}
> + *
> + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> + *			     u64 gpuva_start, u64 gpuva_end)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *		int err;
> + *
> + *		driver_svm_lock();
> + *	retry:
> + *		// Always process UNMAPs first so view of GPU SVM
> ranges is current
> + *		driver_garbage_collector(gpusvm);
> + *
> + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> fault_addr,
> + *							gpuva_start,
> gpuva_end,
> + *						        &ctx);
> + *		if (IS_ERR(range)) {
> + *			err = PTR_ERR(range);
> + *			goto unlock;
> + *		}
> + *
> + *		if (driver_migration_policy(range)) {
> + *			bo = driver_alloc_bo();
> + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> range, bo, &ctx);
> + *			if (err)	// CPU mappings may have
> changed
> + *				goto retry;
> + *		}
> + *
> + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &ctx);
> + *		if (err == -EFAULT || err == -EPERM)	// CPU
> mappings changed
> + *			goto retry;
> + *		else if (err)
> + *			goto unlock;
> + *
> + *		err = driver_bind_range(gpusvm, range);
> + *		if (err == -EAGAIN)	// CPU mappings changed
> + *			goto retry
> + *
> + *	unlock:
> + *		driver_svm_unlock();
> + *		return err;
> + *	}
> + *
> + * 2) Garbage Collector.
> + *
> + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> + *					struct drm_gpusvm_range
> *range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		// Partial unmap, migrate any remaining VRAM pages
> back to SRAM
> + *		if (range->flags.partial_unmap)
> + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> &ctx);
> + *
> + *		driver_unbind_range(range);
> + *		drm_gpusvm_range_remove(gpusvm, range);
> + *	}
> + *
> + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> + *	{
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		for_each_range_in_garbage_collector(gpusvm, range)
> + *			__driver_garbage_collector(gpusvm, range);
> + *	}
> + *
> + * 3) Invalidation driver vfunc.
> + *
> + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> + *				 struct drm_gpusvm_notifier
> *notifier,
> + *				 const struct mmu_notifier_range
> *mmu_range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> };
> + *		struct drm_gpusvm_range *range = NULL;
> + *
> + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> >start, mmu_range->end);
> + *
> + *		drm_gpusvm_for_each_range(range, notifier,
> mmu_range->start,
> + *					  mmu_range->end) {
> + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> &ctx);
> + *
> + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> + *				continue;
> + *
> + *			drm_gpusvm_range_set_unmapped(range,
> mmu_range);
> + *			driver_garbage_collector_add(gpusvm, range);
> + *		}
> + *	}
> + */
> +
> +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> rb.__subtree_last,
> +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> +		     static __maybe_unused, range);
> +
> +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> >interval.start)
> +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> >interval.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> notifier);
> +
> +/**
> + * npages_in_range() - Calculate the number of pages in a given
> range
> + * @start__: The start address of the range
> + * @end__: The end address of the range
> + *
> + * This macro calculates the number of pages in a given memory
> range,
> + * specified by the start and end addresses. It divides the
> difference
> + * between the end and start addresses by the page size (PAGE_SIZE)
> to
> + * determine the number of pages in the range.
> + *
> + * Return: The number of pages in the specified range.
> + */
> +#define npages_in_range(start__, end__)	\
> +	(((end__) - (start__)) >> PAGE_SHIFT)
> +
> +/**
> + * struct drm_gpusvm_zdd - GPU SVM zone device data
> + *
> + * @refcount: Reference count for the zdd
> + * @destroy_work: Work structure for asynchronous zdd destruction
> + * @range: Pointer to the GPU SVM range
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + *
> + * This structure serves as a generic wrapper installed in
> + * page->zone_device_data. It provides infrastructure for looking up
> a range
> + * upon CPU page fault and asynchronously releasing VRAM once the
> CPU has no
> + * page references. Asynchronous release is useful because CPU page
> references
> + * can be dropped in IRQ contexts, while releasing VRAM likely
> requires sleeping
> + * locks.
> + */
> +struct drm_gpusvm_zdd {
> +	struct kref refcount;
> +	struct work_struct destroy_work;
> +	struct drm_gpusvm_range *range;
> +	void *vram_allocation;
> +};
> +
> +/**
> + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> zdd
> + * @w: Pointer to the work_struct
> + *
> + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> + */
> +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(w, struct drm_gpusvm_zdd,
> destroy_work);
> +	struct drm_gpusvm_range *range = zdd->range;
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> +		gpusvm->ops->vram_release(zdd->vram_allocation);
> +	drm_gpusvm_range_put(range);
> +	kfree(zdd);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> + * @range: Pointer to the GPU SVM range.
> + *
> + * This function allocates and initializes a new zdd structure. It
> sets up the
> + * reference count, initializes the destroy work, and links the
> provided GPU SVM
> + * range.
> + *
> + * Returns:
> + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_zdd *
> +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_zdd *zdd;
> +
> +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> +	if (!zdd)
> +		return NULL;
> +
> +	kref_init(&zdd->refcount);
> +	INIT_WORK(&zdd->destroy_work,
> drm_gpusvm_zdd_destroy_work_func);
> +	zdd->range = drm_gpusvm_range_get(range);
> +	zdd->vram_allocation = NULL;
> +
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function increments the reference count of the provided zdd
> structure.
> + *
> + * Returns: Pointer to the zdd structure.
> + */
> +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> drm_gpusvm_zdd *zdd)
> +{
> +	kref_get(&zdd->refcount);
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> + * @ref: Pointer to the reference count structure.
> + *
> + * This function queues the destroy_work of the zdd for asynchronous
> destruction.
> + */
> +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> +
> +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_put - Put a zdd reference.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function decrements the reference count of the provided zdd
> structure
> + * and schedules its destruction if the count drops to zero.
> + */
> +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> + * @notifier: Pointer to the GPU SVM notifier structure.
> + * @start: Start address of the range
> + * @end: End address of the range
> + *
> + * Return: A pointer to the drm_gpusvm_range if found or NULL
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end)
> +{
> +	return range_iter_first(&notifier->root, start, end - 1);
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> ranges in a notifier
> + * @range__: Iterator variable for the ranges
> + * @next__: Iterator variable for the ranges temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier
> while
> + * removing ranges from it.
> + */
> +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> start__, end__)	\
> +	for ((range__) = drm_gpusvm_range_find((notifier__),
> (start__), (end__)),	\
> +	     (next__) =
> __drm_gpusvm_range_next(range__);				\
> +	     (range__) && (range__->va.start <
> (end__));				\
> +	     (range__) = (next__), (next__) =
> __drm_gpusvm_range_next(range__))
> +
> +/**
> + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> the list
> + * @notifier: a pointer to the current drm_gpusvm_notifier
> + *
> + * Return: A pointer to the next drm_gpusvm_notifier if available,
> or NULL if
> + *         the current notifier is the last one or if the input
> notifier is
> + *         NULL.
> + */
> +static struct drm_gpusvm_notifier *
> +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> +{
> +	if (notifier && !list_is_last(&notifier->rb.entry,
> +				      &notifier->gpusvm-
> >notifier_list))
> +		return list_next_entry(notifier, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> + */
> +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> end__)		\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1);	\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @next__: Iterator variable for the notifiers temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> while
> + * removing notifiers from it.
> + */
> +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> gpusvm__, start__, end__)	\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1),	\
> +	     (next__) =
> __drm_gpusvm_notifier_next(notifier__);				\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = (next__), (next__) =
> __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> + * @mni: Pointer to the mmu_interval_notifier structure.
> + * @mmu_range: Pointer to the mmu_notifier_range structure.
> + * @cur_seq: Current sequence number.
> + *
> + * This function serves as a generic MMU notifier for GPU SVM. It
> sets the MMU
> + * notifier sequence number and calls the driver invalidate vfunc
> under
> + * gpusvm->notifier_lock.
> + *
> + * Returns:
> + * true if the operation succeeds, false otherwise.
> + */
> +static bool
> +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> +			       const struct mmu_notifier_range
> *mmu_range,
> +			       unsigned long cur_seq)
> +{
> +	struct drm_gpusvm_notifier *notifier =
> +		container_of(mni, typeof(*notifier), notifier);
> +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> +
> +	if (!mmu_notifier_range_blockable(mmu_range))
> +		return false;
> +
> +	down_write(&gpusvm->notifier_lock);
> +	mmu_interval_set_seq(mni, cur_seq);
> +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> +	up_write(&gpusvm->notifier_lock);
> +
> +	return true;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> GPU SVM
> + */
> +static const struct mmu_interval_notifier_ops
> drm_gpusvm_notifier_ops = {
> +	.invalidate = drm_gpusvm_notifier_invalidate,
> +};
> +
> +/**
> + * drm_gpusvm_init - Initialize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @name: Name of the GPU SVM.
> + * @drm: Pointer to the DRM device structure.
> + * @mm: Pointer to the mm_struct for the address space.
> + * @device_private_page_owner: Device private pages owner.
> + * @mm_start: Start address of GPU SVM.
> + * @mm_range: Range of the GPU SVM.
> + * @notifier_size: Size of individual notifiers.
> + * @ops: Pointer to the operations structure for GPU SVM.
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order
> with last
> + *               entry being SZ_4K.
> + * @num_chunks: Number of chunks.
> + *
> + * This function initializes the GPU SVM.
> + *
> + * Returns:
> + * 0 on success, a negative error code on failure.
> + */
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks)
> +{
> +	if (!ops->invalidate || !num_chunks)
> +		return -EINVAL;
> +
> +	gpusvm->name = name;
> +	gpusvm->drm = drm;
> +	gpusvm->mm = mm;
> +	gpusvm->device_private_page_owner =
> device_private_page_owner;
> +	gpusvm->mm_start = mm_start;
> +	gpusvm->mm_range = mm_range;
> +	gpusvm->notifier_size = notifier_size;
> +	gpusvm->ops = ops;
> +	gpusvm->chunk_sizes = chunk_sizes;
> +	gpusvm->num_chunks = num_chunks;
> +	gpusvm->zdd_wq = system_wq;
> +
> +	mmgrab(mm);
> +	gpusvm->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> +
> +	init_rwsem(&gpusvm->notifier_lock);
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	might_lock(&gpusvm->notifier_lock);
> +	fs_reclaim_release(GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure
> + * @fault_addr__: Fault address
> + *
> + * This macro finds the GPU SVM notifier associated with the fault
> address.
> + *
> + * Returns:
> + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> + */
> +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> +			    (fault_addr__ + 1))
> +
> +/**
> + * to_drm_gpusvm_notifier - retrieve the container struct for a
> given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_notifier struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_notifier
> structure.
> + */
> +#define to_drm_gpusvm_notifier(__node)				\
> +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> +
> +/**
> + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function inserts the GPU SVM notifier into the GPU SVM RB
> tree and list.
> + */
> +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	notifier_insert(notifier, &gpusvm->root);
> +
> +	node = rb_prev(&notifier->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> +	else
> +		head = &gpusvm->notifier_list;
> +
> +	list_add(&notifier->rb.entry, head);
> +}
> +
> +/**
> + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM tructure
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + *
> + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> and list.
> + */
> +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> +	list_del(&(notifier__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_fini - Finalize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + *
> + * This function finalizes the GPU SVM by cleaning up any remaining
> ranges and
> + * notifiers, and dropping a reference to struct MM.
> + */
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> +{
> +	struct drm_gpusvm_notifier *notifier, *next;
> +
> +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> LONG_MAX) {
> +		struct drm_gpusvm_range *range, *__next;
> +
> +		/*
> +		 * Remove notifier first to avoid racing with any
> invalidation
> +		 */
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +		notifier->flags.removed = true;
> +
> +		drm_gpusvm_for_each_range_safe(range, __next,
> notifier, 0,
> +					       LONG_MAX)
> +			drm_gpusvm_range_remove(gpusvm, range);
> +	}
> +
> +	mmdrop(gpusvm->mm);
> +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> +}
> +
> +/**
> + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + *
> + * This function allocates and initializes the GPU SVM notifier
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> on failure.
> + */
> +static struct drm_gpusvm_notifier *
> +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	if (gpusvm->ops->notifier_alloc)
> +		notifier = gpusvm->ops->notifier_alloc();
> +	else
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +
> +	if (!notifier)
> +		return ERR_PTR(-ENOMEM);
> +
> +	notifier->gpusvm = gpusvm;
> +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> >notifier_size);
> +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> >notifier_size);
> +	INIT_LIST_HEAD(&notifier->rb.entry);
> +	notifier->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&notifier->range_list);
> +
> +	return notifier;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function frees the GPU SVM notifier structure.
> + */
> +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> +				     struct drm_gpusvm_notifier
> *notifier)
> +{
> +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> +
> +	if (gpusvm->ops->notifier_free)
> +		gpusvm->ops->notifier_free(notifier);
> +	else
> +		kfree(notifier);
> +}
> +
> +/**
> + * to_drm_gpusvm_range - retrieve the container struct for a given
> rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_range struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_range structure.
> + */
> +#define to_drm_gpusvm_range(node__)	\
> +	container_of((node__), struct drm_gpusvm_range, rb.node)
> +
> +/**
> + * drm_gpusvm_range_insert - Insert GPU SVM range
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function inserts the GPU SVM range into the notifier RB tree
> and list.
> + */
> +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> *notifier,
> +				    struct drm_gpusvm_range *range)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> +	range_insert(range, &notifier->root);
> +
> +	node = rb_prev(&range->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> +	else
> +		head = &notifier->range_list;
> +
> +	list_add(&range->rb.entry, head);
> +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> +}
> +
> +/**
> + * __drm_gpusvm_range_remove - Remove GPU SVM range
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + * @range__: Pointer to the GPU SVM range structure
> + *
> + * This macro removes the GPU SVM range from the notifier RB tree
> and list.
> + */
> +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> +	range_remove((range__), &(notifier__)->root);		\
> +	list_del(&(range__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @fault_addr: Fault address
> + * @chunk_size: Chunk size
> + * @migrate_vram: Flag indicating whether to migrate VRAM
> + *
> + * This function allocates and initializes the GPU SVM range
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> failure.
> + */
> +static struct drm_gpusvm_range *
> +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> +		       struct drm_gpusvm_notifier *notifier,
> +		       u64 fault_addr, u64 chunk_size, bool
> migrate_vram)
> +{
> +	struct drm_gpusvm_range *range;
> +
> +	if (gpusvm->ops->range_alloc)
> +		range = gpusvm->ops->range_alloc(gpusvm);
> +	else
> +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&range->refcount);
> +	range->gpusvm = gpusvm;
> +	range->notifier = notifier;
> +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> +	INIT_LIST_HEAD(&range->rb.entry);
> +	range->notifier_seq = LONG_MAX;
> +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_check_pages - Check pages
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @start: Start address
> + * @end: End address
> + *
> + * Check if pages between start and end have been faulted in on the
> CPU. Use to
> + * prevent migration of pages without CPU backing store.
> + *
> + * Returns:
> + * True if pages have been faulted into CPU, False otherwise
> + */
> +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> +				   struct drm_gpusvm_notifier
> *notifier,
> +				   u64 start, u64 end)
> +{
> +	struct hmm_range hmm_range = {
> +		.default_flags = 0,
> +		.notifier = &notifier->notifier,
> +		.start = start,
> +		.end = end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long *pfns;
> +	unsigned long npages = npages_in_range(start, end);
> +	int err, i;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +	if (!pfns)
> +		return false;
> +
> +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> >notifier);
> +	hmm_range.hmm_pfns = pfns;
> +
> +	while (true) {
> +		err = hmm_range_fault(&hmm_range);
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(&notifier->notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (err)
> +		goto err_free;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!(pfns[i] & HMM_PFN_VALID)) {
> +			err = -EFAULT;
> +			goto err_free;
> +		}
> +	}
> +
> +err_free:
> +	kvfree(pfns);
> +	return err ? false : true;
> +}
> +
> +/**
> + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @vas: Pointer to the virtual memory area structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @check_pages: Flag indicating whether to check pages
> + *
> + * This function determines the chunk size for the GPU SVM range
> based on the
> + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> the virtual
> + * memory area boundaries.
> + *
> + * Returns:
> + * Chunk size on success, LONG_MAX on failure.
> + */
> +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier,
> +				       struct vm_area_struct *vas,
> +				       u64 fault_addr, u64
> gpuva_start,
> +				       u64 gpuva_end, bool
> check_pages)
> +{
> +	u64 start, end;
> +	int i = 0;
> +
> +retry:
> +	for (; i < gpusvm->num_chunks; ++i) {
> +		start = ALIGN_DOWN(fault_addr, gpusvm-
> >chunk_sizes[i]);
> +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> +
> +		if (start >= vas->vm_start && end <= vas->vm_end &&
> +		    start >= notifier->interval.start &&
> +		    end <= notifier->interval.end &&
> +		    start >= gpuva_start && end <= gpuva_end)
> +			break;
> +	}
> +
> +	if (i == gpusvm->num_chunks)
> +		return LONG_MAX;
> +
> +	/*
> +	 * If allocation more than page, ensure not to overlap with
> existing
> +	 * ranges.
> +	 */
> +	if (end - start != SZ_4K) {
> +		struct drm_gpusvm_range *range;
> +
> +		range = drm_gpusvm_range_find(notifier, start, end);
> +		if (range) {
> +			++i;
> +			goto retry;
> +		}
> +
> +		/*
> +		 * XXX: Only create range on pages CPU has faulted
> in. Without
> +		 * this check, or prefault, on BMG
> 'xe_exec_system_allocator --r
> +		 * process-many-malloc' fails. In the failure case,
> each process
> +		 * mallocs 16k but the CPU VMA is ~128k which
> results in 64k SVM
> +		 * ranges. When migrating the SVM ranges, some
> processes fail in
> +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> != npages'
> +		 * and then upon drm_gpusvm_range_get_pages device
> pages from
> +		 * other processes are collected + faulted in which
> creates all
> +		 * sorts of problems. Unsure exactly how this
> happening, also
> +		 * problem goes away if 'xe_exec_system_allocator --
> r
> +		 * process-many-malloc' mallocs at least 64k at a
> time.
> +		 */
> +		if (check_pages &&
> +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> end)) {
> +			++i;
> +			goto retry;
> +		}
> +	}
> +
> +	return end - start;
> +}
> +
> +/**
> + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @ctx: GPU SVM context
> + *
> + * This function finds or inserts a newly allocated a GPU SVM range
> based on the
> + * fault address. Caller must hold a lock to protect range lookup
> and insertion.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +	struct drm_gpusvm_range *range;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	bool notifier_alloc = false;
> +	u64 chunk_size;
> +	int err;
> +	bool migrate_vram;
> +
> +	if (fault_addr < gpusvm->mm_start ||
> +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_write_locked(mm);
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> +	if (!notifier) {
> +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> fault_addr);
> +		if (IS_ERR(notifier)) {
> +			err = PTR_ERR(notifier);
> +			goto err_mmunlock;
> +		}
> +		notifier_alloc = true;
> +		err = mmu_interval_notifier_insert_locked(&notifier-
> >notifier,
> +							  mm,
> notifier->interval.start,
> +							  notifier-
> >interval.end -
> +							  notifier-
> >interval.start,
> +							 
> &drm_gpusvm_notifier_ops);
> +		if (err)
> +			goto err_notifier;
> +	}
> +
> +	vas = vma_lookup(mm, fault_addr);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_notifier_remove;
> +	}
> +
> +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> +		err = -EPERM;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_find(notifier, fault_addr,
> fault_addr + 1);
> +	if (range)
> +		goto out_mmunlock;
> +	/*
> +	 * XXX: Short-circuiting migration based on migrate_vma_*
> current
> +	 * limitations. If/when migrate_vma_* add more support, this
> logic will
> +	 * have to change.
> +	 */
> +	migrate_vram = ctx->vram_possible &&
> +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> +
> +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> vas,
> +						 fault_addr,
> gpuva_start,
> +						 gpuva_end,
> migrate_vram &&
> +						 !ctx->prefault);
> +	if (chunk_size == LONG_MAX) {
> +		err = -EINVAL;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> chunk_size,
> +				       migrate_vram);
> +	if (IS_ERR(range)) {
> +		err = PTR_ERR(range);
> +		goto err_notifier_remove;
> +	}
> +
> +	drm_gpusvm_range_insert(notifier, range);
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> +
> +	if (ctx->prefault) {
> +		struct drm_gpusvm_ctx __ctx = *ctx;
> +
> +		__ctx.mmap_locked = true;
> +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &__ctx);
> +		if (err)
> +			goto err_range_remove;
> +	}
> +
> +out_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +
> +	return range;
> +
> +err_range_remove:
> +	__drm_gpusvm_range_remove(notifier, range);
> +err_notifier_remove:
> +	if (notifier_alloc)
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +err_notifier:
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * for_each_dma_page - iterate over pages in a DMA regio`n
> + * @i__: the current page index in the iteration
> + * @j__: the current page index, log order, in the iteration
> + * @npages__: the total number of pages in the DMA region
> + * @order__: the order of the pages in the DMA region
> + *
> + * This macro iterates over each page in a DMA region. The DMA
> region
> + * is assumed to be composed of 2^@order__ pages, and the macro will
> + * step through the region one block of 2^@order__ pages at a time.
> + */
> +#define for_each_dma_page(i__, j__, npages__, order__)	\
> +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> +	     (j__)++, (i__) += 0x1 << (order__))
> +
> +/**
> + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> GPU SVM range (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function unmap pages associated with a GPU SVM range.
> Assumes and
> + * asserts correct locking is in place when called.
> + */
> +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> *gpusvm,
> +					   struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		unsigned long i, j, npages = npages_in_range(range-
> >va.start,
> +							     range-
> >va.end);
> +
> +		if (range->flags.has_dma_mapping) {
> +			for_each_dma_page(i, j, npages, range-
> >order)
> +				dma_unmap_page(gpusvm->drm->dev,
> +					       range->dma_addr[j],
> +					       PAGE_SIZE << range-
> >order,
> +					       DMA_BIDIRECTIONAL);
> +		}
> +
> +		range->flags.has_vram_pages = false;
> +		range->flags.has_dma_mapping = false;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function free pages associated with a GPU SVM range.
> + */
> +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> +					struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		if (range->flags.kfree_mapping) {
> +			kfree(range->dma_addr);
> +			range->flags.kfree_mapping = false;
> +			range->pages = NULL;
> +		} else {
> +			kvfree(range->pages);
> +			range->pages = NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_remove - Remove GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range to be removed
> + *
> + * This function removes the specified GPU SVM range and also
> removes the parent
> + * GPU SVM notifier if no more ranges remain in the notifier. The
> caller must
> + * hold a lock to protect range and notifier removal.
> + */
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> >va.start);
> +	if (WARN_ON_ONCE(!notifier))
> +		return;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +	drm_gpusvm_range_free_pages(gpusvm, range);
> +	__drm_gpusvm_range_remove(notifier, range);
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	drm_gpusvm_range_put(range);
> +
> +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> +		if (!notifier->flags.removed)
> +			mmu_interval_notifier_remove(&notifier-
> >notifier);
> +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function increments the reference count of the specified GPU
> SVM range.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> +{
> +	kref_get(&range->refcount);
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> + * @refcount: Pointer to the reference counter embedded in the GPU
> SVM range
> + *
> + * This function destroys the specified GPU SVM range when its
> reference count
> + * reaches zero. If a custom range-free function is provided, it is
> invoked to
> + * free the range; otherwise, the range is deallocated using
> kfree().
> + */
> +static void drm_gpusvm_range_destroy(struct kref *refcount)
> +{
> +	struct drm_gpusvm_range *range =
> +		container_of(refcount, struct drm_gpusvm_range,
> refcount);
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->range_free)
> +		gpusvm->ops->range_free(range);
> +	else
> +		kfree(range);
> +}
> +
> +/**
> + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function decrements the reference count of the specified GPU
> SVM range
> + * and frees it when the count reaches zero.
> + */
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> +{
> +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called holding gpusvm->notifier_lock and as the last step before
> commiting a
> + * GPU binding.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	return range->flags.has_vram_pages || range-
> >flags.has_dma_mapping;
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> unlocked
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called without holding gpusvm->notifier_lock.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +static bool
> +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> +				      struct drm_gpusvm_range
> *range)
> +{
> +	bool pages_valid;
> +
> +	if (!range->pages)
> +		return false;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> +	if (!pages_valid && range->flags.kfree_mapping) {
> +		kfree(range->dma_addr);
> +		range->flags.kfree_mapping = false;
> +		range->pages = NULL;
> +	}
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	return pages_valid;
> +}
> +
> +/**
> + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function gets pages for a GPU SVM range and ensures they are
> mapped for
> + * DMA access.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct mmu_interval_notifier *notifier = &range->notifier-
> >notifier;
> +	struct hmm_range hmm_range = {
> +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> ? 0 :
> +			HMM_PFN_REQ_WRITE),
> +		.notifier = notifier,
> +		.start = range->va.start,
> +		.end = range->va.end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long i, j;
> +	unsigned long npages = npages_in_range(range->va.start,
> range->va.end);
> +	unsigned int order = 0;
> +	unsigned long *pfns;
> +	struct page **pages;
> +	int err = 0;
> +	bool vram_pages = !!range->flags.migrate_vram;
> +	bool alloc_pfns = false, kfree_mapping;
> +
> +retry:
> +	kfree_mapping = false;
> +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> +		return 0;
> +
> +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> >pages) {
> +		if (ctx->prefault)
> +			return 0;
> +
> +		pfns = (unsigned long *)range->pages;
> +		pages = range->pages;
> +		goto map_pages;
> +	}
> +
> +	if (!range->pages) {
> +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> GFP_KERNEL);
> +		if (!pfns)
> +			return -ENOMEM;
> +		alloc_pfns = true;
> +	} else {
> +		pfns = (unsigned long *)range->pages;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +	}
> +
> +	hmm_range.hmm_pfns = pfns;
> +	while (true) {
> +		/* Must be checked after mmu_interval_read_begin */
> +		if (range->flags.unmapped) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only
> a read-lock
> +			 * is required but there apears to be a
> window between
> +			 * the MMU_NOTIFY_MIGRATE event triggered in
> a CPU fault
> +			 * via migrate_vma_setup and the pages
> actually moving
> +			 * in migrate_vma_finalize in which this
> code can grab
> +			 * garbage pages. Grabbing the write-lock if
> the range
> +			 * is attached to vram appears to protect
> against this
> +			 * race.
> +			 */
> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if
> (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {
> +			if (WARN_ON_ONCE(i && order !=
> +					
> hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if
> (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);
> +
> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE <<
> order,
> +						  
> DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev,
> dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j,
> sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] =
> (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order,
> DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If
> @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write
> mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> called on
> + * each GPU SVM range attached to notifier in gpusvm->ops-
> >invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> +					   unsigned long
> *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> grate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU
> SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to mapped
> pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU
> SVM. It
> + * iterates over each page frame number provided in @migrate_pfn,
> maps the
> + * corresponding page, and stores the DMA address in the provided
> @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int
> *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page =
> migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for GPU
> Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in
> @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the
> corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction
> dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev,
> dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> The caller
> + *                   should hold a reference to the VRAM allocation,
> which
> + *                   should be dropped via ops->vram_allocation or
> upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It
> performs the
> + * necessary setup and invokes the driver-specific operations for
> migration to
> + * VRAM. Upon successful return, @vram_allocation can safely
> reference @range
> + * until ops->vram_release is called which only upon successful
> return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> >copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> npages, not
> +	 * always an error. Need to revisit possible cases and how
> to handle. We
> +	 * could prefault on migrate.cpages != npages via
> hmm_range_fault.
> +	 */
> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.src, npages,
> DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers
> (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the
> VM area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> vm_area_struct *vas,
> +						unsigned long
> npages,
> +						unsigned long
> *src_mpfn,
> +						unsigned long *mpfn,
> u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> lock and
> + * migration done via migrate_device_* functions. Fallback path as
> it is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> >vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm-
> >device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   dst, npages,
> DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified
> GPU SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM
> PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	/* Corner where VMA area struct has been partially unmapped
> */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src,
> migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM
> range to
> + * SRAM. It performs necessary checks and invokes the internal
> migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err =
> drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner
> case when
> +	 * VRAM backing has been partially unmapped from MM's
> address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> end);
> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called
> while
> +		 * holding a dma-resv lock, and a final put can grab
> the mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device
> data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM
> range to RAM.
> + * It retrieves the GPU SVM range information from the faulting page
> and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);
> +
> +	return err ? VM_FAULT_SIGBUS : 0;
> +}
> +
> +/**
> + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> + */
> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> +	.page_free = drm_gpusvm_page_free,
> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> +};
> +
> +/**
> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> operations
> + *
> + * Returns:
> + * Pointer to the GPU SVM device page map operations structure.
> + */
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> +{
> +	return &drm_gpusvm_pagemap_ops;
> +}
> +
> +/**
> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> given address range
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @start: Start address
> + * @end: End address
> + *
> + * Returns:
> + * True if GPU SVM has mapping, False otherwise
> + */
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> u64 end)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> +		struct drm_gpusvm_range *range = NULL;
> +
> +		drm_gpusvm_for_each_range(range, notifier, start,
> end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> b/drivers/gpu/drm/xe/drm_gpusvm.h
> new file mode 100644
> index 000000000000..0ea70f8534a8
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> @@ -0,0 +1,415 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef __DRM_GPUSVM_H__
> +#define __DRM_GPUSVM_H__
> +
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/workqueue.h>
> +
> +struct dev_pagemap_ops;
> +struct drm_device;
> +struct drm_gpusvm;
> +struct drm_gpusvm_notifier;
> +struct drm_gpusvm_ops;
> +struct drm_gpusvm_range;
> +
> +/**
> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> + *
> + * This structure defines the operations for GPU Shared Virtual
> Memory (SVM).
> + * These operations are provided by the GPU driver to manage SVM
> ranges and
> + * perform operations such as migration between VRAM and system RAM.
> + */
> +struct drm_gpusvm_ops {
> +	/**
> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> +	 *
> +	 * This function shall allocate a GPU SVM notifier.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM notifier on success,
> NULL on failure.
> +	 */
> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> +
> +	/**
> +	 * @notifier_free: Free a GPU SVM notifier (optional)
> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> +	 *
> +	 * This function shall free a GPU SVM notifier.
> +	 */
> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> +
> +	/**
> +	 * @range_alloc: Allocate a GPU SVM range (optional)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 *
> +	 * This function shall allocate a GPU SVM range.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM range on success, NULL
> on failure.
> +	 */
> +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> *gpusvm);
> +
> +	/**
> +	 * @range_free: Free a GPU SVM range (optional)
> +	 * @range: Pointer to the GPU SVM range to be freed
> +	 *
> +	 * This function shall free a GPU SVM range.
> +	 */
> +	void (*range_free)(struct drm_gpusvm_range *range);
> +
> +	/**
> +	 * @vram_release: Release VRAM allocation (optional)
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 *
> +	 * This function shall release VRAM allocation and expects
> to drop a
> +	 * reference to VRAM allocation.
> +	 */
> +	void (*vram_release)(void *vram_allocation);
> +
> +	/**
> +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 * @npages: Number of pages to populate
> +	 * @pfn: Array of page frame numbers to populate
> +	 *
> +	 * This function shall populate VRAM page frame numbers
> (PFN).
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> +				 void *vram_allocation,
> +				 unsigned long npages,
> +				 unsigned long *pfn);
> +
> +	/**
> +	 * @copy_to_vram: Copy to VRAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (destination)
> +	 * @dma_addr: Pointer to array of DMA addresses (source)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to VRAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @copy_to_sram: Copy to system RAM (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (source)
> +	 * @dma_addr: Pointer to array of DMA addresses
> (destination)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to system RAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @invalidate: Invalidate GPU SVM notifier (required)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @notifier: Pointer to the GPU SVM notifier
> +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> +	 *
> +	 * This function shall invalidate the GPU page tables. It
> can safely
> +	 * walk the notifier range RB tree/list in this function.
> Called while
> +	 * holding the notifier lock.
> +	 */
> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> +			   struct drm_gpusvm_notifier *notifier,
> +			   const struct mmu_notifier_range
> *mmu_range);
> +};
> +
> +/**
> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> notifier
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: MMU interval notifier
> + * @interval: Interval for the notifier
> + * @rb: Red-black tree node for the parent GPU SVM structure
> notifier tree
> + * @root: Cached root node of the RB tree containing ranges
> + * @range_list: List head containing of ranges in the same order
> they appear in
> + *              interval tree. This is useful to keep iterating
> ranges while
> + *              doing modifications to RB tree.
> + * @flags.removed: Flag indicating whether the MMU interval notifier
> has been
> + *                 removed
> + *
> + * This structure represents a GPU SVM notifier.
> + */
> +struct drm_gpusvm_notifier {
> +	struct drm_gpusvm *gpusvm;
> +	struct mmu_interval_notifier notifier;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} interval;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct rb_root_cached root;
> +	struct list_head range_list;
> +	struct {
> +		u32 removed : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier
> + * @refcount: Reference count for the range
> + * @rb: Red-black tree node for the parent GPU SVM notifier
> structure range tree
> + * @va: Virtual address range
> + * @notifier_seq: Notifier sequence number of the range's pages
> + * @pages: Pointer to the array of pages (if backing store is in
> VRAM)
> + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> mapped)
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> size
> + * @flags.migrate_vram: Flag indicating whether the range can be
> migrated to VRAM
> + * @flags.unmapped: Flag indicating if the range has been unmapped
> + * @flags.partial_unmap: Flag indicating if the range has been
> partially unmapped
> + * @flags.has_vram_pages: Flag indicating if the range has vram
> pages
> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> mapping
> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> allocation based
> + *                       on @order which releases via kfree
> + *
> + * This structure represents a GPU SVM range used for tracking
> memory ranges
> + * mapped in a DRM device.
> + */
> +struct drm_gpusvm_range {
> +	struct drm_gpusvm *gpusvm;
> +	struct drm_gpusvm_notifier *notifier;
> +	struct kref refcount;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} va;
> +	unsigned long notifier_seq;
> +	union {
> +		struct page **pages;
> +		dma_addr_t *dma_addr;
> +	};
> +	void *vram_allocation;
> +	u16 order;
> +	struct {
> +		/* All flags below must be set upon creation */
> +		u16 migrate_vram : 1;
> +		/* All flags below must be set / cleared under
> notifier lock */
> +		u16 unmapped : 1;
> +		u16 partial_unmap : 1;
> +		u16 has_vram_pages : 1;
> +		u16 has_dma_mapping : 1;
> +		u16 kfree_mapping : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm - GPU SVM structure
> + *
> + * @name: Name of the GPU SVM
> + * @drm: Pointer to the DRM device structure
> + * @mm: Pointer to the mm_struct for the address space
> + * @device_private_page_owner: Device private pages owner
> + * @mm_start: Start address of GPU SVM
> + * @mm_range: Range of the GPU SVM
> + * @notifier_size: Size of individual notifiers
> + * @ops: Pointer to the operations structure for GPU SVM
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order.
> + * @num_chunks: Number of chunks
> + * @notifier_lock: Read-write semaphore for protecting notifier
> operations
> + * @zdd_wq: Workqueue for deferred work on zdd destruction
> + * @root: Cached root node of the Red-Black tree containing GPU SVM
> notifiers
> + * @notifier_list: list head containing of notifiers in the same
> order they
> + *                 appear in interval tree. This is useful to keep
> iterating
> + *                 notifiers while doing modifications to RB tree.
> + *
> + * This structure represents a GPU SVM (Shared Virtual Memory) used
> for tracking
> + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> + *
> + * No reference counting is provided, as this is expected to be
> embedded in the
> + * driver VM structure along with the struct drm_gpuvm, which
> handles reference
> + * counting.
> + */
> +struct drm_gpusvm {
> +	const char *name;
> +	struct drm_device *drm;
> +	struct mm_struct *mm;
> +	void *device_private_page_owner;
> +	u64 mm_start;
> +	u64 mm_range;
> +	u64 notifier_size;
> +	const struct drm_gpusvm_ops *ops;
> +	const u64 *chunk_sizes;
> +	int num_chunks;
> +	struct rw_semaphore notifier_lock;
> +	struct workqueue_struct *zdd_wq;
> +	struct rb_root_cached root;
> +	struct list_head notifier_list;
> +};
> +
> +/**
> + * struct drm_gpusvm_ctx - DRM GPU SVM context
> + *
> + * @mmap_locked: mmap lock is locked
> + * @trylock_mmap: trylock mmap lock, used to avoid locking
> inversions
> + *                (e.g.dma-revs -> mmap lock)
> + * @in_notifier: entering from a MMU notifier
> + * @read_only: operating on read-only memory
> + * @vram_possible: possible to use VRAM
> + * @prefault: prefault pages
> + *
> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> + */
> +struct drm_gpusvm_ctx {
> +	u32 mmap_locked :1;
> +	u32 trylock_mmap :1;
> +	u32 in_notifier :1;
> +	u32 read_only :1;
> +	u32 vram_possible :1;
> +	u32 prefault :1;
> +};
> +
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks);
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> +
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range);
> +
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx);
> +
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx);
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> +
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> u64 end);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end);
> +
> +/**
> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, take lock
> + */
> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> +	down_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, drop lock
> + */
> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> +	up_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> + * @range: a pointer to the current GPU SVM range
> + *
> + * Return: A pointer to the next drm_gpusvm_range if available, or
> NULL if the
> + *         current range is the last one or if the input range is
> NULL.
> + */
> +static inline struct drm_gpusvm_range *
> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> +{
> +	if (range && !list_is_last(&range->rb.entry,
> +				   &range->notifier->range_list))
> +		return list_next_entry(range, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> notifier
> + * @range__: Iterator variable for the ranges. If set, it indicates
> the start of
> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> get the range.
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier.
> It is safe
> + * to use while holding the driver SVM lock or the notifier lock.
> + */
> +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> end__)	\
> +	for ((range__) = (range__)
> ?:					\
> +	     drm_gpusvm_range_find((notifier__), (start__),
> (end__));	\
> +	     (range__) && (range__->va.start <
> (end__));		\
> +	     (range__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> + * @range: Pointer to the GPU SVM range structure.
> + * @mmu_range: Pointer to the MMU notifier range structure.
> + *
> + * This function marks a GPU SVM range as unmapped and sets the
> partial_unmap flag
> + * if the range partially falls within the provided MMU notifier
> range.
> + */
> +static inline void
> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> +			      const struct mmu_notifier_range
> *mmu_range)
> +{
> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> +
> +	range->flags.unmapped = true;
> +	if (range->va.start < mmu_range->start ||
> +	    range->va.end > mmu_range->end)
> +		range->flags.partial_unmap = true;
> +}
> +
> +#endif /* __DRM_GPUSVM_H__ */
Daniel Vetter Aug. 29, 2024, 9:45 a.m. UTC | #8
On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> This patch introduces support for GPU Shared Virtual Memory (SVM) in the
> Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> sharing of memory between the CPU and GPU, enhancing performance and
> flexibility in GPU computing tasks.
> 
> The patch adds the necessary infrastructure for SVM, including data
> structures and functions for managing SVM ranges and notifiers. It also
> provides mechanisms for allocating, deallocating, and migrating memory
> regions between system RAM and GPU VRAM.
> 
> This mid-layer is largely inspired by GPUVM.
> 
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Still not sure I've got the right race that you paper over with
mmap_write_lock, but I spotted a few things, commments inline.

> ---
>  drivers/gpu/drm/xe/Makefile     |    3 +-
>  drivers/gpu/drm/xe/drm_gpusvm.c | 2174 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
>  3 files changed, 2591 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index b9670ae09a9e..b8fc2ee58f1a 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>  
>  # core driver code
>  
> -xe-y += xe_bb.o \
> +xe-y += drm_gpusvm.o \
> +	xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
>  	xe_devcoredump.o \
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c b/drivers/gpu/drm/xe/drm_gpusvm.c
> new file mode 100644
> index 000000000000..fc1e44e6ae72
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> @@ -0,0 +1,2174 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + *
> + * Authors:
> + *     Matthew Brost <matthew.brost@intel.com>
> + */
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hmm.h>
> +#include <linux/memremap.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_types.h>
> +#include <linux/pagemap.h>
> +#include <linux/slab.h>
> +
> +#include <drm/drm_device.h>
> +#include "drm_gpusvm.h"
> +
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework designed to manage shared
> + * virtual memory between the CPU and GPU. It enables efficient data exchange and
> + * processing for GPU-accelerated applications by allowing memory sharing and
> + * synchronization between the CPU's and GPU's virtual address spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and notifying the
> + *		GPU of changes, notifiers are sized based on a GPU SVM
> + *		initialization parameter, with a recommendation of 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of ranges that
> + *		fall within the notifier interval. Notifiers are tracked within
> + *		a GPU SVM Red-BlacK tree and list and are dynamically inserted
> + *		or removed as ranges within the interval are created or
> + *		destroyed.
> + * - Ranges: Represent memory ranges mapped in a DRM device and managed
> + *	     by GPU SVM. They are sized based on an array of chunk sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU address space.
> + *	     Upon GPU fault, the largest aligned chunk that fits within the
> + *	     faulting CPU address space is chosen for the range size. Ranges are
> + *	     expected to be dynamically allocated on GPU fault and removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges are tracked in
> + *	     a notifier's Red-Black tree.
> + * - Operations: Define the interface for driver-specific SVM operations such as
> + *		 allocation, page collection, migration, invalidations, and VRAM
> + *		 release.
> + *
> + * This layer provides interfaces for allocating, mapping, migrating, and
> + * releasing memory ranges between the CPU and GPU. It handles all core memory
> + * management interactions (DMA mapping, HMM, and migration) and provides
> + * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
> + * to build the expected driver components for an SVM implementation as detailed
> + * below.
> + *
> + * Expected Driver Components:
> + * - GPU page fault handler: Used to create ranges and notifiers based on the
> + *			     fault address, optionally migrate the range to
> + *			     VRAM, and create GPU bindings.
> + * - Garbage collector: Used to destroy GPU bindings for ranges. Ranges are
> + *			expected to be added to the garbage collector upon
> + *			MMU_NOTIFY_UNMAP event.
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
> + * mmap lock as needed. Alternatively, if the driver prefers to handle the mmap
> + * lock itself, a 'locked' argument is provided to the functions that require
> + * the mmap lock. This option may be useful for drivers that need to call into
> + * GPU SVM while also holding a dma-resv lock, thus preventing locking
> + * inversions between the mmap and dma-resv locks.
> + *
> + * GPU SVM introduces a global notifier lock, which safeguards the notifier's
> + * range RB tree and list, as well as the range's DMA mappings and sequence
> + * number. GPU SVM manages all necessary locking and unlocking operations,
> + * except for the recheck of the range's sequence number
> + * (mmu_interval_read_retry) when the driver is committing GPU bindings. This
> + * lock corresponds to the 'driver->update' lock mentioned in the HMM
> + * documentation (TODO: Link). Future revisions may transition from a GPU SVM
> + * global lock to a per-notifier lock if finer-grained locking is deemed
> + * necessary.
> + *
> + * In addition to the locking mentioned above, the driver should implement a
> + * lock to safeguard core GPU SVM function calls that modify state, such as
> + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. Alternatively,
> + * these core functions can be called within a single kernel thread, for
> + * instance, using an ordered work queue. This lock is denoted as
> + * 'driver_svm_lock' in code examples.

I think this doesn't work, because essentially it forces a single threaded
design. Core mm isn't single threaded, and you cannot lock them all out,
at least not easily.

So I think a design requirement is that gpusvm can cope with migrations to
ram due to cpu faults, migrations for other reasons, gpu fault handling
all concurrently. Currently with the combo of driver_svm_lock + taking
mmap_write_lock you serialize this all a lot, which I think is hiding
design bugs.

> + */
> +
> +/**
> + * DOC: Migrataion
> + *
> + * The migration support is quite simple, allowing migration between SRAM and
> + * VRAM at the range granularity. For example, GPU SVM currently does not
> + * support mixing SRAM and VRAM pages within a range. This means that upon GPU
> + * fault, the entire range can be migrated to VRAM, and upon CPU fault, the
> + * entire range is migrated to SRAM.
> + *
> + * The reasoning for only supporting range granularity is as follows: it
> + * simplifies the implementation, and range sizes are driver-defined and should
> + * be relatively small.
> + */
> +
> +/**
> + * DOC: Partial Unmapping of Ranges
> + *
> + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
> + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
> + * being that a subset of the range still has CPU and GPU mappings. If the
> + * backing store for the range is in VRAM, a subset of the backing store has
> + * references. One option would be to split the range and VRAM backing store,
> + * but the implementation for this would be quite complicated. Given that
> + * partial unmappings are rare and driver-defined range sizes are relatively
> + * small, GPU SVM does not support splitting of ranges.
> + *
> + * With no support for range splitting, upon partial unmapping of a range, the
> + * driver is expected to invalidate and destroy the entire range. If the range
> + * has VRAM as its backing, the driver is also expected to migrate any remaining
> + * pages back to SRAM.
> + */
> +
> +/**
> + * DOC: Examples
> + *
> + * This section provides two examples of how to build the expected driver
> + * components: the GPU page fault handler and the garbage collector. A third
> + * example demonstrates a sample invalidation driver vfunc.
> + *
> + * The generic code provided does not include logic for complex migration
> + * policies, optimized invalidations, or other potentially required driver
> + * locking (e.g., DMA-resv locks).
> + *
> + * 1) GPU page fault handler
> + *
> + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
> + *	{
> + *		int err = 0;
> + *
> + *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
> + *
> + *		drm_gpusvm_notifier_lock(gpusvm);
> + *		if (drm_gpusvm_range_pages_valid(range))
> + *			driver_commit_bind(gpusvm, range);
> + *		else
> + *			err = -EAGAIN;
> + *		drm_gpusvm_notifier_unlock(gpusvm);
> + *
> + *		return err;
> + *	}
> + *
> + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64 fault_addr,
> + *			     u64 gpuva_start, u64 gpuva_end)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *		int err;
> + *
> + *		driver_svm_lock();
> + *	retry:
> + *		// Always process UNMAPs first so view of GPU SVM ranges is current
> + *		driver_garbage_collector(gpusvm);
> + *
> + *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
> + *							gpuva_start, gpuva_end,
> + *						        &ctx);
> + *		if (IS_ERR(range)) {
> + *			err = PTR_ERR(range);
> + *			goto unlock;
> + *		}
> + *
> + *		if (driver_migration_policy(range)) {
> + *			bo = driver_alloc_bo();
> + *			err = drm_gpusvm_migrate_to_vram(gpusvm, range, bo, &ctx);
> + *			if (err)	// CPU mappings may have changed
> + *				goto retry;
> + *		}
> + *
> + *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
> + *		if (err == -EFAULT || err == -EPERM)	// CPU mappings changed
> + *			goto retry;
> + *		else if (err)
> + *			goto unlock;
> + *
> + *		err = driver_bind_range(gpusvm, range);
> + *		if (err == -EAGAIN)	// CPU mappings changed
> + *			goto retry
> + *
> + *	unlock:
> + *		driver_svm_unlock();
> + *		return err;
> + *	}
> + *
> + * 2) Garbage Collector.
> + *
> + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> + *					struct drm_gpusvm_range *range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
> + *		if (range->flags.partial_unmap)
> + *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);

Note that the migration back to sram isn't guaranteed to succeed, so you
might be still stuck with partially migrated range. This might be a case
where hmm gives you vram pfns, but the range you have doesn't have any
vram allocation anymore because you droppped it here. Not sure tbh.

> + *
> + *		driver_unbind_range(range);
> + *		drm_gpusvm_range_remove(gpusvm, range);
> + *	}
> + *
> + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> + *	{
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		for_each_range_in_garbage_collector(gpusvm, range)
> + *			__driver_garbage_collector(gpusvm, range);
> + *	}
> + *
> + * 3) Invalidation driver vfunc.
> + *
> + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> + *				 struct drm_gpusvm_notifier *notifier,
> + *				 const struct mmu_notifier_range *mmu_range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> + *		struct drm_gpusvm_range *range = NULL;
> + *
> + *		driver_invalidate_device_tlb(gpusvm, mmu_range->start, mmu_range->end);
> + *
> + *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
> + *					  mmu_range->end) {
> + *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
> + *
> + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> + *				continue;
> + *
> + *			drm_gpusvm_range_set_unmapped(range, mmu_range);
> + *			driver_garbage_collector_add(gpusvm, range);
> + *		}
> + *	}
> + */
> +
> +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64, rb.__subtree_last,
> +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> +		     static __maybe_unused, range);
> +
> +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)->interval.start)
> +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)->interval.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused, notifier);
> +
> +/**
> + * npages_in_range() - Calculate the number of pages in a given range
> + * @start__: The start address of the range
> + * @end__: The end address of the range
> + *
> + * This macro calculates the number of pages in a given memory range,
> + * specified by the start and end addresses. It divides the difference
> + * between the end and start addresses by the page size (PAGE_SIZE) to
> + * determine the number of pages in the range.
> + *
> + * Return: The number of pages in the specified range.
> + */
> +#define npages_in_range(start__, end__)	\
> +	(((end__) - (start__)) >> PAGE_SHIFT)
> +
> +/**
> + * struct drm_gpusvm_zdd - GPU SVM zone device data
> + *
> + * @refcount: Reference count for the zdd
> + * @destroy_work: Work structure for asynchronous zdd destruction
> + * @range: Pointer to the GPU SVM range
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + *
> + * This structure serves as a generic wrapper installed in
> + * page->zone_device_data. It provides infrastructure for looking up a range
> + * upon CPU page fault and asynchronously releasing VRAM once the CPU has no
> + * page references. Asynchronous release is useful because CPU page references
> + * can be dropped in IRQ contexts, while releasing VRAM likely requires sleeping
> + * locks.
> + */
> +struct drm_gpusvm_zdd {
> +	struct kref refcount;
> +	struct work_struct destroy_work;
> +	struct drm_gpusvm_range *range;
> +	void *vram_allocation;
> +};
> +
> +/**
> + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a zdd
> + * @w: Pointer to the work_struct
> + *
> + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> + */
> +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(w, struct drm_gpusvm_zdd, destroy_work);
> +	struct drm_gpusvm_range *range = zdd->range;
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> +		gpusvm->ops->vram_release(zdd->vram_allocation);
> +	drm_gpusvm_range_put(range);
> +	kfree(zdd);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> + * @range: Pointer to the GPU SVM range.
> + *
> + * This function allocates and initializes a new zdd structure. It sets up the
> + * reference count, initializes the destroy work, and links the provided GPU SVM
> + * range.
> + *
> + * Returns:
> + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_zdd *
> +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_zdd *zdd;
> +
> +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> +	if (!zdd)
> +		return NULL;
> +
> +	kref_init(&zdd->refcount);
> +	INIT_WORK(&zdd->destroy_work, drm_gpusvm_zdd_destroy_work_func);
> +	zdd->range = drm_gpusvm_range_get(range);
> +	zdd->vram_allocation = NULL;
> +
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function increments the reference count of the provided zdd structure.
> + *
> + * Returns: Pointer to the zdd structure.
> + */
> +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_get(&zdd->refcount);
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> + * @ref: Pointer to the reference count structure.
> + *
> + * This function queues the destroy_work of the zdd for asynchronous destruction.
> + */
> +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> +
> +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_put - Put a zdd reference.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function decrements the reference count of the provided zdd structure
> + * and schedules its destruction if the count drops to zero.
> + */
> +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> + * @notifier: Pointer to the GPU SVM notifier structure.
> + * @start: Start address of the range
> + * @end: End address of the range
> + *
> + * Return: A pointer to the drm_gpusvm_range if found or NULL
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end)
> +{
> +	return range_iter_first(&notifier->root, start, end - 1);
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM ranges in a notifier
> + * @range__: Iterator variable for the ranges
> + * @next__: Iterator variable for the ranges temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier while
> + * removing ranges from it.
> + */
> +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__)	\
> +	for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)),	\
> +	     (next__) = __drm_gpusvm_range_next(range__);				\
> +	     (range__) && (range__->va.start < (end__));				\
> +	     (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in the list
> + * @notifier: a pointer to the current drm_gpusvm_notifier
> + *
> + * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
> + *         the current notifier is the last one or if the input notifier is
> + *         NULL.
> + */
> +static struct drm_gpusvm_notifier *
> +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> +{
> +	if (notifier && !list_is_last(&notifier->rb.entry,
> +				      &notifier->gpusvm->notifier_list))
> +		return list_next_entry(notifier, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> + */
> +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__)		\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1);	\
> +	     (notifier__) && (notifier__->interval.start < (end__));			\
> +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @next__: Iterator variable for the notifiers temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
> + * removing notifiers from it.
> + */
> +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__)	\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1),	\
> +	     (next__) = __drm_gpusvm_notifier_next(notifier__);				\
> +	     (notifier__) && (notifier__->interval.start < (end__));			\
> +	     (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> + * @mni: Pointer to the mmu_interval_notifier structure.
> + * @mmu_range: Pointer to the mmu_notifier_range structure.
> + * @cur_seq: Current sequence number.
> + *
> + * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
> + * notifier sequence number and calls the driver invalidate vfunc under
> + * gpusvm->notifier_lock.
> + *
> + * Returns:
> + * true if the operation succeeds, false otherwise.
> + */
> +static bool
> +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> +			       const struct mmu_notifier_range *mmu_range,
> +			       unsigned long cur_seq)
> +{
> +	struct drm_gpusvm_notifier *notifier =
> +		container_of(mni, typeof(*notifier), notifier);
> +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> +
> +	if (!mmu_notifier_range_blockable(mmu_range))
> +		return false;
> +
> +	down_write(&gpusvm->notifier_lock);
> +	mmu_interval_set_seq(mni, cur_seq);
> +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> +	up_write(&gpusvm->notifier_lock);
> +
> +	return true;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
> + */
> +static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
> +	.invalidate = drm_gpusvm_notifier_invalidate,
> +};
> +
> +/**
> + * drm_gpusvm_init - Initialize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @name: Name of the GPU SVM.
> + * @drm: Pointer to the DRM device structure.
> + * @mm: Pointer to the mm_struct for the address space.
> + * @device_private_page_owner: Device private pages owner.
> + * @mm_start: Start address of GPU SVM.
> + * @mm_range: Range of the GPU SVM.
> + * @notifier_size: Size of individual notifiers.
> + * @ops: Pointer to the operations structure for GPU SVM.
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> + *               Entries should be powers of 2 in descending order with last
> + *               entry being SZ_4K.
> + * @num_chunks: Number of chunks.
> + *
> + * This function initializes the GPU SVM.
> + *
> + * Returns:
> + * 0 on success, a negative error code on failure.
> + */
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks)
> +{
> +	if (!ops->invalidate || !num_chunks)
> +		return -EINVAL;
> +
> +	gpusvm->name = name;
> +	gpusvm->drm = drm;
> +	gpusvm->mm = mm;
> +	gpusvm->device_private_page_owner = device_private_page_owner;
> +	gpusvm->mm_start = mm_start;
> +	gpusvm->mm_range = mm_range;
> +	gpusvm->notifier_size = notifier_size;
> +	gpusvm->ops = ops;
> +	gpusvm->chunk_sizes = chunk_sizes;
> +	gpusvm->num_chunks = num_chunks;
> +	gpusvm->zdd_wq = system_wq;
> +
> +	mmgrab(mm);
> +	gpusvm->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> +
> +	init_rwsem(&gpusvm->notifier_lock);
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	might_lock(&gpusvm->notifier_lock);
> +	fs_reclaim_release(GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure
> + * @fault_addr__: Fault address
> + *
> + * This macro finds the GPU SVM notifier associated with the fault address.
> + *
> + * Returns:
> + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> + */
> +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> +			    (fault_addr__ + 1))
> +
> +/**
> + * to_drm_gpusvm_notifier - retrieve the container struct for a given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_notifier structure.
> + */
> +#define to_drm_gpusvm_notifier(__node)				\
> +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> +
> +/**
> + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
> + */
> +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier *notifier)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	notifier_insert(notifier, &gpusvm->root);
> +
> +	node = rb_prev(&notifier->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> +	else
> +		head = &gpusvm->notifier_list;
> +
> +	list_add(&notifier->rb.entry, head);
> +}
> +
> +/**
> + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM tructure
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + *
> + * This macro removes the GPU SVM notifier from the GPU SVM RB tree and list.
> + */
> +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> +	list_del(&(notifier__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_fini - Finalize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + *
> + * This function finalizes the GPU SVM by cleaning up any remaining ranges and
> + * notifiers, and dropping a reference to struct MM.
> + */
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> +{
> +	struct drm_gpusvm_notifier *notifier, *next;
> +
> +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
> +		struct drm_gpusvm_range *range, *__next;
> +
> +		/*
> +		 * Remove notifier first to avoid racing with any invalidation
> +		 */
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +		notifier->flags.removed = true;
> +
> +		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
> +					       LONG_MAX)
> +			drm_gpusvm_range_remove(gpusvm, range);
> +	}
> +
> +	mmdrop(gpusvm->mm);
> +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> +}
> +
> +/**
> + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + *
> + * This function allocates and initializes the GPU SVM notifier structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_notifier *
> +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	if (gpusvm->ops->notifier_alloc)
> +		notifier = gpusvm->ops->notifier_alloc();
> +	else
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +
> +	if (!notifier)
> +		return ERR_PTR(-ENOMEM);
> +
> +	notifier->gpusvm = gpusvm;
> +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
> +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm->notifier_size);
> +	INIT_LIST_HEAD(&notifier->rb.entry);
> +	notifier->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&notifier->range_list);
> +
> +	return notifier;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function frees the GPU SVM notifier structure.
> + */
> +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> +				     struct drm_gpusvm_notifier *notifier)
> +{
> +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> +
> +	if (gpusvm->ops->notifier_free)
> +		gpusvm->ops->notifier_free(notifier);
> +	else
> +		kfree(notifier);
> +}
> +
> +/**
> + * to_drm_gpusvm_range - retrieve the container struct for a given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_range structure.
> + */
> +#define to_drm_gpusvm_range(node__)	\
> +	container_of((node__), struct drm_gpusvm_range, rb.node)
> +
> +/**
> + * drm_gpusvm_range_insert - Insert GPU SVM range
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function inserts the GPU SVM range into the notifier RB tree and list.
> + */
> +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
> +				    struct drm_gpusvm_range *range)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> +	range_insert(range, &notifier->root);
> +
> +	node = rb_prev(&range->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> +	else
> +		head = &notifier->range_list;
> +
> +	list_add(&range->rb.entry, head);
> +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> +}
> +
> +/**
> + * __drm_gpusvm_range_remove - Remove GPU SVM range
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + * @range__: Pointer to the GPU SVM range structure
> + *
> + * This macro removes the GPU SVM range from the notifier RB tree and list.
> + */
> +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> +	range_remove((range__), &(notifier__)->root);		\
> +	list_del(&(range__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @fault_addr: Fault address
> + * @chunk_size: Chunk size
> + * @migrate_vram: Flag indicating whether to migrate VRAM
> + *
> + * This function allocates and initializes the GPU SVM range structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_range *
> +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> +		       struct drm_gpusvm_notifier *notifier,
> +		       u64 fault_addr, u64 chunk_size, bool migrate_vram)
> +{
> +	struct drm_gpusvm_range *range;
> +
> +	if (gpusvm->ops->range_alloc)
> +		range = gpusvm->ops->range_alloc(gpusvm);
> +	else
> +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&range->refcount);
> +	range->gpusvm = gpusvm;
> +	range->notifier = notifier;
> +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> +	INIT_LIST_HEAD(&range->rb.entry);
> +	range->notifier_seq = LONG_MAX;
> +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_check_pages - Check pages
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @start: Start address
> + * @end: End address
> + *
> + * Check if pages between start and end have been faulted in on the CPU. Use to
> + * prevent migration of pages without CPU backing store.
> + *
> + * Returns:
> + * True if pages have been faulted into CPU, False otherwise
> + */
> +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> +				   struct drm_gpusvm_notifier *notifier,
> +				   u64 start, u64 end)
> +{
> +	struct hmm_range hmm_range = {
> +		.default_flags = 0,
> +		.notifier = &notifier->notifier,
> +		.start = start,
> +		.end = end,
> +		.dev_private_owner = gpusvm->device_private_page_owner,
> +	};
> +	unsigned long timeout =
> +		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long *pfns;
> +	unsigned long npages = npages_in_range(start, end);
> +	int err, i;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +	if (!pfns)
> +		return false;
> +
> +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
> +	hmm_range.hmm_pfns = pfns;
> +
> +	while (true) {
> +		err = hmm_range_fault(&hmm_range);
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (err)
> +		goto err_free;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!(pfns[i] & HMM_PFN_VALID)) {
> +			err = -EFAULT;
> +			goto err_free;
> +		}
> +	}
> +
> +err_free:
> +	kvfree(pfns);
> +	return err ? false : true;
> +}
> +
> +/**
> + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @vas: Pointer to the virtual memory area structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @check_pages: Flag indicating whether to check pages
> + *
> + * This function determines the chunk size for the GPU SVM range based on the
> + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
> + * memory area boundaries.
> + *
> + * Returns:
> + * Chunk size on success, LONG_MAX on failure.
> + */
> +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier *notifier,
> +				       struct vm_area_struct *vas,
> +				       u64 fault_addr, u64 gpuva_start,
> +				       u64 gpuva_end, bool check_pages)
> +{
> +	u64 start, end;
> +	int i = 0;
> +
> +retry:
> +	for (; i < gpusvm->num_chunks; ++i) {
> +		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
> +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> +
> +		if (start >= vas->vm_start && end <= vas->vm_end &&
> +		    start >= notifier->interval.start &&
> +		    end <= notifier->interval.end &&
> +		    start >= gpuva_start && end <= gpuva_end)
> +			break;
> +	}
> +
> +	if (i == gpusvm->num_chunks)
> +		return LONG_MAX;
> +
> +	/*
> +	 * If allocation more than page, ensure not to overlap with existing
> +	 * ranges.
> +	 */
> +	if (end - start != SZ_4K) {
> +		struct drm_gpusvm_range *range;
> +
> +		range = drm_gpusvm_range_find(notifier, start, end);
> +		if (range) {
> +			++i;
> +			goto retry;
> +		}
> +
> +		/*
> +		 * XXX: Only create range on pages CPU has faulted in. Without
> +		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
> +		 * process-many-malloc' fails. In the failure case, each process
> +		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
> +		 * ranges. When migrating the SVM ranges, some processes fail in
> +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages != npages'
> +		 * and then upon drm_gpusvm_range_get_pages device pages from
> +		 * other processes are collected + faulted in which creates all
> +		 * sorts of problems. Unsure exactly how this happening, also
> +		 * problem goes away if 'xe_exec_system_allocator --r
> +		 * process-many-malloc' mallocs at least 64k at a time.
> +		 */
> +		if (check_pages &&
> +		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
> +			++i;
> +			goto retry;
> +		}
> +	}
> +
> +	return end - start;
> +}
> +
> +/**
> + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @ctx: GPU SVM context
> + *
> + * This function finds or inserts a newly allocated a GPU SVM range based on the
> + * fault address. Caller must hold a lock to protect range lookup and insertion.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +	struct drm_gpusvm_range *range;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	bool notifier_alloc = false;
> +	u64 chunk_size;
> +	int err;
> +	bool migrate_vram;
> +
> +	if (fault_addr < gpusvm->mm_start ||
> +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_write_locked(mm);
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> +	if (!notifier) {
> +		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
> +		if (IS_ERR(notifier)) {
> +			err = PTR_ERR(notifier);
> +			goto err_mmunlock;
> +		}
> +		notifier_alloc = true;
> +		err = mmu_interval_notifier_insert_locked(&notifier->notifier,
> +							  mm, notifier->interval.start,
> +							  notifier->interval.end -
> +							  notifier->interval.start,
> +							  &drm_gpusvm_notifier_ops);
> +		if (err)
> +			goto err_notifier;
> +	}
> +
> +	vas = vma_lookup(mm, fault_addr);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_notifier_remove;
> +	}
> +
> +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> +		err = -EPERM;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
> +	if (range)
> +		goto out_mmunlock;
> +	/*
> +	 * XXX: Short-circuiting migration based on migrate_vma_* current
> +	 * limitations. If/when migrate_vma_* add more support, this logic will
> +	 * have to change.
> +	 */
> +	migrate_vram = ctx->vram_possible &&
> +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> +
> +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
> +						 fault_addr, gpuva_start,
> +						 gpuva_end, migrate_vram &&
> +						 !ctx->prefault);
> +	if (chunk_size == LONG_MAX) {
> +		err = -EINVAL;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
> +				       migrate_vram);
> +	if (IS_ERR(range)) {
> +		err = PTR_ERR(range);
> +		goto err_notifier_remove;
> +	}
> +
> +	drm_gpusvm_range_insert(notifier, range);
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> +
> +	if (ctx->prefault) {
> +		struct drm_gpusvm_ctx __ctx = *ctx;
> +
> +		__ctx.mmap_locked = true;
> +		err = drm_gpusvm_range_get_pages(gpusvm, range, &__ctx);
> +		if (err)
> +			goto err_range_remove;
> +	}
> +
> +out_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +
> +	return range;
> +
> +err_range_remove:
> +	__drm_gpusvm_range_remove(notifier, range);
> +err_notifier_remove:
> +	if (notifier_alloc)
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +err_notifier:
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * for_each_dma_page - iterate over pages in a DMA regio`n
> + * @i__: the current page index in the iteration
> + * @j__: the current page index, log order, in the iteration
> + * @npages__: the total number of pages in the DMA region
> + * @order__: the order of the pages in the DMA region
> + *
> + * This macro iterates over each page in a DMA region. The DMA region
> + * is assumed to be composed of 2^@order__ pages, and the macro will
> + * step through the region one block of 2^@order__ pages at a time.
> + */
> +#define for_each_dma_page(i__, j__, npages__, order__)	\
> +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> +	     (j__)++, (i__) += 0x1 << (order__))
> +
> +/**
> + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function unmap pages associated with a GPU SVM range. Assumes and
> + * asserts correct locking is in place when called.
> + */
> +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +					   struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		unsigned long i, j, npages = npages_in_range(range->va.start,
> +							     range->va.end);
> +
> +		if (range->flags.has_dma_mapping) {
> +			for_each_dma_page(i, j, npages, range->order)
> +				dma_unmap_page(gpusvm->drm->dev,
> +					       range->dma_addr[j],
> +					       PAGE_SIZE << range->order,
> +					       DMA_BIDIRECTIONAL);
> +		}
> +
> +		range->flags.has_vram_pages = false;
> +		range->flags.has_dma_mapping = false;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_free_pages - Free pages associated with a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function free pages associated with a GPU SVM range.
> + */
> +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> +					struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		if (range->flags.kfree_mapping) {
> +			kfree(range->dma_addr);
> +			range->flags.kfree_mapping = false;
> +			range->pages = NULL;
> +		} else {
> +			kvfree(range->pages);
> +			range->pages = NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_remove - Remove GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range to be removed
> + *
> + * This function removes the specified GPU SVM range and also removes the parent
> + * GPU SVM notifier if no more ranges remain in the notifier. The caller must
> + * hold a lock to protect range and notifier removal.
> + */
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, range->va.start);
> +	if (WARN_ON_ONCE(!notifier))
> +		return;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +	drm_gpusvm_range_free_pages(gpusvm, range);
> +	__drm_gpusvm_range_remove(notifier, range);
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	drm_gpusvm_range_put(range);
> +
> +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> +		if (!notifier->flags.removed)
> +			mmu_interval_notifier_remove(&notifier->notifier);
> +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function increments the reference count of the specified GPU SVM range.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> +{
> +	kref_get(&range->refcount);
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> + * @refcount: Pointer to the reference counter embedded in the GPU SVM range
> + *
> + * This function destroys the specified GPU SVM range when its reference count
> + * reaches zero. If a custom range-free function is provided, it is invoked to
> + * free the range; otherwise, the range is deallocated using kfree().
> + */
> +static void drm_gpusvm_range_destroy(struct kref *refcount)
> +{
> +	struct drm_gpusvm_range *range =
> +		container_of(refcount, struct drm_gpusvm_range, refcount);
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->range_free)
> +		gpusvm->ops->range_free(range);
> +	else
> +		kfree(range);
> +}
> +
> +/**
> + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function decrements the reference count of the specified GPU SVM range
> + * and frees it when the count reaches zero.
> + */
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> +{
> +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid. Expected be
> + * called holding gpusvm->notifier_lock and as the last step before commiting a
> + * GPU binding.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	return range->flags.has_vram_pages || range->flags.has_dma_mapping;
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid unlocked
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid. Expected be
> + * called without holding gpusvm->notifier_lock.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +static bool
> +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> +				      struct drm_gpusvm_range *range)
> +{
> +	bool pages_valid;
> +
> +	if (!range->pages)
> +		return false;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> +	if (!pages_valid && range->flags.kfree_mapping) {
> +		kfree(range->dma_addr);
> +		range->flags.kfree_mapping = false;
> +		range->pages = NULL;
> +	}
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	return pages_valid;
> +}
> +
> +/**
> + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function gets pages for a GPU SVM range and ensures they are mapped for
> + * DMA access.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
> +	struct hmm_range hmm_range = {
> +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
> +			HMM_PFN_REQ_WRITE),
> +		.notifier = notifier,
> +		.start = range->va.start,
> +		.end = range->va.end,
> +		.dev_private_owner = gpusvm->device_private_page_owner,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long timeout =
> +		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long i, j;
> +	unsigned long npages = npages_in_range(range->va.start, range->va.end);
> +	unsigned int order = 0;
> +	unsigned long *pfns;
> +	struct page **pages;
> +	int err = 0;
> +	bool vram_pages = !!range->flags.migrate_vram;
> +	bool alloc_pfns = false, kfree_mapping;
> +
> +retry:
> +	kfree_mapping = false;
> +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> +		return 0;
> +
> +	if (range->notifier_seq == hmm_range.notifier_seq && range->pages) {
> +		if (ctx->prefault)
> +			return 0;
> +
> +		pfns = (unsigned long *)range->pages;
> +		pages = range->pages;
> +		goto map_pages;
> +	}
> +
> +	if (!range->pages) {
> +		pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +		if (!pfns)
> +			return -ENOMEM;
> +		alloc_pfns = true;
> +	} else {
> +		pfns = (unsigned long *)range->pages;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +	}
> +
> +	hmm_range.hmm_pfns = pfns;
> +	while (true) {
> +		/* Must be checked after mmu_interval_read_begin */
> +		if (range->flags.unmapped) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only a read-lock
> +			 * is required but there apears to be a window between
> +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> +			 * via migrate_vma_setup and the pages actually moving
> +			 * in migrate_vma_finalize in which this code can grab
> +			 * garbage pages. Grabbing the write-lock if the range
> +			 * is attached to vram appears to protect against this
> +			 * race.
> +			 */
> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}

You can't do the above, because the pfn you get from hmm come with zero
guarantees, you neither hold a page reference nor the page lock. The only
thing you can do is grab the pagetable lock (or mmu notifier locks) and
check it's still valid, before you can touch any state. I think the
range->vram_allocation is probably always valid since you clean that up
under the same lock/thread, but there's good chances the vram allocation
is otherwise already gone for good. Or you get an inconsistent snapshot.

> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {
> +			if (WARN_ON_ONCE(i && order !=
> +					 hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);

You can't do these, because you don't hold a page reference. They're also
not needed because hmm_range_fault goes thorugh the full mkwrite dance,
which takes care of these, unlike the gup family of functions.

> +
> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE << order,
> +						   DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}

Aside: dma_map_page is about the only thing that's ok, because it doesn't
do anything harmful and especially doesn't make any assumption about what
that page is.

> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] = (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> +					   unsigned long *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU SVM. It
> + * iterates over each page frame number provided in @migrate_pfn, maps the
> + * corresponding page, and stores the DMA address in the provided @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> + *                   should hold a reference to the VRAM allocation, which
> + *                   should be dropped via ops->vram_allocation or upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It performs the
> + * necessary setup and invokes the driver-specific operations for migration to
> + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> + * until ops->vram_release is called which only upon successful return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> +	 * always an error. Need to revisit possible cases and how to handle. We
> +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> +	 */

Yeah I think especially under contention partial migrations, at least back
to sram due to cpu faults, are pretty much expected. And you need to cope
somehow.

> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   migrate.src, npages, DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the VM area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> +						unsigned long npages,
> +						unsigned long *src_mpfn,
> +						unsigned long *mpfn, u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> + * migration done via migrate_device_* functions. Fallback path as it is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm->device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   dst, npages, DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified GPU SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);

That's the wrong mm, at least for the ->migrate_to_ram path. You might be
called on a anon mapping from a child process. That also means that the
vma you're looking at might have no relationship with anythign you're
tracking in your gpusvm.

> +
> +	/* Corner where VMA area struct has been partially unmapped */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src, migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM range to
> + * SRAM. It performs necessary checks and invokes the internal migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner case when
> +	 * VRAM backing has been partially unmapped from MM's address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called while
> +		 * holding a dma-resv lock, and a final put can grab the mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> + * It retrieves the GPU SVM range information from the faulting page and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,

So I think zdd->range doesn't work, because even within a single mm the
vma mapping a given piece of anon memory does not need to be unique, you
can duplicate them with mremap.

So all you have here is the physical memory and the vma, which might or
might not be from the same process as gpusvm->mm.

Also the child process scenario means you using mmap_write on the fault
side doesn't stop all cpu faults migrating stuff back.

Somewhat aside, but I think that means amdkfd's svm_range->migration_mutex
is busted, because it's va based and so misses concurrently ongoing
different mappings moving physical storage around underneath.


Cheers, Sima

> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);
> +
> +	return err ? VM_FAULT_SIGBUS : 0;
> +}
> +
> +/**
> + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> + */
> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> +	.page_free = drm_gpusvm_page_free,
> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> +};
> +
> +/**
> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> + *
> + * Returns:
> + * Pointer to the GPU SVM device page map operations structure.
> + */
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> +{
> +	return &drm_gpusvm_pagemap_ops;
> +}
> +
> +/**
> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @start: Start address
> + * @end: End address
> + *
> + * Returns:
> + * True if GPU SVM has mapping, False otherwise
> + */
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> +		struct drm_gpusvm_range *range = NULL;
> +
> +		drm_gpusvm_for_each_range(range, notifier, start, end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> new file mode 100644
> index 000000000000..0ea70f8534a8
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> @@ -0,0 +1,415 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef __DRM_GPUSVM_H__
> +#define __DRM_GPUSVM_H__
> +
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/workqueue.h>
> +
> +struct dev_pagemap_ops;
> +struct drm_device;
> +struct drm_gpusvm;
> +struct drm_gpusvm_notifier;
> +struct drm_gpusvm_ops;
> +struct drm_gpusvm_range;
> +
> +/**
> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> + *
> + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> + * These operations are provided by the GPU driver to manage SVM ranges and
> + * perform operations such as migration between VRAM and system RAM.
> + */
> +struct drm_gpusvm_ops {
> +	/**
> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> +	 *
> +	 * This function shall allocate a GPU SVM notifier.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> +	 */
> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> +
> +	/**
> +	 * @notifier_free: Free a GPU SVM notifier (optional)
> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> +	 *
> +	 * This function shall free a GPU SVM notifier.
> +	 */
> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> +
> +	/**
> +	 * @range_alloc: Allocate a GPU SVM range (optional)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 *
> +	 * This function shall allocate a GPU SVM range.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> +	 */
> +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> +
> +	/**
> +	 * @range_free: Free a GPU SVM range (optional)
> +	 * @range: Pointer to the GPU SVM range to be freed
> +	 *
> +	 * This function shall free a GPU SVM range.
> +	 */
> +	void (*range_free)(struct drm_gpusvm_range *range);
> +
> +	/**
> +	 * @vram_release: Release VRAM allocation (optional)
> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> +	 *
> +	 * This function shall release VRAM allocation and expects to drop a
> +	 * reference to VRAM allocation.
> +	 */
> +	void (*vram_release)(void *vram_allocation);
> +
> +	/**
> +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> +	 * @npages: Number of pages to populate
> +	 * @pfn: Array of page frame numbers to populate
> +	 *
> +	 * This function shall populate VRAM page frame numbers (PFN).
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> +				 void *vram_allocation,
> +				 unsigned long npages,
> +				 unsigned long *pfn);
> +
> +	/**
> +	 * @copy_to_vram: Copy to VRAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (destination)
> +	 * @dma_addr: Pointer to array of DMA addresses (source)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to VRAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @copy_to_sram: Copy to system RAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (source)
> +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to system RAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @invalidate: Invalidate GPU SVM notifier (required)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @notifier: Pointer to the GPU SVM notifier
> +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> +	 *
> +	 * This function shall invalidate the GPU page tables. It can safely
> +	 * walk the notifier range RB tree/list in this function. Called while
> +	 * holding the notifier lock.
> +	 */
> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> +			   struct drm_gpusvm_notifier *notifier,
> +			   const struct mmu_notifier_range *mmu_range);
> +};
> +
> +/**
> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: MMU interval notifier
> + * @interval: Interval for the notifier
> + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> + * @root: Cached root node of the RB tree containing ranges
> + * @range_list: List head containing of ranges in the same order they appear in
> + *              interval tree. This is useful to keep iterating ranges while
> + *              doing modifications to RB tree.
> + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> + *                 removed
> + *
> + * This structure represents a GPU SVM notifier.
> + */
> +struct drm_gpusvm_notifier {
> +	struct drm_gpusvm *gpusvm;
> +	struct mmu_interval_notifier notifier;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} interval;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct rb_root_cached root;
> +	struct list_head range_list;
> +	struct {
> +		u32 removed : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier
> + * @refcount: Reference count for the range
> + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> + * @va: Virtual address range
> + * @notifier_seq: Notifier sequence number of the range's pages
> + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> + * @flags.unmapped: Flag indicating if the range has been unmapped
> + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> + *                       on @order which releases via kfree
> + *
> + * This structure represents a GPU SVM range used for tracking memory ranges
> + * mapped in a DRM device.
> + */
> +struct drm_gpusvm_range {
> +	struct drm_gpusvm *gpusvm;
> +	struct drm_gpusvm_notifier *notifier;
> +	struct kref refcount;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} va;
> +	unsigned long notifier_seq;
> +	union {
> +		struct page **pages;
> +		dma_addr_t *dma_addr;
> +	};
> +	void *vram_allocation;
> +	u16 order;
> +	struct {
> +		/* All flags below must be set upon creation */
> +		u16 migrate_vram : 1;
> +		/* All flags below must be set / cleared under notifier lock */
> +		u16 unmapped : 1;
> +		u16 partial_unmap : 1;
> +		u16 has_vram_pages : 1;
> +		u16 has_dma_mapping : 1;
> +		u16 kfree_mapping : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm - GPU SVM structure
> + *
> + * @name: Name of the GPU SVM
> + * @drm: Pointer to the DRM device structure
> + * @mm: Pointer to the mm_struct for the address space
> + * @device_private_page_owner: Device private pages owner
> + * @mm_start: Start address of GPU SVM
> + * @mm_range: Range of the GPU SVM
> + * @notifier_size: Size of individual notifiers
> + * @ops: Pointer to the operations structure for GPU SVM
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> + *               Entries should be powers of 2 in descending order.
> + * @num_chunks: Number of chunks
> + * @notifier_lock: Read-write semaphore for protecting notifier operations
> + * @zdd_wq: Workqueue for deferred work on zdd destruction
> + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> + * @notifier_list: list head containing of notifiers in the same order they
> + *                 appear in interval tree. This is useful to keep iterating
> + *                 notifiers while doing modifications to RB tree.
> + *
> + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> + *
> + * No reference counting is provided, as this is expected to be embedded in the
> + * driver VM structure along with the struct drm_gpuvm, which handles reference
> + * counting.
> + */
> +struct drm_gpusvm {
> +	const char *name;
> +	struct drm_device *drm;
> +	struct mm_struct *mm;
> +	void *device_private_page_owner;
> +	u64 mm_start;
> +	u64 mm_range;
> +	u64 notifier_size;
> +	const struct drm_gpusvm_ops *ops;
> +	const u64 *chunk_sizes;
> +	int num_chunks;
> +	struct rw_semaphore notifier_lock;
> +	struct workqueue_struct *zdd_wq;
> +	struct rb_root_cached root;
> +	struct list_head notifier_list;
> +};
> +
> +/**
> + * struct drm_gpusvm_ctx - DRM GPU SVM context
> + *
> + * @mmap_locked: mmap lock is locked
> + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> + *                (e.g.dma-revs -> mmap lock)
> + * @in_notifier: entering from a MMU notifier
> + * @read_only: operating on read-only memory
> + * @vram_possible: possible to use VRAM
> + * @prefault: prefault pages
> + *
> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> + */
> +struct drm_gpusvm_ctx {
> +	u32 mmap_locked :1;
> +	u32 trylock_mmap :1;
> +	u32 in_notifier :1;
> +	u32 read_only :1;
> +	u32 vram_possible :1;
> +	u32 prefault :1;
> +};
> +
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks);
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> +
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range);
> +
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx);
> +
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx);
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> +
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> +
> +/**
> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, take lock
> + */
> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> +	down_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, drop lock
> + */
> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> +	up_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> + * @range: a pointer to the current GPU SVM range
> + *
> + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> + *         current range is the last one or if the input range is NULL.
> + */
> +static inline struct drm_gpusvm_range *
> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> +{
> +	if (range && !list_is_last(&range->rb.entry,
> +				   &range->notifier->range_list))
> +		return list_next_entry(range, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> + * to use while holding the driver SVM lock or the notifier lock.
> + */
> +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> +	for ((range__) = (range__) ?:					\
> +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> +	     (range__) && (range__->va.start < (end__));		\
> +	     (range__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> + * @range: Pointer to the GPU SVM range structure.
> + * @mmu_range: Pointer to the MMU notifier range structure.
> + *
> + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> + * if the range partially falls within the provided MMU notifier range.
> + */
> +static inline void
> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> +			      const struct mmu_notifier_range *mmu_range)
> +{
> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> +
> +	range->flags.unmapped = true;
> +	if (range->va.start < mmu_range->start ||
> +	    range->va.end > mmu_range->end)
> +		range->flags.partial_unmap = true;
> +}
> +
> +#endif /* __DRM_GPUSVM_H__ */
> -- 
> 2.34.1
>
Matthew Brost Aug. 29, 2024, 4:40 p.m. UTC | #9
On Wed, Aug 28, 2024 at 06:25:18PM +0200, Daniel Vetter wrote:
> On Wed, Aug 28, 2024 at 03:43:48PM +0000, Matthew Brost wrote:
> > On Wed, Aug 28, 2024 at 04:46:24PM +0200, Christian König wrote:
> > > Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> > > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > > +		if (!ctx->mmap_locked) {
> > > > > +			/*
> > > > > +			 * XXX: HMM locking document indicates only a read-lock
> > > > > +			 * is required but there apears to be a window between
> > > > > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > > > +			 * via migrate_vma_setup and the pages actually moving
> > > > > +			 * in migrate_vma_finalize in which this code can grab
> > > > > +			 * garbage pages. Grabbing the write-lock if the range
> > > > > +			 * is attached to vram appears to protect against this
> > > > > +			 * race.
> > > > > +			 */
> > 
> > Thanks the comments, replying to both of you inline.
> > 
> > > > This one is really scary, since it means the entire migrate pte trickery
> > > > is essentially completely busted. Grabbing the mmap write lock just means
> > > > you block out pretty much everything interesting from concurrently
> > > > happening.
> > > > 
> > > > My gut feeling says we need to figure out what's happening here, because
> > > > this looks a bit too fundamental to me.
> > 
> > I agree. I haven’t looked into this issue for a couple of months but
> > really need to understand what is going on.
> > 
> > I should have mentioned this in the cover letter: the goal of this
> > series was to produce something for review that is stable and supports
> > UMDs/user applications. It was not intended to be presented as a final
> > solution. This issue certainly falls into the category of "needs to be
> > understood and requires a proper fix."
> > 
> > One open question I have is whether the test case that triggers this
> > issue is even defined behavior. The test creates concurrent access
> > between the GPU and CPU to the same memory address, resulting in GPU and
> > CPU faults racing against each other. It’s possible that this is
> > undefined behavior, so data corruption might be acceptable—i.e., the
> > kernel can’t crash, but incorrect results might be permissible.
> 
> Yes this is supposed to be defined, at least from an hmm pov. And core mm/
> is ridiculous in how many races it allows, especially around concurrent
> fault handling.
> 
> It is ofc really slow if every fault results in a migration, but that's a
> matter of the application setting stupid memory migration hints for the
> gpu.
> 
> > e.g. This is the only defined usage model:
> > 
> > alloc_memory();
> > start_compute_kernel();
> > sync_on_compute_kernel_completion();
> > read_memory();
> > 
> > Hopefully, in the next week or so, I'll be heavily engaging with the UMD
> > teams. Development can then start, and applications will be running soon
> > after. This will allow us to address issues like this, collect data on
> > memory usage, and verify some of the assumptions I've made, such as
> > optimizing for 2M+ allocations.
> > 
> > > 
> > > I think I have at least a high level understanding what's going on here,
> > > Felix and especially Philip should know more of the details.
> > > 
> > 
> > I meant to reach out to AMD for issues like this. So, Felix
> > (felix.kuehling@amd.com) and Philip (Philip.Yang@amd.com) would be good
> > contacts?
> > 
> > > In general grabbing the mm_lock to protect PTEs from changing is completely
> > > nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!
> > > 
> > 
> > Thanks for the hint. I believe that in the AMD implementation, I noticed
> > some additional locks for migration, which might be how you mitigated
> > this issue.
> 
> Yeah, so in general hold mmap_reading is indeed pure magic thinking for
> preventing pte changes, like Christian points out. It doesn't stop
> invalidates, and with the per vma locking it also doesn't stop new valid

Invalidations happening to parallel to migrations, get pages, or
bindings should be fine. The notifier lock usage should make all of this
safe.

> ptes from being inserted at least for anon memory.
> 
> Except migration pte entries that point at vram pages are special, and are
> _only_ resolved while holding mmap_read. Which means holding mmap_write
> for the case of looking up our own vram pages with hmm_range_fault
> actually prevents issues. And so this duct-tape of holding mmap_write very
> much looks like a working hack to plug any races against concurrently
> ongoing migrations to system memory due to cpu faults.
> 

Agree holding mmap_write is a hack. Looking at AMD 'To serialize concurrent
migrations or validations of the same range, the prange->migrate_mutex
must be held.', seemly I could drop mmap write lock abuse and use
something like this here. The would like be an inner lock of the mmap
lock.

Does this seem like a reasonable thing to explore?

> An even more fun corner case is multiple concurrent cpu faults on the same
> vram page. fork gets you that, or maybe a bit more reasonable mremap with

My understanding is memory shared between processes cannot migrated due
to current limitations migrate layer.

e.g. mmap called with MAP_SHARED is not eligible for migration.

Unsure what the behavior is fork() is called on a process with memory in
VRAM and the child tries to access it. Maybe fork() is different than
MAP_SHARED where as parent / child processes can share memory in VRAM? 

Also really I'm unsure what would happen if user space calls fork() and
has an Xe VM open and tries to use it too. Before commenting more on
this, I need play around with test cases like this educate myself.

My current test doesn't use mremap, agree that would be to good add.
Again before commenting more here, let add more test cases to educate
myself.

> MREMAP_DONTUNMAP | MREMAP_MAYMOVE. I think just hammer the same va with
> multiple threads along isn't enough, it's better to have a private va for

I do have test cases where multiple CPU faults from threads hammer the
same memory. Found some bugs in my initial code but as far as I can tell
multiple CPU faults in parallel occur in my testing and do work.

> each thread pointing at the same anon memory page, so that you can get

You are losing me here - 'private va for each thread pointing at the
same anon memory page'. This is a fork() case where the parent allocates
memory and then all children try to read in parallel?

> more parallel faults due to finely grained pte locking.
> 
> Would be a good testcase to add, if you don't have it yet.
>

See above, agree these are good test cases which I haven't considered and
will expand my suite to include these. Thanks for the tip - IMO testing
is as important or even more important than the KMD design and need to
ensure I have all possible uses covered.

> > I must say it is a bit unfortunate that the HMM locking documentation
> > doesn’t mention this. I believe the documentation needs additional
> > information, which I can add once we finalize the solution.
> 
> Yeah, at least from my very cursory lock you don't have enough locking.
> I've written an in-depth reply to patch 23 with the high-level summary of
> my thoughts.
>

Will look and reply there.

Matt

> Cheers, Sima
> 
> > 
> > Matt 
> > 
> > > Even with the write side of the mm_lock taken it is perfectly possible that
> > > PTE change. It's just less likely.
> > > 
> > > We run into multiple issues before we figured out this important distinction
> > > as well.
> > > 
> > > Christian.
> > > 
> > > > -Sima
> > > > 
> > > > 
> > > > > +			if (vram_pages)
> > > > > +				mmap_write_lock(mm);
> > > > > +			else
> > > > > +				mmap_read_lock(mm);
> > > > > +		}
> > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > +		if (!ctx->mmap_locked) {
> > > > > +			if (vram_pages)
> > > > > +				mmap_write_unlock(mm);
> > > > > +			else
> > > > > +				mmap_read_unlock(mm);
> > > > > +		}
> > > > > +
> > > > > +		if (err == -EBUSY) {
> > > > > +			if (time_after(jiffies, timeout))
> > > > > +				break;
> > > > > +
> > > > > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > > > +			continue;
> > > > > +		}
> > > > > +		break;
> > > > > +	}
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmput(mm);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	pages = (struct page **)pfns;
> > > > > +
> > > > > +	if (ctx->prefault) {
> > > > > +		range->pages = pages;
> > > > > +		goto set_seqno;
> > > > > +	}
> > > > > +
> > > > > +map_pages:
> > > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > > +
> > > > > +		for (i = 0; i < npages; ++i) {
> > > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > > +
> > > > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				goto err_free;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Do not race with notifier unmapping pages */
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +		range->flags.has_vram_pages = true;
> > > > > +		range->pages = pages;
> > > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > > +			err = -EAGAIN;
> > > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +		}
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +	} else {
> > > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > > +
> > > > > +		for_each_dma_page(i, j, npages, order) {
> > > > > +			if (WARN_ON_ONCE(i && order !=
> > > > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > > +
> > > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +
> > > > > +			set_page_dirty_lock(pages[j]);
> > > > > +			mark_page_accessed(pages[j]);
> > > > > +
> > > > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > > > +						   pages[j], 0,
> > > > > +						   PAGE_SIZE << order,
> > > > > +						   DMA_BIDIRECTIONAL);
> > > > > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > > > +				err = -EFAULT;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Huge pages, reduce memory footprint */
> > > > > +		if (order) {
> > > > > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > > > +						 GFP_KERNEL);
> > > > > +			if (dma_addr) {
> > > > > +				for (i = 0; i < j; ++i)
> > > > > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > > > > +				kvfree(pfns);
> > > > > +				kfree_mapping = true;
> > > > > +			} else {
> > > > > +				dma_addr = (dma_addr_t *)pfns;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Do not race with notifier unmapping pages */
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +		range->order = order;
> > > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > > +		range->flags.has_dma_mapping = true;
> > > > > +		range->dma_addr = dma_addr;
> > > > > +		range->vram_allocation = NULL;
> > > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > > +			err = -EAGAIN;
> > > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +		}
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +	}
> > > > > +
> > > > > +	if (err == -EAGAIN)
> > > > > +		goto retry;
> > > > > +set_seqno:
> > > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > > +
> > > > > +	return 0;
> > > > > +
> > > > > +err_unmap:
> > > > > +	for_each_dma_page(i, j, npages, order)
> > > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > > +			       (dma_addr_t)pfns[j],
> > > > > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	if (alloc_pfns)
> > > > > +		kvfree(pfns);
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > > > + * security model.
> > > > > + */
> > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range *range,
> > > > > +				  const struct drm_gpusvm_ctx *ctx)
> > > > > +{
> > > > > +	if (ctx->in_notifier)
> > > > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > > > +	else
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +
> > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +
> > > > > +	if (!ctx->in_notifier)
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > > + * @page: Pointer to the page to put
> > > > > + *
> > > > > + * This function unlocks and puts a page.
> > > > > + */
> > > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > > +{
> > > > > +	unlock_page(page);
> > > > > +	put_page(page);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > > + * @npages: Number of pages
> > > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > > + *
> > > > > + * This function puts an array of pages.
> > > > > + */
> > > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > > +					   unsigned long *migrate_pfn)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		if (!migrate_pfn[i])
> > > > > +			continue;
> > > > > +
> > > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > > > +		migrate_pfn[i] = 0;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > > + * @page: Pointer to the page
> > > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > > + *
> > > > > + * This function associates the given page with the specified GPU SVM zone
> > > > > + * device data and initializes it for zone device usage.
> > > > > + */
> > > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > > +				     struct drm_gpusvm_zdd *zdd)
> > > > > +{
> > > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > > +	zone_device_page_init(page);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > > > + * @dev: The device for which the pages are being mapped
> > > > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > > + * @npages: Number of pages to map
> > > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > > + *
> > > > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > > > + * array.
> > > > > + *
> > > > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > > > + */
> > > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > > +					dma_addr_t *dma_addr,
> > > > > +					long unsigned int *migrate_pfn,
> > > > > +					unsigned long npages,
> > > > > +					enum dma_data_direction dir)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > > > +
> > > > > +		if (!page)
> > > > > +			continue;
> > > > > +
> > > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > > +			return -EFAULT;
> > > > > +
> > > > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > > +			return -EFAULT;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > > > + * @dev: The device for which the pages were mapped
> > > > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > > > + * @npages: Number of pages to unmap
> > > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > > + *
> > > > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > > > + */
> > > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > > +					   dma_addr_t *dma_addr,
> > > > > +					   unsigned long npages,
> > > > > +					   enum dma_data_direction dir)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > > > +			continue;
> > > > > +
> > > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *                   failure of this function.
> > > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > > > + *                   should hold a reference to the VRAM allocation, which
> > > > > + *                   should be dropped via ops->vram_allocation or upon the
> > > > > + *                   failure of this function.
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > > > + * necessary setup and invokes the driver-specific operations for migration to
> > > > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > > > + * until ops->vram_release is called which only upon successful return.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range *range,
> > > > > +			       void *vram_allocation,
> > > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > > +{
> > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > +	struct migrate_vma migrate = {
> > > > > +		.start		= start,
> > > > > +		.end		= end,
> > > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > > +	};
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	unsigned long i, npages = npages_in_range(start, end);
> > > > > +	struct vm_area_struct *vas;
> > > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > > +	struct page **pages;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int err;
> > > > > +
> > > > > +	if (!range->flags.migrate_vram)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > > > +	    !gpusvm->ops->copy_to_sram)
> > > > > +		return -EOPNOTSUPP;
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +		mmap_write_lock(mm);
> > > > > +	}
> > > > > +
> > > > > +	mmap_assert_locked(mm);
> > > > > +
> > > > > +	vas = vma_lookup(mm, start);
> > > > > +	if (!vas) {
> > > > > +		err = -ENOENT;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > > +		err = -EINVAL;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (!vma_is_anonymous(vas)) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > > +
> > > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > > +	if (!zdd) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	migrate.vma = vas;
> > > > > +	migrate.src = buf;
> > > > > +	migrate.dst = migrate.src + npages;
> > > > > +
> > > > > +	err = migrate_vma_setup(&migrate);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	/*
> > > > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > > > +	 */
> > > > > +
> > > > > +	if (!migrate.cpages) {
> > > > > +		err = -EFAULT;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	if (migrate.cpages != npages) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_finalize;
> > > > > +	}
> > > > > +
> > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > > > +					     migrate.dst);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > +					   migrate.src, npages, DMA_TO_DEVICE);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > > > +
> > > > > +		pages[i] = page;
> > > > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > > +	}
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	/* Upon success bind vram allocation to range and zdd */
> > > > > +	range->vram_allocation = vram_allocation;
> > > > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > > +	migrate_vma_pages(&migrate);
> > > > > +	migrate_vma_finalize(&migrate);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > +				       DMA_TO_DEVICE);
> > > > > +err_free:
> > > > > +	if (zdd)
> > > > > +		drm_gpusvm_zdd_put(zdd);
> > > > > +	kvfree(buf);
> > > > > +err_mmunlock:
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_write_unlock(mm);
> > > > > +		mmput(mm);
> > > > > +	}
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > > + * @npages: Number of pages to populate
> > > > > + * @src_mpfn: Source array of migrate PFNs
> > > > > + * @mpfn: Array of migrate PFNs to populate
> > > > > + * @addr: Start address for PFN allocation
> > > > > + *
> > > > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > > > + * alloc_page for allocation.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > > > +						unsigned long npages,
> > > > > +						unsigned long *src_mpfn,
> > > > > +						unsigned long *mpfn, u64 addr)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > > +		struct page *page;
> > > > > +
> > > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > > +			continue;
> > > > > +
> > > > > +		if (vas)
> > > > > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > > > +		else
> > > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > > +
> > > > > +		if (!page)
> > > > > +			return -ENOMEM;
> > > > > +
> > > > > +		lock_page(page);
> > > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > > > + * preferred to issue migrations with mmap lock.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +				    struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	unsigned long npages;
> > > > > +	struct page **pages;
> > > > > +	unsigned long *src, *dst;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int i, err = 0;
> > > > > +
> > > > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_out;
> > > > > +	}
> > > > > +	src = buf;
> > > > > +	dst = buf + (sizeof(*src) * npages);
> > > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > > > +
> > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > > > +					     npages, src);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > > +				       gpusvm->device_private_page_owner, src,
> > > > > +				       npages, range->va.start);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > +					   dst, npages, DMA_BIDIRECTIONAL);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i)
> > > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > > +	migrate_device_pages(src, dst, npages);
> > > > > +	migrate_device_finalize(src, dst, npages);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > +				       DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	kvfree(buf);
> > > > > +err_out:
> > > > > +
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @vas: Pointer to the VM area structure
> > > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > > + * @start: Start address of the migration range
> > > > > + * @end: End address of the migration range
> > > > > + *
> > > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +					struct vm_area_struct *vas,
> > > > > +					struct page *page,
> > > > > +					u64 start, u64 end)
> > > > > +{
> > > > > +	struct migrate_vma migrate = {
> > > > > +		.vma		= vas,
> > > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > > +		.fault_page	= page,
> > > > > +	};
> > > > > +	unsigned long npages;
> > > > > +	struct page **pages;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int i, err = 0;
> > > > > +
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > +
> > > > > +	/* Corner where VMA area struct has been partially unmapped */
> > > > > +	if (start < vas->vm_start)
> > > > > +		start = vas->vm_start;
> > > > > +	if (end > vas->vm_end)
> > > > > +		end = vas->vm_end;
> > > > > +
> > > > > +	migrate.start = start;
> > > > > +	migrate.end = end;
> > > > > +	npages = npages_in_range(start, end);
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_out;
> > > > > +	}
> > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > > +
> > > > > +	migrate.vma = vas;
> > > > > +	migrate.src = buf;
> > > > > +	migrate.dst = migrate.src + npages;
> > > > > +
> > > > > +	err = migrate_vma_setup(&migrate);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	/* Raced with another CPU fault, nothing to do */
> > > > > +	if (!migrate.cpages)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > > +						   migrate.src, migrate.dst,
> > > > > +						   start);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > +					   migrate.dst, npages,
> > > > > +					   DMA_BIDIRECTIONAL);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i)
> > > > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > > +	migrate_vma_pages(&migrate);
> > > > > +	migrate_vma_finalize(&migrate);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > +				       DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	kvfree(buf);
> > > > > +err_out:
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > +
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function initiates the migration of the specified GPU SVM range to
> > > > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > > > + * function for actual migration.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range *range,
> > > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > > +{
> > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	struct vm_area_struct *vas;
> > > > > +	int err;
> > > > > +	bool retry = false;
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +		if (ctx->trylock_mmap) {
> > > > > +			if (!mmap_read_trylock(mm))  {
> > > > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > > +				goto err_mmput;
> > > > > +			}
> > > > > +		} else {
> > > > > +			mmap_read_lock(mm);
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	mmap_assert_locked(mm);
> > > > > +
> > > > > +	/*
> > > > > +	 * Loop required to find all VMA area structs for the corner case when
> > > > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > > > +	 */
> > > > > +again:
> > > > > +	vas = find_vma(mm, start);
> > > > > +	if (!vas) {
> > > > > +		if (!retry)
> > > > > +			err = -ENOENT;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > > +		if (!retry)
> > > > > +			err = -EINVAL;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > > > +	if (err)
> > > > > +		goto err_mmunlock;
> > > > > +
> > > > > +	if (vas->vm_end < end) {
> > > > > +		retry = true;
> > > > > +		start = vas->vm_end;
> > > > > +		goto again;
> > > > > +	}
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_read_unlock(mm);
> > > > > +		/*
> > > > > +		 * Using mmput_async as this function can be called while
> > > > > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > > > > +		 * lock, causing a lock inversion.
> > > > > +		 */
> > > > > +		mmput_async(mm);
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +
> > > > > +err_mmunlock:
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmap_read_unlock(mm);
> > > > > +err_mmput:
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmput_async(mm);
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > > > + * @page: Pointer to the page
> > > > > + *
> > > > > + * This function is a callback used to put the GPU SVM zone device data
> > > > > + * associated with a page when it is being released.
> > > > > + */
> > > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > > +{
> > > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > > + * @vmf: Pointer to the fault information structure
> > > > > + *
> > > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > > + * the internal migration function to migrate the range back to RAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > > + */
> > > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > > +	int err;
> > > > > +
> > > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > > +					   vmf->vma, vmf->page,
> > > > > +					   zdd->range->va.start,
> > > > > +					   zdd->range->va.end);
> > > > > +
> > > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > > > + */
> > > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > > > +	.page_free = drm_gpusvm_page_free,
> > > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the GPU SVM device page map operations structure.
> > > > > + */
> > > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > > > +{
> > > > > +	return &drm_gpusvm_pagemap_ops;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > + * @start: Start address
> > > > > + * @end: End address
> > > > > + *
> > > > > + * Returns:
> > > > > + * True if GPU SVM has mapping, False otherwise
> > > > > + */
> > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +
> > > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > > > +		struct drm_gpusvm_range *range = NULL;
> > > > > +
> > > > > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > > > > +			return true;
> > > > > +	}
> > > > > +
> > > > > +	return false;
> > > > > +}
> > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > new file mode 100644
> > > > > index 000000000000..0ea70f8534a8
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > @@ -0,0 +1,415 @@
> > > > > +/* SPDX-License-Identifier: MIT */
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#ifndef __DRM_GPUSVM_H__
> > > > > +#define __DRM_GPUSVM_H__
> > > > > +
> > > > > +#include <linux/kref.h>
> > > > > +#include <linux/mmu_notifier.h>
> > > > > +#include <linux/workqueue.h>
> > > > > +
> > > > > +struct dev_pagemap_ops;
> > > > > +struct drm_device;
> > > > > +struct drm_gpusvm;
> > > > > +struct drm_gpusvm_notifier;
> > > > > +struct drm_gpusvm_ops;
> > > > > +struct drm_gpusvm_range;
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > > + *
> > > > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > > > + * perform operations such as migration between VRAM and system RAM.
> > > > > + */
> > > > > +struct drm_gpusvm_ops {
> > > > > +	/**
> > > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > > > +	 *
> > > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > > > +	 */
> > > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > > +
> > > > > +	/**
> > > > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > > > +	 *
> > > > > +	 * This function shall free a GPU SVM notifier.
> > > > > +	 */
> > > > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > > > +
> > > > > +	/**
> > > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 *
> > > > > +	 * This function shall allocate a GPU SVM range.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > > > +	 */
> > > > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > > > +
> > > > > +	/**
> > > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > > +	 *
> > > > > +	 * This function shall free a GPU SVM range.
> > > > > +	 */
> > > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > > +
> > > > > +	/**
> > > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > +	 *
> > > > > +	 * This function shall release VRAM allocation and expects to drop a
> > > > > +	 * reference to VRAM allocation.
> > > > > +	 */
> > > > > +	void (*vram_release)(void *vram_allocation);
> > > > > +
> > > > > +	/**
> > > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > +	 * @npages: Number of pages to populate
> > > > > +	 * @pfn: Array of page frame numbers to populate
> > > > > +	 *
> > > > > +	 * This function shall populate VRAM page frame numbers (PFN).
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > > +				 void *vram_allocation,
> > > > > +				 unsigned long npages,
> > > > > +				 unsigned long *pfn);
> > > > > +
> > > > > +	/**
> > > > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > > > +	 * @npages: Number of pages to copy
> > > > > +	 *
> > > > > +	 * This function shall copy pages to VRAM.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > > +			    struct page **pages,
> > > > > +			    dma_addr_t *dma_addr,
> > > > > +			    unsigned long npages);
> > > > > +
> > > > > +	/**
> > > > > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > > > > +	 * @npages: Number of pages to copy
> > > > > +	 *
> > > > > +	 * This function shall copy pages to system RAM.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > > +			    struct page **pages,
> > > > > +			    dma_addr_t *dma_addr,
> > > > > +			    unsigned long npages);
> > > > > +
> > > > > +	/**
> > > > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > > > +	 *
> > > > > +	 * This function shall invalidate the GPU page tables. It can safely
> > > > > +	 * walk the notifier range RB tree/list in this function. Called while
> > > > > +	 * holding the notifier lock.
> > > > > +	 */
> > > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > > +			   struct drm_gpusvm_notifier *notifier,
> > > > > +			   const struct mmu_notifier_range *mmu_range);
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > > > + *
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: MMU interval notifier
> > > > > + * @interval: Interval for the notifier
> > > > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > > > + * @root: Cached root node of the RB tree containing ranges
> > > > > + * @range_list: List head containing of ranges in the same order they appear in
> > > > > + *              interval tree. This is useful to keep iterating ranges while
> > > > > + *              doing modifications to RB tree.
> > > > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > > > + *                 removed
> > > > > + *
> > > > > + * This structure represents a GPU SVM notifier.
> > > > > + */
> > > > > +struct drm_gpusvm_notifier {
> > > > > +	struct drm_gpusvm *gpusvm;
> > > > > +	struct mmu_interval_notifier notifier;
> > > > > +	struct {
> > > > > +		u64 start;
> > > > > +		u64 end;
> > > > > +	} interval;
> > > > > +	struct {
> > > > > +		struct rb_node node;
> > > > > +		struct list_head entry;
> > > > > +		u64 __subtree_last;
> > > > > +	} rb;
> > > > > +	struct rb_root_cached root;
> > > > > +	struct list_head range_list;
> > > > > +	struct {
> > > > > +		u32 removed : 1;
> > > > > +	} flags;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > > > + *
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > > + * @refcount: Reference count for the range
> > > > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > > > + * @va: Virtual address range
> > > > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > > > + *                       on @order which releases via kfree
> > > > > + *
> > > > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > > > + * mapped in a DRM device.
> > > > > + */
> > > > > +struct drm_gpusvm_range {
> > > > > +	struct drm_gpusvm *gpusvm;
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +	struct kref refcount;
> > > > > +	struct {
> > > > > +		struct rb_node node;
> > > > > +		struct list_head entry;
> > > > > +		u64 __subtree_last;
> > > > > +	} rb;
> > > > > +	struct {
> > > > > +		u64 start;
> > > > > +		u64 end;
> > > > > +	} va;
> > > > > +	unsigned long notifier_seq;
> > > > > +	union {
> > > > > +		struct page **pages;
> > > > > +		dma_addr_t *dma_addr;
> > > > > +	};
> > > > > +	void *vram_allocation;
> > > > > +	u16 order;
> > > > > +	struct {
> > > > > +		/* All flags below must be set upon creation */
> > > > > +		u16 migrate_vram : 1;
> > > > > +		/* All flags below must be set / cleared under notifier lock */
> > > > > +		u16 unmapped : 1;
> > > > > +		u16 partial_unmap : 1;
> > > > > +		u16 has_vram_pages : 1;
> > > > > +		u16 has_dma_mapping : 1;
> > > > > +		u16 kfree_mapping : 1;
> > > > > +	} flags;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm - GPU SVM structure
> > > > > + *
> > > > > + * @name: Name of the GPU SVM
> > > > > + * @drm: Pointer to the DRM device structure
> > > > > + * @mm: Pointer to the mm_struct for the address space
> > > > > + * @device_private_page_owner: Device private pages owner
> > > > > + * @mm_start: Start address of GPU SVM
> > > > > + * @mm_range: Range of the GPU SVM
> > > > > + * @notifier_size: Size of individual notifiers
> > > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > > > + *               Entries should be powers of 2 in descending order.
> > > > > + * @num_chunks: Number of chunks
> > > > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > > > + * @notifier_list: list head containing of notifiers in the same order they
> > > > > + *                 appear in interval tree. This is useful to keep iterating
> > > > > + *                 notifiers while doing modifications to RB tree.
> > > > > + *
> > > > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > > > + *
> > > > > + * No reference counting is provided, as this is expected to be embedded in the
> > > > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > > > + * counting.
> > > > > + */
> > > > > +struct drm_gpusvm {
> > > > > +	const char *name;
> > > > > +	struct drm_device *drm;
> > > > > +	struct mm_struct *mm;
> > > > > +	void *device_private_page_owner;
> > > > > +	u64 mm_start;
> > > > > +	u64 mm_range;
> > > > > +	u64 notifier_size;
> > > > > +	const struct drm_gpusvm_ops *ops;
> > > > > +	const u64 *chunk_sizes;
> > > > > +	int num_chunks;
> > > > > +	struct rw_semaphore notifier_lock;
> > > > > +	struct workqueue_struct *zdd_wq;
> > > > > +	struct rb_root_cached root;
> > > > > +	struct list_head notifier_list;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > > + *
> > > > > + * @mmap_locked: mmap lock is locked
> > > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > > > + *                (e.g.dma-revs -> mmap lock)
> > > > > + * @in_notifier: entering from a MMU notifier
> > > > > + * @read_only: operating on read-only memory
> > > > > + * @vram_possible: possible to use VRAM
> > > > > + * @prefault: prefault pages
> > > > > + *
> > > > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > > > + */
> > > > > +struct drm_gpusvm_ctx {
> > > > > +	u32 mmap_locked :1;
> > > > > +	u32 trylock_mmap :1;
> > > > > +	u32 in_notifier :1;
> > > > > +	u32 read_only :1;
> > > > > +	u32 vram_possible :1;
> > > > > +	u32 prefault :1;
> > > > > +};
> > > > > +
> > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > +		    const char *name, struct drm_device *drm,
> > > > > +		    struct mm_struct *mm, void *device_private_page_owner,
> > > > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > > +				const struct drm_gpusvm_ctx *ctx);
> > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > +			     struct drm_gpusvm_range *range);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > > +
> > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range *range);
> > > > > +
> > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range *range,
> > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range *range,
> > > > > +				  const struct drm_gpusvm_ctx *ctx);
> > > > > +
> > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range *range,
> > > > > +			       void *vram_allocation,
> > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range *range,
> > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > +
> > > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > > > +
> > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > + *
> > > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > + *
> > > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > > > + * @range: a pointer to the current GPU SVM range
> > > > > + *
> > > > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > > > + *         current range is the last one or if the input range is NULL.
> > > > > + */
> > > > > +static inline struct drm_gpusvm_range *
> > > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > > +				   &range->notifier->range_list))
> > > > > +		return list_next_entry(range, rb.entry);
> > > > > +
> > > > > +	return NULL;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > + * @start__: Start address of the range
> > > > > + * @end__: End address of the range
> > > > > + *
> > > > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > > > + * to use while holding the driver SVM lock or the notifier lock.
> > > > > + */
> > > > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > > > > +	for ((range__) = (range__) ?:					\
> > > > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > > > > +	     (range__) && (range__->va.start < (end__));		\
> > > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > > > + * @range: Pointer to the GPU SVM range structure.
> > > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > > + *
> > > > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > > > + * if the range partially falls within the provided MMU notifier range.
> > > > > + */
> > > > > +static inline void
> > > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > > > +			      const struct mmu_notifier_range *mmu_range)
> > > > > +{
> > > > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > > > +
> > > > > +	range->flags.unmapped = true;
> > > > > +	if (range->va.start < mmu_range->start ||
> > > > > +	    range->va.end > mmu_range->end)
> > > > > +		range->flags.partial_unmap = true;
> > > > > +}
> > > > > +
> > > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > > -- 
> > > > > 2.34.1
> > > > > 
> > > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost Aug. 29, 2024, 4:49 p.m. UTC | #10
On Wed, Aug 28, 2024 at 08:50:02PM +0200, Daniel Vetter wrote:
> On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner case when
> > +	 * VRAM backing has been partially unmapped from MM's address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> 
> So a hiliarous case that amdkfd gets a bit better but still not entirely
> is that the original vma might entirely gone. Even when you can still get
> at the mm of that process. This happens with cow (or shared too I think)
> mappings in forked child processes, or also if you play fun mremap games.
> 
> I think that outside of the ->migrate_to_ram callback migration/eviction
> to sram cannot assume there's any reasonable vma around and has to
> unconditionally go with the drm_gpusvm_evict_to_sram path.
> 

See my response here [1]. Let me drop the whole trylock thing and
convert to an 'evict' flag which calls drm_gpusvm_evict_to_sram in
places where Xe needs to evict VRAM. Or maybe just export that function
and call it directly. That way the only place the VMA is looked up for
SRAM -> VRAM is upon CPU page fault.

[1] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164

> Also in the migrate_to_ram case the vma is essentially nothing else that
> informational about which ranges we might need if we prefault a bit (in
> case the child changed the vma compared to the original one). So it's good
> to as parameter for migrate_vma_setup, but absolutely nothing else.
> 
> amdkfd almost gets this right by being entirely based on their svm_range
> structures, except they still have the lingering check that the orignal mm
> is still alive. Of course you cannot ever use that memory on the gpu
> anymore, but the child process could get very pissed if their memory is
> suddenly gone. Also the eviction code has the same issue as yours and
> limits itself to vma that still exist in the original mm, leaving anything
> that's orphaned in children or remaps stuck in vram. At least that's my
> understanding, I might very well be wrong.
> 
> So probably want a bunch of these testcases too to make sure that all
> works, and we're not stuck with memory allocations in vram that we can't
> move out.

When writing some additional test cases, let me add hooks in my IGTs to
be able to verify we are not orphaning VRAM too.

Matt

> -Sima
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost Aug. 29, 2024, 5:27 p.m. UTC | #11
On Thu, Aug 29, 2024 at 11:45:08AM +0200, Daniel Vetter wrote:
> On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > This patch introduces support for GPU Shared Virtual Memory (SVM) in the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > sharing of memory between the CPU and GPU, enhancing performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It also
> > provides mechanisms for allocating, deallocating, and migrating memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> 
> Still not sure I've got the right race that you paper over with
> mmap_write_lock, but I spotted a few things, commments inline.
> 

I've replied to this issue several times, let's table the
mmap_write_lock issue in this reply - a lot of other things to get
through. Current thinking is try to add a range->migrate_lock like AMD
which I state here [1]. Let's continue discussing the mmap lock issue
there if possible.

[1] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169

> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174 +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> >  
> >  # core driver code
> >  
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data exchange and
> > + * processing for GPU-accelerated applications by allowing memory sharing and
> > + * synchronization between the CPU's and GPU's virtual address spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU SVM
> > + *		initialization parameter, with a recommendation of 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of ranges that
> > + *		fall within the notifier interval. Notifiers are tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are dynamically inserted
> > + *		or removed as ranges within the interval are created or
> > + *		destroyed.
> > + * - Ranges: Represent memory ranges mapped in a DRM device and managed
> > + *	     by GPU SVM. They are sized based on an array of chunk sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits within the
> > + *	     faulting CPU address space is chosen for the range size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges are tracked in
> > + *	     a notifier's Red-Black tree.
> > + * - Operations: Define the interface for driver-specific SVM operations such as
> > + *		 allocation, page collection, migration, invalidations, and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping, migrating, and
> > + * releasing memory ranges between the CPU and GPU. It handles all core memory
> > + * management interactions (DMA mapping, HMM, and migration) and provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
> > + * to build the expected driver components for an SVM implementation as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers based on the
> > + *			     fault address, optionally migrate the range to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges. Ranges are
> > + *			expected to be added to the garbage collector upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions that require
> > + * the mmap lock. This option may be useful for drivers that need to call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and sequence
> > + * number. GPU SVM manages all necessary locking and unlocking operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the HMM
> > + * documentation (TODO: Link). Future revisions may transition from a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should implement a
> > + * lock to safeguard core GPU SVM function calls that modify state, such as
> > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. Alternatively,
> > + * these core functions can be called within a single kernel thread, for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> 
> I think this doesn't work, because essentially it forces a single threaded
> design. Core mm isn't single threaded, and you cannot lock them all out,
> at least not easily.
> 
> So I think a design requirement is that gpusvm can cope with migrations to
> ram due to cpu faults, migrations for other reasons, gpu fault handling
> all concurrently. Currently with the combo of driver_svm_lock + taking
> mmap_write_lock you serialize this all a lot, which I think is hiding
> design bugs.

See above, mmap_write_lock is wrong will work on other solutions.
driver_svm_lock is a per GPUSVM lock which in Xe maps to an existing per
GPUVM lock. All of Xe's binding code requires this lock. This is only
taken in the path of a GPU faults, certainly only 1 GPU fault per VM can
be serviced at a time. Agree cpu faults and migrations for other reasons
can happen in parallel with a GPU fault. Once we drop the mmap write
lock hack, this can freely happen.

> 
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently does not
> > + * support mixing SRAM and VRAM pages within a range. This means that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as follows: it
> > + * simplifies the implementation, and range sizes are driver-defined and should
> > + * be relatively small.
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
> > + * being that a subset of the range still has CPU and GPU mappings. If the
> > + * backing store for the range is in VRAM, a subset of the backing store has
> > + * references. One option would be to split the range and VRAM backing store,
> > + * but the implementation for this would be quite complicated. Given that
> > + * partial unmappings are rare and driver-defined range sizes are relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a range, the
> > + * driver is expected to invalidate and destroy the entire range. If the range
> > + * has VRAM as its backing, the driver is also expected to migrate any remaining
> > + * pages back to SRAM.
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected driver
> > + * components: the GPU page fault handler and the garbage collector. A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex migration
> > + * policies, optimized invalidations, or other potentially required driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
> > + *							gpuva_start, gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm, range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > + *					struct drm_gpusvm_range *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);
> 
> Note that the migration back to sram isn't guaranteed to succeed, so you
> might be still stuck with partially migrated range. This might be a case
> where hmm gives you vram pfns, but the range you have doesn't have any
> vram allocation anymore because you droppped it here. Not sure tbh.
>

Hmm isn't the picture here nor will a VMA once the
drm_gpusvm_evict_to_sram path is always taken as discussed here [2]. I
might have a corner case BO refcounting / TTM resource lookup bug in
somewhere in here which needs to be resolved though (e.g. eviction
racing with this code path), will try to close on that.

[2] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164
 
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier *notifier,
> > + *				 const struct mmu_notifier_range *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range->start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
> > + *
> > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range, mmu_range);
> > + *			driver_garbage_collector_add(gpusvm, range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64, rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)->interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)->interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused, notifier);
> > +
> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory range,
> > + * specified by the start and end addresses. It divides the difference
> > + * between the end and start addresses by the page size (PAGE_SIZE) to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up a range
> > + * upon CPU page fault and asynchronously releasing VRAM once the CPU has no
> > + * page references. Asynchronous release is useful because CPU page references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
> > +	void *vram_allocation;
> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd, destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It sets up the
> > + * reference count, initializes the destroy work, and links the provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work, drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for asynchronous destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)),	\
> > +	     (next__) = __drm_gpusvm_range_next(range__);				\
> > +	     (range__) && (range__->va.start < (end__));				\
> > +	     (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
> > + *         the current notifier is the last one or if the input notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm->notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start < (end__));			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1),	\
> > +	     (next__) = __drm_gpusvm_notifier_next(notifier__);				\
> > +	     (notifier__) && (notifier__->interval.start < (end__));			\
> > +	     (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > + *               Entries should be powers of 2 in descending order with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner = device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)				\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm->notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > +	range_remove((range__), &(notifier__)->root);		\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm->device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64 gpuva_start,
> > +				       u64 gpuva_end, bool check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted in. Without
> > +		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case, each process
> > +		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages device pages from
> > +		 * other processes are collected + faulted in which creates all
> > +		 * sorts of problems. Unsure exactly how this happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' mallocs at least 64k at a time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range based on the
> > + * fault address. Caller must hold a lock to protect range lookup and insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier->notifier,
> > +							  mm, notifier->interval.start,
> > +							  notifier->interval.end -
> > +							  notifier->interval.start,
> > +							  &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_* current
> > +	 * limitations. If/when migrate_vma_* add more support, this logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
> > +						 fault_addr, gpuva_start,
> > +						 gpuva_end, migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range, &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA region
> > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range. Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +					   struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range->va.start,
> > +							     range->va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range->order)
> > +				dma_unmap_page(gpusvm->drm->dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range->order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > +					struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range->va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier->notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified GPU SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the GPU SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its reference count
> > + * reaches zero. If a custom range-free function is provided, it is invoked to
> > + * free the range; otherwise, the range is deallocated using kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range, refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified GPU SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid. Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range->flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid. Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > +				      struct drm_gpusvm_range *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they are mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm->device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start, range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq && range->pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only a read-lock
> > +			 * is required but there apears to be a window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > +			 * via migrate_vma_setup and the pages actually moving
> > +			 * in migrate_vma_finalize in which this code can grab
> > +			 * garbage pages. Grabbing the write-lock if the range
> > +			 * is attached to vram appears to protect against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> 
> You can't do the above, because the pfn you get from hmm come with zero
> guarantees, you neither hold a page reference nor the page lock. The only
> thing you can do is grab the pagetable lock (or mmu notifier locks) and
> check it's still valid, before you can touch any state. I think the
> range->vram_allocation is probably always valid since you clean that up
> under the same lock/thread, but there's good chances the vram allocation
> is otherwise already gone for good. Or you get an inconsistent snapshot.
> 

I haven't seen this pop in my testing yet which is fairly thorough. My
thinking was migration always being enforced at range grainularity we'd
never get mixed mappings from the core as migration is completely under
control of the driver. Maybe I'm not understanding what you are saying
here...

> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> 
> You can't do these, because you don't hold a page reference. They're also
> not needed because hmm_range_fault goes thorugh the full mkwrite dance,
> which takes care of these, unlike the gup family of functions.
>

This is a left over from our existing userpte code and it does appear to
be incorrect. Let me remove this and fixup our userptr code while I'm at
it.
 
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE << order,
> > +						   DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> 
> Aside: dma_map_page is about the only thing that's ok, because it doesn't
> do anything harmful and especially doesn't make any assumption about what
> that page is.
> 

+1

> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > + *                   should hold a reference to the VRAM allocation, which
> > + *                   should be dropped via ops->vram_allocation or upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > + * necessary setup and invokes the driver-specific operations for migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > + * until ops->vram_release is called which only upon successful return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > +	 * always an error. Need to revisit possible cases and how to handle. We
> > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.

This is a bit stale, can update this comment.

> > +	 */
> 
> Yeah I think especially under contention partial migrations, at least back
> to sram due to cpu faults, are pretty much expected. And you need to cope
> somehow.
> 

I have seen these pop if the IGT calls mlock on the memory. My thinking
is migration to VRAM is basically optional and fallback to leaving range
in SRAM if an error occurs rather than doing a partial migration. This
is what currently happens so it is coped with.

If the memory is marked as must be in VRAM (NIY), well then the user
program has done something wrong and can kill the app (akin to
segfault).

> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   migrate.src, npages, DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > +						unsigned long npages,
> > +						unsigned long *src_mpfn,
> > +						unsigned long *mpfn, u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > + * migration done via migrate_device_* functions. Fallback path as it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm->device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   dst, npages, DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> 
> That's the wrong mm, at least for the ->migrate_to_ram path. You might be
> called on a anon mapping from a child process. That also means that the
> vma you're looking at might have no relationship with anythign you're
> tracking in your gpusvm.
>

Hmm, as discussed [3] I haven't added tests with child processes yet.
Let me do that and update the design as needed. This likely isn't
correct as you say.

[3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169 
 
> > +
> > +	/* Corner where VMA area struct has been partially unmapped */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src, migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM range to
> > + * SRAM. It performs necessary checks and invokes the internal migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner case when
> > +	 * VRAM backing has been partially unmapped from MM's address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called while
> > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> 
> So I think zdd->range doesn't work, because even within a single mm the
> vma mapping a given piece of anon memory does not need to be unique, you
> can duplicate them with mremap.
> 

This is attached to a page, not a VMA. Both AMD and Nvidia drivers use a
similar lookup mechanism.

> So all you have here is the physical memory and the vma, which might or
> might not be from the same process as gpusvm->mm.
> 
> Also the child process scenario means you using mmap_write on the fault
> side doesn't stop all cpu faults migrating stuff back.
> 
> Somewhat aside, but I think that means amdkfd's svm_range->migration_mutex
> is busted, because it's va based and so misses concurrently ongoing
> different mappings moving physical storage around underneath.
>

I think all of the above which falls into the fork() + child process
issues which you have raise. Until I test this out I can't speak to this
any level of confidence so I won't. Thanks for raising this issue and
let me write test cases as discussed and educate myself. Once I do that,
we can engage in further discussions.

Matt

> 
> Cheers, Sima
> 
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM ranges and
> > + * perform operations such as migration between VRAM and system RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects to drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It can safely
> > +	 * walk the notifier range RB tree/list in this function. Called while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order they appear in
> > + *              interval tree. This is useful to keep iterating ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under notifier lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > + * @notifier_list: list head containing of notifiers in the same order they
> > + *                 appear in interval tree. This is useful to keep iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > + *
> > + * No reference counting is provided, as this is expected to be embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > + *         current range is the last one or if the input range is NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > +	for ((range__) = (range__) ?:					\
> > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > +	     (range__) && (range__->va.start < (end__));		\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > +			      const struct mmu_notifier_range *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
> > -- 
> > 2.34.1
> > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost Aug. 29, 2024, 5:45 p.m. UTC | #12
On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> Hi, Matt. 
> 
> Some initial design comments / questions:
> 
> On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > sharing of memory between the CPU and GPU, enhancing performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It
> > also
> > provides mechanisms for allocating, deallocating, and migrating
> > memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile
> > b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> >  
> >  # core driver code
> >  
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to
> > manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data
> > exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are
> > tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> 
> What is the benefit of this extra layer compared to direct insertion of
> ranges using mmu_interval_notifier_insert?
> 
> IIRC the argument made previously about having wide notifiers was that
> the rb tree lookups inside the core were costly and if there were only
> a few, then the rb tree lookups within a notifier range could be
> replaced with the page-table radix-tree-like lookup, so each lookup
> complexity would be O(log(n_notifiers) + page_table_depth).
> 
> But now we have first an rb-tree lookup in the core and then an rb-tree
> lookup within each notifier yeilding O(log(n_ranges))
> 
> I can see a small benefit in that inserting directly into the core rb-
> tree will block pending ongoing invalidations, but at a cost of an
> extra multiplexing layer.
> 

So when the notifier is triggered the search is a smaller range. In a
perfect world eventually I'd like to drop the SVM range completely.
There is a lot of changes required in Xe to make that possible and not
entirely convinced it is possible and the ROI is worth it (additional
complexity vs. perf benefit). For now, this was a relatively simple way
to get SVM working (mirrors boths AMD's and Nvidia's implement wrt to
having a range concept) but also is flexible in the sense the notifier
size can be easily tweaked via a modparam [1] following Jason's
suggestion of larger notifiers.

[1] https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1

> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU
> > address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits
> > within the
> > + *	     faulting CPU address space is chosen for the range
> > size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> 
> How do ranges and chunks map to
>  
> a) Prefaulting granularity
> b) Migration granularity?
> 
> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration,
> > invalidations, and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping,
> > migrating, and
> > + * releasing memory ranges between the CPU and GPU. It handles all
> > core memory
> > + * management interactions (DMA mapping, HMM, and migration) and
> > provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure
> > is sufficient
> > + * to build the expected driver components for an SVM implementation
> > as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers
> > based on the
> > + *			     fault address, optionally migrate the
> > range to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > Ranges are
> > + *			expected to be added to the garbage
> > collector upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it
> > locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to
> > handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions
> > that require
> > + * the mmap lock. This option may be useful for drivers that need to
> > call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the
> > notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and
> > sequence
> > + * number. GPU SVM manages all necessary locking and unlocking
> > operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU
> > bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the
> > HMM
> > + * documentation (TODO: Link). Future revisions may transition from
> > a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is
> > deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should
> > implement a
> > + * lock to safeguard core GPU SVM function calls that modify state,
> > such as
> > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > Alternatively,
> > + * these core functions can be called within a single kernel thread,
> > for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between
> > SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently
> > does not
> > + * support mixing SRAM and VRAM pages within a range. This means
> > that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as
> > follows: it
> > + * simplifies the implementation, and range sizes are driver-defined
> > and should
> > + * be relatively small.
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > main one
> > + * being that a subset of the range still has CPU and GPU mappings.
> > If the
> > + * backing store for the range is in VRAM, a subset of the backing
> > store has
> > + * references. One option would be to split the range and VRAM
> > backing store,
> > + * but the implementation for this would be quite complicated. Given
> > that
> > + * partial unmappings are rare and driver-defined range sizes are
> > relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a
> > range, the
> > + * driver is expected to invalidate and destroy the entire range. If
> > the range
> > + * has VRAM as its backing, the driver is also expected to migrate
> > any remaining
> > + * pages back to SRAM.
> 
> So what happens if we get a one-page invalidation, say protection
> change event, or NUMA accounting event, in the middle of a range? Can
> we unmap just that single gpu pte covering that range, that is, how do
> the ranges map to invalidation granularity? Does this differ between
> igfx an dgfx?

Well the idea of chunks is ranges should be 1 GPU page (the chunk array
in Xe is 4k, 64k, and 2M). The design is flexible enough that doesn't
have to true but optimized for the thinking each range is most likely 1
GPU page. If this isn't true, then all GPU pages in the range are
invalidated which isn't ideal but keeps it simple which IMO far out
weighs the potential benefits. In theory a driver could implement
spliting / partial invalidaions too with a couple of updates to GPUSVM
but would likely largely be a driver implementation rather than GPUSVM.

No difference between igfx an dgfx.

You bring up a good point about protection changes, I likely haven't
fully gotten that part of implementation correct either. I can add this
to my TODO list and also update my IGTs to do things like this.

Matt

> 
> Thanks,
> Thomas
> 
> 
> 
> 
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected
> > driver
> > + * components: the GPU page fault handler and the garbage collector.
> > A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex
> > migration
> > + * policies, optimized invalidations, or other potentially required
> > driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM
> > ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > fault_addr,
> > + *							gpuva_start,
> > gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have
> > changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > + *					struct drm_gpusvm_range
> > *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages
> > back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > &ctx);
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier
> > *notifier,
> > + *				 const struct mmu_notifier_range
> > *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > >start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier,
> > mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > &ctx);
> > + *
> > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range,
> > mmu_range);
> > + *			driver_garbage_collector_add(gpusvm, range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > >interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > >interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > notifier);
> > +
> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given
> > range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory
> > range,
> > + * specified by the start and end addresses. It divides the
> > difference
> > + * between the end and start addresses by the page size (PAGE_SIZE)
> > to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up
> > a range
> > + * upon CPU page fault and asynchronously releasing VRAM once the
> > CPU has no
> > + * page references. Asynchronous release is useful because CPU page
> > references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
> > +	void *vram_allocation;
> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd,
> > destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It
> > sets up the
> > + * reference count, initializes the destroy work, and links the
> > provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work,
> > drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd
> > structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for asynchronous
> > destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd
> > structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > (start__), (end__)),	\
> > +	     (next__) =
> > __drm_gpusvm_range_next(range__);				\
> > +	     (range__) && (range__->va.start <
> > (end__));				\
> > +	     (range__) = (next__), (next__) =
> > __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > or NULL if
> > + *         the current notifier is the last one or if the input
> > notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm-
> > >notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1),	\
> > +	     (next__) =
> > __drm_gpusvm_notifier_next(notifier__);				\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = (next__), (next__) =
> > __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It
> > sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc
> > under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range
> > *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops
> > drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order
> > with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner =
> > device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault
> > address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier
> > structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)				\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining
> > ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any
> > invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next,
> > notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > >notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > >notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree
> > and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree
> > and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > +	range_remove((range__), &(notifier__)->root);		\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool
> > migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the
> > CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier
> > *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > >notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range
> > based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64
> > gpuva_start,
> > +				       u64 gpuva_end, bool
> > check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > >chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with
> > existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted
> > in. Without
> > +		 * this check, or prefault, on BMG
> > 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case,
> > each process
> > +		 * mallocs 16k but the CPU VMA is ~128k which
> > results in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some
> > processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages device
> > pages from
> > +		 * other processes are collected + faulted in which
> > creates all
> > +		 * sorts of problems. Unsure exactly how this
> > happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --
> > r
> > +		 * process-many-malloc' mallocs at least 64k at a
> > time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range
> > based on the
> > + * fault address. Caller must hold a lock to protect range lookup
> > and insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > >notifier,
> > +							  mm,
> > notifier->interval.start,
> > +							  notifier-
> > >interval.end -
> > +							  notifier-
> > >interval.start,
> > +							 
> > &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > current
> > +	 * limitations. If/when migrate_vma_* add more support, this
> > logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > vas,
> > +						 fault_addr,
> > gpuva_start,
> > +						 gpuva_end,
> > migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA
> > region
> > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range.
> > Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +					   struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range-
> > >va.start,
> > +							     range-
> > >va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range-
> > >order)
> > +				dma_unmap_page(gpusvm->drm->dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range-
> > >order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > +					struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also
> > removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > >va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier-
> > >notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified GPU
> > SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the GPU
> > SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its
> > reference count
> > + * reaches zero. If a custom range-free function is provided, it is
> > invoked to
> > + * free the range; otherwise, the range is deallocated using
> > kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range,
> > refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified GPU
> > SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before
> > commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range-
> > >flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > +				      struct drm_gpusvm_range
> > *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they are
> > mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > >notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start,
> > range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > >pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only
> > a read-lock
> > +			 * is required but there apears to be a
> > window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > a CPU fault
> > +			 * via migrate_vma_setup and the pages
> > actually moving
> > +			 * in migrate_vma_finalize in which this
> > code can grab
> > +			 * garbage pages. Grabbing the write-lock if
> > the range
> > +			 * is attached to vram appears to protect
> > against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if
> > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +					
> > hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if
> > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE <<
> > order,
> > +						  
> > DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev,
> > dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j,
> > sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] =
> > (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order,
> > DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If
> > @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > >invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long
> > *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > grate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU
> > SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU
> > SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn,
> > maps the
> > + * corresponding page, and stores the DMA address in the provided
> > @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int
> > *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU
> > Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in
> > @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the
> > corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction
> > dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > The caller
> > + *                   should hold a reference to the VRAM allocation,
> > which
> > + *                   should be dropped via ops->vram_allocation or
> > upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It
> > performs the
> > + * necessary setup and invokes the driver-specific operations for
> > migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely
> > reference @range
> > + * until ops->vram_release is called which only upon successful
> > return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > >copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > npages, not
> > +	 * always an error. Need to revisit possible cases and how
> > to handle. We
> > +	 * could prefault on migrate.cpages != npages via
> > hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.src, npages,
> > DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers
> > (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the
> > VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > vm_area_struct *vas,
> > +						unsigned long
> > npages,
> > +						unsigned long
> > *src_mpfn,
> > +						unsigned long *mpfn,
> > u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > lock and
> > + * migration done via migrate_device_* functions. Fallback path as
> > it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > >vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm-
> > >device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   dst, npages,
> > DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified
> > GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially unmapped
> > */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src,
> > migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM
> > range to
> > + * SRAM. It performs necessary checks and invokes the internal
> > migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err =
> > drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner
> > case when
> > +	 * VRAM backing has been partially unmapped from MM's
> > address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called
> > while
> > +		 * holding a dma-resv lock, and a final put can grab
> > the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device
> > data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM
> > range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page
> > and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start,
> > end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual
> > Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM
> > ranges and
> > + * perform operations such as migration between VRAM and system RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success,
> > NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> > *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects
> > to drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers
> > (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses
> > (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It
> > can safely
> > +	 * walk the notifier range RB tree/list in this function.
> > Called while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range
> > *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure
> > notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order
> > they appear in
> > + *              interval tree. This is useful to keep iterating
> > ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval notifier
> > has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in
> > VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> > mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> > size
> > + * @flags.migrate_vram: Flag indicating whether the range can be
> > migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been
> > partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking
> > memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under
> > notifier lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier
> > operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU SVM
> > notifiers
> > + * @notifier_list: list head containing of notifiers in the same
> > order they
> > + *                 appear in interval tree. This is useful to keep
> > iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory) used
> > for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > + *
> > + * No reference counting is provided, as this is expected to be
> > embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which
> > handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > NULL if the
> > + *         current range is the last one or if the input range is
> > NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates
> > the start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> > get the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> > It is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > end__)	\
> > +	for ((range__) = (range__)
> > ?:					\
> > +	     drm_gpusvm_range_find((notifier__), (start__),
> > (end__));	\
> > +	     (range__) && (range__->va.start <
> > (end__));		\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the
> > partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier
> > range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > +			      const struct mmu_notifier_range
> > *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
>
Matthew Brost Aug. 29, 2024, 6:13 p.m. UTC | #13
On Thu, Aug 29, 2024 at 05:45:07PM +0000, Matthew Brost wrote:
> On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> > Hi, Matt. 
> > 
> > Some initial design comments / questions:
> > 
> > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > > the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > sharing of memory between the CPU and GPU, enhancing performance and
> > > flexibility in GPU computing tasks.
> > > 
> > > The patch adds the necessary infrastructure for SVM, including data
> > > structures and functions for managing SVM ranges and notifiers. It
> > > also
> > > provides mechanisms for allocating, deallocating, and migrating
> > > memory
> > > regions between system RAM and GPU VRAM.
> > > 
> > > This mid-layer is largely inspired by GPUVM.
> > > 
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > +++++++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > 
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > >  
> > >  # core driver code
> > >  
> > > -xe-y += xe_bb.o \
> > > +xe-y += drm_gpusvm.o \
> > > +	xe_bb.o \
> > >  	xe_bo.o \
> > >  	xe_bo_evict.o \
> > >  	xe_devcoredump.o \
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > new file mode 100644
> > > index 000000000000..fc1e44e6ae72
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > @@ -0,0 +1,2174 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + *
> > > + * Authors:
> > > + *     Matthew Brost <matthew.brost@intel.com>
> > > + */
> > > +
> > > +#include <linux/dma-mapping.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +#include <linux/hmm.h>
> > > +#include <linux/memremap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/mm_types.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include <drm/drm_device.h>
> > > +#include "drm_gpusvm.h"
> > > +
> > > +/**
> > > + * DOC: Overview
> > > + *
> > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > Rendering Manager (DRM)
> > > + *
> > > + * The GPU SVM layer is a component of the DRM framework designed to
> > > manage shared
> > > + * virtual memory between the CPU and GPU. It enables efficient data
> > > exchange and
> > > + * processing for GPU-accelerated applications by allowing memory
> > > sharing and
> > > + * synchronization between the CPU's and GPU's virtual address
> > > spaces.
> > > + *
> > > + * Key GPU SVM Components:
> > > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > > notifying the
> > > + *		GPU of changes, notifiers are sized based on a GPU
> > > SVM
> > > + *		initialization parameter, with a recommendation of
> > > 512M or
> > > + *		larger. They maintain a Red-BlacK tree and a list of
> > > ranges that
> > > + *		fall within the notifier interval. Notifiers are
> > > tracked within
> > > + *		a GPU SVM Red-BlacK tree and list and are
> > > dynamically inserted
> > > + *		or removed as ranges within the interval are created
> > > or
> > > + *		destroyed.
> > 
> > What is the benefit of this extra layer compared to direct insertion of
> > ranges using mmu_interval_notifier_insert?
> > 
> > IIRC the argument made previously about having wide notifiers was that
> > the rb tree lookups inside the core were costly and if there were only
> > a few, then the rb tree lookups within a notifier range could be
> > replaced with the page-table radix-tree-like lookup, so each lookup
> > complexity would be O(log(n_notifiers) + page_table_depth).
> > 
> > But now we have first an rb-tree lookup in the core and then an rb-tree
> > lookup within each notifier yeilding O(log(n_ranges))
> > 
> > I can see a small benefit in that inserting directly into the core rb-
> > tree will block pending ongoing invalidations, but at a cost of an
> > extra multiplexing layer.
> > 
> 
> So when the notifier is triggered the search is a smaller range. In a
> perfect world eventually I'd like to drop the SVM range completely.
> There is a lot of changes required in Xe to make that possible and not
> entirely convinced it is possible and the ROI is worth it (additional
> complexity vs. perf benefit). For now, this was a relatively simple way
> to get SVM working (mirrors boths AMD's and Nvidia's implement wrt to
> having a range concept) but also is flexible in the sense the notifier
> size can be easily tweaked via a modparam [1] following Jason's
> suggestion of larger notifiers.
> 
> [1] https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1
> 

Sorry double reply. Also worth noting that attaching ranges to the
notifiers also easily lets us know when a notifier can be removed (no
ranges attached).

Lastly, if it isn't clear via [1] is we can one notifier for the entire
VA space, this design works for that too.

Matt

> > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > managed
> > > + *	     by GPU SVM. They are sized based on an array of chunk
> > > sizes, which
> > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > address space.
> > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > within the
> > > + *	     faulting CPU address space is chosen for the range
> > > size. Ranges are
> > > + *	     expected to be dynamically allocated on GPU fault and
> > > removed on an
> > > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > > are tracked in
> > > + *	     a notifier's Red-Black tree.
> > 
> > How do ranges and chunks map to
> >  
> > a) Prefaulting granularity
> > b) Migration granularity?
> > 
> > > + * - Operations: Define the interface for driver-specific SVM
> > > operations such as
> > > + *		 allocation, page collection, migration,
> > > invalidations, and VRAM
> > > + *		 release.
> > > + *
> > > + * This layer provides interfaces for allocating, mapping,
> > > migrating, and
> > > + * releasing memory ranges between the CPU and GPU. It handles all
> > > core memory
> > > + * management interactions (DMA mapping, HMM, and migration) and
> > > provides
> > > + * driver-specific virtual functions (vfuncs). This infrastructure
> > > is sufficient
> > > + * to build the expected driver components for an SVM implementation
> > > as detailed
> > > + * below.
> > > + *
> > > + * Expected Driver Components:
> > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > based on the
> > > + *			     fault address, optionally migrate the
> > > range to
> > > + *			     VRAM, and create GPU bindings.
> > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > Ranges are
> > > + *			expected to be added to the garbage
> > > collector upon
> > > + *			MMU_NOTIFY_UNMAP event.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Locking
> > > + *
> > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > locks/unlocks the
> > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > handle the mmap
> > > + * lock itself, a 'locked' argument is provided to the functions
> > > that require
> > > + * the mmap lock. This option may be useful for drivers that need to
> > > call into
> > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > locking
> > > + * inversions between the mmap and dma-resv locks.
> > > + *
> > > + * GPU SVM introduces a global notifier lock, which safeguards the
> > > notifier's
> > > + * range RB tree and list, as well as the range's DMA mappings and
> > > sequence
> > > + * number. GPU SVM manages all necessary locking and unlocking
> > > operations,
> > > + * except for the recheck of the range's sequence number
> > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > bindings. This
> > > + * lock corresponds to the 'driver->update' lock mentioned in the
> > > HMM
> > > + * documentation (TODO: Link). Future revisions may transition from
> > > a GPU SVM
> > > + * global lock to a per-notifier lock if finer-grained locking is
> > > deemed
> > > + * necessary.
> > > + *
> > > + * In addition to the locking mentioned above, the driver should
> > > implement a
> > > + * lock to safeguard core GPU SVM function calls that modify state,
> > > such as
> > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > Alternatively,
> > > + * these core functions can be called within a single kernel thread,
> > > for
> > > + * instance, using an ordered work queue. This lock is denoted as
> > > + * 'driver_svm_lock' in code examples.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Migrataion
> > > + *
> > > + * The migration support is quite simple, allowing migration between
> > > SRAM and
> > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > does not
> > > + * support mixing SRAM and VRAM pages within a range. This means
> > > that upon GPU
> > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > fault, the
> > > + * entire range is migrated to SRAM.
> > > + *
> > > + * The reasoning for only supporting range granularity is as
> > > follows: it
> > > + * simplifies the implementation, and range sizes are driver-defined
> > > and should
> > > + * be relatively small.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Partial Unmapping of Ranges
> > > + *
> > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > > CPU resulting
> > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > > main one
> > > + * being that a subset of the range still has CPU and GPU mappings.
> > > If the
> > > + * backing store for the range is in VRAM, a subset of the backing
> > > store has
> > > + * references. One option would be to split the range and VRAM
> > > backing store,
> > > + * but the implementation for this would be quite complicated. Given
> > > that
> > > + * partial unmappings are rare and driver-defined range sizes are
> > > relatively
> > > + * small, GPU SVM does not support splitting of ranges.
> > > + *
> > > + * With no support for range splitting, upon partial unmapping of a
> > > range, the
> > > + * driver is expected to invalidate and destroy the entire range. If
> > > the range
> > > + * has VRAM as its backing, the driver is also expected to migrate
> > > any remaining
> > > + * pages back to SRAM.
> > 
> > So what happens if we get a one-page invalidation, say protection
> > change event, or NUMA accounting event, in the middle of a range? Can
> > we unmap just that single gpu pte covering that range, that is, how do
> > the ranges map to invalidation granularity? Does this differ between
> > igfx an dgfx?
> 
> Well the idea of chunks is ranges should be 1 GPU page (the chunk array
> in Xe is 4k, 64k, and 2M). The design is flexible enough that doesn't
> have to true but optimized for the thinking each range is most likely 1
> GPU page. If this isn't true, then all GPU pages in the range are
> invalidated which isn't ideal but keeps it simple which IMO far out
> weighs the potential benefits. In theory a driver could implement
> spliting / partial invalidaions too with a couple of updates to GPUSVM
> but would likely largely be a driver implementation rather than GPUSVM.
> 
> No difference between igfx an dgfx.
> 
> You bring up a good point about protection changes, I likely haven't
> fully gotten that part of implementation correct either. I can add this
> to my TODO list and also update my IGTs to do things like this.
> 
> Matt
> 
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > 
> > 
> > > + */
> > > +
> > > +/**
> > > + * DOC: Examples
> > > + *
> > > + * This section provides two examples of how to build the expected
> > > driver
> > > + * components: the GPU page fault handler and the garbage collector.
> > > A third
> > > + * example demonstrates a sample invalidation driver vfunc.
> > > + *
> > > + * The generic code provided does not include logic for complex
> > > migration
> > > + * policies, optimized invalidations, or other potentially required
> > > driver
> > > + * locking (e.g., DMA-resv locks).
> > > + *
> > > + * 1) GPU page fault handler
> > > + *
> > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > drm_gpusvm_range *range)
> > > + *	{
> > > + *		int err = 0;
> > > + *
> > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > range);
> > > + *
> > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > + *			driver_commit_bind(gpusvm, range);
> > > + *		else
> > > + *			err = -EAGAIN;
> > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > + *
> > > + *		return err;
> > > + *	}
> > > + *
> > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *		int err;
> > > + *
> > > + *		driver_svm_lock();
> > > + *	retry:
> > > + *		// Always process UNMAPs first so view of GPU SVM
> > > ranges is current
> > > + *		driver_garbage_collector(gpusvm);
> > > + *
> > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > fault_addr,
> > > + *							gpuva_start,
> > > gpuva_end,
> > > + *						        &ctx);
> > > + *		if (IS_ERR(range)) {
> > > + *			err = PTR_ERR(range);
> > > + *			goto unlock;
> > > + *		}
> > > + *
> > > + *		if (driver_migration_policy(range)) {
> > > + *			bo = driver_alloc_bo();
> > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > range, bo, &ctx);
> > > + *			if (err)	// CPU mappings may have
> > > changed
> > > + *				goto retry;
> > > + *		}
> > > + *
> > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &ctx);
> > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > mappings changed
> > > + *			goto retry;
> > > + *		else if (err)
> > > + *			goto unlock;
> > > + *
> > > + *		err = driver_bind_range(gpusvm, range);
> > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > + *			goto retry
> > > + *
> > > + *	unlock:
> > > + *		driver_svm_unlock();
> > > + *		return err;
> > > + *	}
> > > + *
> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > > + *					struct drm_gpusvm_range
> > > *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM pages
> > > back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > > &ctx);
> > > + *
> > > + *		driver_unbind_range(range);
> > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > + *	}
> > > + *
> > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > + *	{
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > > + *			__driver_garbage_collector(gpusvm, range);
> > > + *	}
> > > + *
> > > + * 3) Invalidation driver vfunc.
> > > + *
> > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > + *				 struct drm_gpusvm_notifier
> > > *notifier,
> > > + *				 const struct mmu_notifier_range
> > > *mmu_range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > > };
> > > + *		struct drm_gpusvm_range *range = NULL;
> > > + *
> > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > >start, mmu_range->end);
> > > + *
> > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > mmu_range->start,
> > > + *					  mmu_range->end) {
> > > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > > &ctx);
> > > + *
> > > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > > + *				continue;
> > > + *
> > > + *			drm_gpusvm_range_set_unmapped(range,
> > > mmu_range);
> > > + *			driver_garbage_collector_add(gpusvm, range);
> > > + *		}
> > > + *	}
> > > + */
> > > +
> > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > rb.__subtree_last,
> > > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > > +		     static __maybe_unused, range);
> > > +
> > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > >interval.start)
> > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > >interval.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > > notifier);
> > > +
> > > +/**
> > > + * npages_in_range() - Calculate the number of pages in a given
> > > range
> > > + * @start__: The start address of the range
> > > + * @end__: The end address of the range
> > > + *
> > > + * This macro calculates the number of pages in a given memory
> > > range,
> > > + * specified by the start and end addresses. It divides the
> > > difference
> > > + * between the end and start addresses by the page size (PAGE_SIZE)
> > > to
> > > + * determine the number of pages in the range.
> > > + *
> > > + * Return: The number of pages in the specified range.
> > > + */
> > > +#define npages_in_range(start__, end__)	\
> > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > +
> > > +/**
> > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > + *
> > > + * @refcount: Reference count for the zdd
> > > + * @destroy_work: Work structure for asynchronous zdd destruction
> > > + * @range: Pointer to the GPU SVM range
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + *
> > > + * This structure serves as a generic wrapper installed in
> > > + * page->zone_device_data. It provides infrastructure for looking up
> > > a range
> > > + * upon CPU page fault and asynchronously releasing VRAM once the
> > > CPU has no
> > > + * page references. Asynchronous release is useful because CPU page
> > > references
> > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > requires sleeping
> > > + * locks.
> > > + */
> > > +struct drm_gpusvm_zdd {
> > > +	struct kref refcount;
> > > +	struct work_struct destroy_work;
> > > +	struct drm_gpusvm_range *range;
> > > +	void *vram_allocation;
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > > zdd
> > > + * @w: Pointer to the work_struct
> > > + *
> > > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(w, struct drm_gpusvm_zdd,
> > > destroy_work);
> > > +	struct drm_gpusvm_range *range = zdd->range;
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > +	drm_gpusvm_range_put(range);
> > > +	kfree(zdd);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > + * @range: Pointer to the GPU SVM range.
> > > + *
> > > + * This function allocates and initializes a new zdd structure. It
> > > sets up the
> > > + * reference count, initializes the destroy work, and links the
> > > provided GPU SVM
> > > + * range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > > + */
> > > +static struct drm_gpusvm_zdd *
> > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd;
> > > +
> > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > +	if (!zdd)
> > > +		return NULL;
> > > +
> > > +	kref_init(&zdd->refcount);
> > > +	INIT_WORK(&zdd->destroy_work,
> > > drm_gpusvm_zdd_destroy_work_func);
> > > +	zdd->range = drm_gpusvm_range_get(range);
> > > +	zdd->vram_allocation = NULL;
> > > +
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function increments the reference count of the provided zdd
> > > structure.
> > > + *
> > > + * Returns: Pointer to the zdd structure.
> > > + */
> > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_get(&zdd->refcount);
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > + * @ref: Pointer to the reference count structure.
> > > + *
> > > + * This function queues the destroy_work of the zdd for asynchronous
> > > destruction.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > +
> > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function decrements the reference count of the provided zdd
> > > structure
> > > + * and schedules its destruction if the count drops to zero.
> > > + */
> > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > + * @start: Start address of the range
> > > + * @end: End address of the range
> > > + *
> > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end)
> > > +{
> > > +	return range_iter_first(&notifier->root, start, end - 1);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > ranges in a notifier
> > > + * @range__: Iterator variable for the ranges
> > > + * @next__: Iterator variable for the ranges temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > > while
> > > + * removing ranges from it.
> > > + */
> > > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > > start__, end__)	\
> > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > (start__), (end__)),	\
> > > +	     (next__) =
> > > __drm_gpusvm_range_next(range__);				\
> > > +	     (range__) && (range__->va.start <
> > > (end__));				\
> > > +	     (range__) = (next__), (next__) =
> > > __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > > the list
> > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > > or NULL if
> > > + *         the current notifier is the last one or if the input
> > > notifier is
> > > + *         NULL.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > +{
> > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > +				      &notifier->gpusvm-
> > > >notifier_list))
> > > +		return list_next_entry(notifier, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > > a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > > end__)		\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1);	\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > > notifiers in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @next__: Iterator variable for the notifiers temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > > while
> > > + * removing notifiers from it.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > gpusvm__, start__, end__)	\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1),	\
> > > +	     (next__) =
> > > __drm_gpusvm_notifier_next(notifier__);				\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = (next__), (next__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > + * @cur_seq: Current sequence number.
> > > + *
> > > + * This function serves as a generic MMU notifier for GPU SVM. It
> > > sets the MMU
> > > + * notifier sequence number and calls the driver invalidate vfunc
> > > under
> > > + * gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * true if the operation succeeds, false otherwise.
> > > + */
> > > +static bool
> > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > > +			       const struct mmu_notifier_range
> > > *mmu_range,
> > > +			       unsigned long cur_seq)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier =
> > > +		container_of(mni, typeof(*notifier), notifier);
> > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > +
> > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > +		return false;
> > > +
> > > +	down_write(&gpusvm->notifier_lock);
> > > +	mmu_interval_set_seq(mni, cur_seq);
> > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > +	up_write(&gpusvm->notifier_lock);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > > GPU SVM
> > > + */
> > > +static const struct mmu_interval_notifier_ops
> > > drm_gpusvm_notifier_ops = {
> > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @name: Name of the GPU SVM.
> > > + * @drm: Pointer to the DRM device structure.
> > > + * @mm: Pointer to the mm_struct for the address space.
> > > + * @device_private_page_owner: Device private pages owner.
> > > + * @mm_start: Start address of GPU SVM.
> > > + * @mm_range: Range of the GPU SVM.
> > > + * @notifier_size: Size of individual notifiers.
> > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order
> > > with last
> > > + *               entry being SZ_4K.
> > > + * @num_chunks: Number of chunks.
> > > + *
> > > + * This function initializes the GPU SVM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, a negative error code on failure.
> > > + */
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks)
> > > +{
> > > +	if (!ops->invalidate || !num_chunks)
> > > +		return -EINVAL;
> > > +
> > > +	gpusvm->name = name;
> > > +	gpusvm->drm = drm;
> > > +	gpusvm->mm = mm;
> > > +	gpusvm->device_private_page_owner =
> > > device_private_page_owner;
> > > +	gpusvm->mm_start = mm_start;
> > > +	gpusvm->mm_range = mm_range;
> > > +	gpusvm->notifier_size = notifier_size;
> > > +	gpusvm->ops = ops;
> > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > +	gpusvm->num_chunks = num_chunks;
> > > +	gpusvm->zdd_wq = system_wq;
> > > +
> > > +	mmgrab(mm);
> > > +	gpusvm->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > +
> > > +	init_rwsem(&gpusvm->notifier_lock);
> > > +
> > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > +	might_lock(&gpusvm->notifier_lock);
> > > +	fs_reclaim_release(GFP_KERNEL);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > + * @fault_addr__: Fault address
> > > + *
> > > + * This macro finds the GPU SVM notifier associated with the fault
> > > address.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > + */
> > > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > > +			    (fault_addr__ + 1))
> > > +
> > > +/**
> > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > given rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_notifier struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > structure.
> > > + */
> > > +#define to_drm_gpusvm_notifier(__node)				\
> > > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > > tree and list.
> > > + */
> > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	notifier_insert(notifier, &gpusvm->root);
> > > +
> > > +	node = rb_prev(&notifier->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > > +	else
> > > +		head = &gpusvm->notifier_list;
> > > +
> > > +	list_add(&notifier->rb.entry, head);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > > and list.
> > > + */
> > > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > > +	list_del(&(notifier__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + *
> > > + * This function finalizes the GPU SVM by cleaning up any remaining
> > > ranges and
> > > + * notifiers, and dropping a reference to struct MM.
> > > + */
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > +
> > > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > > LONG_MAX) {
> > > +		struct drm_gpusvm_range *range, *__next;
> > > +
> > > +		/*
> > > +		 * Remove notifier first to avoid racing with any
> > > invalidation
> > > +		 */
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +		notifier->flags.removed = true;
> > > +
> > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > notifier, 0,
> > > +					       LONG_MAX)
> > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > +	}
> > > +
> > > +	mmdrop(gpusvm->mm);
> > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + *
> > > + * This function allocates and initializes the GPU SVM notifier
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > > on failure.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	if (gpusvm->ops->notifier_alloc)
> > > +		notifier = gpusvm->ops->notifier_alloc();
> > > +	else
> > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > +
> > > +	if (!notifier)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	notifier->gpusvm = gpusvm;
> > > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >notifier_size);
> > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > >notifier_size);
> > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > +	notifier->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > +
> > > +	return notifier;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function frees the GPU SVM notifier structure.
> > > + */
> > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > +				     struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > +
> > > +	if (gpusvm->ops->notifier_free)
> > > +		gpusvm->ops->notifier_free(notifier);
> > > +	else
> > > +		kfree(notifier);
> > > +}
> > > +
> > > +/**
> > > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > > rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_range struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > > + */
> > > +#define to_drm_gpusvm_range(node__)	\
> > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function inserts the GPU SVM range into the notifier RB tree
> > > and list.
> > > + */
> > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > *notifier,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > +	range_insert(range, &notifier->root);
> > > +
> > > +	node = rb_prev(&range->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > +	else
> > > +		head = &notifier->range_list;
> > > +
> > > +	list_add(&range->rb.entry, head);
> > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + * @range__: Pointer to the GPU SVM range structure
> > > + *
> > > + * This macro removes the GPU SVM range from the notifier RB tree
> > > and list.
> > > + */
> > > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > > +	range_remove((range__), &(notifier__)->root);		\
> > > +	list_del(&(range__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @fault_addr: Fault address
> > > + * @chunk_size: Chunk size
> > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > + *
> > > + * This function allocates and initializes the GPU SVM range
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_range *
> > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > +		       struct drm_gpusvm_notifier *notifier,
> > > +		       u64 fault_addr, u64 chunk_size, bool
> > > migrate_vram)
> > > +{
> > > +	struct drm_gpusvm_range *range;
> > > +
> > > +	if (gpusvm->ops->range_alloc)
> > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > +	else
> > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > +
> > > +	if (!range)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	kref_init(&range->refcount);
> > > +	range->gpusvm = gpusvm;
> > > +	range->notifier = notifier;
> > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > +	range->notifier_seq = LONG_MAX;
> > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_check_pages - Check pages
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Check if pages between start and end have been faulted in on the
> > > CPU. Use to
> > > + * prevent migration of pages without CPU backing store.
> > > + *
> > > + * Returns:
> > > + * True if pages have been faulted into CPU, False otherwise
> > > + */
> > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > +				   struct drm_gpusvm_notifier
> > > *notifier,
> > > +				   u64 start, u64 end)
> > > +{
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = 0,
> > > +		.notifier = &notifier->notifier,
> > > +		.start = start,
> > > +		.end = end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long *pfns;
> > > +	unsigned long npages = npages_in_range(start, end);
> > > +	int err, i;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > > +	if (!pfns)
> > > +		return false;
> > > +
> > > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > > >notifier);
> > > +	hmm_range.hmm_pfns = pfns;
> > > +
> > > +	while (true) {
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > +			err = -EFAULT;
> > > +			goto err_free;
> > > +		}
> > > +	}
> > > +
> > > +err_free:
> > > +	kvfree(pfns);
> > > +	return err ? false : true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @vas: Pointer to the virtual memory area structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @check_pages: Flag indicating whether to check pages
> > > + *
> > > + * This function determines the chunk size for the GPU SVM range
> > > based on the
> > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > > the virtual
> > > + * memory area boundaries.
> > > + *
> > > + * Returns:
> > > + * Chunk size on success, LONG_MAX on failure.
> > > + */
> > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier,
> > > +				       struct vm_area_struct *vas,
> > > +				       u64 fault_addr, u64
> > > gpuva_start,
> > > +				       u64 gpuva_end, bool
> > > check_pages)
> > > +{
> > > +	u64 start, end;
> > > +	int i = 0;
> > > +
> > > +retry:
> > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >chunk_sizes[i]);
> > > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > > +
> > > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > > +		    start >= notifier->interval.start &&
> > > +		    end <= notifier->interval.end &&
> > > +		    start >= gpuva_start && end <= gpuva_end)
> > > +			break;
> > > +	}
> > > +
> > > +	if (i == gpusvm->num_chunks)
> > > +		return LONG_MAX;
> > > +
> > > +	/*
> > > +	 * If allocation more than page, ensure not to overlap with
> > > existing
> > > +	 * ranges.
> > > +	 */
> > > +	if (end - start != SZ_4K) {
> > > +		struct drm_gpusvm_range *range;
> > > +
> > > +		range = drm_gpusvm_range_find(notifier, start, end);
> > > +		if (range) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +
> > > +		/*
> > > +		 * XXX: Only create range on pages CPU has faulted
> > > in. Without
> > > +		 * this check, or prefault, on BMG
> > > 'xe_exec_system_allocator --r
> > > +		 * process-many-malloc' fails. In the failure case,
> > > each process
> > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > results in 64k SVM
> > > +		 * ranges. When migrating the SVM ranges, some
> > > processes fail in
> > > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > > != npages'
> > > +		 * and then upon drm_gpusvm_range_get_pages device
> > > pages from
> > > +		 * other processes are collected + faulted in which
> > > creates all
> > > +		 * sorts of problems. Unsure exactly how this
> > > happening, also
> > > +		 * problem goes away if 'xe_exec_system_allocator --
> > > r
> > > +		 * process-many-malloc' mallocs at least 64k at a
> > > time.
> > > +		 */
> > > +		if (check_pages &&
> > > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > > end)) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +	}
> > > +
> > > +	return end - start;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function finds or inserts a newly allocated a GPU SVM range
> > > based on the
> > > + * fault address. Caller must hold a lock to protect range lookup
> > > and insertion.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct drm_gpusvm_range *range;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	bool notifier_alloc = false;
> > > +	u64 chunk_size;
> > > +	int err;
> > > +	bool migrate_vram;
> > > +
> > > +	if (fault_addr < gpusvm->mm_start ||
> > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > +		err = -EINVAL;
> > > +		goto err_out;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_write_locked(mm);
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > +	if (!notifier) {
> > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > fault_addr);
> > > +		if (IS_ERR(notifier)) {
> > > +			err = PTR_ERR(notifier);
> > > +			goto err_mmunlock;
> > > +		}
> > > +		notifier_alloc = true;
> > > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > > >notifier,
> > > +							  mm,
> > > notifier->interval.start,
> > > +							  notifier-
> > > >interval.end -
> > > +							  notifier-
> > > >interval.start,
> > > +							 
> > > &drm_gpusvm_notifier_ops);
> > > +		if (err)
> > > +			goto err_notifier;
> > > +	}
> > > +
> > > +	vas = vma_lookup(mm, fault_addr);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > +		err = -EPERM;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > fault_addr + 1);
> > > +	if (range)
> > > +		goto out_mmunlock;
> > > +	/*
> > > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > > current
> > > +	 * limitations. If/when migrate_vma_* add more support, this
> > > logic will
> > > +	 * have to change.
> > > +	 */
> > > +	migrate_vram = ctx->vram_possible &&
> > > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > > +
> > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > > vas,
> > > +						 fault_addr,
> > > gpuva_start,
> > > +						 gpuva_end,
> > > migrate_vram &&
> > > +						 !ctx->prefault);
> > > +	if (chunk_size == LONG_MAX) {
> > > +		err = -EINVAL;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > > chunk_size,
> > > +				       migrate_vram);
> > > +	if (IS_ERR(range)) {
> > > +		err = PTR_ERR(range);
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	drm_gpusvm_range_insert(notifier, range);
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > +
> > > +	if (ctx->prefault) {
> > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > +
> > > +		__ctx.mmap_locked = true;
> > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &__ctx);
> > > +		if (err)
> > > +			goto err_range_remove;
> > > +	}
> > > +
> > > +out_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +
> > > +	return range;
> > > +
> > > +err_range_remove:
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +err_notifier_remove:
> > > +	if (notifier_alloc)
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +err_notifier:
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +/**
> > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > + * @i__: the current page index in the iteration
> > > + * @j__: the current page index, log order, in the iteration
> > > + * @npages__: the total number of pages in the DMA region
> > > + * @order__: the order of the pages in the DMA region
> > > + *
> > > + * This macro iterates over each page in a DMA region. The DMA
> > > region
> > > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > > + * step through the region one block of 2^@order__ pages at a time.
> > > + */
> > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > GPU SVM range (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function unmap pages associated with a GPU SVM range.
> > > Assumes and
> > > + * asserts correct locking is in place when called.
> > > + */
> > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					   struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		unsigned long i, j, npages = npages_in_range(range-
> > > >va.start,
> > > +							     range-
> > > >va.end);
> > > +
> > > +		if (range->flags.has_dma_mapping) {
> > > +			for_each_dma_page(i, j, npages, range-
> > > >order)
> > > +				dma_unmap_page(gpusvm->drm->dev,
> > > +					       range->dma_addr[j],
> > > +					       PAGE_SIZE << range-
> > > >order,
> > > +					       DMA_BIDIRECTIONAL);
> > > +		}
> > > +
> > > +		range->flags.has_vram_pages = false;
> > > +		range->flags.has_dma_mapping = false;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function free pages associated with a GPU SVM range.
> > > + */
> > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > > +					struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		if (range->flags.kfree_mapping) {
> > > +			kfree(range->dma_addr);
> > > +			range->flags.kfree_mapping = false;
> > > +			range->pages = NULL;
> > > +		} else {
> > > +			kvfree(range->pages);
> > > +			range->pages = NULL;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range to be removed
> > > + *
> > > + * This function removes the specified GPU SVM range and also
> > > removes the parent
> > > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > > caller must
> > > + * hold a lock to protect range and notifier removal.
> > > + */
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > >va.start);
> > > +	if (WARN_ON_ONCE(!notifier))
> > > +		return;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	drm_gpusvm_range_put(range);
> > > +
> > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > +		if (!notifier->flags.removed)
> > > +			mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function increments the reference count of the specified GPU
> > > SVM range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_get(&range->refcount);
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > + * @refcount: Pointer to the reference counter embedded in the GPU
> > > SVM range
> > > + *
> > > + * This function destroys the specified GPU SVM range when its
> > > reference count
> > > + * reaches zero. If a custom range-free function is provided, it is
> > > invoked to
> > > + * free the range; otherwise, the range is deallocated using
> > > kfree().
> > > + */
> > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > +{
> > > +	struct drm_gpusvm_range *range =
> > > +		container_of(refcount, struct drm_gpusvm_range,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->range_free)
> > > +		gpusvm->ops->range_free(range);
> > > +	else
> > > +		kfree(range);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function decrements the reference count of the specified GPU
> > > SVM range
> > > + * and frees it when the count reaches zero.
> > > + */
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called holding gpusvm->notifier_lock and as the last step before
> > > commiting a
> > > + * GPU binding.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	return range->flags.has_vram_pages || range-
> > > >flags.has_dma_mapping;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > > unlocked
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called without holding gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +static bool
> > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > +				      struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	bool pages_valid;
> > > +
> > > +	if (!range->pages)
> > > +		return false;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > +		kfree(range->dma_addr);
> > > +		range->flags.kfree_mapping = false;
> > > +		range->pages = NULL;
> > > +	}
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	return pages_valid;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function gets pages for a GPU SVM range and ensures they are
> > > mapped for
> > > + * DMA access.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > > >notifier;
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > > ? 0 :
> > > +			HMM_PFN_REQ_WRITE),
> > > +		.notifier = notifier,
> > > +		.start = range->va.start,
> > > +		.end = range->va.end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long i, j;
> > > +	unsigned long npages = npages_in_range(range->va.start,
> > > range->va.end);
> > > +	unsigned int order = 0;
> > > +	unsigned long *pfns;
> > > +	struct page **pages;
> > > +	int err = 0;
> > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > +	bool alloc_pfns = false, kfree_mapping;
> > > +
> > > +retry:
> > > +	kfree_mapping = false;
> > > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > > +		return 0;
> > > +
> > > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > > >pages) {
> > > +		if (ctx->prefault)
> > > +			return 0;
> > > +
> > > +		pfns = (unsigned long *)range->pages;
> > > +		pages = range->pages;
> > > +		goto map_pages;
> > > +	}
> > > +
> > > +	if (!range->pages) {
> > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +		if (!pfns)
> > > +			return -ENOMEM;
> > > +		alloc_pfns = true;
> > > +	} else {
> > > +		pfns = (unsigned long *)range->pages;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +	}
> > > +
> > > +	hmm_range.hmm_pfns = pfns;
> > > +	while (true) {
> > > +		/* Must be checked after mmu_interval_read_begin */
> > > +		if (range->flags.unmapped) {
> > > +			err = -EFAULT;
> > > +			break;
> > > +		}
> > > +
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates only
> > > a read-lock
> > > +			 * is required but there apears to be a
> > > window between
> > > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > > a CPU fault
> > > +			 * via migrate_vma_setup and the pages
> > > actually moving
> > > +			 * in migrate_vma_finalize in which this
> > > code can grab
> > > +			 * garbage pages. Grabbing the write-lock if
> > > the range
> > > +			 * is attached to vram appears to protect
> > > against this
> > > +			 * race.
> > > +			 */
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if
> > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					
> > > hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if
> > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE <<
> > > order,
> > > +						  
> > > DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j,
> > > sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] =
> > > (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order,
> > > DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range. If
> > > @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > > mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > > called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > >invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long
> > > *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > > grate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified GPU
> > > SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > > migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > > pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU
> > > SVM. It
> > > + * iterates over each page frame number provided in @migrate_pfn,
> > > maps the
> > > + * corresponding page, and stores the DMA address in the provided
> > > @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int
> > > *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page =
> > > migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > > dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > > for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for GPU
> > > Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in
> > > @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the
> > > corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > > The caller
> > > + *                   should hold a reference to the VRAM allocation,
> > > which
> > > + *                   should be dropped via ops->vram_allocation or
> > > upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM. It
> > > performs the
> > > + * necessary setup and invokes the driver-specific operations for
> > > migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > reference @range
> > > + * until ops->vram_release is called which only upon successful
> > > return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > >copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > > npages, not
> > > +	 * always an error. Need to revisit possible cases and how
> > > to handle. We
> > > +	 * could prefault on migrate.cpages != npages via
> > > hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.src, npages,
> > > DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > > Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > > VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers
> > > (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in the
> > > VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > > if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > vm_area_struct *vas,
> > > +						unsigned long
> > > npages,
> > > +						unsigned long
> > > *src_mpfn,
> > > +						unsigned long *mpfn,
> > > u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > > lock and
> > > + * migration done via migrate_device_* functions. Fallback path as
> > > it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > > +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > >vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm-
> > > >device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   dst, npages,
> > > DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified
> > > GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially unmapped
> > > */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src,
> > > migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU SVM
> > > range to
> > > + * SRAM. It performs necessary checks and invokes the internal
> > > migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err =
> > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner
> > > case when
> > > +	 * VRAM backing has been partially unmapped from MM's
> > > address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > > end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be called
> > > while
> > > +		 * holding a dma-resv lock, and a final put can grab
> > > the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > > with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone device
> > > data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > > fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU SVM
> > > range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting page
> > > and invokes
> > > + * the internal migration function to migrate the range back to RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > + */
> > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > > operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > > given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > > u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier, start,
> > > end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual
> > > Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage SVM
> > > ranges and
> > > + * perform operations such as migration between VRAM and system RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success,
> > > NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > > on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> > > *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and expects
> > > to drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers
> > > (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables. It
> > > can safely
> > > +	 * walk the notifier range RB tree/list in this function.
> > > Called while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range
> > > *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > > notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order
> > > they appear in
> > > + *              interval tree. This is useful to keep iterating
> > > ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval notifier
> > > has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > + * @pages: Pointer to the array of pages (if backing store is in
> > > VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> > > mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> > > size
> > > + * @flags.migrate_vram: Flag indicating whether the range can be
> > > migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been
> > > partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > > pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > > mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > > allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking
> > > memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under
> > > notifier lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier
> > > operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing GPU SVM
> > > notifiers
> > > + * @notifier_list: list head containing of notifiers in the same
> > > order they
> > > + *                 appear in interval tree. This is useful to keep
> > > iterating
> > > + *                 notifiers while doing modifications to RB tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory) used
> > > for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be
> > > embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which
> > > handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > > u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > > NULL if the
> > > + *         current range is the last one or if the input range is
> > > NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier->range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > > notifier
> > > + * @range__: Iterator variable for the ranges. If set, it indicates
> > > the start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> > > get the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> > > It is safe
> > > + * to use while holding the driver SVM lock or the notifier lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > > end__)	\
> > > +	for ((range__) = (range__)
> > > ?:					\
> > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > (end__));	\
> > > +	     (range__) && (range__->va.start <
> > > (end__));		\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets the
> > > partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier
> > > range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > +			      const struct mmu_notifier_range
> > > *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> >
Thomas Hellström Aug. 29, 2024, 7:18 p.m. UTC | #14
Hi, Matthew,

On Thu, 2024-08-29 at 17:45 +0000, Matthew Brost wrote:
> On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> > Hi, Matt. 
> > 
> > Some initial design comments / questions:
> > 
> > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > This patch introduces support for GPU Shared Virtual Memory (SVM)
> > > in
> > > the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > sharing of memory between the CPU and GPU, enhancing performance
> > > and
> > > flexibility in GPU computing tasks.
> > > 
> > > The patch adds the necessary infrastructure for SVM, including
> > > data
> > > structures and functions for managing SVM ranges and notifiers.
> > > It
> > > also
> > > provides mechanisms for allocating, deallocating, and migrating
> > > memory
> > > regions between system RAM and GPU VRAM.
> > > 
> > > This mid-layer is largely inspired by GPUVM.
> > > 
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > +++++++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > 
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > >  
> > >  # core driver code
> > >  
> > > -xe-y += xe_bb.o \
> > > +xe-y += drm_gpusvm.o \
> > > +	xe_bb.o \
> > >  	xe_bo.o \
> > >  	xe_bo_evict.o \
> > >  	xe_devcoredump.o \
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > new file mode 100644
> > > index 000000000000..fc1e44e6ae72
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > @@ -0,0 +1,2174 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + *
> > > + * Authors:
> > > + *     Matthew Brost <matthew.brost@intel.com>
> > > + */
> > > +
> > > +#include <linux/dma-mapping.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +#include <linux/hmm.h>
> > > +#include <linux/memremap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/mm_types.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include <drm/drm_device.h>
> > > +#include "drm_gpusvm.h"
> > > +
> > > +/**
> > > + * DOC: Overview
> > > + *
> > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > Rendering Manager (DRM)
> > > + *
> > > + * The GPU SVM layer is a component of the DRM framework
> > > designed to
> > > manage shared
> > > + * virtual memory between the CPU and GPU. It enables efficient
> > > data
> > > exchange and
> > > + * processing for GPU-accelerated applications by allowing
> > > memory
> > > sharing and
> > > + * synchronization between the CPU's and GPU's virtual address
> > > spaces.
> > > + *
> > > + * Key GPU SVM Components:
> > > + * - Notifiers: Notifiers: Used for tracking memory intervals
> > > and
> > > notifying the
> > > + *		GPU of changes, notifiers are sized based on a
> > > GPU
> > > SVM
> > > + *		initialization parameter, with a recommendation
> > > of
> > > 512M or
> > > + *		larger. They maintain a Red-BlacK tree and a
> > > list of
> > > ranges that
> > > + *		fall within the notifier interval. Notifiers are
> > > tracked within
> > > + *		a GPU SVM Red-BlacK tree and list and are
> > > dynamically inserted
> > > + *		or removed as ranges within the interval are
> > > created
> > > or
> > > + *		destroyed.
> > 
> > What is the benefit of this extra layer compared to direct
> > insertion of
> > ranges using mmu_interval_notifier_insert?
> > 
> > IIRC the argument made previously about having wide notifiers was
> > that
> > the rb tree lookups inside the core were costly and if there were
> > only
> > a few, then the rb tree lookups within a notifier range could be
> > replaced with the page-table radix-tree-like lookup, so each lookup
> > complexity would be O(log(n_notifiers) + page_table_depth).
> > 
> > But now we have first an rb-tree lookup in the core and then an rb-
> > tree
> > lookup within each notifier yeilding O(log(n_ranges))
> > 
> > I can see a small benefit in that inserting directly into the core
> > rb-
> > tree will block pending ongoing invalidations, but at a cost of an
> > extra multiplexing layer.
> > 
> 
> So when the notifier is triggered the search is a smaller range. In a
> perfect world eventually I'd like to drop the SVM range completely.
> There is a lot of changes required in Xe to make that possible and
> not
> entirely convinced it is possible and the ROI is worth it (additional
> complexity vs. perf benefit). For now, this was a relatively simple
> way
> to get SVM working (mirrors boths AMD's and Nvidia's implement wrt to
> having a range concept) but also is flexible in the sense the
> notifier
> size can be easily tweaked via a modparam [1] following Jason's
> suggestion of larger notifiers.
> 
> [1]
> https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1

What I meant was the core is already implementing the "one notifier for
the whole range", since your notifier duplicates the
mmu_interval_notifier functionality.

The mmu_interval_notifier first does an rbtree search to get to the
notifier, and then drm_gpusvm does an rbtree search to get to the
range.

If the svm notifier layer is skipped, mmu_interval_notifier has to
perform a wider rbtree search to get to the range. The point is, the
complexity is the same for both approaches so there is no point in
adding a svm notifier layer for that reason. The width of the notifier
just adjust the relative size of the two rbtree searches, so from that
point of view the drm_gpusvm does not offer any benefit from inserting
the ranges into the mmu_interval_notifier directly (except that the
mmu_interval_notifier is slightly more heavyweight).

As I understand it, Jasons comments were based on the assumption that
the drm_gpusvm search would be radix tree based, and hence with less
complexity than the rbtree search, and therefore providing a clear
benefit the larger they could be.

I.e. just calling something similar to xe_vm_invalidate_xxx over the
whole range, which will just skip subranges that are not populated.

/Thomas

> 
> > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > managed
> > > + *	     by GPU SVM. They are sized based on an array of
> > > chunk
> > > sizes, which
> > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > address space.
> > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > within the
> > > + *	     faulting CPU address space is chosen for the range
> > > size. Ranges are
> > > + *	     expected to be dynamically allocated on GPU fault
> > > and
> > > removed on an
> > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > ranges
> > > are tracked in
> > > + *	     a notifier's Red-Black tree.
> > 
> > How do ranges and chunks map to
> >  
> > a) Prefaulting granularity
> > b) Migration granularity?
> > 
> > > + * - Operations: Define the interface for driver-specific SVM
> > > operations such as
> > > + *		 allocation, page collection, migration,
> > > invalidations, and VRAM
> > > + *		 release.
> > > + *
> > > + * This layer provides interfaces for allocating, mapping,
> > > migrating, and
> > > + * releasing memory ranges between the CPU and GPU. It handles
> > > all
> > > core memory
> > > + * management interactions (DMA mapping, HMM, and migration) and
> > > provides
> > > + * driver-specific virtual functions (vfuncs). This
> > > infrastructure
> > > is sufficient
> > > + * to build the expected driver components for an SVM
> > > implementation
> > > as detailed
> > > + * below.
> > > + *
> > > + * Expected Driver Components:
> > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > based on the
> > > + *			     fault address, optionally migrate
> > > the
> > > range to
> > > + *			     VRAM, and create GPU bindings.
> > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > Ranges are
> > > + *			expected to be added to the garbage
> > > collector upon
> > > + *			MMU_NOTIFY_UNMAP event.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Locking
> > > + *
> > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > locks/unlocks the
> > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > handle the mmap
> > > + * lock itself, a 'locked' argument is provided to the functions
> > > that require
> > > + * the mmap lock. This option may be useful for drivers that
> > > need to
> > > call into
> > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > locking
> > > + * inversions between the mmap and dma-resv locks.
> > > + *
> > > + * GPU SVM introduces a global notifier lock, which safeguards
> > > the
> > > notifier's
> > > + * range RB tree and list, as well as the range's DMA mappings
> > > and
> > > sequence
> > > + * number. GPU SVM manages all necessary locking and unlocking
> > > operations,
> > > + * except for the recheck of the range's sequence number
> > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > bindings. This
> > > + * lock corresponds to the 'driver->update' lock mentioned in
> > > the
> > > HMM
> > > + * documentation (TODO: Link). Future revisions may transition
> > > from
> > > a GPU SVM
> > > + * global lock to a per-notifier lock if finer-grained locking
> > > is
> > > deemed
> > > + * necessary.
> > > + *
> > > + * In addition to the locking mentioned above, the driver should
> > > implement a
> > > + * lock to safeguard core GPU SVM function calls that modify
> > > state,
> > > such as
> > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > Alternatively,
> > > + * these core functions can be called within a single kernel
> > > thread,
> > > for
> > > + * instance, using an ordered work queue. This lock is denoted
> > > as
> > > + * 'driver_svm_lock' in code examples.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Migrataion
> > > + *
> > > + * The migration support is quite simple, allowing migration
> > > between
> > > SRAM and
> > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > does not
> > > + * support mixing SRAM and VRAM pages within a range. This means
> > > that upon GPU
> > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > fault, the
> > > + * entire range is migrated to SRAM.
> > > + *
> > > + * The reasoning for only supporting range granularity is as
> > > follows: it
> > > + * simplifies the implementation, and range sizes are driver-
> > > defined
> > > and should
> > > + * be relatively small.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Partial Unmapping of Ranges
> > > + *
> > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped
> > > by
> > > CPU resulting
> > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with
> > > the
> > > main one
> > > + * being that a subset of the range still has CPU and GPU
> > > mappings.
> > > If the
> > > + * backing store for the range is in VRAM, a subset of the
> > > backing
> > > store has
> > > + * references. One option would be to split the range and VRAM
> > > backing store,
> > > + * but the implementation for this would be quite complicated.
> > > Given
> > > that
> > > + * partial unmappings are rare and driver-defined range sizes
> > > are
> > > relatively
> > > + * small, GPU SVM does not support splitting of ranges.
> > > + *
> > > + * With no support for range splitting, upon partial unmapping
> > > of a
> > > range, the
> > > + * driver is expected to invalidate and destroy the entire
> > > range. If
> > > the range
> > > + * has VRAM as its backing, the driver is also expected to
> > > migrate
> > > any remaining
> > > + * pages back to SRAM.
> > 
> > So what happens if we get a one-page invalidation, say protection
> > change event, or NUMA accounting event, in the middle of a range?
> > Can
> > we unmap just that single gpu pte covering that range, that is, how
> > do
> > the ranges map to invalidation granularity? Does this differ
> > between
> > igfx an dgfx?
> 
> Well the idea of chunks is ranges should be 1 GPU page (the chunk
> array
> in Xe is 4k, 64k, and 2M). The design is flexible enough that doesn't
> have to true but optimized for the thinking each range is most likely
> 1
> GPU page. If this isn't true, then all GPU pages in the range are
> invalidated which isn't ideal but keeps it simple which IMO far out
> weighs the potential benefits. In theory a driver could implement
> spliting / partial invalidaions too with a couple of updates to
> GPUSVM
> but would likely largely be a driver implementation rather than
> GPUSVM.
> 
> No difference between igfx an dgfx.
> 
> You bring up a good point about protection changes, I likely haven't
> fully gotten that part of implementation correct either. I can add
> this
> to my TODO list and also update my IGTs to do things like this.
> 
> Matt
> 
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > 
> > 
> > > + */
> > > +
> > > +/**
> > > + * DOC: Examples
> > > + *
> > > + * This section provides two examples of how to build the
> > > expected
> > > driver
> > > + * components: the GPU page fault handler and the garbage
> > > collector.
> > > A third
> > > + * example demonstrates a sample invalidation driver vfunc.
> > > + *
> > > + * The generic code provided does not include logic for complex
> > > migration
> > > + * policies, optimized invalidations, or other potentially
> > > required
> > > driver
> > > + * locking (e.g., DMA-resv locks).
> > > + *
> > > + * 1) GPU page fault handler
> > > + *
> > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > drm_gpusvm_range *range)
> > > + *	{
> > > + *		int err = 0;
> > > + *
> > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > range);
> > > + *
> > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > + *			driver_commit_bind(gpusvm, range);
> > > + *		else
> > > + *			err = -EAGAIN;
> > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > + *
> > > + *		return err;
> > > + *	}
> > > + *
> > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *		int err;
> > > + *
> > > + *		driver_svm_lock();
> > > + *	retry:
> > > + *		// Always process UNMAPs first so view of GPU
> > > SVM
> > > ranges is current
> > > + *		driver_garbage_collector(gpusvm);
> > > + *
> > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > fault_addr,
> > > +
> > > *							gpuva_start,
> > > gpuva_end,
> > > + *						        &ctx);
> > > + *		if (IS_ERR(range)) {
> > > + *			err = PTR_ERR(range);
> > > + *			goto unlock;
> > > + *		}
> > > + *
> > > + *		if (driver_migration_policy(range)) {
> > > + *			bo = driver_alloc_bo();
> > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > range, bo, &ctx);
> > > + *			if (err)	// CPU mappings may have
> > > changed
> > > + *				goto retry;
> > > + *		}
> > > + *
> > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &ctx);
> > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > mappings changed
> > > + *			goto retry;
> > > + *		else if (err)
> > > + *			goto unlock;
> > > + *
> > > + *		err = driver_bind_range(gpusvm, range);
> > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > + *			goto retry
> > > + *
> > > + *	unlock:
> > > + *		driver_svm_unlock();
> > > + *		return err;
> > > + *	}
> > > + *
> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > *gpusvm,
> > > + *					struct drm_gpusvm_range
> > > *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM
> > > pages
> > > back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > range,
> > > &ctx);
> > > + *
> > > + *		driver_unbind_range(range);
> > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > + *	}
> > > + *
> > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > + *	{
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > range)
> > > + *			__driver_garbage_collector(gpusvm,
> > > range);
> > > + *	}
> > > + *
> > > + * 3) Invalidation driver vfunc.
> > > + *
> > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > + *				 struct drm_gpusvm_notifier
> > > *notifier,
> > > + *				 const struct mmu_notifier_range
> > > *mmu_range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > true,
> > > };
> > > + *		struct drm_gpusvm_range *range = NULL;
> > > + *
> > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > > start, mmu_range->end);
> > > + *
> > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > mmu_range->start,
> > > + *					  mmu_range->end) {
> > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > range,
> > > &ctx);
> > > + *
> > > + *			if (mmu_range->event !=
> > > MMU_NOTIFY_UNMAP)
> > > + *				continue;
> > > + *
> > > + *			drm_gpusvm_range_set_unmapped(range,
> > > mmu_range);
> > > + *			driver_garbage_collector_add(gpusvm,
> > > range);
> > > + *		}
> > > + *	}
> > > + */
> > > +
> > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > rb.__subtree_last,
> > > +		     DRM_GPUSVM_RANGE_START,
> > > DRM_GPUSVM_RANGE_END,
> > > +		     static __maybe_unused, range);
> > > +
> > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > interval.start)
> > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > interval.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > +		     rb.__subtree_last,
> > > DRM_GPUSVM_NOTIFIER_START,
> > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > __maybe_unused,
> > > notifier);
> > > +
> > > +/**
> > > + * npages_in_range() - Calculate the number of pages in a given
> > > range
> > > + * @start__: The start address of the range
> > > + * @end__: The end address of the range
> > > + *
> > > + * This macro calculates the number of pages in a given memory
> > > range,
> > > + * specified by the start and end addresses. It divides the
> > > difference
> > > + * between the end and start addresses by the page size
> > > (PAGE_SIZE)
> > > to
> > > + * determine the number of pages in the range.
> > > + *
> > > + * Return: The number of pages in the specified range.
> > > + */
> > > +#define npages_in_range(start__, end__)	\
> > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > +
> > > +/**
> > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > + *
> > > + * @refcount: Reference count for the zdd
> > > + * @destroy_work: Work structure for asynchronous zdd
> > > destruction
> > > + * @range: Pointer to the GPU SVM range
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > + *
> > > + * This structure serves as a generic wrapper installed in
> > > + * page->zone_device_data. It provides infrastructure for
> > > looking up
> > > a range
> > > + * upon CPU page fault and asynchronously releasing VRAM once
> > > the
> > > CPU has no
> > > + * page references. Asynchronous release is useful because CPU
> > > page
> > > references
> > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > requires sleeping
> > > + * locks.
> > > + */
> > > +struct drm_gpusvm_zdd {
> > > +	struct kref refcount;
> > > +	struct work_struct destroy_work;
> > > +	struct drm_gpusvm_range *range;
> > > +	void *vram_allocation;
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > destroying a
> > > zdd
> > > + * @w: Pointer to the work_struct
> > > + *
> > > + * This function releases VRAM, puts GPU SVM range, and frees
> > > zdd.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct
> > > *w)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(w, struct drm_gpusvm_zdd,
> > > destroy_work);
> > > +	struct drm_gpusvm_range *range = zdd->range;
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > +	drm_gpusvm_range_put(range);
> > > +	kfree(zdd);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > + * @range: Pointer to the GPU SVM range.
> > > + *
> > > + * This function allocates and initializes a new zdd structure.
> > > It
> > > sets up the
> > > + * reference count, initializes the destroy work, and links the
> > > provided GPU SVM
> > > + * range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_zdd *
> > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd;
> > > +
> > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > +	if (!zdd)
> > > +		return NULL;
> > > +
> > > +	kref_init(&zdd->refcount);
> > > +	INIT_WORK(&zdd->destroy_work,
> > > drm_gpusvm_zdd_destroy_work_func);
> > > +	zdd->range = drm_gpusvm_range_get(range);
> > > +	zdd->vram_allocation = NULL;
> > > +
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function increments the reference count of the provided
> > > zdd
> > > structure.
> > > + *
> > > + * Returns: Pointer to the zdd structure.
> > > + */
> > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_get(&zdd->refcount);
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > + * @ref: Pointer to the reference count structure.
> > > + *
> > > + * This function queues the destroy_work of the zdd for
> > > asynchronous
> > > destruction.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > +
> > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function decrements the reference count of the provided
> > > zdd
> > > structure
> > > + * and schedules its destruction if the count drops to zero.
> > > + */
> > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > notifier
> > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > + * @start: Start address of the range
> > > + * @end: End address of the range
> > > + *
> > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end)
> > > +{
> > > +	return range_iter_first(&notifier->root, start, end -
> > > 1);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > ranges in a notifier
> > > + * @range__: Iterator variable for the ranges
> > > + * @next__: Iterator variable for the ranges temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a
> > > notifier
> > > while
> > > + * removing ranges from it.
> > > + */
> > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > notifier__,
> > > start__, end__)	\
> > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > (start__), (end__)),	\
> > > +	     (next__) =
> > > __drm_gpusvm_range_next(range__);				\
> > > +	     (range__) && (range__->va.start <
> > > (end__));				\
> > > +	     (range__) = (next__), (next__) =
> > > __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier
> > > in
> > > the list
> > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > available,
> > > or NULL if
> > > + *         the current notifier is the last one or if the input
> > > notifier is
> > > + *         NULL.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > +{
> > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > +				      &notifier->gpusvm-
> > > > notifier_list))
> > > +		return list_next_entry(notifier, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> > > in
> > > a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > gpusvm.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > start__,
> > > end__)		\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > >root,
> > > (start__), (end__) - 1);	\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> > > SVM
> > > notifiers in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @next__: Iterator variable for the notifiers temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > gpusvm
> > > while
> > > + * removing notifiers from it.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > gpusvm__, start__, end__)	\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > >root,
> > > (start__), (end__) - 1),	\
> > > +	     (next__) =
> > > __drm_gpusvm_notifier_next(notifier__);				\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = (next__), (next__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > notifier.
> > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > + * @cur_seq: Current sequence number.
> > > + *
> > > + * This function serves as a generic MMU notifier for GPU SVM.
> > > It
> > > sets the MMU
> > > + * notifier sequence number and calls the driver invalidate
> > > vfunc
> > > under
> > > + * gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * true if the operation succeeds, false otherwise.
> > > + */
> > > +static bool
> > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > *mni,
> > > +			       const struct mmu_notifier_range
> > > *mmu_range,
> > > +			       unsigned long cur_seq)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier =
> > > +		container_of(mni, typeof(*notifier), notifier);
> > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > +
> > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > +		return false;
> > > +
> > > +	down_write(&gpusvm->notifier_lock);
> > > +	mmu_interval_set_seq(mni, cur_seq);
> > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > +	up_write(&gpusvm->notifier_lock);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations
> > > for
> > > GPU SVM
> > > + */
> > > +static const struct mmu_interval_notifier_ops
> > > drm_gpusvm_notifier_ops = {
> > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @name: Name of the GPU SVM.
> > > + * @drm: Pointer to the DRM device structure.
> > > + * @mm: Pointer to the mm_struct for the address space.
> > > + * @device_private_page_owner: Device private pages owner.
> > > + * @mm_start: Start address of GPU SVM.
> > > + * @mm_range: Range of the GPU SVM.
> > > + * @notifier_size: Size of individual notifiers.
> > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending
> > > order
> > > with last
> > > + *               entry being SZ_4K.
> > > + * @num_chunks: Number of chunks.
> > > + *
> > > + * This function initializes the GPU SVM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, a negative error code on failure.
> > > + */
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64
> > > notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks)
> > > +{
> > > +	if (!ops->invalidate || !num_chunks)
> > > +		return -EINVAL;
> > > +
> > > +	gpusvm->name = name;
> > > +	gpusvm->drm = drm;
> > > +	gpusvm->mm = mm;
> > > +	gpusvm->device_private_page_owner =
> > > device_private_page_owner;
> > > +	gpusvm->mm_start = mm_start;
> > > +	gpusvm->mm_range = mm_range;
> > > +	gpusvm->notifier_size = notifier_size;
> > > +	gpusvm->ops = ops;
> > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > +	gpusvm->num_chunks = num_chunks;
> > > +	gpusvm->zdd_wq = system_wq;
> > > +
> > > +	mmgrab(mm);
> > > +	gpusvm->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > +
> > > +	init_rwsem(&gpusvm->notifier_lock);
> > > +
> > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > +	might_lock(&gpusvm->notifier_lock);
> > > +	fs_reclaim_release(GFP_KERNEL);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > + * @fault_addr__: Fault address
> > > + *
> > > + * This macro finds the GPU SVM notifier associated with the
> > > fault
> > > address.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > + */
> > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > fault_addr__)	\
> > > +	notifier_iter_first(&(gpusvm__)->root,
> > > (fault_addr__),	\
> > > +			    (fault_addr__ + 1))
> > > +
> > > +/**
> > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > given rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_notifier struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > structure.
> > > + */
> > > +#define
> > > to_drm_gpusvm_notifier(__node)				\
> > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function inserts the GPU SVM notifier into the GPU SVM
> > > RB
> > > tree and list.
> > > + */
> > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct
> > > drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	notifier_insert(notifier, &gpusvm->root);
> > > +
> > > +	node = rb_prev(&notifier->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > >rb.entry;
> > > +	else
> > > +		head = &gpusvm->notifier_list;
> > > +
> > > +	list_add(&notifier->rb.entry, head);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This macro removes the GPU SVM notifier from the GPU SVM RB
> > > tree
> > > and list.
> > > + */
> > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > notifier__)	\
> > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > >root);	\
> > > +	list_del(&(notifier__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + *
> > > + * This function finalizes the GPU SVM by cleaning up any
> > > remaining
> > > ranges and
> > > + * notifiers, and dropping a reference to struct MM.
> > > + */
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > +
> > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > gpusvm, 0,
> > > LONG_MAX) {
> > > +		struct drm_gpusvm_range *range, *__next;
> > > +
> > > +		/*
> > > +		 * Remove notifier first to avoid racing with
> > > any
> > > invalidation
> > > +		 */
> > > +		mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +		notifier->flags.removed = true;
> > > +
> > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > notifier, 0,
> > > +					       LONG_MAX)
> > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > +	}
> > > +
> > > +	mmdrop(gpusvm->mm);
> > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + *
> > > + * This function allocates and initializes the GPU SVM notifier
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM notifier on success,
> > > ERR_PTR()
> > > on failure.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > fault_addr)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	if (gpusvm->ops->notifier_alloc)
> > > +		notifier = gpusvm->ops->notifier_alloc();
> > > +	else
> > > +		notifier = kzalloc(sizeof(*notifier),
> > > GFP_KERNEL);
> > > +
> > > +	if (!notifier)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	notifier->gpusvm = gpusvm;
> > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > gpusvm-
> > > > notifier_size);
> > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > > notifier_size);
> > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > +	notifier->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > +
> > > +	return notifier;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function frees the GPU SVM notifier structure.
> > > + */
> > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > +				     struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > +
> > > +	if (gpusvm->ops->notifier_free)
> > > +		gpusvm->ops->notifier_free(notifier);
> > > +	else
> > > +		kfree(notifier);
> > > +}
> > > +
> > > +/**
> > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > given
> > > rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_range struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_range
> > > structure.
> > > + */
> > > +#define to_drm_gpusvm_range(node__)	\
> > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function inserts the GPU SVM range into the notifier RB
> > > tree
> > > and list.
> > > + */
> > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > *notifier,
> > > +				    struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > +	range_insert(range, &notifier->root);
> > > +
> > > +	node = rb_prev(&range->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > +	else
> > > +		head = &notifier->range_list;
> > > +
> > > +	list_add(&range->rb.entry, head);
> > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + * @range__: Pointer to the GPU SVM range structure
> > > + *
> > > + * This macro removes the GPU SVM range from the notifier RB
> > > tree
> > > and list.
> > > + */
> > > +#define __drm_gpusvm_range_remove(notifier__,
> > > range__)		\
> > > +	range_remove((range__), &(notifier__)-
> > > >root);		\
> > > +	list_del(&(range__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @fault_addr: Fault address
> > > + * @chunk_size: Chunk size
> > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > + *
> > > + * This function allocates and initializes the GPU SVM range
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR()
> > > on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_range *
> > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > +		       struct drm_gpusvm_notifier *notifier,
> > > +		       u64 fault_addr, u64 chunk_size, bool
> > > migrate_vram)
> > > +{
> > > +	struct drm_gpusvm_range *range;
> > > +
> > > +	if (gpusvm->ops->range_alloc)
> > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > +	else
> > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > +
> > > +	if (!range)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	kref_init(&range->refcount);
> > > +	range->gpusvm = gpusvm;
> > > +	range->notifier = notifier;
> > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > +	range->notifier_seq = LONG_MAX;
> > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_check_pages - Check pages
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Check if pages between start and end have been faulted in on
> > > the
> > > CPU. Use to
> > > + * prevent migration of pages without CPU backing store.
> > > + *
> > > + * Returns:
> > > + * True if pages have been faulted into CPU, False otherwise
> > > + */
> > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > +				   struct drm_gpusvm_notifier
> > > *notifier,
> > > +				   u64 start, u64 end)
> > > +{
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = 0,
> > > +		.notifier = &notifier->notifier,
> > > +		.start = start,
> > > +		.end = end,
> > > +		.dev_private_owner = gpusvm-
> > > > device_private_page_owner,
> > > +	};
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long *pfns;
> > > +	unsigned long npages = npages_in_range(start, end);
> > > +	int err, i;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +	if (!pfns)
> > > +		return false;
> > > +
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier-
> > > > notifier);
> > > +	hmm_range.hmm_pfns = pfns;
> > > +
> > > +	while (true) {
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > +			err = -EFAULT;
> > > +			goto err_free;
> > > +		}
> > > +	}
> > > +
> > > +err_free:
> > > +	kvfree(pfns);
> > > +	return err ? false : true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> > > SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @vas: Pointer to the virtual memory area structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @check_pages: Flag indicating whether to check pages
> > > + *
> > > + * This function determines the chunk size for the GPU SVM range
> > > based on the
> > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > > and
> > > the virtual
> > > + * memory area boundaries.
> > > + *
> > > + * Returns:
> > > + * Chunk size on success, LONG_MAX on failure.
> > > + */
> > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct
> > > drm_gpusvm_notifier
> > > *notifier,
> > > +				       struct vm_area_struct
> > > *vas,
> > > +				       u64 fault_addr, u64
> > > gpuva_start,
> > > +				       u64 gpuva_end, bool
> > > check_pages)
> > > +{
> > > +	u64 start, end;
> > > +	int i = 0;
> > > +
> > > +retry:
> > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > chunk_sizes[i]);
> > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > >chunk_sizes[i]);
> > > +
> > > +		if (start >= vas->vm_start && end <= vas->vm_end
> > > &&
> > > +		    start >= notifier->interval.start &&
> > > +		    end <= notifier->interval.end &&
> > > +		    start >= gpuva_start && end <= gpuva_end)
> > > +			break;
> > > +	}
> > > +
> > > +	if (i == gpusvm->num_chunks)
> > > +		return LONG_MAX;
> > > +
> > > +	/*
> > > +	 * If allocation more than page, ensure not to overlap
> > > with
> > > existing
> > > +	 * ranges.
> > > +	 */
> > > +	if (end - start != SZ_4K) {
> > > +		struct drm_gpusvm_range *range;
> > > +
> > > +		range = drm_gpusvm_range_find(notifier, start,
> > > end);
> > > +		if (range) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +
> > > +		/*
> > > +		 * XXX: Only create range on pages CPU has
> > > faulted
> > > in. Without
> > > +		 * this check, or prefault, on BMG
> > > 'xe_exec_system_allocator --r
> > > +		 * process-many-malloc' fails. In the failure
> > > case,
> > > each process
> > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > results in 64k SVM
> > > +		 * ranges. When migrating the SVM ranges, some
> > > processes fail in
> > > +		 * drm_gpusvm_migrate_to_vram with
> > > 'migrate.cpages
> > > != npages'
> > > +		 * and then upon drm_gpusvm_range_get_pages
> > > device
> > > pages from
> > > +		 * other processes are collected + faulted in
> > > which
> > > creates all
> > > +		 * sorts of problems. Unsure exactly how this
> > > happening, also
> > > +		 * problem goes away if
> > > 'xe_exec_system_allocator --
> > > r
> > > +		 * process-many-malloc' mallocs at least 64k at
> > > a
> > > time.
> > > +		 */
> > > +		if (check_pages &&
> > > +		    !drm_gpusvm_check_pages(gpusvm, notifier,
> > > start,
> > > end)) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +	}
> > > +
> > > +	return end - start;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function finds or inserts a newly allocated a GPU SVM
> > > range
> > > based on the
> > > + * fault address. Caller must hold a lock to protect range
> > > lookup
> > > and insertion.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx
> > > *ctx)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct drm_gpusvm_range *range;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	bool notifier_alloc = false;
> > > +	u64 chunk_size;
> > > +	int err;
> > > +	bool migrate_vram;
> > > +
> > > +	if (fault_addr < gpusvm->mm_start ||
> > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > +		err = -EINVAL;
> > > +		goto err_out;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_write_locked(mm);
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > +	if (!notifier) {
> > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > fault_addr);
> > > +		if (IS_ERR(notifier)) {
> > > +			err = PTR_ERR(notifier);
> > > +			goto err_mmunlock;
> > > +		}
> > > +		notifier_alloc = true;
> > > +		err =
> > > mmu_interval_notifier_insert_locked(&notifier-
> > > > notifier,
> > > +							  mm,
> > > notifier->interval.start,
> > > +							 
> > > notifier-
> > > > interval.end -
> > > +							 
> > > notifier-
> > > > interval.start,
> > > +							 
> > > &drm_gpusvm_notifier_ops);
> > > +		if (err)
> > > +			goto err_notifier;
> > > +	}
> > > +
> > > +	vas = vma_lookup(mm, fault_addr);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > +		err = -EPERM;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > fault_addr + 1);
> > > +	if (range)
> > > +		goto out_mmunlock;
> > > +	/*
> > > +	 * XXX: Short-circuiting migration based on
> > > migrate_vma_*
> > > current
> > > +	 * limitations. If/when migrate_vma_* add more support,
> > > this
> > > logic will
> > > +	 * have to change.
> > > +	 */
> > > +	migrate_vram = ctx->vram_possible &&
> > > +		vma_is_anonymous(vas) &&
> > > !is_vm_hugetlb_page(vas);
> > > +
> > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > notifier,
> > > vas,
> > > +						 fault_addr,
> > > gpuva_start,
> > > +						 gpuva_end,
> > > migrate_vram &&
> > > +						 !ctx-
> > > >prefault);
> > > +	if (chunk_size == LONG_MAX) {
> > > +		err = -EINVAL;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > fault_addr,
> > > chunk_size,
> > > +				       migrate_vram);
> > > +	if (IS_ERR(range)) {
> > > +		err = PTR_ERR(range);
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	drm_gpusvm_range_insert(notifier, range);
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > +
> > > +	if (ctx->prefault) {
> > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > +
> > > +		__ctx.mmap_locked = true;
> > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &__ctx);
> > > +		if (err)
> > > +			goto err_range_remove;
> > > +	}
> > > +
> > > +out_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +
> > > +	return range;
> > > +
> > > +err_range_remove:
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +err_notifier_remove:
> > > +	if (notifier_alloc)
> > > +		mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +err_notifier:
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +/**
> > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > + * @i__: the current page index in the iteration
> > > + * @j__: the current page index, log order, in the iteration
> > > + * @npages__: the total number of pages in the DMA region
> > > + * @order__: the order of the pages in the DMA region
> > > + *
> > > + * This macro iterates over each page in a DMA region. The DMA
> > > region
> > > + * is assumed to be composed of 2^@order__ pages, and the macro
> > > will
> > > + * step through the region one block of 2^@order__ pages at a
> > > time.
> > > + */
> > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with
> > > a
> > > GPU SVM range (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function unmap pages associated with a GPU SVM range.
> > > Assumes and
> > > + * asserts correct locking is in place when called.
> > > + */
> > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					   struct
> > > drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		unsigned long i, j, npages =
> > > npages_in_range(range-
> > > > va.start,
> > > +							    
> > > range-
> > > > va.end);
> > > +
> > > +		if (range->flags.has_dma_mapping) {
> > > +			for_each_dma_page(i, j, npages, range-
> > > > order)
> > > +				dma_unmap_page(gpusvm->drm->dev,
> > > +					       range-
> > > >dma_addr[j],
> > > +					       PAGE_SIZE <<
> > > range-
> > > > order,
> > > +					      
> > > DMA_BIDIRECTIONAL);
> > > +		}
> > > +
> > > +		range->flags.has_vram_pages = false;
> > > +		range->flags.has_dma_mapping = false;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_free_pages - Free pages associated with a
> > > GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function free pages associated with a GPU SVM range.
> > > + */
> > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		if (range->flags.kfree_mapping) {
> > > +			kfree(range->dma_addr);
> > > +			range->flags.kfree_mapping = false;
> > > +			range->pages = NULL;
> > > +		} else {
> > > +			kvfree(range->pages);
> > > +			range->pages = NULL;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range to be removed
> > > + *
> > > + * This function removes the specified GPU SVM range and also
> > > removes the parent
> > > + * GPU SVM notifier if no more ranges remain in the notifier.
> > > The
> > > caller must
> > > + * hold a lock to protect range and notifier removal.
> > > + */
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > va.start);
> > > +	if (WARN_ON_ONCE(!notifier))
> > > +		return;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	drm_gpusvm_range_put(range);
> > > +
> > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > +		if (!notifier->flags.removed)
> > > +			mmu_interval_notifier_remove(&notifier-
> > > > notifier);
> > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function increments the reference count of the specified
> > > GPU
> > > SVM range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_get(&range->refcount);
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > + * @refcount: Pointer to the reference counter embedded in the
> > > GPU
> > > SVM range
> > > + *
> > > + * This function destroys the specified GPU SVM range when its
> > > reference count
> > > + * reaches zero. If a custom range-free function is provided, it
> > > is
> > > invoked to
> > > + * free the range; otherwise, the range is deallocated using
> > > kfree().
> > > + */
> > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > +{
> > > +	struct drm_gpusvm_range *range =
> > > +		container_of(refcount, struct drm_gpusvm_range,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->range_free)
> > > +		gpusvm->ops->range_free(range);
> > > +	else
> > > +		kfree(range);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function decrements the reference count of the specified
> > > GPU
> > > SVM range
> > > + * and frees it when the count reaches zero.
> > > + */
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called holding gpusvm->notifier_lock and as the last step
> > > before
> > > commiting a
> > > + * GPU binding.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	return range->flags.has_vram_pages || range-
> > > > flags.has_dma_mapping;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages
> > > valid
> > > unlocked
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called without holding gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +static bool
> > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > +				      struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	bool pages_valid;
> > > +
> > > +	if (!range->pages)
> > > +		return false;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > range);
> > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > +		kfree(range->dma_addr);
> > > +		range->flags.kfree_mapping = false;
> > > +		range->pages = NULL;
> > > +	}
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	return pages_valid;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function gets pages for a GPU SVM range and ensures they
> > > are
> > > mapped for
> > > + * DMA access.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct mmu_interval_notifier *notifier = &range-
> > > >notifier-
> > > > notifier;
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > >read_only
> > > ? 0 :
> > > +			HMM_PFN_REQ_WRITE),
> > > +		.notifier = notifier,
> > > +		.start = range->va.start,
> > > +		.end = range->va.end,
> > > +		.dev_private_owner = gpusvm-
> > > > device_private_page_owner,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long i, j;
> > > +	unsigned long npages = npages_in_range(range->va.start,
> > > range->va.end);
> > > +	unsigned int order = 0;
> > > +	unsigned long *pfns;
> > > +	struct page **pages;
> > > +	int err = 0;
> > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > +	bool alloc_pfns = false, kfree_mapping;
> > > +
> > > +retry:
> > > +	kfree_mapping = false;
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > range))
> > > +		return 0;
> > > +
> > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > range-
> > > > pages) {
> > > +		if (ctx->prefault)
> > > +			return 0;
> > > +
> > > +		pfns = (unsigned long *)range->pages;
> > > +		pages = range->pages;
> > > +		goto map_pages;
> > > +	}
> > > +
> > > +	if (!range->pages) {
> > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +		if (!pfns)
> > > +			return -ENOMEM;
> > > +		alloc_pfns = true;
> > > +	} else {
> > > +		pfns = (unsigned long *)range->pages;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +	}
> > > +
> > > +	hmm_range.hmm_pfns = pfns;
> > > +	while (true) {
> > > +		/* Must be checked after mmu_interval_read_begin
> > > */
> > > +		if (range->flags.unmapped) {
> > > +			err = -EFAULT;
> > > +			break;
> > > +		}
> > > +
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates
> > > only
> > > a read-lock
> > > +			 * is required but there apears to be a
> > > window between
> > > +			 * the MMU_NOTIFY_MIGRATE event
> > > triggered in
> > > a CPU fault
> > > +			 * via migrate_vma_setup and the pages
> > > actually moving
> > > +			 * in migrate_vma_finalize in which this
> > > code can grab
> > > +			 * garbage pages. Grabbing the write-
> > > lock if
> > > the range
> > > +			 * is attached to vram appears to
> > > protect
> > > against this
> > > +			 * race.
> > > +			 */
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if
> > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					
> > > hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if
> > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm-
> > > >dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE <<
> > > order,
> > > +						  
> > > DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j,
> > > sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] =
> > > (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order,
> > > DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range.
> > > If
> > > @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in
> > > write
> > > mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > Must be
> > > called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range,
> > > +				  const struct drm_gpusvm_ctx
> > > *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm-
> > > >notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long
> > > *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_pag
> > > e(mi
> > > grate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified
> > > GPU
> > > SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU
> > > SVM
> > > migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > mapped
> > > pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU
> > > SVM. It
> > > + * iterates over each page frame number provided in
> > > @migrate_pfn,
> > > maps the
> > > + * corresponding page, and stores the DMA address in the
> > > provided
> > > @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int
> > > *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page =
> > > migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > PAGE_SIZE,
> > > dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > mapped
> > > for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for
> > > GPU
> > > Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in
> > > @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the
> > > corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum
> > > dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation.
> > > The caller
> > > + *                   should hold a reference to the VRAM
> > > allocation,
> > > which
> > > + *                   should be dropped via ops->vram_allocation
> > > or
> > > upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM.
> > > It
> > > performs the
> > > + * necessary setup and invokes the driver-specific operations
> > > for
> > > migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > reference @range
> > > + * until ops->vram_release is called which only upon successful
> > > return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and
> > > migrate.cpages !=
> > > npages, not
> > > +	 * always an error. Need to revisit possible cases and
> > > how
> > > to handle. We
> > > +	 * could prefault on migrate.cpages != npages via
> > > hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.src, npages,
> > > DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation,
> > > vram_allocation);	/*
> > > Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for
> > > a
> > > VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers
> > > (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in
> > > the
> > > VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > allocation,
> > > if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > vm_area_struct *vas,
> > > +						unsigned long
> > > npages,
> > > +						unsigned long
> > > *src_mpfn,
> > > +						unsigned long
> > > *mpfn,
> > > u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> > > mmap
> > > lock and
> > > + * migration done via migrate_device_* functions. Fallback path
> > > as
> > > it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range-
> > > >va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > sizeof(*dma_addr)
> > > +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm-
> > > > device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   dst, npages,
> > > DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the
> > > specified
> > > GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct vm_area_struct
> > > *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		=
> > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially
> > > unmapped
> > > */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src,
> > > migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU
> > > SVM
> > > range to
> > > + * SRAM. It performs necessary checks and invokes the internal
> > > migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err =
> > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the
> > > corner
> > > case when
> > > +	 * VRAM backing has been partially unmapped from MM's
> > > address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > > start,
> > > end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be
> > > called
> > > while
> > > +		 * holding a dma-resv lock, and a final put can
> > > grab
> > > the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > associated
> > > with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone
> > > device
> > > data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > (page
> > > fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU
> > > SVM
> > > range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting
> > > page
> > > and invokes
> > > + * the internal migration function to migrate the range back to
> > > RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > >zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU
> > > SVM
> > > + */
> > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > > operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > > given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > start,
> > > u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start,
> > > end) {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier,
> > > start,
> > > end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual
> > > Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage SVM
> > > ranges and
> > > + * perform operations such as migration between VRAM and system
> > > RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier
> > > (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success,
> > > NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be
> > > freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier
> > > *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success,
> > > NULL
> > > on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > > drm_gpusvm
> > > *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and
> > > expects
> > > to drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers
> > > (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > > structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables.
> > > It
> > > can safely
> > > +	 * walk the notifier range RB tree/list in this
> > > function.
> > > Called while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range
> > > *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > > notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order
> > > they appear in
> > > + *              interval tree. This is useful to keep iterating
> > > ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval
> > > notifier
> > > has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU SVM
> > > range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > + * @pages: Pointer to the array of pages (if backing store is in
> > > VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and
> > > DMA
> > > mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > > mapping
> > > size
> > > + * @flags.migrate_vram: Flag indicating whether the range can be
> > > migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been
> > > unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been
> > > partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > > pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a
> > > DMA
> > > mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > > allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking
> > > memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under
> > > notifier lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending
> > > order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier
> > > operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing GPU
> > > SVM
> > > notifiers
> > > + * @notifier_list: list head containing of notifiers in the same
> > > order they
> > > + *                 appear in interval tree. This is useful to
> > > keep
> > > iterating
> > > + *                 notifiers while doing modifications to RB
> > > tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory)
> > > used
> > > for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > > device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be
> > > embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which
> > > handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user
> > > arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64
> > > notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx
> > > *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx
> > > *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range,
> > > +				  const struct drm_gpusvm_ctx
> > > *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx
> > > *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx
> > > *ctx);
> > > +
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > start,
> > > u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the
> > > list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available,
> > > or
> > > NULL if the
> > > + *         current range is the last one or if the input range
> > > is
> > > NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier-
> > > >range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > > notifier
> > > + * @range__: Iterator variable for the ranges. If set, it
> > > indicates
> > > the start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find()
> > > to
> > > get the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a
> > > notifier.
> > > It is safe
> > > + * to use while holding the driver SVM lock or the notifier
> > > lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > > end__)	\
> > > +	for ((range__) = (range__)
> > > ?:					\
> > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > (end__));	\
> > > +	     (range__) && (range__->va.start <
> > > (end__));		\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> > > unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets the
> > > partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier
> > > range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > +			      const struct mmu_notifier_range
> > > *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm-
> > > >notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> >
Matthew Brost Aug. 29, 2024, 8:56 p.m. UTC | #15
On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> Hi, Matthew,
> 
> On Thu, 2024-08-29 at 17:45 +0000, Matthew Brost wrote:
> > On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> > > Hi, Matt. 
> > > 
> > > Some initial design comments / questions:
> > > 
> > > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > > This patch introduces support for GPU Shared Virtual Memory (SVM)
> > > > in
> > > > the
> > > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > > sharing of memory between the CPU and GPU, enhancing performance
> > > > and
> > > > flexibility in GPU computing tasks.
> > > > 
> > > > The patch adds the necessary infrastructure for SVM, including
> > > > data
> > > > structures and functions for managing SVM ranges and notifiers.
> > > > It
> > > > also
> > > > provides mechanisms for allocating, deallocating, and migrating
> > > > memory
> > > > regions between system RAM and GPU VRAM.
> > > > 
> > > > This mid-layer is largely inspired by GPUVM.
> > > > 
> > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: Christian König <christian.koenig@amd.com>
> > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > > +++++++++++++++++++++++++++++++
> > > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > > b/drivers/gpu/drm/xe/Makefile
> > > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > >  
> > > >  # core driver code
> > > >  
> > > > -xe-y += xe_bb.o \
> > > > +xe-y += drm_gpusvm.o \
> > > > +	xe_bb.o \
> > > >  	xe_bo.o \
> > > >  	xe_bo_evict.o \
> > > >  	xe_devcoredump.o \
> > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > new file mode 100644
> > > > index 000000000000..fc1e44e6ae72
> > > > --- /dev/null
> > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > @@ -0,0 +1,2174 @@
> > > > +// SPDX-License-Identifier: MIT
> > > > +/*
> > > > + * Copyright © 2024 Intel Corporation
> > > > + *
> > > > + * Authors:
> > > > + *     Matthew Brost <matthew.brost@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/dma-mapping.h>
> > > > +#include <linux/interval_tree_generic.h>
> > > > +#include <linux/hmm.h>
> > > > +#include <linux/memremap.h>
> > > > +#include <linux/migrate.h>
> > > > +#include <linux/mm_types.h>
> > > > +#include <linux/pagemap.h>
> > > > +#include <linux/slab.h>
> > > > +
> > > > +#include <drm/drm_device.h>
> > > > +#include "drm_gpusvm.h"
> > > > +
> > > > +/**
> > > > + * DOC: Overview
> > > > + *
> > > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > > Rendering Manager (DRM)
> > > > + *
> > > > + * The GPU SVM layer is a component of the DRM framework
> > > > designed to
> > > > manage shared
> > > > + * virtual memory between the CPU and GPU. It enables efficient
> > > > data
> > > > exchange and
> > > > + * processing for GPU-accelerated applications by allowing
> > > > memory
> > > > sharing and
> > > > + * synchronization between the CPU's and GPU's virtual address
> > > > spaces.
> > > > + *
> > > > + * Key GPU SVM Components:
> > > > + * - Notifiers: Notifiers: Used for tracking memory intervals
> > > > and
> > > > notifying the
> > > > + *		GPU of changes, notifiers are sized based on a
> > > > GPU
> > > > SVM
> > > > + *		initialization parameter, with a recommendation
> > > > of
> > > > 512M or
> > > > + *		larger. They maintain a Red-BlacK tree and a
> > > > list of
> > > > ranges that
> > > > + *		fall within the notifier interval. Notifiers are
> > > > tracked within
> > > > + *		a GPU SVM Red-BlacK tree and list and are
> > > > dynamically inserted
> > > > + *		or removed as ranges within the interval are
> > > > created
> > > > or
> > > > + *		destroyed.
> > > 
> > > What is the benefit of this extra layer compared to direct
> > > insertion of
> > > ranges using mmu_interval_notifier_insert?
> > > 
> > > IIRC the argument made previously about having wide notifiers was
> > > that
> > > the rb tree lookups inside the core were costly and if there were
> > > only
> > > a few, then the rb tree lookups within a notifier range could be
> > > replaced with the page-table radix-tree-like lookup, so each lookup
> > > complexity would be O(log(n_notifiers) + page_table_depth).
> > > 
> > > But now we have first an rb-tree lookup in the core and then an rb-
> > > tree
> > > lookup within each notifier yeilding O(log(n_ranges))
> > > 
> > > I can see a small benefit in that inserting directly into the core
> > > rb-
> > > tree will block pending ongoing invalidations, but at a cost of an
> > > extra multiplexing layer.
> > > 
> > 
> > So when the notifier is triggered the search is a smaller range. In a
> > perfect world eventually I'd like to drop the SVM range completely.
> > There is a lot of changes required in Xe to make that possible and
> > not
> > entirely convinced it is possible and the ROI is worth it (additional
> > complexity vs. perf benefit). For now, this was a relatively simple
> > way
> > to get SVM working (mirrors boths AMD's and Nvidia's implement wrt to
> > having a range concept) but also is flexible in the sense the
> > notifier
> > size can be easily tweaked via a modparam [1] following Jason's
> > suggestion of larger notifiers.
> > 
> > [1]
> > https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1
> 
> What I meant was the core is already implementing the "one notifier for
> the whole range", since your notifier duplicates the
> mmu_interval_notifier functionality.
> 
> The mmu_interval_notifier first does an rbtree search to get to the
> notifier, and then drm_gpusvm does an rbtree search to get to the
> range.

Yes.

> 
> If the svm notifier layer is skipped, mmu_interval_notifier has to
> perform a wider rbtree search to get to the range. The point is, the
> complexity is the same for both approaches so there is no point in
> adding a svm notifier layer for that reason. The width of the notifier
> just adjust the relative size of the two rbtree searches, so from that
> point of view the drm_gpusvm does not offer any benefit from inserting
> the ranges into the mmu_interval_notifier directly (except that the
> mmu_interval_notifier is slightly more heavyweight).
> 

I think a large part of it was to avoid inserting / removing many
notifiers as that was expensive. Agree the search is not fundamentally
faster the way I have this coded. It just avoids heavy inserting /
removing of notifiers.

> As I understand it, Jasons comments were based on the assumption that
> the drm_gpusvm search would be radix tree based, and hence with less
> complexity than the rbtree search, and therefore providing a clear
> benefit the larger they could be.
> 
> I.e. just calling something similar to xe_vm_invalidate_xxx over the
> whole range, which will just skip subranges that are not populated.
> 

As stated, I think eventually removing the SVM range is a good longterm
goal.

I almost coded that in this initial series but ran into a number of
issues which make this complex and to get something working in simplest
way possible to enable further test development, start constructive
upstream discussions which appear to be happening, UMDs / application
development, and other up[er layer KMD development I stuck with this
approach.

I think for any solution which requires a SVM range (fwiw both AMD and
Nvidia have a similar concept), attaching the ranges to a larger
notifier makes sense and is better than 1 notifier per range.

Issues with removing a SVM range:

- Xe bind code stores invalidation / present state in VMA, this would
  need to be moved to the radix tree. I have Jira open for that work
  which I believe other developers are going to own.
- Where would the dma mapping / device pages be stored?
	- In the radix tree? What if ATS is enabled? We don't have a
	  driver owned radix tree. How do we reasonably connect a driver
	  owned radix to a common GPUSVM layer?
	- In the notifier? What is the notifier is sparsely populated?
	  We would be wasting huge amounts of memory. What is the
	  notifier is configured to span the entire virtual address
	  space?
- How does the garbage collector work? We can't allocate memory in the
  notifier so we don't anything to add to the garbage collector. We
  can't directly modify page tables given you need lock in the path of
  reclaim.
- How do we deal with fault storms (e.g. tons of faults hitting the same
  SVM range in a row)? Without a SVM range no every to know if mapping
  is valid and GPU page handler can be short circuited.
- Do we have notifier seqno for every PTE?

I feel like I'm missing a few and likely more issues would arrise when
implementing this too.

To be clear, I'm saying we shouldn't try to do this and all of the above
issues are likely workable but doing all this upfront is akin running
before we can walk. I rather solve of fundamental locking issues first,
have robust testing in place + passing and UMDs / apps running before
trying to rework this one. Performance numbers for this would also be
helpful too.

Matt

> /Thomas
> 
> > 
> > > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > > managed
> > > > + *	     by GPU SVM. They are sized based on an array of
> > > > chunk
> > > > sizes, which
> > > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > > address space.
> > > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > > within the
> > > > + *	     faulting CPU address space is chosen for the range
> > > > size. Ranges are
> > > > + *	     expected to be dynamically allocated on GPU fault
> > > > and
> > > > removed on an
> > > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > > ranges
> > > > are tracked in
> > > > + *	     a notifier's Red-Black tree.
> > > 
> > > How do ranges and chunks map to
> > >  
> > > a) Prefaulting granularity
> > > b) Migration granularity?
> > > 
> > > > + * - Operations: Define the interface for driver-specific SVM
> > > > operations such as
> > > > + *		 allocation, page collection, migration,
> > > > invalidations, and VRAM
> > > > + *		 release.
> > > > + *
> > > > + * This layer provides interfaces for allocating, mapping,
> > > > migrating, and
> > > > + * releasing memory ranges between the CPU and GPU. It handles
> > > > all
> > > > core memory
> > > > + * management interactions (DMA mapping, HMM, and migration) and
> > > > provides
> > > > + * driver-specific virtual functions (vfuncs). This
> > > > infrastructure
> > > > is sufficient
> > > > + * to build the expected driver components for an SVM
> > > > implementation
> > > > as detailed
> > > > + * below.
> > > > + *
> > > > + * Expected Driver Components:
> > > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > > based on the
> > > > + *			     fault address, optionally migrate
> > > > the
> > > > range to
> > > > + *			     VRAM, and create GPU bindings.
> > > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > > Ranges are
> > > > + *			expected to be added to the garbage
> > > > collector upon
> > > > + *			MMU_NOTIFY_UNMAP event.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Locking
> > > > + *
> > > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > > locks/unlocks the
> > > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > > handle the mmap
> > > > + * lock itself, a 'locked' argument is provided to the functions
> > > > that require
> > > > + * the mmap lock. This option may be useful for drivers that
> > > > need to
> > > > call into
> > > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > > locking
> > > > + * inversions between the mmap and dma-resv locks.
> > > > + *
> > > > + * GPU SVM introduces a global notifier lock, which safeguards
> > > > the
> > > > notifier's
> > > > + * range RB tree and list, as well as the range's DMA mappings
> > > > and
> > > > sequence
> > > > + * number. GPU SVM manages all necessary locking and unlocking
> > > > operations,
> > > > + * except for the recheck of the range's sequence number
> > > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > > bindings. This
> > > > + * lock corresponds to the 'driver->update' lock mentioned in
> > > > the
> > > > HMM
> > > > + * documentation (TODO: Link). Future revisions may transition
> > > > from
> > > > a GPU SVM
> > > > + * global lock to a per-notifier lock if finer-grained locking
> > > > is
> > > > deemed
> > > > + * necessary.
> > > > + *
> > > > + * In addition to the locking mentioned above, the driver should
> > > > implement a
> > > > + * lock to safeguard core GPU SVM function calls that modify
> > > > state,
> > > > such as
> > > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > > Alternatively,
> > > > + * these core functions can be called within a single kernel
> > > > thread,
> > > > for
> > > > + * instance, using an ordered work queue. This lock is denoted
> > > > as
> > > > + * 'driver_svm_lock' in code examples.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Migrataion
> > > > + *
> > > > + * The migration support is quite simple, allowing migration
> > > > between
> > > > SRAM and
> > > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > > does not
> > > > + * support mixing SRAM and VRAM pages within a range. This means
> > > > that upon GPU
> > > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > > fault, the
> > > > + * entire range is migrated to SRAM.
> > > > + *
> > > > + * The reasoning for only supporting range granularity is as
> > > > follows: it
> > > > + * simplifies the implementation, and range sizes are driver-
> > > > defined
> > > > and should
> > > > + * be relatively small.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Partial Unmapping of Ranges
> > > > + *
> > > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped
> > > > by
> > > > CPU resulting
> > > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with
> > > > the
> > > > main one
> > > > + * being that a subset of the range still has CPU and GPU
> > > > mappings.
> > > > If the
> > > > + * backing store for the range is in VRAM, a subset of the
> > > > backing
> > > > store has
> > > > + * references. One option would be to split the range and VRAM
> > > > backing store,
> > > > + * but the implementation for this would be quite complicated.
> > > > Given
> > > > that
> > > > + * partial unmappings are rare and driver-defined range sizes
> > > > are
> > > > relatively
> > > > + * small, GPU SVM does not support splitting of ranges.
> > > > + *
> > > > + * With no support for range splitting, upon partial unmapping
> > > > of a
> > > > range, the
> > > > + * driver is expected to invalidate and destroy the entire
> > > > range. If
> > > > the range
> > > > + * has VRAM as its backing, the driver is also expected to
> > > > migrate
> > > > any remaining
> > > > + * pages back to SRAM.
> > > 
> > > So what happens if we get a one-page invalidation, say protection
> > > change event, or NUMA accounting event, in the middle of a range?
> > > Can
> > > we unmap just that single gpu pte covering that range, that is, how
> > > do
> > > the ranges map to invalidation granularity? Does this differ
> > > between
> > > igfx an dgfx?
> > 
> > Well the idea of chunks is ranges should be 1 GPU page (the chunk
> > array
> > in Xe is 4k, 64k, and 2M). The design is flexible enough that doesn't
> > have to true but optimized for the thinking each range is most likely
> > 1
> > GPU page. If this isn't true, then all GPU pages in the range are
> > invalidated which isn't ideal but keeps it simple which IMO far out
> > weighs the potential benefits. In theory a driver could implement
> > spliting / partial invalidaions too with a couple of updates to
> > GPUSVM
> > but would likely largely be a driver implementation rather than
> > GPUSVM.
> > 
> > No difference between igfx an dgfx.
> > 
> > You bring up a good point about protection changes, I likely haven't
> > fully gotten that part of implementation correct either. I can add
> > this
> > to my TODO list and also update my IGTs to do things like this.
> > 
> > Matt
> > 
> > > 
> > > Thanks,
> > > Thomas
> > > 
> > > 
> > > 
> > > 
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Examples
> > > > + *
> > > > + * This section provides two examples of how to build the
> > > > expected
> > > > driver
> > > > + * components: the GPU page fault handler and the garbage
> > > > collector.
> > > > A third
> > > > + * example demonstrates a sample invalidation driver vfunc.
> > > > + *
> > > > + * The generic code provided does not include logic for complex
> > > > migration
> > > > + * policies, optimized invalidations, or other potentially
> > > > required
> > > > driver
> > > > + * locking (e.g., DMA-resv locks).
> > > > + *
> > > > + * 1) GPU page fault handler
> > > > + *
> > > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > > drm_gpusvm_range *range)
> > > > + *	{
> > > > + *		int err = 0;
> > > > + *
> > > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > > range);
> > > > + *
> > > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > > + *			driver_commit_bind(gpusvm, range);
> > > > + *		else
> > > > + *			err = -EAGAIN;
> > > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > > + *
> > > > + *		return err;
> > > > + *	}
> > > > + *
> > > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr,
> > > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > + *		int err;
> > > > + *
> > > > + *		driver_svm_lock();
> > > > + *	retry:
> > > > + *		// Always process UNMAPs first so view of GPU
> > > > SVM
> > > > ranges is current
> > > > + *		driver_garbage_collector(gpusvm);
> > > > + *
> > > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > > fault_addr,
> > > > +
> > > > *							gpuva_start,
> > > > gpuva_end,
> > > > + *						        &ctx);
> > > > + *		if (IS_ERR(range)) {
> > > > + *			err = PTR_ERR(range);
> > > > + *			goto unlock;
> > > > + *		}
> > > > + *
> > > > + *		if (driver_migration_policy(range)) {
> > > > + *			bo = driver_alloc_bo();
> > > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > > range, bo, &ctx);
> > > > + *			if (err)	// CPU mappings may have
> > > > changed
> > > > + *				goto retry;
> > > > + *		}
> > > > + *
> > > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > > &ctx);
> > > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > > mappings changed
> > > > + *			goto retry;
> > > > + *		else if (err)
> > > > + *			goto unlock;
> > > > + *
> > > > + *		err = driver_bind_range(gpusvm, range);
> > > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > > + *			goto retry
> > > > + *
> > > > + *	unlock:
> > > > + *		driver_svm_unlock();
> > > > + *		return err;
> > > > + *	}
> > > > + *
> > > > + * 2) Garbage Collector.
> > > > + *
> > > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > > *gpusvm,
> > > > + *					struct drm_gpusvm_range
> > > > *range)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > + *
> > > > + *		assert_driver_svm_locked(gpusvm);
> > > > + *
> > > > + *		// Partial unmap, migrate any remaining VRAM
> > > > pages
> > > > back to SRAM
> > > > + *		if (range->flags.partial_unmap)
> > > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > > range,
> > > > &ctx);
> > > > + *
> > > > + *		driver_unbind_range(range);
> > > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > > + *	}
> > > > + *
> > > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > > + *	{
> > > > + *		assert_driver_svm_locked(gpusvm);
> > > > + *
> > > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > > range)
> > > > + *			__driver_garbage_collector(gpusvm,
> > > > range);
> > > > + *	}
> > > > + *
> > > > + * 3) Invalidation driver vfunc.
> > > > + *
> > > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > > + *				 struct drm_gpusvm_notifier
> > > > *notifier,
> > > > + *				 const struct mmu_notifier_range
> > > > *mmu_range)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > > true,
> > > > };
> > > > + *		struct drm_gpusvm_range *range = NULL;
> > > > + *
> > > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > > > start, mmu_range->end);
> > > > + *
> > > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > > mmu_range->start,
> > > > + *					  mmu_range->end) {
> > > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range,
> > > > &ctx);
> > > > + *
> > > > + *			if (mmu_range->event !=
> > > > MMU_NOTIFY_UNMAP)
> > > > + *				continue;
> > > > + *
> > > > + *			drm_gpusvm_range_set_unmapped(range,
> > > > mmu_range);
> > > > + *			driver_garbage_collector_add(gpusvm,
> > > > range);
> > > > + *		}
> > > > + *	}
> > > > + */
> > > > +
> > > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > > rb.__subtree_last,
> > > > +		     DRM_GPUSVM_RANGE_START,
> > > > DRM_GPUSVM_RANGE_END,
> > > > +		     static __maybe_unused, range);
> > > > +
> > > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > > interval.start)
> > > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > > interval.end - 1)
> > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > > +		     rb.__subtree_last,
> > > > DRM_GPUSVM_NOTIFIER_START,
> > > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > > __maybe_unused,
> > > > notifier);
> > > > +
> > > > +/**
> > > > + * npages_in_range() - Calculate the number of pages in a given
> > > > range
> > > > + * @start__: The start address of the range
> > > > + * @end__: The end address of the range
> > > > + *
> > > > + * This macro calculates the number of pages in a given memory
> > > > range,
> > > > + * specified by the start and end addresses. It divides the
> > > > difference
> > > > + * between the end and start addresses by the page size
> > > > (PAGE_SIZE)
> > > > to
> > > > + * determine the number of pages in the range.
> > > > + *
> > > > + * Return: The number of pages in the specified range.
> > > > + */
> > > > +#define npages_in_range(start__, end__)	\
> > > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > > + *
> > > > + * @refcount: Reference count for the zdd
> > > > + * @destroy_work: Work structure for asynchronous zdd
> > > > destruction
> > > > + * @range: Pointer to the GPU SVM range
> > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation
> > > > + *
> > > > + * This structure serves as a generic wrapper installed in
> > > > + * page->zone_device_data. It provides infrastructure for
> > > > looking up
> > > > a range
> > > > + * upon CPU page fault and asynchronously releasing VRAM once
> > > > the
> > > > CPU has no
> > > > + * page references. Asynchronous release is useful because CPU
> > > > page
> > > > references
> > > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > > requires sleeping
> > > > + * locks.
> > > > + */
> > > > +struct drm_gpusvm_zdd {
> > > > +	struct kref refcount;
> > > > +	struct work_struct destroy_work;
> > > > +	struct drm_gpusvm_range *range;
> > > > +	void *vram_allocation;
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > > destroying a
> > > > zdd
> > > > + * @w: Pointer to the work_struct
> > > > + *
> > > > + * This function releases VRAM, puts GPU SVM range, and frees
> > > > zdd.
> > > > + */
> > > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct
> > > > *w)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd =
> > > > +		container_of(w, struct drm_gpusvm_zdd,
> > > > destroy_work);
> > > > +	struct drm_gpusvm_range *range = zdd->range;
> > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > +
> > > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > > +	drm_gpusvm_range_put(range);
> > > > +	kfree(zdd);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > > + * @range: Pointer to the GPU SVM range.
> > > > + *
> > > > + * This function allocates and initializes a new zdd structure.
> > > > It
> > > > sets up the
> > > > + * reference count, initializes the destroy work, and links the
> > > > provided GPU SVM
> > > > + * range.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > > failure.
> > > > + */
> > > > +static struct drm_gpusvm_zdd *
> > > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd;
> > > > +
> > > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > > +	if (!zdd)
> > > > +		return NULL;
> > > > +
> > > > +	kref_init(&zdd->refcount);
> > > > +	INIT_WORK(&zdd->destroy_work,
> > > > drm_gpusvm_zdd_destroy_work_func);
> > > > +	zdd->range = drm_gpusvm_range_get(range);
> > > > +	zdd->vram_allocation = NULL;
> > > > +
> > > > +	return zdd;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > > + * @zdd: Pointer to the zdd structure.
> > > > + *
> > > > + * This function increments the reference count of the provided
> > > > zdd
> > > > structure.
> > > > + *
> > > > + * Returns: Pointer to the zdd structure.
> > > > + */
> > > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > > drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	kref_get(&zdd->refcount);
> > > > +	return zdd;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > > + * @ref: Pointer to the reference count structure.
> > > > + *
> > > > + * This function queues the destroy_work of the zdd for
> > > > asynchronous
> > > > destruction.
> > > > + */
> > > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd =
> > > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > > refcount);
> > > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > > +
> > > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > > + * @zdd: Pointer to the zdd structure.
> > > > + *
> > > > + * This function decrements the reference count of the provided
> > > > zdd
> > > > structure
> > > > + * and schedules its destruction if the count drops to zero.
> > > > + */
> > > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > > notifier
> > > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > > + * @start: Start address of the range
> > > > + * @end: End address of the range
> > > > + *
> > > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > > start, u64 end)
> > > > +{
> > > > +	return range_iter_first(&notifier->root, start, end -
> > > > 1);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > > ranges in a notifier
> > > > + * @range__: Iterator variable for the ranges
> > > > + * @next__: Iterator variable for the ranges temporay storage
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the range
> > > > + * @end__: End address of the range
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > notifier
> > > > while
> > > > + * removing ranges from it.
> > > > + */
> > > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > > notifier__,
> > > > start__, end__)	\
> > > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > > (start__), (end__)),	\
> > > > +	     (next__) =
> > > > __drm_gpusvm_range_next(range__);				\
> > > > +	     (range__) && (range__->va.start <
> > > > (end__));				\
> > > > +	     (range__) = (next__), (next__) =
> > > > __drm_gpusvm_range_next(range__))
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier
> > > > in
> > > > the list
> > > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > > + *
> > > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > > available,
> > > > or NULL if
> > > > + *         the current notifier is the last one or if the input
> > > > notifier is
> > > > + *         NULL.
> > > > + */
> > > > +static struct drm_gpusvm_notifier *
> > > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > > +{
> > > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > > +				      &notifier->gpusvm-
> > > > > notifier_list))
> > > > +		return list_next_entry(notifier, rb.entry);
> > > > +
> > > > +	return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> > > > in
> > > > a gpusvm
> > > > + * @notifier__: Iterator variable for the notifiers
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the notifier
> > > > + * @end__: End address of the notifier
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > gpusvm.
> > > > + */
> > > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > > start__,
> > > > end__)		\
> > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > >root,
> > > > (start__), (end__) - 1);	\
> > > > +	     (notifier__) && (notifier__->interval.start <
> > > > (end__));			\
> > > > +	     (notifier__) =
> > > > __drm_gpusvm_notifier_next(notifier__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> > > > SVM
> > > > notifiers in a gpusvm
> > > > + * @notifier__: Iterator variable for the notifiers
> > > > + * @next__: Iterator variable for the notifiers temporay storage
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the notifier
> > > > + * @end__: End address of the notifier
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > gpusvm
> > > > while
> > > > + * removing notifiers from it.
> > > > + */
> > > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > > gpusvm__, start__, end__)	\
> > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > >root,
> > > > (start__), (end__) - 1),	\
> > > > +	     (next__) =
> > > > __drm_gpusvm_notifier_next(notifier__);				\
> > > > +	     (notifier__) && (notifier__->interval.start <
> > > > (end__));			\
> > > > +	     (notifier__) = (next__), (next__) =
> > > > __drm_gpusvm_notifier_next(notifier__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > > notifier.
> > > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > > + * @cur_seq: Current sequence number.
> > > > + *
> > > > + * This function serves as a generic MMU notifier for GPU SVM.
> > > > It
> > > > sets the MMU
> > > > + * notifier sequence number and calls the driver invalidate
> > > > vfunc
> > > > under
> > > > + * gpusvm->notifier_lock.
> > > > + *
> > > > + * Returns:
> > > > + * true if the operation succeeds, false otherwise.
> > > > + */
> > > > +static bool
> > > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > > *mni,
> > > > +			       const struct mmu_notifier_range
> > > > *mmu_range,
> > > > +			       unsigned long cur_seq)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier =
> > > > +		container_of(mni, typeof(*notifier), notifier);
> > > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > > +
> > > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > > +		return false;
> > > > +
> > > > +	down_write(&gpusvm->notifier_lock);
> > > > +	mmu_interval_set_seq(mni, cur_seq);
> > > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > > +	up_write(&gpusvm->notifier_lock);
> > > > +
> > > > +	return true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations
> > > > for
> > > > GPU SVM
> > > > + */
> > > > +static const struct mmu_interval_notifier_ops
> > > > drm_gpusvm_notifier_ops = {
> > > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + * @name: Name of the GPU SVM.
> > > > + * @drm: Pointer to the DRM device structure.
> > > > + * @mm: Pointer to the mm_struct for the address space.
> > > > + * @device_private_page_owner: Device private pages owner.
> > > > + * @mm_start: Start address of GPU SVM.
> > > > + * @mm_range: Range of the GPU SVM.
> > > > + * @notifier_size: Size of individual notifiers.
> > > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > range
> > > > allocation.
> > > > + *               Entries should be powers of 2 in descending
> > > > order
> > > > with last
> > > > + *               entry being SZ_4K.
> > > > + * @num_chunks: Number of chunks.
> > > > + *
> > > > + * This function initializes the GPU SVM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, a negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > +		    const char *name, struct drm_device *drm,
> > > > +		    struct mm_struct *mm, void
> > > > *device_private_page_owner,
> > > > +		    u64 mm_start, u64 mm_range, u64
> > > > notifier_size,
> > > > +		    const struct drm_gpusvm_ops *ops,
> > > > +		    const u64 *chunk_sizes, int num_chunks)
> > > > +{
> > > > +	if (!ops->invalidate || !num_chunks)
> > > > +		return -EINVAL;
> > > > +
> > > > +	gpusvm->name = name;
> > > > +	gpusvm->drm = drm;
> > > > +	gpusvm->mm = mm;
> > > > +	gpusvm->device_private_page_owner =
> > > > device_private_page_owner;
> > > > +	gpusvm->mm_start = mm_start;
> > > > +	gpusvm->mm_range = mm_range;
> > > > +	gpusvm->notifier_size = notifier_size;
> > > > +	gpusvm->ops = ops;
> > > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > > +	gpusvm->num_chunks = num_chunks;
> > > > +	gpusvm->zdd_wq = system_wq;
> > > > +
> > > > +	mmgrab(mm);
> > > > +	gpusvm->root = RB_ROOT_CACHED;
> > > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > > +
> > > > +	init_rwsem(&gpusvm->notifier_lock);
> > > > +
> > > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > > +	might_lock(&gpusvm->notifier_lock);
> > > > +	fs_reclaim_release(GFP_KERNEL);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > > + * @fault_addr__: Fault address
> > > > + *
> > > > + * This macro finds the GPU SVM notifier associated with the
> > > > fault
> > > > address.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > > + */
> > > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > > fault_addr__)	\
> > > > +	notifier_iter_first(&(gpusvm__)->root,
> > > > (fault_addr__),	\
> > > > +			    (fault_addr__ + 1))
> > > > +
> > > > +/**
> > > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > > given rbtree node
> > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > drm_gpusvm_notifier struct
> > > > + *
> > > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > > structure.
> > > > + */
> > > > +#define
> > > > to_drm_gpusvm_notifier(__node)				\
> > > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > > rb.node)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This function inserts the GPU SVM notifier into the GPU SVM
> > > > RB
> > > > tree and list.
> > > > + */
> > > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > > *gpusvm,
> > > > +				       struct
> > > > drm_gpusvm_notifier
> > > > *notifier)
> > > > +{
> > > > +	struct rb_node *node;
> > > > +	struct list_head *head;
> > > > +
> > > > +	notifier_insert(notifier, &gpusvm->root);
> > > > +
> > > > +	node = rb_prev(&notifier->rb.node);
> > > > +	if (node)
> > > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > > >rb.entry;
> > > > +	else
> > > > +		head = &gpusvm->notifier_list;
> > > > +
> > > > +	list_add(&notifier->rb.entry, head);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This macro removes the GPU SVM notifier from the GPU SVM RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > > notifier__)	\
> > > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > > >root);	\
> > > > +	list_del(&(notifier__)->rb.entry)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * This function finalizes the GPU SVM by cleaning up any
> > > > remaining
> > > > ranges and
> > > > + * notifiers, and dropping a reference to struct MM.
> > > > + */
> > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > > +
> > > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > > gpusvm, 0,
> > > > LONG_MAX) {
> > > > +		struct drm_gpusvm_range *range, *__next;
> > > > +
> > > > +		/*
> > > > +		 * Remove notifier first to avoid racing with
> > > > any
> > > > invalidation
> > > > +		 */
> > > > +		mmu_interval_notifier_remove(&notifier-
> > > > >notifier);
> > > > +		notifier->flags.removed = true;
> > > > +
> > > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > > notifier, 0,
> > > > +					       LONG_MAX)
> > > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > > +	}
> > > > +
> > > > +	mmdrop(gpusvm->mm);
> > > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @fault_addr: Fault address
> > > > + *
> > > > + * This function allocates and initializes the GPU SVM notifier
> > > > structure.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated GPU SVM notifier on success,
> > > > ERR_PTR()
> > > > on failure.
> > > > + */
> > > > +static struct drm_gpusvm_notifier *
> > > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	if (gpusvm->ops->notifier_alloc)
> > > > +		notifier = gpusvm->ops->notifier_alloc();
> > > > +	else
> > > > +		notifier = kzalloc(sizeof(*notifier),
> > > > GFP_KERNEL);
> > > > +
> > > > +	if (!notifier)
> > > > +		return ERR_PTR(-ENOMEM);
> > > > +
> > > > +	notifier->gpusvm = gpusvm;
> > > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > > gpusvm-
> > > > > notifier_size);
> > > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > > > notifier_size);
> > > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > > +	notifier->root = RB_ROOT_CACHED;
> > > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > > +
> > > > +	return notifier;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This function frees the GPU SVM notifier structure.
> > > > + */
> > > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > > +				     struct drm_gpusvm_notifier
> > > > *notifier)
> > > > +{
> > > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > > +
> > > > +	if (gpusvm->ops->notifier_free)
> > > > +		gpusvm->ops->notifier_free(notifier);
> > > > +	else
> > > > +		kfree(notifier);
> > > > +}
> > > > +
> > > > +/**
> > > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > > given
> > > > rbtree node
> > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > drm_gpusvm_range struct
> > > > + *
> > > > + * Return: A pointer to the containing drm_gpusvm_range
> > > > structure.
> > > > + */
> > > > +#define to_drm_gpusvm_range(node__)	\
> > > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function inserts the GPU SVM range into the notifier RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > > *notifier,
> > > > +				    struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	struct rb_node *node;
> > > > +	struct list_head *head;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > > +	range_insert(range, &notifier->root);
> > > > +
> > > > +	node = rb_prev(&range->rb.node);
> > > > +	if (node)
> > > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > > +	else
> > > > +		head = &notifier->range_list;
> > > > +
> > > > +	list_add(&range->rb.entry, head);
> > > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > + * @range__: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This macro removes the GPU SVM range from the notifier RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +#define __drm_gpusvm_range_remove(notifier__,
> > > > range__)		\
> > > > +	range_remove((range__), &(notifier__)-
> > > > >root);		\
> > > > +	list_del(&(range__)->rb.entry)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @fault_addr: Fault address
> > > > + * @chunk_size: Chunk size
> > > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > > + *
> > > > + * This function allocates and initializes the GPU SVM range
> > > > structure.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR()
> > > > on
> > > > failure.
> > > > + */
> > > > +static struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > > +		       struct drm_gpusvm_notifier *notifier,
> > > > +		       u64 fault_addr, u64 chunk_size, bool
> > > > migrate_vram)
> > > > +{
> > > > +	struct drm_gpusvm_range *range;
> > > > +
> > > > +	if (gpusvm->ops->range_alloc)
> > > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > > +	else
> > > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > > +
> > > > +	if (!range)
> > > > +		return ERR_PTR(-ENOMEM);
> > > > +
> > > > +	kref_init(&range->refcount);
> > > > +	range->gpusvm = gpusvm;
> > > > +	range->notifier = notifier;
> > > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > > +	range->notifier_seq = LONG_MAX;
> > > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > > +
> > > > +	return range;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_check_pages - Check pages
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @start: Start address
> > > > + * @end: End address
> > > > + *
> > > > + * Check if pages between start and end have been faulted in on
> > > > the
> > > > CPU. Use to
> > > > + * prevent migration of pages without CPU backing store.
> > > > + *
> > > > + * Returns:
> > > > + * True if pages have been faulted into CPU, False otherwise
> > > > + */
> > > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > > +				   struct drm_gpusvm_notifier
> > > > *notifier,
> > > > +				   u64 start, u64 end)
> > > > +{
> > > > +	struct hmm_range hmm_range = {
> > > > +		.default_flags = 0,
> > > > +		.notifier = &notifier->notifier,
> > > > +		.start = start,
> > > > +		.end = end,
> > > > +		.dev_private_owner = gpusvm-
> > > > > device_private_page_owner,
> > > > +	};
> > > > +	unsigned long timeout =
> > > > +		jiffies +
> > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > +	unsigned long *pfns;
> > > > +	unsigned long npages = npages_in_range(start, end);
> > > > +	int err, i;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > GFP_KERNEL);
> > > > +	if (!pfns)
> > > > +		return false;
> > > > +
> > > > +	hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(&notifier-
> > > > > notifier);
> > > > +	hmm_range.hmm_pfns = pfns;
> > > > +
> > > > +	while (true) {
> > > > +		err = hmm_range_fault(&hmm_range);
> > > > +		if (err == -EBUSY) {
> > > > +			if (time_after(jiffies, timeout))
> > > > +				break;
> > > > +
> > > > +			hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(&notifier->notifier);
> > > > +			continue;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_free;
> > > > +		}
> > > > +	}
> > > > +
> > > > +err_free:
> > > > +	kvfree(pfns);
> > > > +	return err ? false : true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> > > > SVM
> > > > range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @vas: Pointer to the virtual memory area structure
> > > > + * @fault_addr: Fault address
> > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > + * @check_pages: Flag indicating whether to check pages
> > > > + *
> > > > + * This function determines the chunk size for the GPU SVM range
> > > > based on the
> > > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > > > and
> > > > the virtual
> > > > + * memory area boundaries.
> > > > + *
> > > > + * Returns:
> > > > + * Chunk size on success, LONG_MAX on failure.
> > > > + */
> > > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > > *gpusvm,
> > > > +				       struct
> > > > drm_gpusvm_notifier
> > > > *notifier,
> > > > +				       struct vm_area_struct
> > > > *vas,
> > > > +				       u64 fault_addr, u64
> > > > gpuva_start,
> > > > +				       u64 gpuva_end, bool
> > > > check_pages)
> > > > +{
> > > > +	u64 start, end;
> > > > +	int i = 0;
> > > > +
> > > > +retry:
> > > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > > chunk_sizes[i]);
> > > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > > >chunk_sizes[i]);
> > > > +
> > > > +		if (start >= vas->vm_start && end <= vas->vm_end
> > > > &&
> > > > +		    start >= notifier->interval.start &&
> > > > +		    end <= notifier->interval.end &&
> > > > +		    start >= gpuva_start && end <= gpuva_end)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	if (i == gpusvm->num_chunks)
> > > > +		return LONG_MAX;
> > > > +
> > > > +	/*
> > > > +	 * If allocation more than page, ensure not to overlap
> > > > with
> > > > existing
> > > > +	 * ranges.
> > > > +	 */
> > > > +	if (end - start != SZ_4K) {
> > > > +		struct drm_gpusvm_range *range;
> > > > +
> > > > +		range = drm_gpusvm_range_find(notifier, start,
> > > > end);
> > > > +		if (range) {
> > > > +			++i;
> > > > +			goto retry;
> > > > +		}
> > > > +
> > > > +		/*
> > > > +		 * XXX: Only create range on pages CPU has
> > > > faulted
> > > > in. Without
> > > > +		 * this check, or prefault, on BMG
> > > > 'xe_exec_system_allocator --r
> > > > +		 * process-many-malloc' fails. In the failure
> > > > case,
> > > > each process
> > > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > > results in 64k SVM
> > > > +		 * ranges. When migrating the SVM ranges, some
> > > > processes fail in
> > > > +		 * drm_gpusvm_migrate_to_vram with
> > > > 'migrate.cpages
> > > > != npages'
> > > > +		 * and then upon drm_gpusvm_range_get_pages
> > > > device
> > > > pages from
> > > > +		 * other processes are collected + faulted in
> > > > which
> > > > creates all
> > > > +		 * sorts of problems. Unsure exactly how this
> > > > happening, also
> > > > +		 * problem goes away if
> > > > 'xe_exec_system_allocator --
> > > > r
> > > > +		 * process-many-malloc' mallocs at least 64k at
> > > > a
> > > > time.
> > > > +		 */
> > > > +		if (check_pages &&
> > > > +		    !drm_gpusvm_check_pages(gpusvm, notifier,
> > > > start,
> > > > end)) {
> > > > +			++i;
> > > > +			goto retry;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	return end - start;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > > range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @fault_addr: Fault address
> > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function finds or inserts a newly allocated a GPU SVM
> > > > range
> > > > based on the
> > > > + * fault address. Caller must hold a lock to protect range
> > > > lookup
> > > > and insertion.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > > failure.
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr,
> > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > +				const struct drm_gpusvm_ctx
> > > > *ctx)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +	struct drm_gpusvm_range *range;
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	struct vm_area_struct *vas;
> > > > +	bool notifier_alloc = false;
> > > > +	u64 chunk_size;
> > > > +	int err;
> > > > +	bool migrate_vram;
> > > > +
> > > > +	if (fault_addr < gpusvm->mm_start ||
> > > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > > +		err = -EINVAL;
> > > > +		goto err_out;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		mmap_write_lock(mm);
> > > > +	}
> > > > +
> > > > +	mmap_assert_write_locked(mm);
> > > > +
> > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > > +	if (!notifier) {
> > > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > > fault_addr);
> > > > +		if (IS_ERR(notifier)) {
> > > > +			err = PTR_ERR(notifier);
> > > > +			goto err_mmunlock;
> > > > +		}
> > > > +		notifier_alloc = true;
> > > > +		err =
> > > > mmu_interval_notifier_insert_locked(&notifier-
> > > > > notifier,
> > > > +							  mm,
> > > > notifier->interval.start,
> > > > +							 
> > > > notifier-
> > > > > interval.end -
> > > > +							 
> > > > notifier-
> > > > > interval.start,
> > > > +							 
> > > > &drm_gpusvm_notifier_ops);
> > > > +		if (err)
> > > > +			goto err_notifier;
> > > > +	}
> > > > +
> > > > +	vas = vma_lookup(mm, fault_addr);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > > +		err = -EPERM;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > > fault_addr + 1);
> > > > +	if (range)
> > > > +		goto out_mmunlock;
> > > > +	/*
> > > > +	 * XXX: Short-circuiting migration based on
> > > > migrate_vma_*
> > > > current
> > > > +	 * limitations. If/when migrate_vma_* add more support,
> > > > this
> > > > logic will
> > > > +	 * have to change.
> > > > +	 */
> > > > +	migrate_vram = ctx->vram_possible &&
> > > > +		vma_is_anonymous(vas) &&
> > > > !is_vm_hugetlb_page(vas);
> > > > +
> > > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > > notifier,
> > > > vas,
> > > > +						 fault_addr,
> > > > gpuva_start,
> > > > +						 gpuva_end,
> > > > migrate_vram &&
> > > > +						 !ctx-
> > > > >prefault);
> > > > +	if (chunk_size == LONG_MAX) {
> > > > +		err = -EINVAL;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > > fault_addr,
> > > > chunk_size,
> > > > +				       migrate_vram);
> > > > +	if (IS_ERR(range)) {
> > > > +		err = PTR_ERR(range);
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	drm_gpusvm_range_insert(notifier, range);
> > > > +	if (notifier_alloc)
> > > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > > +
> > > > +	if (ctx->prefault) {
> > > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > > +
> > > > +		__ctx.mmap_locked = true;
> > > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > > &__ctx);
> > > > +		if (err)
> > > > +			goto err_range_remove;
> > > > +	}
> > > > +
> > > > +out_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +
> > > > +	return range;
> > > > +
> > > > +err_range_remove:
> > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > +err_notifier_remove:
> > > > +	if (notifier_alloc)
> > > > +		mmu_interval_notifier_remove(&notifier-
> > > > >notifier);
> > > > +err_notifier:
> > > > +	if (notifier_alloc)
> > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +err_out:
> > > > +	return ERR_PTR(err);
> > > > +}
> > > > +
> > > > +/**
> > > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > > + * @i__: the current page index in the iteration
> > > > + * @j__: the current page index, log order, in the iteration
> > > > + * @npages__: the total number of pages in the DMA region
> > > > + * @order__: the order of the pages in the DMA region
> > > > + *
> > > > + * This macro iterates over each page in a DMA region. The DMA
> > > > region
> > > > + * is assumed to be composed of 2^@order__ pages, and the macro
> > > > will
> > > > + * step through the region one block of 2^@order__ pages at a
> > > > time.
> > > > + */
> > > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with
> > > > a
> > > > GPU SVM range (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function unmap pages associated with a GPU SVM range.
> > > > Assumes and
> > > > + * asserts correct locking is in place when called.
> > > > + */
> > > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					   struct
> > > > drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	if (range->pages) {
> > > > +		unsigned long i, j, npages =
> > > > npages_in_range(range-
> > > > > va.start,
> > > > +							    
> > > > range-
> > > > > va.end);
> > > > +
> > > > +		if (range->flags.has_dma_mapping) {
> > > > +			for_each_dma_page(i, j, npages, range-
> > > > > order)
> > > > +				dma_unmap_page(gpusvm->drm->dev,
> > > > +					       range-
> > > > >dma_addr[j],
> > > > +					       PAGE_SIZE <<
> > > > range-
> > > > > order,
> > > > +					      
> > > > DMA_BIDIRECTIONAL);
> > > > +		}
> > > > +
> > > > +		range->flags.has_vram_pages = false;
> > > > +		range->flags.has_dma_mapping = false;
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_free_pages - Free pages associated with a
> > > > GPU
> > > > SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function free pages associated with a GPU SVM range.
> > > > + */
> > > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	if (range->pages) {
> > > > +		if (range->flags.kfree_mapping) {
> > > > +			kfree(range->dma_addr);
> > > > +			range->flags.kfree_mapping = false;
> > > > +			range->pages = NULL;
> > > > +		} else {
> > > > +			kvfree(range->pages);
> > > > +			range->pages = NULL;
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range to be removed
> > > > + *
> > > > + * This function removes the specified GPU SVM range and also
> > > > removes the parent
> > > > + * GPU SVM notifier if no more ranges remain in the notifier.
> > > > The
> > > > caller must
> > > > + * hold a lock to protect range and notifier removal.
> > > > + */
> > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > +			     struct drm_gpusvm_range *range)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > > va.start);
> > > > +	if (WARN_ON_ONCE(!notifier))
> > > > +		return;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > +
> > > > +	drm_gpusvm_range_put(range);
> > > > +
> > > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > > +		if (!notifier->flags.removed)
> > > > +			mmu_interval_notifier_remove(&notifier-
> > > > > notifier);
> > > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > > + * @range: Pointer to the GPU SVM range
> > > > + *
> > > > + * This function increments the reference count of the specified
> > > > GPU
> > > > SVM range.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM range.
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	kref_get(&range->refcount);
> > > > +
> > > > +	return range;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > > + * @refcount: Pointer to the reference counter embedded in the
> > > > GPU
> > > > SVM range
> > > > + *
> > > > + * This function destroys the specified GPU SVM range when its
> > > > reference count
> > > > + * reaches zero. If a custom range-free function is provided, it
> > > > is
> > > > invoked to
> > > > + * free the range; otherwise, the range is deallocated using
> > > > kfree().
> > > > + */
> > > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > > +{
> > > > +	struct drm_gpusvm_range *range =
> > > > +		container_of(refcount, struct drm_gpusvm_range,
> > > > refcount);
> > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > +
> > > > +	if (gpusvm->ops->range_free)
> > > > +		gpusvm->ops->range_free(range);
> > > > +	else
> > > > +		kfree(range);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > > + * @range: Pointer to the GPU SVM range
> > > > + *
> > > > + * This function decrements the reference count of the specified
> > > > GPU
> > > > SVM range
> > > > + * and frees it when the count reaches zero.
> > > > + */
> > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function determines if a GPU SVM range pages are valid.
> > > > Expected be
> > > > + * called holding gpusvm->notifier_lock and as the last step
> > > > before
> > > > commiting a
> > > > + * GPU binding.
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > + */
> > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	return range->flags.has_vram_pages || range-
> > > > > flags.has_dma_mapping;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages
> > > > valid
> > > > unlocked
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function determines if a GPU SVM range pages are valid.
> > > > Expected be
> > > > + * called without holding gpusvm->notifier_lock.
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > + */
> > > > +static bool
> > > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > > +				      struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	bool pages_valid;
> > > > +
> > > > +	if (!range->pages)
> > > > +		return false;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > > range);
> > > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > > +		kfree(range->dma_addr);
> > > > +		range->flags.kfree_mapping = false;
> > > > +		range->pages = NULL;
> > > > +	}
> > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > +
> > > > +	return pages_valid;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function gets pages for a GPU SVM range and ensures they
> > > > are
> > > > mapped for
> > > > + * DMA access.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	struct mmu_interval_notifier *notifier = &range-
> > > > >notifier-
> > > > > notifier;
> > > > +	struct hmm_range hmm_range = {
> > > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > > >read_only
> > > > ? 0 :
> > > > +			HMM_PFN_REQ_WRITE),
> > > > +		.notifier = notifier,
> > > > +		.start = range->va.start,
> > > > +		.end = range->va.end,
> > > > +		.dev_private_owner = gpusvm-
> > > > > device_private_page_owner,
> > > > +	};
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	unsigned long timeout =
> > > > +		jiffies +
> > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > +	unsigned long i, j;
> > > > +	unsigned long npages = npages_in_range(range->va.start,
> > > > range->va.end);
> > > > +	unsigned int order = 0;
> > > > +	unsigned long *pfns;
> > > > +	struct page **pages;
> > > > +	int err = 0;
> > > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > > +	bool alloc_pfns = false, kfree_mapping;
> > > > +
> > > > +retry:
> > > > +	kfree_mapping = false;
> > > > +	hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(notifier);
> > > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > > range))
> > > > +		return 0;
> > > > +
> > > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > > range-
> > > > > pages) {
> > > > +		if (ctx->prefault)
> > > > +			return 0;
> > > > +
> > > > +		pfns = (unsigned long *)range->pages;
> > > > +		pages = range->pages;
> > > > +		goto map_pages;
> > > > +	}
> > > > +
> > > > +	if (!range->pages) {
> > > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > GFP_KERNEL);
> > > > +		if (!pfns)
> > > > +			return -ENOMEM;
> > > > +		alloc_pfns = true;
> > > > +	} else {
> > > > +		pfns = (unsigned long *)range->pages;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	hmm_range.hmm_pfns = pfns;
> > > > +	while (true) {
> > > > +		/* Must be checked after mmu_interval_read_begin
> > > > */
> > > > +		if (range->flags.unmapped) {
> > > > +			err = -EFAULT;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		if (!ctx->mmap_locked) {
> > > > +			/*
> > > > +			 * XXX: HMM locking document indicates
> > > > only
> > > > a read-lock
> > > > +			 * is required but there apears to be a
> > > > window between
> > > > +			 * the MMU_NOTIFY_MIGRATE event
> > > > triggered in
> > > > a CPU fault
> > > > +			 * via migrate_vma_setup and the pages
> > > > actually moving
> > > > +			 * in migrate_vma_finalize in which this
> > > > code can grab
> > > > +			 * garbage pages. Grabbing the write-
> > > > lock if
> > > > the range
> > > > +			 * is attached to vram appears to
> > > > protect
> > > > against this
> > > > +			 * race.
> > > > +			 */
> > > > +			if (vram_pages)
> > > > +				mmap_write_lock(mm);
> > > > +			else
> > > > +				mmap_read_lock(mm);
> > > > +		}
> > > > +		err = hmm_range_fault(&hmm_range);
> > > > +		if (!ctx->mmap_locked) {
> > > > +			if (vram_pages)
> > > > +				mmap_write_unlock(mm);
> > > > +			else
> > > > +				mmap_read_unlock(mm);
> > > > +		}
> > > > +
> > > > +		if (err == -EBUSY) {
> > > > +			if (time_after(jiffies, timeout))
> > > > +				break;
> > > > +
> > > > +			hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(notifier);
> > > > +			continue;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput(mm);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	pages = (struct page **)pfns;
> > > > +
> > > > +	if (ctx->prefault) {
> > > > +		range->pages = pages;
> > > > +		goto set_seqno;
> > > > +	}
> > > > +
> > > > +map_pages:
> > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > +
> > > > +		for (i = 0; i < npages; ++i) {
> > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > +
> > > > +			if
> > > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				goto err_free;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->flags.has_vram_pages = true;
> > > > +		range->pages = pages;
> > > > +		if (mmu_interval_read_retry(notifier,
> > > > hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	} else {
> > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > +
> > > > +		for_each_dma_page(i, j, npages, order) {
> > > > +			if (WARN_ON_ONCE(i && order !=
> > > > +					
> > > > hmm_pfn_to_map_order(pfns[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > +
> > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > +			if
> > > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +
> > > > +			set_page_dirty_lock(pages[j]);
> > > > +			mark_page_accessed(pages[j]);
> > > > +
> > > > +			dma_addr[j] = dma_map_page(gpusvm->drm-
> > > > >dev,
> > > > +						   pages[j], 0,
> > > > +						   PAGE_SIZE <<
> > > > order,
> > > > +						  
> > > > DMA_BIDIRECTIONAL);
> > > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > > dma_addr[j])) {
> > > > +				err = -EFAULT;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Huge pages, reduce memory footprint */
> > > > +		if (order) {
> > > > +			dma_addr = kmalloc_array(j,
> > > > sizeof(*dma_addr),
> > > > +						 GFP_KERNEL);
> > > > +			if (dma_addr) {
> > > > +				for (i = 0; i < j; ++i)
> > > > +					dma_addr[i] =
> > > > (dma_addr_t)pfns[i];
> > > > +				kvfree(pfns);
> > > > +				kfree_mapping = true;
> > > > +			} else {
> > > > +				dma_addr = (dma_addr_t *)pfns;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->order = order;
> > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > +		range->flags.has_dma_mapping = true;
> > > > +		range->dma_addr = dma_addr;
> > > > +		range->vram_allocation = NULL;
> > > > +		if (mmu_interval_read_retry(notifier,
> > > > hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	}
> > > > +
> > > > +	if (err == -EAGAIN)
> > > > +		goto retry;
> > > > +set_seqno:
> > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_unmap:
> > > > +	for_each_dma_page(i, j, npages, order)
> > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > +			       (dma_addr_t)pfns[j],
> > > > +			       PAGE_SIZE << order,
> > > > DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	if (alloc_pfns)
> > > > +		kvfree(pfns);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > > GPU
> > > > SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function unmaps pages associated with a GPU SVM range.
> > > > If
> > > > @in_notifier
> > > > + * is set, it is assumed that gpusvm->notifier_lock is held in
> > > > write
> > > > mode; if it
> > > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > > Must be
> > > > called on
> > > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > > invalidate for IOMMU
> > > > + * security model.
> > > > + */
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range,
> > > > +				  const struct drm_gpusvm_ctx
> > > > *ctx)
> > > > +{
> > > > +	if (ctx->in_notifier)
> > > > +		lockdep_assert_held_write(&gpusvm-
> > > > >notifier_lock);
> > > > +	else
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +
> > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +
> > > > +	if (!ctx->in_notifier)
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > + * @page: Pointer to the page to put
> > > > + *
> > > > + * This function unlocks and puts a page.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > +{
> > > > +	unlock_page(page);
> > > > +	put_page(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > + * @npages: Number of pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > + *
> > > > + * This function puts an array of pages.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > +					   unsigned long
> > > > *migrate_pfn)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!migrate_pfn[i])
> > > > +			continue;
> > > > +
> > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_pag
> > > > e(mi
> > > > grate_pfn[i]));
> > > > +		migrate_pfn[i] = 0;
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > + * @page: Pointer to the page
> > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > + *
> > > > + * This function associates the given page with the specified
> > > > GPU
> > > > SVM zone
> > > > + * device data and initializes it for zone device usage.
> > > > + */
> > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > +				     struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > +	zone_device_page_init(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU
> > > > SVM
> > > > migration
> > > > + * @dev: The device for which the pages are being mapped
> > > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > > mapped
> > > > pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > + * @npages: Number of pages to map
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function maps pages of memory for migration usage in GPU
> > > > SVM. It
> > > > + * iterates over each page frame number provided in
> > > > @migrate_pfn,
> > > > maps the
> > > > + * corresponding page, and stores the DMA address in the
> > > > provided
> > > > @dma_addr
> > > > + * array.
> > > > + *
> > > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > > mapping.
> > > > + */
> > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > +					dma_addr_t *dma_addr,
> > > > +					long unsigned int
> > > > *migrate_pfn,
> > > > +					unsigned long npages,
> > > > +					enum dma_data_direction
> > > > dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page =
> > > > migrate_pfn_to_page(migrate_pfn[i]);
> > > > +
> > > > +		if (!page)
> > > > +			continue;
> > > > +
> > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > +			return -EFAULT;
> > > > +
> > > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > > PAGE_SIZE,
> > > > dir);
> > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > +			return -EFAULT;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > > mapped
> > > > for GPU SVM migration
> > > > + * @dev: The device for which the pages were mapped
> > > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > > pages
> > > > + * @npages: Number of pages to unmap
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function unmaps previously mapped pages of memory for
> > > > GPU
> > > > Shared Virtual
> > > > + * Memory (SVM). It iterates over each DMA address provided in
> > > > @dma_addr, checks
> > > > + * if it's valid and not already unmapped, and unmaps the
> > > > corresponding page.
> > > > + */
> > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > +					   dma_addr_t *dma_addr,
> > > > +					   unsigned long npages,
> > > > +					   enum
> > > > dma_data_direction
> > > > dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > > dma_addr[i]))
> > > > +			continue;
> > > > +
> > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > > dir);
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *                   failure of this function.
> > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation.
> > > > The caller
> > > > + *                   should hold a reference to the VRAM
> > > > allocation,
> > > > which
> > > > + *                   should be dropped via ops->vram_allocation
> > > > or
> > > > upon the
> > > > + *                   failure of this function.
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function migrates the specified GPU SVM range to VRAM.
> > > > It
> > > > performs the
> > > > + * necessary setup and invokes the driver-specific operations
> > > > for
> > > > migration to
> > > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > > reference @range
> > > > + * until ops->vram_release is called which only upon successful
> > > > return.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       void *vram_allocation,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct migrate_vma migrate = {
> > > > +		.start		= start,
> > > > +		.end		= end,
> > > > +		.pgmap_owner	= gpusvm-
> > > > >device_private_page_owner,
> > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > +	};
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	unsigned long i, npages = npages_in_range(start, end);
> > > > +	struct vm_area_struct *vas;
> > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int err;
> > > > +
> > > > +	if (!range->flags.migrate_vram)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > > copy_to_vram ||
> > > > +	    !gpusvm->ops->copy_to_sram)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		mmap_write_lock(mm);
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	vas = vma_lookup(mm, start);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > +		err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (!vma_is_anonymous(vas)) {
> > > > +		err = -EBUSY;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr))
> > > > * npages;
> > > > +
> > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > +	if (!zdd) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/*
> > > > +	 * FIXME: Below cases, !migrate.cpages and
> > > > migrate.cpages !=
> > > > npages, not
> > > > +	 * always an error. Need to revisit possible cases and
> > > > how
> > > > to handle. We
> > > > +	 * could prefault on migrate.cpages != npages via
> > > > hmm_range_fault.
> > > > +	 */
> > > > +
> > > > +	if (!migrate.cpages) {
> > > > +		err = -EFAULT;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	if (migrate.cpages != npages) {
> > > > +		err = -EBUSY;
> > > > +		goto err_finalize;
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > > vram_allocation, npages,
> > > > +					     migrate.dst);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   migrate.src, npages,
> > > > DMA_TO_DEVICE);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > > +
> > > > +		pages[i] = page;
> > > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	/* Upon success bind vram allocation to range and zdd */
> > > > +	range->vram_allocation = vram_allocation;
> > > > +	WRITE_ONCE(zdd->vram_allocation,
> > > > vram_allocation);	/*
> > > > Owns ref */
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_TO_DEVICE);
> > > > +err_free:
> > > > +	if (zdd)
> > > > +		drm_gpusvm_zdd_put(zdd);
> > > > +	kvfree(buf);
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for
> > > > a
> > > > VM area
> > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > + * @npages: Number of pages to populate
> > > > + * @src_mpfn: Source array of migrate PFNs
> > > > + * @mpfn: Array of migrate PFNs to populate
> > > > + * @addr: Start address for PFN allocation
> > > > + *
> > > > + * This function populates the SRAM migrate page frame numbers
> > > > (PFNs) for the
> > > > + * specified VM area structure. It allocates and locks pages in
> > > > the
> > > > VM area for
> > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > > allocation,
> > > > if NULL use
> > > > + * alloc_page for allocation.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > > vm_area_struct *vas,
> > > > +						unsigned long
> > > > npages,
> > > > +						unsigned long
> > > > *src_mpfn,
> > > > +						unsigned long
> > > > *mpfn,
> > > > u64 addr)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > +		struct page *page;
> > > > +
> > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > +			continue;
> > > > +
> > > > +		if (vas)
> > > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > > addr);
> > > > +		else
> > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > +
> > > > +		if (!page)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		lock_page(page);
> > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> > > > mmap
> > > > lock and
> > > > + * migration done via migrate_device_* functions. Fallback path
> > > > as
> > > > it is
> > > > + * preferred to issue migrations with mmap lock.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > +				    struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	unsigned long *src, *dst;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	npages = npages_in_range(range->va.start, range-
> > > > >va.end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > > sizeof(*dma_addr)
> > > > +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	src = buf;
> > > > +	dst = buf + (sizeof(*src) * npages);
> > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > > npages;
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > > vram_allocation,
> > > > +					     npages, src);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > +				       gpusvm-
> > > > > device_private_page_owner, src,
> > > > +				       npages, range->va.start);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > > src, dst, 0);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   dst, npages,
> > > > DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > +	migrate_device_pages(src, dst, npages);
> > > > +	migrate_device_finalize(src, dst, npages);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > > (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @vas: Pointer to the VM area structure
> > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > + * @start: Start address of the migration range
> > > > + * @end: End address of the migration range
> > > > + *
> > > > + * This internal function performs the migration of the
> > > > specified
> > > > GPU SVM range
> > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > > PFNs, and
> > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					struct vm_area_struct
> > > > *vas,
> > > > +					struct page *page,
> > > > +					u64 start, u64 end)
> > > > +{
> > > > +	struct migrate_vma migrate = {
> > > > +		.vma		= vas,
> > > > +		.pgmap_owner	= gpusvm-
> > > > >device_private_page_owner,
> > > > +		.flags		=
> > > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > +		.fault_page	= page,
> > > > +	};
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	/* Corner where VMA area struct has been partially
> > > > unmapped
> > > > */
> > > > +	if (start < vas->vm_start)
> > > > +		start = vas->vm_start;
> > > > +	if (end > vas->vm_end)
> > > > +		end = vas->vm_end;
> > > > +
> > > > +	migrate.start = start;
> > > > +	migrate.end = end;
> > > > +	npages = npages_in_range(start, end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr))
> > > > * npages;
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/* Raced with another CPU fault, nothing to do */
> > > > +	if (!migrate.cpages)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > +						   migrate.src,
> > > > migrate.dst,
> > > > +						   start);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   migrate.dst, npages,
> > > > +					   DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > > SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function initiates the migration of the specified GPU
> > > > SVM
> > > > range to
> > > > + * SRAM. It performs necessary checks and invokes the internal
> > > > migration
> > > > + * function for actual migration.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	struct vm_area_struct *vas;
> > > > +	int err;
> > > > +	bool retry = false;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		if (ctx->trylock_mmap) {
> > > > +			if (!mmap_read_trylock(mm))  {
> > > > +				err =
> > > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > +				goto err_mmput;
> > > > +			}
> > > > +		} else {
> > > > +			mmap_read_lock(mm);
> > > > +		}
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	/*
> > > > +	 * Loop required to find all VMA area structs for the
> > > > corner
> > > > case when
> > > > +	 * VRAM backing has been partially unmapped from MM's
> > > > address space.
> > > > +	 */
> > > > +again:
> > > > +	vas = find_vma(mm, start);
> > > > +	if (!vas) {
> > > > +		if (!retry)
> > > > +			err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > +		if (!retry)
> > > > +			err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > > > start,
> > > > end);
> > > > +	if (err)
> > > > +		goto err_mmunlock;
> > > > +
> > > > +	if (vas->vm_end < end) {
> > > > +		retry = true;
> > > > +		start = vas->vm_end;
> > > > +		goto again;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_read_unlock(mm);
> > > > +		/*
> > > > +		 * Using mmput_async as this function can be
> > > > called
> > > > while
> > > > +		 * holding a dma-resv lock, and a final put can
> > > > grab
> > > > the mmap
> > > > +		 * lock, causing a lock inversion.
> > > > +		 */
> > > > +		mmput_async(mm);
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmap_read_unlock(mm);
> > > > +err_mmput:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput_async(mm);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > > associated
> > > > with a page
> > > > + * @page: Pointer to the page
> > > > + *
> > > > + * This function is a callback used to put the GPU SVM zone
> > > > device
> > > > data
> > > > + * associated with a page when it is being released.
> > > > + */
> > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > +{
> > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > > (page
> > > > fault handler)
> > > > + * @vmf: Pointer to the fault information structure
> > > > + *
> > > > + * This function is a page fault handler used to migrate a GPU
> > > > SVM
> > > > range to RAM.
> > > > + * It retrieves the GPU SVM range information from the faulting
> > > > page
> > > > and invokes
> > > > + * the internal migration function to migrate the range back to
> > > > RAM.
> > > > + *
> > > > + * Returns:
> > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > + */
> > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > > *vmf)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > > >zone_device_data;
> > > > +	int err;
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > +					   vmf->vma, vmf->page,
> > > > +					   zdd->range->va.start,
> > > > +					   zdd->range->va.end);
> > > > +
> > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU
> > > > SVM
> > > > + */
> > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > > +	.page_free = drm_gpusvm_page_free,
> > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > > > operations
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM device page map operations structure.
> > > > + */
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > > +{
> > > > +	return &drm_gpusvm_pagemap_ops;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > > > given address range
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + * @start: Start address
> > > > + * @end: End address
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM has mapping, False otherwise
> > > > + */
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > start,
> > > > u64 end)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start,
> > > > end) {
> > > > +		struct drm_gpusvm_range *range = NULL;
> > > > +
> > > > +		drm_gpusvm_for_each_range(range, notifier,
> > > > start,
> > > > end)
> > > > +			return true;
> > > > +	}
> > > > +
> > > > +	return false;
> > > > +}
> > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > new file mode 100644
> > > > index 000000000000..0ea70f8534a8
> > > > --- /dev/null
> > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > @@ -0,0 +1,415 @@
> > > > +/* SPDX-License-Identifier: MIT */
> > > > +/*
> > > > + * Copyright © 2024 Intel Corporation
> > > > + */
> > > > +
> > > > +#ifndef __DRM_GPUSVM_H__
> > > > +#define __DRM_GPUSVM_H__
> > > > +
> > > > +#include <linux/kref.h>
> > > > +#include <linux/mmu_notifier.h>
> > > > +#include <linux/workqueue.h>
> > > > +
> > > > +struct dev_pagemap_ops;
> > > > +struct drm_device;
> > > > +struct drm_gpusvm;
> > > > +struct drm_gpusvm_notifier;
> > > > +struct drm_gpusvm_ops;
> > > > +struct drm_gpusvm_range;
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > + *
> > > > + * This structure defines the operations for GPU Shared Virtual
> > > > Memory (SVM).
> > > > + * These operations are provided by the GPU driver to manage SVM
> > > > ranges and
> > > > + * perform operations such as migration between VRAM and system
> > > > RAM.
> > > > + */
> > > > +struct drm_gpusvm_ops {
> > > > +	/**
> > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier
> > > > (optional)
> > > > +	 *
> > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * Pointer to the allocated GPU SVM notifier on success,
> > > > NULL on failure.
> > > > +	 */
> > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > +
> > > > +	/**
> > > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > > +	 * @notifier: Pointer to the GPU SVM notifier to be
> > > > freed
> > > > +	 *
> > > > +	 * This function shall free a GPU SVM notifier.
> > > > +	 */
> > > > +	void (*notifier_free)(struct drm_gpusvm_notifier
> > > > *notifier);
> > > > +
> > > > +	/**
> > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 *
> > > > +	 * This function shall allocate a GPU SVM range.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * Pointer to the allocated GPU SVM range on success,
> > > > NULL
> > > > on failure.
> > > > +	 */
> > > > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > > > drm_gpusvm
> > > > *gpusvm);
> > > > +
> > > > +	/**
> > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > +	 *
> > > > +	 * This function shall free a GPU SVM range.
> > > > +	 */
> > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > +
> > > > +	/**
> > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation
> > > > +	 *
> > > > +	 * This function shall release VRAM allocation and
> > > > expects
> > > > to drop a
> > > > +	 * reference to VRAM allocation.
> > > > +	 */
> > > > +	void (*vram_release)(void *vram_allocation);
> > > > +
> > > > +	/**
> > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > > > migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation
> > > > +	 * @npages: Number of pages to populate
> > > > +	 * @pfn: Array of page frame numbers to populate
> > > > +	 *
> > > > +	 * This function shall populate VRAM page frame numbers
> > > > (PFN).
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > +				 void *vram_allocation,
> > > > +				 unsigned long npages,
> > > > +				 unsigned long *pfn);
> > > > +
> > > > +	/**
> > > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > > +	 * @npages: Number of pages to copy
> > > > +	 *
> > > > +	 * This function shall copy pages to VRAM.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > +			    struct page **pages,
> > > > +			    dma_addr_t *dma_addr,
> > > > +			    unsigned long npages);
> > > > +
> > > > +	/**
> > > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > > migration)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > > (destination)
> > > > +	 * @npages: Number of pages to copy
> > > > +	 *
> > > > +	 * This function shall copy pages to system RAM.
> > > > +	 *
> > > > +	 * Returns:
> > > > +	 * 0 on success, a negative error code on failure.
> > > > +	 */
> > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > +			    struct page **pages,
> > > > +			    dma_addr_t *dma_addr,
> > > > +			    unsigned long npages);
> > > > +
> > > > +	/**
> > > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > > > structure
> > > > +	 *
> > > > +	 * This function shall invalidate the GPU page tables.
> > > > It
> > > > can safely
> > > > +	 * walk the notifier range RB tree/list in this
> > > > function.
> > > > Called while
> > > > +	 * holding the notifier lock.
> > > > +	 */
> > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > +			   struct drm_gpusvm_notifier *notifier,
> > > > +			   const struct mmu_notifier_range
> > > > *mmu_range);
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > > > notifier
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: MMU interval notifier
> > > > + * @interval: Interval for the notifier
> > > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > > notifier tree
> > > > + * @root: Cached root node of the RB tree containing ranges
> > > > + * @range_list: List head containing of ranges in the same order
> > > > they appear in
> > > > + *              interval tree. This is useful to keep iterating
> > > > ranges while
> > > > + *              doing modifications to RB tree.
> > > > + * @flags.removed: Flag indicating whether the MMU interval
> > > > notifier
> > > > has been
> > > > + *                 removed
> > > > + *
> > > > + * This structure represents a GPU SVM notifier.
> > > > + */
> > > > +struct drm_gpusvm_notifier {
> > > > +	struct drm_gpusvm *gpusvm;
> > > > +	struct mmu_interval_notifier notifier;
> > > > +	struct {
> > > > +		u64 start;
> > > > +		u64 end;
> > > > +	} interval;
> > > > +	struct {
> > > > +		struct rb_node node;
> > > > +		struct list_head entry;
> > > > +		u64 __subtree_last;
> > > > +	} rb;
> > > > +	struct rb_root_cached root;
> > > > +	struct list_head range_list;
> > > > +	struct {
> > > > +		u32 removed : 1;
> > > > +	} flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_range - Structure representing a GPU SVM
> > > > range
> > > > + *
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > + * @refcount: Reference count for the range
> > > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > > structure range tree
> > > > + * @va: Virtual address range
> > > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > > + * @pages: Pointer to the array of pages (if backing store is in
> > > > VRAM)
> > > > + * @dma_addr: DMA address array (if backing store is SRAM and
> > > > DMA
> > > > mapped)
> > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation
> > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > > > mapping
> > > > size
> > > > + * @flags.migrate_vram: Flag indicating whether the range can be
> > > > migrated to VRAM
> > > > + * @flags.unmapped: Flag indicating if the range has been
> > > > unmapped
> > > > + * @flags.partial_unmap: Flag indicating if the range has been
> > > > partially unmapped
> > > > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > > > pages
> > > > + * @flags.has_dma_mapping: Flag indicating if the range has a
> > > > DMA
> > > > mapping
> > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > > > allocation based
> > > > + *                       on @order which releases via kfree
> > > > + *
> > > > + * This structure represents a GPU SVM range used for tracking
> > > > memory ranges
> > > > + * mapped in a DRM device.
> > > > + */
> > > > +struct drm_gpusvm_range {
> > > > +	struct drm_gpusvm *gpusvm;
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +	struct kref refcount;
> > > > +	struct {
> > > > +		struct rb_node node;
> > > > +		struct list_head entry;
> > > > +		u64 __subtree_last;
> > > > +	} rb;
> > > > +	struct {
> > > > +		u64 start;
> > > > +		u64 end;
> > > > +	} va;
> > > > +	unsigned long notifier_seq;
> > > > +	union {
> > > > +		struct page **pages;
> > > > +		dma_addr_t *dma_addr;
> > > > +	};
> > > > +	void *vram_allocation;
> > > > +	u16 order;
> > > > +	struct {
> > > > +		/* All flags below must be set upon creation */
> > > > +		u16 migrate_vram : 1;
> > > > +		/* All flags below must be set / cleared under
> > > > notifier lock */
> > > > +		u16 unmapped : 1;
> > > > +		u16 partial_unmap : 1;
> > > > +		u16 has_vram_pages : 1;
> > > > +		u16 has_dma_mapping : 1;
> > > > +		u16 kfree_mapping : 1;
> > > > +	} flags;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm - GPU SVM structure
> > > > + *
> > > > + * @name: Name of the GPU SVM
> > > > + * @drm: Pointer to the DRM device structure
> > > > + * @mm: Pointer to the mm_struct for the address space
> > > > + * @device_private_page_owner: Device private pages owner
> > > > + * @mm_start: Start address of GPU SVM
> > > > + * @mm_range: Range of the GPU SVM
> > > > + * @notifier_size: Size of individual notifiers
> > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > range
> > > > allocation.
> > > > + *               Entries should be powers of 2 in descending
> > > > order.
> > > > + * @num_chunks: Number of chunks
> > > > + * @notifier_lock: Read-write semaphore for protecting notifier
> > > > operations
> > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > + * @root: Cached root node of the Red-Black tree containing GPU
> > > > SVM
> > > > notifiers
> > > > + * @notifier_list: list head containing of notifiers in the same
> > > > order they
> > > > + *                 appear in interval tree. This is useful to
> > > > keep
> > > > iterating
> > > > + *                 notifiers while doing modifications to RB
> > > > tree.
> > > > + *
> > > > + * This structure represents a GPU SVM (Shared Virtual Memory)
> > > > used
> > > > for tracking
> > > > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > > > device.
> > > > + *
> > > > + * No reference counting is provided, as this is expected to be
> > > > embedded in the
> > > > + * driver VM structure along with the struct drm_gpuvm, which
> > > > handles reference
> > > > + * counting.
> > > > + */
> > > > +struct drm_gpusvm {
> > > > +	const char *name;
> > > > +	struct drm_device *drm;
> > > > +	struct mm_struct *mm;
> > > > +	void *device_private_page_owner;
> > > > +	u64 mm_start;
> > > > +	u64 mm_range;
> > > > +	u64 notifier_size;
> > > > +	const struct drm_gpusvm_ops *ops;
> > > > +	const u64 *chunk_sizes;
> > > > +	int num_chunks;
> > > > +	struct rw_semaphore notifier_lock;
> > > > +	struct workqueue_struct *zdd_wq;
> > > > +	struct rb_root_cached root;
> > > > +	struct list_head notifier_list;
> > > > +};
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > + *
> > > > + * @mmap_locked: mmap lock is locked
> > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > > inversions
> > > > + *                (e.g.dma-revs -> mmap lock)
> > > > + * @in_notifier: entering from a MMU notifier
> > > > + * @read_only: operating on read-only memory
> > > > + * @vram_possible: possible to use VRAM
> > > > + * @prefault: prefault pages
> > > > + *
> > > > + * Context that is DRM GPUSVM is operating in (i.e. user
> > > > arguments).
> > > > + */
> > > > +struct drm_gpusvm_ctx {
> > > > +	u32 mmap_locked :1;
> > > > +	u32 trylock_mmap :1;
> > > > +	u32 in_notifier :1;
> > > > +	u32 read_only :1;
> > > > +	u32 vram_possible :1;
> > > > +	u32 prefault :1;
> > > > +};
> > > > +
> > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > +		    const char *name, struct drm_device *drm,
> > > > +		    struct mm_struct *mm, void
> > > > *device_private_page_owner,
> > > > +		    u64 mm_start, u64 mm_range, u64
> > > > notifier_size,
> > > > +		    const struct drm_gpusvm_ops *ops,
> > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr,
> > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > +				const struct drm_gpusvm_ctx
> > > > *ctx);
> > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > +			     struct drm_gpusvm_range *range);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > +
> > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range);
> > > > +
> > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx
> > > > *ctx);
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range,
> > > > +				  const struct drm_gpusvm_ctx
> > > > *ctx);
> > > > +
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       void *vram_allocation,
> > > > +			       const struct drm_gpusvm_ctx
> > > > *ctx);
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx
> > > > *ctx);
> > > > +
> > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > > +
> > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > start,
> > > > u64 end);
> > > > +
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > > start, u64 end);
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > + */
> > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the
> > > > list
> > > > + * @range: a pointer to the current GPU SVM range
> > > > + *
> > > > + * Return: A pointer to the next drm_gpusvm_range if available,
> > > > or
> > > > NULL if the
> > > > + *         current range is the last one or if the input range
> > > > is
> > > > NULL.
> > > > + */
> > > > +static inline struct drm_gpusvm_range *
> > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > +				   &range->notifier-
> > > > >range_list))
> > > > +		return list_next_entry(range, rb.entry);
> > > > +
> > > > +	return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > > > notifier
> > > > + * @range__: Iterator variable for the ranges. If set, it
> > > > indicates
> > > > the start of
> > > > + *	     the iterator. If NULL, call drm_gpusvm_range_find()
> > > > to
> > > > get the range.
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the range
> > > > + * @end__: End address of the range
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > notifier.
> > > > It is safe
> > > > + * to use while holding the driver SVM lock or the notifier
> > > > lock.
> > > > + */
> > > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > > > end__)	\
> > > > +	for ((range__) = (range__)
> > > > ?:					\
> > > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > > (end__));	\
> > > > +	     (range__) && (range__->va.start <
> > > > (end__));		\
> > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> > > > unmapped
> > > > + * @range: Pointer to the GPU SVM range structure.
> > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > + *
> > > > + * This function marks a GPU SVM range as unmapped and sets the
> > > > partial_unmap flag
> > > > + * if the range partially falls within the provided MMU notifier
> > > > range.
> > > > + */
> > > > +static inline void
> > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > > +			      const struct mmu_notifier_range
> > > > *mmu_range)
> > > > +{
> > > > +	lockdep_assert_held_write(&range->gpusvm-
> > > > >notifier_lock);
> > > > +
> > > > +	range->flags.unmapped = true;
> > > > +	if (range->va.start < mmu_range->start ||
> > > > +	    range->va.end > mmu_range->end)
> > > > +		range->flags.partial_unmap = true;
> > > > +}
> > > > +
> > > > +#endif /* __DRM_GPUSVM_H__ */
> > > 
>
Matthew Brost Aug. 30, 2024, 1:35 a.m. UTC | #16
On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> Hi, Matt. 
> 
> Some initial design comments / questions:
> 

Hi, Thomas. Missed one question in initial reply.

> On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > sharing of memory between the CPU and GPU, enhancing performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It
> > also
> > provides mechanisms for allocating, deallocating, and migrating
> > memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile
> > b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> >  
> >  # core driver code
> >  
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to
> > manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data
> > exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are
> > tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> 
> What is the benefit of this extra layer compared to direct insertion of
> ranges using mmu_interval_notifier_insert?
> 
> IIRC the argument made previously about having wide notifiers was that
> the rb tree lookups inside the core were costly and if there were only
> a few, then the rb tree lookups within a notifier range could be
> replaced with the page-table radix-tree-like lookup, so each lookup
> complexity would be O(log(n_notifiers) + page_table_depth).
> 
> But now we have first an rb-tree lookup in the core and then an rb-tree
> lookup within each notifier yeilding O(log(n_ranges))
> 
> I can see a small benefit in that inserting directly into the core rb-
> tree will block pending ongoing invalidations, but at a cost of an
> extra multiplexing layer.
> 
> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU
> > address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits
> > within the
> > + *	     faulting CPU address space is chosen for the range
> > size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> 
> How do ranges and chunks map to
>  
> a) Prefaulting granularity

Well we haven't implemented prefetch yet but my thinking was initially
prefetch is an IOCTL which would create N ranges based on chunk size.
As optimization we likely can make prefetch interruptable (e.g. stop
prefetch if a GPU fault occurs to service that first) and hopefully make
prefetch parallel rather than a completely serial operation like a GPU
fault. We can start with a simple serial implementation build on top of
the faulting code though.

As a further optimization it might be advantageous to trigger prefetch
upon a GPU fault too - e.g. service the fault for 1 range only then
trigger prefetch for N ranges async. This essentially could be thought
of of as fault triggering a prefetch IOCTL. This likely would be
controled by madvise setting or perhaps a global modparam?

All of above is likely to be done in the tuning phase when we UMDs /
apps running and based on performance data. Or at least this is my
thinking.

We have Jira open for all of this and I believe other engineers on the
team will be owning the implementation of this. 

> b) Migration granularity?
> 

A chunk is the size of the range and migration granularity. We chose the
largest chunk for a range which fits in GPU VMA and CPU VMA in an
aligned manner. The chunks I'm currently using in Xe map to single GPU
page (4k, 64k, or 2M). As I've mentioned this is flexible so it doesn't
have to be 1 GPU page but started there as it makes a bit of sense.

Matt

> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration,
> > invalidations, and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping,
> > migrating, and
> > + * releasing memory ranges between the CPU and GPU. It handles all
> > core memory
> > + * management interactions (DMA mapping, HMM, and migration) and
> > provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure
> > is sufficient
> > + * to build the expected driver components for an SVM implementation
> > as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers
> > based on the
> > + *			     fault address, optionally migrate the
> > range to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > Ranges are
> > + *			expected to be added to the garbage
> > collector upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it
> > locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to
> > handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions
> > that require
> > + * the mmap lock. This option may be useful for drivers that need to
> > call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the
> > notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and
> > sequence
> > + * number. GPU SVM manages all necessary locking and unlocking
> > operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU
> > bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the
> > HMM
> > + * documentation (TODO: Link). Future revisions may transition from
> > a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is
> > deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should
> > implement a
> > + * lock to safeguard core GPU SVM function calls that modify state,
> > such as
> > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > Alternatively,
> > + * these core functions can be called within a single kernel thread,
> > for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between
> > SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently
> > does not
> > + * support mixing SRAM and VRAM pages within a range. This means
> > that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as
> > follows: it
> > + * simplifies the implementation, and range sizes are driver-defined
> > and should
> > + * be relatively small.
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > main one
> > + * being that a subset of the range still has CPU and GPU mappings.
> > If the
> > + * backing store for the range is in VRAM, a subset of the backing
> > store has
> > + * references. One option would be to split the range and VRAM
> > backing store,
> > + * but the implementation for this would be quite complicated. Given
> > that
> > + * partial unmappings are rare and driver-defined range sizes are
> > relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a
> > range, the
> > + * driver is expected to invalidate and destroy the entire range. If
> > the range
> > + * has VRAM as its backing, the driver is also expected to migrate
> > any remaining
> > + * pages back to SRAM.
> 
> So what happens if we get a one-page invalidation, say protection
> change event, or NUMA accounting event, in the middle of a range? Can
> we unmap just that single gpu pte covering that range, that is, how do
> the ranges map to invalidation granularity? Does this differ between
> igfx an dgfx?
> 
> Thanks,
> Thomas
> 
> 
> 
> 
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected
> > driver
> > + * components: the GPU page fault handler and the garbage collector.
> > A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex
> > migration
> > + * policies, optimized invalidations, or other potentially required
> > driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM
> > ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > fault_addr,
> > + *							gpuva_start,
> > gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have
> > changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > + *					struct drm_gpusvm_range
> > *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages
> > back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > &ctx);
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier
> > *notifier,
> > + *				 const struct mmu_notifier_range
> > *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > >start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier,
> > mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > &ctx);
> > + *
> > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range,
> > mmu_range);
> > + *			driver_garbage_collector_add(gpusvm, range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > >interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > >interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > notifier);
> > +
> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given
> > range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory
> > range,
> > + * specified by the start and end addresses. It divides the
> > difference
> > + * between the end and start addresses by the page size (PAGE_SIZE)
> > to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up
> > a range
> > + * upon CPU page fault and asynchronously releasing VRAM once the
> > CPU has no
> > + * page references. Asynchronous release is useful because CPU page
> > references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
> > +	void *vram_allocation;
> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd,
> > destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It
> > sets up the
> > + * reference count, initializes the destroy work, and links the
> > provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work,
> > drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd
> > structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for asynchronous
> > destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd
> > structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > (start__), (end__)),	\
> > +	     (next__) =
> > __drm_gpusvm_range_next(range__);				\
> > +	     (range__) && (range__->va.start <
> > (end__));				\
> > +	     (range__) = (next__), (next__) =
> > __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > or NULL if
> > + *         the current notifier is the last one or if the input
> > notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm-
> > >notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1),	\
> > +	     (next__) =
> > __drm_gpusvm_notifier_next(notifier__);				\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = (next__), (next__) =
> > __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It
> > sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc
> > under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range
> > *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops
> > drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order
> > with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner =
> > device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault
> > address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier
> > structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)				\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining
> > ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any
> > invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next,
> > notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > >notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > >notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree
> > and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree
> > and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > +	range_remove((range__), &(notifier__)->root);		\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool
> > migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the
> > CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier
> > *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > >notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range
> > based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64
> > gpuva_start,
> > +				       u64 gpuva_end, bool
> > check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > >chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with
> > existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted
> > in. Without
> > +		 * this check, or prefault, on BMG
> > 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case,
> > each process
> > +		 * mallocs 16k but the CPU VMA is ~128k which
> > results in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some
> > processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages device
> > pages from
> > +		 * other processes are collected + faulted in which
> > creates all
> > +		 * sorts of problems. Unsure exactly how this
> > happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --
> > r
> > +		 * process-many-malloc' mallocs at least 64k at a
> > time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range
> > based on the
> > + * fault address. Caller must hold a lock to protect range lookup
> > and insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > >notifier,
> > +							  mm,
> > notifier->interval.start,
> > +							  notifier-
> > >interval.end -
> > +							  notifier-
> > >interval.start,
> > +							 
> > &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > current
> > +	 * limitations. If/when migrate_vma_* add more support, this
> > logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > vas,
> > +						 fault_addr,
> > gpuva_start,
> > +						 gpuva_end,
> > migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA
> > region
> > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range.
> > Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +					   struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range-
> > >va.start,
> > +							     range-
> > >va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range-
> > >order)
> > +				dma_unmap_page(gpusvm->drm->dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range-
> > >order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > +					struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also
> > removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > >va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier-
> > >notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified GPU
> > SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the GPU
> > SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its
> > reference count
> > + * reaches zero. If a custom range-free function is provided, it is
> > invoked to
> > + * free the range; otherwise, the range is deallocated using
> > kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range,
> > refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified GPU
> > SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before
> > commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range-
> > >flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > +				      struct drm_gpusvm_range
> > *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they are
> > mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > >notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start,
> > range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > >pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only
> > a read-lock
> > +			 * is required but there apears to be a
> > window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > a CPU fault
> > +			 * via migrate_vma_setup and the pages
> > actually moving
> > +			 * in migrate_vma_finalize in which this
> > code can grab
> > +			 * garbage pages. Grabbing the write-lock if
> > the range
> > +			 * is attached to vram appears to protect
> > against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if
> > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +					
> > hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if
> > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE <<
> > order,
> > +						  
> > DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev,
> > dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j,
> > sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] =
> > (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order,
> > DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If
> > @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > >invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long
> > *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > grate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU
> > SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU
> > SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn,
> > maps the
> > + * corresponding page, and stores the DMA address in the provided
> > @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int
> > *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU
> > Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in
> > @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the
> > corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction
> > dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > The caller
> > + *                   should hold a reference to the VRAM allocation,
> > which
> > + *                   should be dropped via ops->vram_allocation or
> > upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It
> > performs the
> > + * necessary setup and invokes the driver-specific operations for
> > migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely
> > reference @range
> > + * until ops->vram_release is called which only upon successful
> > return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > >copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > npages, not
> > +	 * always an error. Need to revisit possible cases and how
> > to handle. We
> > +	 * could prefault on migrate.cpages != npages via
> > hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.src, npages,
> > DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers
> > (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the
> > VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > vm_area_struct *vas,
> > +						unsigned long
> > npages,
> > +						unsigned long
> > *src_mpfn,
> > +						unsigned long *mpfn,
> > u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > lock and
> > + * migration done via migrate_device_* functions. Fallback path as
> > it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > >vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm-
> > >device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   dst, npages,
> > DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified
> > GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially unmapped
> > */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src,
> > migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM
> > range to
> > + * SRAM. It performs necessary checks and invokes the internal
> > migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err =
> > drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner
> > case when
> > +	 * VRAM backing has been partially unmapped from MM's
> > address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called
> > while
> > +		 * holding a dma-resv lock, and a final put can grab
> > the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device
> > data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM
> > range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page
> > and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start,
> > end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual
> > Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM
> > ranges and
> > + * perform operations such as migration between VRAM and system RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success,
> > NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> > *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects
> > to drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers
> > (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses
> > (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It
> > can safely
> > +	 * walk the notifier range RB tree/list in this function.
> > Called while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range
> > *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure
> > notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order
> > they appear in
> > + *              interval tree. This is useful to keep iterating
> > ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval notifier
> > has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in
> > VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> > mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> > size
> > + * @flags.migrate_vram: Flag indicating whether the range can be
> > migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been
> > partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking
> > memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under
> > notifier lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier
> > operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU SVM
> > notifiers
> > + * @notifier_list: list head containing of notifiers in the same
> > order they
> > + *                 appear in interval tree. This is useful to keep
> > iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory) used
> > for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > + *
> > + * No reference counting is provided, as this is expected to be
> > embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which
> > handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > NULL if the
> > + *         current range is the last one or if the input range is
> > NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates
> > the start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> > get the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> > It is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > end__)	\
> > +	for ((range__) = (range__)
> > ?:					\
> > +	     drm_gpusvm_range_find((notifier__), (start__),
> > (end__));	\
> > +	     (range__) && (range__->va.start <
> > (end__));		\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the
> > partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier
> > range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > +			      const struct mmu_notifier_range
> > *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
>
Matthew Brost Aug. 30, 2024, 5 a.m. UTC | #17
On Wed, Aug 28, 2024 at 04:31:19PM +0200, Daniel Vetter wrote:
> On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only a read-lock
> > +			 * is required but there apears to be a window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > +			 * via migrate_vma_setup and the pages actually moving
> > +			 * in migrate_vma_finalize in which this code can grab
> > +			 * garbage pages. Grabbing the write-lock if the range
> > +			 * is attached to vram appears to protect against this
> > +			 * race.
> > +			 */
> 
> This one is really scary, since it means the entire migrate pte trickery
> is essentially completely busted. Grabbing the mmap write lock just means
> you block out pretty much everything interesting from concurrently
> happening.
> 
> My gut feeling says we need to figure out what's happening here, because
> this looks a bit too fundamental to me.
> -Sima
> 

Sima,

I’ve already replied to this.

We’ve discussed the mmap write hack extensively, so I’m not quite sure
where to put this. The reply chain is quickly becoming a mess. However,
I’ve looked into this and collected some data points based on your
feedback.

I’ve pushed a branch [1] with the updated code.

The first new commit [2] removes the mmap write lock hack and addresses
an issue related to VRAM migrations, which couldn’t collect all VRAM
pages without this hack.

With this commit [2], xe_exec_system_allocator --r twice*race* fails
quite regularly, perhaps 25% of the time. This test is a
single-thread/process test that races CPU and GPU faults with migration.

It fails with the following dmesg:

[   68.473007] WARNING: CPU: 12 PID: 1643 at drivers/gpu/drm/xe/drm_gpusvm.c:1407 drm_gpusvm_range_get_pages+0xbda/0x1480 [xe]
...
[   68.473836] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault response: Unsuccessful -95
[   68.474024] xe 0000:03:00.0: [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT1: Engine memory cat error: engine_class=vecs, logical_mask: 0x2, guc_id=0
[   68.474163] xe 0000:03:00.0: [drm] exec queue reset detected
[   68.474696] xe 0000:03:00.0: [drm] GT1: Engine reset: engine_class=vecs, logical_mask: 0x2, guc_id=0

This means hmm_range_fault collects a mix of SRAM and VRAM pages, which
my design aims to avoid. Perhaps allowing a mix of SRAM and VRAM pages
in my design might work, but I highly doubt it based on AMD's
range->migration_mutex and my inspection of the migration layer.
Allowing mixed mappings would introduce significant complexity, so I’d
prefer to avoid this if possible. Additionally, allowing mixed mappings
would eliminate the use of huge GPU pages when race like this occurs.

I also implemented a retry loop to see if the system stabilizes with
either only SRAM or VRAM pages. Unfortunately, it results in a
continuous loop of drm_gpusvm_range_get_pages / hmm_range_fault until
the test case kills the MM due to a timeout.

Next, I added a lock similar to AMD's range->migration_lock, but using
an rwsem [3]. The semantics are to allow read access for CPU access and
write access for GPU access, thus enabling parallel CPU page faults for
the same range which matching existing core semantics. This provides
finer granularity compared to using the mmap write lock; it only
disallows CPU and GPU servicing in parallel for a given range, rather
than the entire MM. It also aligns with AMD’s approach. I haven’t
checked Nvidia’s approach wrt this locking but can do so if you think it
would be helpful.

Matt

[1] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commits/mmap_write_lock
[2] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commit/6cf67d98c719ffbb4ac6124a7cb81d797a5bad9f
[3] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commit/2b62075d193265b2c1634ecfd0497dffd2e18c13

> 
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE << order,
> > +						   DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > + *                   should hold a reference to the VRAM allocation, which
> > + *                   should be dropped via ops->vram_allocation or upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > + * necessary setup and invokes the driver-specific operations for migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > + * until ops->vram_release is called which only upon successful return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > +	 * always an error. Need to revisit possible cases and how to handle. We
> > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   migrate.src, npages, DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > +						unsigned long npages,
> > +						unsigned long *src_mpfn,
> > +						unsigned long *mpfn, u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > + * migration done via migrate_device_* functions. Fallback path as it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm->device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   dst, npages, DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially unmapped */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src, migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM range to
> > + * SRAM. It performs necessary checks and invokes the internal migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner case when
> > +	 * VRAM backing has been partially unmapped from MM's address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called while
> > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM ranges and
> > + * perform operations such as migration between VRAM and system RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects to drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It can safely
> > +	 * walk the notifier range RB tree/list in this function. Called while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order they appear in
> > + *              interval tree. This is useful to keep iterating ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under notifier lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > + * @notifier_list: list head containing of notifiers in the same order they
> > + *                 appear in interval tree. This is useful to keep iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > + *
> > + * No reference counting is provided, as this is expected to be embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > + *         current range is the last one or if the input range is NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > +	for ((range__) = (range__) ?:					\
> > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > +	     (range__) && (range__->va.start < (end__));		\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > +			      const struct mmu_notifier_range *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
> > -- 
> > 2.34.1
> > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Thomas Hellström Aug. 30, 2024, 8:18 a.m. UTC | #18
Hi, Matthew,

On Thu, 2024-08-29 at 20:56 +0000, Matthew Brost wrote:
> On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> > Hi, Matthew,
> > 
> > On Thu, 2024-08-29 at 17:45 +0000, Matthew Brost wrote:
> > > On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> > > > Hi, Matt. 
> > > > 
> > > > Some initial design comments / questions:
> > > > 
> > > > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > > > This patch introduces support for GPU Shared Virtual Memory
> > > > > (SVM)
> > > > > in
> > > > > the
> > > > > Direct Rendering Manager (DRM) subsystem. SVM allows for
> > > > > seamless
> > > > > sharing of memory between the CPU and GPU, enhancing
> > > > > performance
> > > > > and
> > > > > flexibility in GPU computing tasks.
> > > > > 
> > > > > The patch adds the necessary infrastructure for SVM,
> > > > > including
> > > > > data
> > > > > structures and functions for managing SVM ranges and
> > > > > notifiers.
> > > > > It
> > > > > also
> > > > > provides mechanisms for allocating, deallocating, and
> > > > > migrating
> > > > > memory
> > > > > regions between system RAM and GPU VRAM.
> > > > > 
> > > > > This mid-layer is largely inspired by GPUVM.
> > > > > 
> > > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > Cc: Christian König <christian.koenig@amd.com>
> > > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > ---
> > > > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > > > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > > > +++++++++++++++++++++++++++++++
> > > > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > > > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > > > b/drivers/gpu/drm/xe/Makefile
> > > > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > > >  
> > > > >  # core driver code
> > > > >  
> > > > > -xe-y += xe_bb.o \
> > > > > +xe-y += drm_gpusvm.o \
> > > > > +	xe_bb.o \
> > > > >  	xe_bo.o \
> > > > >  	xe_bo_evict.o \
> > > > >  	xe_devcoredump.o \
> > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > new file mode 100644
> > > > > index 000000000000..fc1e44e6ae72
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > @@ -0,0 +1,2174 @@
> > > > > +// SPDX-License-Identifier: MIT
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + *
> > > > > + * Authors:
> > > > > + *     Matthew Brost <matthew.brost@intel.com>
> > > > > + */
> > > > > +
> > > > > +#include <linux/dma-mapping.h>
> > > > > +#include <linux/interval_tree_generic.h>
> > > > > +#include <linux/hmm.h>
> > > > > +#include <linux/memremap.h>
> > > > > +#include <linux/migrate.h>
> > > > > +#include <linux/mm_types.h>
> > > > > +#include <linux/pagemap.h>
> > > > > +#include <linux/slab.h>
> > > > > +
> > > > > +#include <drm/drm_device.h>
> > > > > +#include "drm_gpusvm.h"
> > > > > +
> > > > > +/**
> > > > > + * DOC: Overview
> > > > > + *
> > > > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > > > Rendering Manager (DRM)
> > > > > + *
> > > > > + * The GPU SVM layer is a component of the DRM framework
> > > > > designed to
> > > > > manage shared
> > > > > + * virtual memory between the CPU and GPU. It enables
> > > > > efficient
> > > > > data
> > > > > exchange and
> > > > > + * processing for GPU-accelerated applications by allowing
> > > > > memory
> > > > > sharing and
> > > > > + * synchronization between the CPU's and GPU's virtual
> > > > > address
> > > > > spaces.
> > > > > + *
> > > > > + * Key GPU SVM Components:
> > > > > + * - Notifiers: Notifiers: Used for tracking memory
> > > > > intervals
> > > > > and
> > > > > notifying the
> > > > > + *		GPU of changes, notifiers are sized based on
> > > > > a
> > > > > GPU
> > > > > SVM
> > > > > + *		initialization parameter, with a
> > > > > recommendation
> > > > > of
> > > > > 512M or
> > > > > + *		larger. They maintain a Red-BlacK tree and a
> > > > > list of
> > > > > ranges that
> > > > > + *		fall within the notifier interval. Notifiers
> > > > > are
> > > > > tracked within
> > > > > + *		a GPU SVM Red-BlacK tree and list and are
> > > > > dynamically inserted
> > > > > + *		or removed as ranges within the interval are
> > > > > created
> > > > > or
> > > > > + *		destroyed.
> > > > 
> > > > What is the benefit of this extra layer compared to direct
> > > > insertion of
> > > > ranges using mmu_interval_notifier_insert?
> > > > 
> > > > IIRC the argument made previously about having wide notifiers
> > > > was
> > > > that
> > > > the rb tree lookups inside the core were costly and if there
> > > > were
> > > > only
> > > > a few, then the rb tree lookups within a notifier range could
> > > > be
> > > > replaced with the page-table radix-tree-like lookup, so each
> > > > lookup
> > > > complexity would be O(log(n_notifiers) + page_table_depth).
> > > > 
> > > > But now we have first an rb-tree lookup in the core and then an
> > > > rb-
> > > > tree
> > > > lookup within each notifier yeilding O(log(n_ranges))
> > > > 
> > > > I can see a small benefit in that inserting directly into the
> > > > core
> > > > rb-
> > > > tree will block pending ongoing invalidations, but at a cost of
> > > > an
> > > > extra multiplexing layer.
> > > > 
> > > 
> > > So when the notifier is triggered the search is a smaller range.
> > > In a
> > > perfect world eventually I'd like to drop the SVM range
> > > completely.
> > > There is a lot of changes required in Xe to make that possible
> > > and
> > > not
> > > entirely convinced it is possible and the ROI is worth it
> > > (additional
> > > complexity vs. perf benefit). For now, this was a relatively
> > > simple
> > > way
> > > to get SVM working (mirrors boths AMD's and Nvidia's implement
> > > wrt to
> > > having a range concept) but also is flexible in the sense the
> > > notifier
> > > size can be easily tweaked via a modparam [1] following Jason's
> > > suggestion of larger notifiers.
> > > 
> > > [1]
> > > https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1
> > 
> > What I meant was the core is already implementing the "one notifier
> > for
> > the whole range", since your notifier duplicates the
> > mmu_interval_notifier functionality.
> > 
> > The mmu_interval_notifier first does an rbtree search to get to the
> > notifier, and then drm_gpusvm does an rbtree search to get to the
> > range.
> 
> Yes.
> 
> > 
> > If the svm notifier layer is skipped, mmu_interval_notifier has to
> > perform a wider rbtree search to get to the range. The point is,
> > the
> > complexity is the same for both approaches so there is no point in
> > adding a svm notifier layer for that reason. The width of the
> > notifier
> > just adjust the relative size of the two rbtree searches, so from
> > that
> > point of view the drm_gpusvm does not offer any benefit from
> > inserting
> > the ranges into the mmu_interval_notifier directly (except that the
> > mmu_interval_notifier is slightly more heavyweight).
> > 
> 
> I think a large part of it was to avoid inserting / removing many
> notifiers as that was expensive. Agree the search is not
> fundamentally
> faster the way I have this coded. It just avoids heavy inserting /
> removing of notifiers.

So I specifically asked Jason about the performance problem about using
many notifiers vs using a single one, and he responded that the problem
is slowing down the core mm on invalidations, if the RB tree gets too
large to walk. He also mentioned that we should consider core
invalidation performance before faulting performance because the latter
is so slow anyway we must have the driver stack avoid gpu faults using
user-space prefetching and similar techniques.

In particular inserting and removing into the mmu_interval tree is not
costly in terms of locking but because of correctness requirements
insertion might block on ongoing validations.

So basically what I'm trying to say is that as long as we're using SVM
ranges in the way we do (I'm not saying that is wrong at this point,
and I agree that could be fine-tuned later), The benefit of an extra
notifier layer is questionable compared to directly inserting the
ranges into the mmu_interval_tree. So hence my questions, given those
considerations why this additional layer?

Anyway, a more detailed review of the code perhaps helps clear this
out.

> 
> > As I understand it, Jasons comments were based on the assumption
> > that
> > the drm_gpusvm search would be radix tree based, and hence with
> > less
> > complexity than the rbtree search, and therefore providing a clear
> > benefit the larger they could be.
> > 
> > I.e. just calling something similar to xe_vm_invalidate_xxx over
> > the
> > whole range, which will just skip subranges that are not populated.
> > 
> 
> As stated, I think eventually removing the SVM range is a good
> longterm
> goal.
> 
> I almost coded that in this initial series but ran into a number of
> issues which make this complex and to get something working in
> simplest
> way possible to enable further test development, start constructive
> upstream discussions which appear to be happening, UMDs / application
> development, and other up[er layer KMD development I stuck with this
> approach.
> 
> I think for any solution which requires a SVM range (fwiw both AMD
> and
> Nvidia have a similar concept), attaching the ranges to a larger
> notifier makes sense and is better than 1 notifier per range.
> 
> Issues with removing a SVM range:
> 
> - Xe bind code stores invalidation / present state in VMA, this would
>   need to be moved to the radix tree. I have Jira open for that work
>   which I believe other developers are going to own.
> - Where would the dma mapping / device pages be stored?
> 	- In the radix tree? What if ATS is enabled? We don't have a
> 	  driver owned radix tree. How do we reasonably connect a
> driver
> 	  owned radix to a common GPUSVM layer?
> 	- In the notifier? What is the notifier is sparsely
> populated?
> 	  We would be wasting huge amounts of memory. What is the
> 	  notifier is configured to span the entire virtual address
> 	  space?
> - How does the garbage collector work? We can't allocate memory in
> the
>   notifier so we don't anything to add to the garbage collector. We
>   can't directly modify page tables given you need lock in the path
> of
>   reclaim.
> - How do we deal with fault storms (e.g. tons of faults hitting the
> same
>   SVM range in a row)? Without a SVM range no every to know if
> mapping
>   is valid and GPU page handler can be short circuited.
> - Do we have notifier seqno for every PTE?
> 
> I feel like I'm missing a few and likely more issues would arrise
> when
> implementing this too.
> 
> To be clear, I'm saying we shouldn't try to do this and all of the
> above
> issues are likely workable but doing all this upfront is akin running
> before we can walk. I rather solve of fundamental locking issues
> first,
> have robust testing in place + passing and UMDs / apps running before
> trying to rework this one. Performance numbers for this would also be
> helpful too.






> 
> Matt
> 
> > /Thomas
> > 
> > > 
> > > > > + * - Ranges: Represent memory ranges mapped in a DRM device
> > > > > and
> > > > > managed
> > > > > + *	     by GPU SVM. They are sized based on an array of
> > > > > chunk
> > > > > sizes, which
> > > > > + *	     is a GPU SVM initialization parameter, and the
> > > > > CPU
> > > > > address space.
> > > > > + *	     Upon GPU fault, the largest aligned chunk that
> > > > > fits
> > > > > within the
> > > > > + *	     faulting CPU address space is chosen for the
> > > > > range
> > > > > size. Ranges are
> > > > > + *	     expected to be dynamically allocated on GPU
> > > > > fault
> > > > > and
> > > > > removed on an
> > > > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > > > ranges
> > > > > are tracked in
> > > > > + *	     a notifier's Red-Black tree.
> > > > 
> > > > How do ranges and chunks map to
> > > >  
> > > > a) Prefaulting granularity
> > > > b) Migration granularity?
> > > > 
> > > > > + * - Operations: Define the interface for driver-specific
> > > > > SVM
> > > > > operations such as
> > > > > + *		 allocation, page collection, migration,
> > > > > invalidations, and VRAM
> > > > > + *		 release.
> > > > > + *
> > > > > + * This layer provides interfaces for allocating, mapping,
> > > > > migrating, and
> > > > > + * releasing memory ranges between the CPU and GPU. It
> > > > > handles
> > > > > all
> > > > > core memory
> > > > > + * management interactions (DMA mapping, HMM, and migration)
> > > > > and
> > > > > provides
> > > > > + * driver-specific virtual functions (vfuncs). This
> > > > > infrastructure
> > > > > is sufficient
> > > > > + * to build the expected driver components for an SVM
> > > > > implementation
> > > > > as detailed
> > > > > + * below.
> > > > > + *
> > > > > + * Expected Driver Components:
> > > > > + * - GPU page fault handler: Used to create ranges and
> > > > > notifiers
> > > > > based on the
> > > > > + *			     fault address, optionally
> > > > > migrate
> > > > > the
> > > > > range to
> > > > > + *			     VRAM, and create GPU bindings.
> > > > > + * - Garbage collector: Used to destroy GPU bindings for
> > > > > ranges.
> > > > > Ranges are
> > > > > + *			expected to be added to the garbage
> > > > > collector upon
> > > > > + *			MMU_NOTIFY_UNMAP event.
> > > > > + */
> > > > > +
> > > > > +/**
> > > > > + * DOC: Locking
> > > > > + *
> > > > > + * GPU SVM handles locking for core MM interactions, i.e.,
> > > > > it
> > > > > locks/unlocks the
> > > > > + * mmap lock as needed. Alternatively, if the driver prefers
> > > > > to
> > > > > handle the mmap
> > > > > + * lock itself, a 'locked' argument is provided to the
> > > > > functions
> > > > > that require
> > > > > + * the mmap lock. This option may be useful for drivers that
> > > > > need to
> > > > > call into
> > > > > + * GPU SVM while also holding a dma-resv lock, thus
> > > > > preventing
> > > > > locking
> > > > > + * inversions between the mmap and dma-resv locks.
> > > > > + *
> > > > > + * GPU SVM introduces a global notifier lock, which
> > > > > safeguards
> > > > > the
> > > > > notifier's
> > > > > + * range RB tree and list, as well as the range's DMA
> > > > > mappings
> > > > > and
> > > > > sequence
> > > > > + * number. GPU SVM manages all necessary locking and
> > > > > unlocking
> > > > > operations,
> > > > > + * except for the recheck of the range's sequence number
> > > > > + * (mmu_interval_read_retry) when the driver is committing
> > > > > GPU
> > > > > bindings. This
> > > > > + * lock corresponds to the 'driver->update' lock mentioned
> > > > > in
> > > > > the
> > > > > HMM
> > > > > + * documentation (TODO: Link). Future revisions may
> > > > > transition
> > > > > from
> > > > > a GPU SVM
> > > > > + * global lock to a per-notifier lock if finer-grained
> > > > > locking
> > > > > is
> > > > > deemed
> > > > > + * necessary.
> > > > > + *
> > > > > + * In addition to the locking mentioned above, the driver
> > > > > should
> > > > > implement a
> > > > > + * lock to safeguard core GPU SVM function calls that modify
> > > > > state,
> > > > > such as
> > > > > + * drm_gpusvm_range_find_or_insert and
> > > > > drm_gpusvm_range_remove.
> > > > > Alternatively,
> > > > > + * these core functions can be called within a single kernel
> > > > > thread,
> > > > > for
> > > > > + * instance, using an ordered work queue. This lock is
> > > > > denoted
> > > > > as
> > > > > + * 'driver_svm_lock' in code examples.
> > > > > + */
> > > > > +
> > > > > +/**
> > > > > + * DOC: Migrataion
> > > > > + *
> > > > > + * The migration support is quite simple, allowing migration
> > > > > between
> > > > > SRAM and
> > > > > + * VRAM at the range granularity. For example, GPU SVM
> > > > > currently
> > > > > does not
> > > > > + * support mixing SRAM and VRAM pages within a range. This
> > > > > means
> > > > > that upon GPU
> > > > > + * fault, the entire range can be migrated to VRAM, and upon
> > > > > CPU
> > > > > fault, the
> > > > > + * entire range is migrated to SRAM.
> > > > > + *
> > > > > + * The reasoning for only supporting range granularity is as
> > > > > follows: it
> > > > > + * simplifies the implementation, and range sizes are
> > > > > driver-
> > > > > defined
> > > > > and should
> > > > > + * be relatively small.
> > > > > + */
> > > > > +
> > > > > +/**
> > > > > + * DOC: Partial Unmapping of Ranges
> > > > > + *
> > > > > + * Partial unmapping of ranges (e.g., 1M out of 2M is
> > > > > unmapped
> > > > > by
> > > > > CPU resulting
> > > > > + * in MMU_NOTIFY_UNMAP event) presents several challenges,
> > > > > with
> > > > > the
> > > > > main one
> > > > > + * being that a subset of the range still has CPU and GPU
> > > > > mappings.
> > > > > If the
> > > > > + * backing store for the range is in VRAM, a subset of the
> > > > > backing
> > > > > store has
> > > > > + * references. One option would be to split the range and
> > > > > VRAM
> > > > > backing store,
> > > > > + * but the implementation for this would be quite
> > > > > complicated.
> > > > > Given
> > > > > that
> > > > > + * partial unmappings are rare and driver-defined range
> > > > > sizes
> > > > > are
> > > > > relatively
> > > > > + * small, GPU SVM does not support splitting of ranges.
> > > > > + *
> > > > > + * With no support for range splitting, upon partial
> > > > > unmapping
> > > > > of a
> > > > > range, the
> > > > > + * driver is expected to invalidate and destroy the entire
> > > > > range. If
> > > > > the range
> > > > > + * has VRAM as its backing, the driver is also expected to
> > > > > migrate
> > > > > any remaining
> > > > > + * pages back to SRAM.
> > > > 
> > > > So what happens if we get a one-page invalidation, say
> > > > protection
> > > > change event, or NUMA accounting event, in the middle of a
> > > > range?
> > > > Can
> > > > we unmap just that single gpu pte covering that range, that is,
> > > > how
> > > > do
> > > > the ranges map to invalidation granularity? Does this differ
> > > > between
> > > > igfx an dgfx?
> > > 
> > > Well the idea of chunks is ranges should be 1 GPU page (the chunk
> > > array
> > > in Xe is 4k, 64k, and 2M). The design is flexible enough that
> > > doesn't
> > > have to true but optimized for the thinking each range is most
> > > likely
> > > 1
> > > GPU page. If this isn't true, then all GPU pages in the range are
> > > invalidated which isn't ideal but keeps it simple which IMO far
> > > out
> > > weighs the potential benefits. In theory a driver could implement
> > > spliting / partial invalidaions too with a couple of updates to
> > > GPUSVM
> > > but would likely largely be a driver implementation rather than
> > > GPUSVM.
> > > 
> > > No difference between igfx an dgfx.
> > > 
> > > You bring up a good point about protection changes, I likely
> > > haven't
> > > fully gotten that part of implementation correct either. I can
> > > add
> > > this
> > > to my TODO list and also update my IGTs to do things like this.
> > > 
> > > Matt
> > > 
> > > > 
> > > > Thanks,
> > > > Thomas
> > > > 
> > > > 
> > > > 
> > > > 
> > > > > + */
> > > > > +
> > > > > +/**
> > > > > + * DOC: Examples
> > > > > + *
> > > > > + * This section provides two examples of how to build the
> > > > > expected
> > > > > driver
> > > > > + * components: the GPU page fault handler and the garbage
> > > > > collector.
> > > > > A third
> > > > > + * example demonstrates a sample invalidation driver vfunc.
> > > > > + *
> > > > > + * The generic code provided does not include logic for
> > > > > complex
> > > > > migration
> > > > > + * policies, optimized invalidations, or other potentially
> > > > > required
> > > > > driver
> > > > > + * locking (e.g., DMA-resv locks).
> > > > > + *
> > > > > + * 1) GPU page fault handler
> > > > > + *
> > > > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm,
> > > > > struct
> > > > > drm_gpusvm_range *range)
> > > > > + *	{
> > > > > + *		int err = 0;
> > > > > + *
> > > > > +
> > > > > *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > > > range);
> > > > > + *
> > > > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > > > + *			driver_commit_bind(gpusvm, range);
> > > > > + *		else
> > > > > + *			err = -EAGAIN;
> > > > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > + *
> > > > > + *		return err;
> > > > > + *	}
> > > > > + *
> > > > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > > > fault_addr,
> > > > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > > > + *	{
> > > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > > + *		int err;
> > > > > + *
> > > > > + *		driver_svm_lock();
> > > > > + *	retry:
> > > > > + *		// Always process UNMAPs first so view of
> > > > > GPU
> > > > > SVM
> > > > > ranges is current
> > > > > + *		driver_garbage_collector(gpusvm);
> > > > > + *
> > > > > + *		range =
> > > > > drm_gpusvm_range_find_or_insert(gpusvm,
> > > > > fault_addr,
> > > > > +
> > > > > *							gpuv
> > > > > a_start,
> > > > > gpuva_end,
> > > > > + *						       
> > > > > &ctx);
> > > > > + *		if (IS_ERR(range)) {
> > > > > + *			err = PTR_ERR(range);
> > > > > + *			goto unlock;
> > > > > + *		}
> > > > > + *
> > > > > + *		if (driver_migration_policy(range)) {
> > > > > + *			bo = driver_alloc_bo();
> > > > > + *			err =
> > > > > drm_gpusvm_migrate_to_vram(gpusvm,
> > > > > range, bo, &ctx);
> > > > > + *			if (err)	// CPU mappings may
> > > > > have
> > > > > changed
> > > > > + *				goto retry;
> > > > > + *		}
> > > > > + *
> > > > > + *		err = drm_gpusvm_range_get_pages(gpusvm,
> > > > > range,
> > > > > &ctx);
> > > > > + *		if (err == -EFAULT || err == -EPERM)	//
> > > > > CPU
> > > > > mappings changed
> > > > > + *			goto retry;
> > > > > + *		else if (err)
> > > > > + *			goto unlock;
> > > > > + *
> > > > > + *		err = driver_bind_range(gpusvm, range);
> > > > > + *		if (err == -EAGAIN)	// CPU mappings
> > > > > changed
> > > > > + *			goto retry
> > > > > + *
> > > > > + *	unlock:
> > > > > + *		driver_svm_unlock();
> > > > > + *		return err;
> > > > > + *	}
> > > > > + *
> > > > > + * 2) Garbage Collector.
> > > > > + *
> > > > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > + *					struct
> > > > > drm_gpusvm_range
> > > > > *range)
> > > > > + *	{
> > > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > > + *
> > > > > + *		assert_driver_svm_locked(gpusvm);
> > > > > + *
> > > > > + *		// Partial unmap, migrate any remaining VRAM
> > > > > pages
> > > > > back to SRAM
> > > > > + *		if (range->flags.partial_unmap)
> > > > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > > > range,
> > > > > &ctx);
> > > > > + *
> > > > > + *		driver_unbind_range(range);
> > > > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > > > + *	}
> > > > > + *
> > > > > + *	void driver_garbage_collector(struct drm_gpusvm
> > > > > *gpusvm)
> > > > > + *	{
> > > > > + *		assert_driver_svm_locked(gpusvm);
> > > > > + *
> > > > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > > > range)
> > > > > + *			__driver_garbage_collector(gpusvm,
> > > > > range);
> > > > > + *	}
> > > > > + *
> > > > > + * 3) Invalidation driver vfunc.
> > > > > + *
> > > > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > > > + *				 struct drm_gpusvm_notifier
> > > > > *notifier,
> > > > > + *				 const struct
> > > > > mmu_notifier_range
> > > > > *mmu_range)
> > > > > + *	{
> > > > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > > > true,
> > > > > };
> > > > > + *		struct drm_gpusvm_range *range = NULL;
> > > > > + *
> > > > > + *		driver_invalidate_device_tlb(gpusvm,
> > > > > mmu_range-
> > > > > > start, mmu_range->end);
> > > > > + *
> > > > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > > > mmu_range->start,
> > > > > + *					  mmu_range->end) {
> > > > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > > > range,
> > > > > &ctx);
> > > > > + *
> > > > > + *			if (mmu_range->event !=
> > > > > MMU_NOTIFY_UNMAP)
> > > > > + *				continue;
> > > > > + *
> > > > > + *			drm_gpusvm_range_set_unmapped(range,
> > > > > mmu_range);
> > > > > + *			driver_garbage_collector_add(gpusvm,
> > > > > range);
> > > > > + *		}
> > > > > + *	}
> > > > > + */
> > > > > +
> > > > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end -
> > > > > 1)
> > > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > > > rb.__subtree_last,
> > > > > +		     DRM_GPUSVM_RANGE_START,
> > > > > DRM_GPUSVM_RANGE_END,
> > > > > +		     static __maybe_unused, range);
> > > > > +
> > > > > +#define
> > > > > DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > > > interval.start)
> > > > > +#define
> > > > > DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > > > interval.end - 1)
> > > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node,
> > > > > u64,
> > > > > +		     rb.__subtree_last,
> > > > > DRM_GPUSVM_NOTIFIER_START,
> > > > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > > > __maybe_unused,
> > > > > notifier);
> > > > > +
> > > > > +/**
> > > > > + * npages_in_range() - Calculate the number of pages in a
> > > > > given
> > > > > range
> > > > > + * @start__: The start address of the range
> > > > > + * @end__: The end address of the range
> > > > > + *
> > > > > + * This macro calculates the number of pages in a given
> > > > > memory
> > > > > range,
> > > > > + * specified by the start and end addresses. It divides the
> > > > > difference
> > > > > + * between the end and start addresses by the page size
> > > > > (PAGE_SIZE)
> > > > > to
> > > > > + * determine the number of pages in the range.
> > > > > + *
> > > > > + * Return: The number of pages in the specified range.
> > > > > + */
> > > > > +#define npages_in_range(start__, end__)	\
> > > > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > > > + *
> > > > > + * @refcount: Reference count for the zdd
> > > > > + * @destroy_work: Work structure for asynchronous zdd
> > > > > destruction
> > > > > + * @range: Pointer to the GPU SVM range
> > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > allocation
> > > > > + *
> > > > > + * This structure serves as a generic wrapper installed in
> > > > > + * page->zone_device_data. It provides infrastructure for
> > > > > looking up
> > > > > a range
> > > > > + * upon CPU page fault and asynchronously releasing VRAM
> > > > > once
> > > > > the
> > > > > CPU has no
> > > > > + * page references. Asynchronous release is useful because
> > > > > CPU
> > > > > page
> > > > > references
> > > > > + * can be dropped in IRQ contexts, while releasing VRAM
> > > > > likely
> > > > > requires sleeping
> > > > > + * locks.
> > > > > + */
> > > > > +struct drm_gpusvm_zdd {
> > > > > +	struct kref refcount;
> > > > > +	struct work_struct destroy_work;
> > > > > +	struct drm_gpusvm_range *range;
> > > > > +	void *vram_allocation;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > > > destroying a
> > > > > zdd
> > > > > + * @w: Pointer to the work_struct
> > > > > + *
> > > > > + * This function releases VRAM, puts GPU SVM range, and
> > > > > frees
> > > > > zdd.
> > > > > + */
> > > > > +static void drm_gpusvm_zdd_destroy_work_func(struct
> > > > > work_struct
> > > > > *w)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd =
> > > > > +		container_of(w, struct drm_gpusvm_zdd,
> > > > > destroy_work);
> > > > > +	struct drm_gpusvm_range *range = zdd->range;
> > > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > > +
> > > > > +	if (gpusvm->ops->vram_release && zdd-
> > > > > >vram_allocation)
> > > > > +		gpusvm->ops->vram_release(zdd-
> > > > > >vram_allocation);
> > > > > +	drm_gpusvm_range_put(range);
> > > > > +	kfree(zdd);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > > > + * @range: Pointer to the GPU SVM range.
> > > > > + *
> > > > > + * This function allocates and initializes a new zdd
> > > > > structure.
> > > > > It
> > > > > sets up the
> > > > > + * reference count, initializes the destroy work, and links
> > > > > the
> > > > > provided GPU SVM
> > > > > + * range.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > > > failure.
> > > > > + */
> > > > > +static struct drm_gpusvm_zdd *
> > > > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd;
> > > > > +
> > > > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > > > +	if (!zdd)
> > > > > +		return NULL;
> > > > > +
> > > > > +	kref_init(&zdd->refcount);
> > > > > +	INIT_WORK(&zdd->destroy_work,
> > > > > drm_gpusvm_zdd_destroy_work_func);
> > > > > +	zdd->range = drm_gpusvm_range_get(range);
> > > > > +	zdd->vram_allocation = NULL;
> > > > > +
> > > > > +	return zdd;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > > > + * @zdd: Pointer to the zdd structure.
> > > > > + *
> > > > > + * This function increments the reference count of the
> > > > > provided
> > > > > zdd
> > > > > structure.
> > > > > + *
> > > > > + * Returns: Pointer to the zdd structure.
> > > > > + */
> > > > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > > > drm_gpusvm_zdd *zdd)
> > > > > +{
> > > > > +	kref_get(&zdd->refcount);
> > > > > +	return zdd;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > > > + * @ref: Pointer to the reference count structure.
> > > > > + *
> > > > > + * This function queues the destroy_work of the zdd for
> > > > > asynchronous
> > > > > destruction.
> > > > > + */
> > > > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd =
> > > > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > > > refcount);
> > > > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > > > +
> > > > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > > > + * @zdd: Pointer to the zdd structure.
> > > > > + *
> > > > > + * This function decrements the reference count of the
> > > > > provided
> > > > > zdd
> > > > > structure
> > > > > + * and schedules its destruction if the count drops to zero.
> > > > > + */
> > > > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > > > +{
> > > > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > > > notifier
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > > > + * @start: Start address of the range
> > > > > + * @end: End address of the range
> > > > > + *
> > > > > + * Return: A pointer to the drm_gpusvm_range if found or
> > > > > NULL
> > > > > + */
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> > > > > u64
> > > > > start, u64 end)
> > > > > +{
> > > > > +	return range_iter_first(&notifier->root, start, end
> > > > > -
> > > > > 1);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU
> > > > > SVM
> > > > > ranges in a notifier
> > > > > + * @range__: Iterator variable for the ranges
> > > > > + * @next__: Iterator variable for the ranges temporay
> > > > > storage
> > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > + * @start__: Start address of the range
> > > > > + * @end__: End address of the range
> > > > > + *
> > > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > > notifier
> > > > > while
> > > > > + * removing ranges from it.
> > > > > + */
> > > > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > > > notifier__,
> > > > > start__, end__)	\
> > > > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > > > (start__), (end__)),	\
> > > > > +	     (next__) =
> > > > > __drm_gpusvm_range_next(range__);			
> > > > > 	\
> > > > > +	     (range__) && (range__->va.start <
> > > > > (end__));				\
> > > > > +	     (range__) = (next__), (next__) =
> > > > > __drm_gpusvm_range_next(range__))
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_notifier_next - get the next
> > > > > drm_gpusvm_notifier
> > > > > in
> > > > > the list
> > > > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > > > + *
> > > > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > > > available,
> > > > > or NULL if
> > > > > + *         the current notifier is the last one or if the
> > > > > input
> > > > > notifier is
> > > > > + *         NULL.
> > > > > + */
> > > > > +static struct drm_gpusvm_notifier *
> > > > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier
> > > > > *notifier)
> > > > > +{
> > > > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > > > +				      &notifier->gpusvm-
> > > > > > notifier_list))
> > > > > +		return list_next_entry(notifier, rb.entry);
> > > > > +
> > > > > +	return NULL;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM
> > > > > notifiers
> > > > > in
> > > > > a gpusvm
> > > > > + * @notifier__: Iterator variable for the notifiers
> > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > + * @start__: Start address of the notifier
> > > > > + * @end__: End address of the notifier
> > > > > + *
> > > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > > gpusvm.
> > > > > + */
> > > > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > > > start__,
> > > > > end__)		\
> > > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > > > root,
> > > > > (start__), (end__) - 1);	\
> > > > > +	     (notifier__) && (notifier__->interval.start <
> > > > > (end__));			\
> > > > > +	     (notifier__) =
> > > > > __drm_gpusvm_notifier_next(notifier__))
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over
> > > > > GPU
> > > > > SVM
> > > > > notifiers in a gpusvm
> > > > > + * @notifier__: Iterator variable for the notifiers
> > > > > + * @next__: Iterator variable for the notifiers temporay
> > > > > storage
> > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > + * @start__: Start address of the notifier
> > > > > + * @end__: End address of the notifier
> > > > > + *
> > > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > > gpusvm
> > > > > while
> > > > > + * removing notifiers from it.
> > > > > + */
> > > > > +#define drm_gpusvm_for_each_notifier_safe(notifier__,
> > > > > next__,
> > > > > gpusvm__, start__, end__)	\
> > > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > > > root,
> > > > > (start__), (end__) - 1),	\
> > > > > +	     (next__) =
> > > > > __drm_gpusvm_notifier_next(notifier__);			
> > > > > 	\
> > > > > +	     (notifier__) && (notifier__->interval.start <
> > > > > (end__));			\
> > > > > +	     (notifier__) = (next__), (next__) =
> > > > > __drm_gpusvm_notifier_next(notifier__))
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > > > notifier.
> > > > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > > > + * @cur_seq: Current sequence number.
> > > > > + *
> > > > > + * This function serves as a generic MMU notifier for GPU
> > > > > SVM.
> > > > > It
> > > > > sets the MMU
> > > > > + * notifier sequence number and calls the driver invalidate
> > > > > vfunc
> > > > > under
> > > > > + * gpusvm->notifier_lock.
> > > > > + *
> > > > > + * Returns:
> > > > > + * true if the operation succeeds, false otherwise.
> > > > > + */
> > > > > +static bool
> > > > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > > > *mni,
> > > > > +			       const struct
> > > > > mmu_notifier_range
> > > > > *mmu_range,
> > > > > +			       unsigned long cur_seq)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier =
> > > > > +		container_of(mni, typeof(*notifier),
> > > > > notifier);
> > > > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > > > +
> > > > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > > > +		return false;
> > > > > +
> > > > > +	down_write(&gpusvm->notifier_lock);
> > > > > +	mmu_interval_set_seq(mni, cur_seq);
> > > > > +	gpusvm->ops->invalidate(gpusvm, notifier,
> > > > > mmu_range);
> > > > > +	up_write(&gpusvm->notifier_lock);
> > > > > +
> > > > > +	return true;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_ops - MMU interval notifier
> > > > > operations
> > > > > for
> > > > > GPU SVM
> > > > > + */
> > > > > +static const struct mmu_interval_notifier_ops
> > > > > drm_gpusvm_notifier_ops = {
> > > > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > + * @name: Name of the GPU SVM.
> > > > > + * @drm: Pointer to the DRM device structure.
> > > > > + * @mm: Pointer to the mm_struct for the address space.
> > > > > + * @device_private_page_owner: Device private pages owner.
> > > > > + * @mm_start: Start address of GPU SVM.
> > > > > + * @mm_range: Range of the GPU SVM.
> > > > > + * @notifier_size: Size of individual notifiers.
> > > > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > > range
> > > > > allocation.
> > > > > + *               Entries should be powers of 2 in descending
> > > > > order
> > > > > with last
> > > > > + *               entry being SZ_4K.
> > > > > + * @num_chunks: Number of chunks.
> > > > > + *
> > > > > + * This function initializes the GPU SVM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, a negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > +		    const char *name, struct drm_device
> > > > > *drm,
> > > > > +		    struct mm_struct *mm, void
> > > > > *device_private_page_owner,
> > > > > +		    u64 mm_start, u64 mm_range, u64
> > > > > notifier_size,
> > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > +		    const u64 *chunk_sizes, int num_chunks)
> > > > > +{
> > > > > +	if (!ops->invalidate || !num_chunks)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	gpusvm->name = name;
> > > > > +	gpusvm->drm = drm;
> > > > > +	gpusvm->mm = mm;
> > > > > +	gpusvm->device_private_page_owner =
> > > > > device_private_page_owner;
> > > > > +	gpusvm->mm_start = mm_start;
> > > > > +	gpusvm->mm_range = mm_range;
> > > > > +	gpusvm->notifier_size = notifier_size;
> > > > > +	gpusvm->ops = ops;
> > > > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > > > +	gpusvm->num_chunks = num_chunks;
> > > > > +	gpusvm->zdd_wq = system_wq;
> > > > > +
> > > > > +	mmgrab(mm);
> > > > > +	gpusvm->root = RB_ROOT_CACHED;
> > > > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > > > +
> > > > > +	init_rwsem(&gpusvm->notifier_lock);
> > > > > +
> > > > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > > > +	might_lock(&gpusvm->notifier_lock);
> > > > > +	fs_reclaim_release(GFP_KERNEL);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > > > + * @fault_addr__: Fault address
> > > > > + *
> > > > > + * This macro finds the GPU SVM notifier associated with the
> > > > > fault
> > > > > address.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the GPU SVM notifier on success, NULL
> > > > > otherwise.
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > > > fault_addr__)	\
> > > > > +	notifier_iter_first(&(gpusvm__)->root,
> > > > > (fault_addr__),	\
> > > > > +			    (fault_addr__ + 1))
> > > > > +
> > > > > +/**
> > > > > + * to_drm_gpusvm_notifier - retrieve the container struct
> > > > > for a
> > > > > given rbtree node
> > > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > > drm_gpusvm_notifier struct
> > > > > + *
> > > > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > > > structure.
> > > > > + */
> > > > > +#define
> > > > > to_drm_gpusvm_notifier(__node)				\
> > > > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > > > rb.node)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + *
> > > > > + * This function inserts the GPU SVM notifier into the GPU
> > > > > SVM
> > > > > RB
> > > > > tree and list.
> > > > > + */
> > > > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				       struct
> > > > > drm_gpusvm_notifier
> > > > > *notifier)
> > > > > +{
> > > > > +	struct rb_node *node;
> > > > > +	struct list_head *head;
> > > > > +
> > > > > +	notifier_insert(notifier, &gpusvm->root);
> > > > > +
> > > > > +	node = rb_prev(&notifier->rb.node);
> > > > > +	if (node)
> > > > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > > > > rb.entry;
> > > > > +	else
> > > > > +		head = &gpusvm->notifier_list;
> > > > > +
> > > > > +	list_add(&notifier->rb.entry, head);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > > + *
> > > > > + * This macro removes the GPU SVM notifier from the GPU SVM
> > > > > RB
> > > > > tree
> > > > > and list.
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > > > notifier__)	\
> > > > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > > > > root);	\
> > > > > +	list_del(&(notifier__)->rb.entry)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > + *
> > > > > + * This function finalizes the GPU SVM by cleaning up any
> > > > > remaining
> > > > > ranges and
> > > > > + * notifiers, and dropping a reference to struct MM.
> > > > > + */
> > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > > > +
> > > > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > > > gpusvm, 0,
> > > > > LONG_MAX) {
> > > > > +		struct drm_gpusvm_range *range, *__next;
> > > > > +
> > > > > +		/*
> > > > > +		 * Remove notifier first to avoid racing
> > > > > with
> > > > > any
> > > > > invalidation
> > > > > +		 */
> > > > > +		mmu_interval_notifier_remove(&notifier-
> > > > > > notifier);
> > > > > +		notifier->flags.removed = true;
> > > > > +
> > > > > +		drm_gpusvm_for_each_range_safe(range,
> > > > > __next,
> > > > > notifier, 0,
> > > > > +					       LONG_MAX)
> > > > > +			drm_gpusvm_range_remove(gpusvm,
> > > > > range);
> > > > > +	}
> > > > > +
> > > > > +	mmdrop(gpusvm->mm);
> > > > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @fault_addr: Fault address
> > > > > + *
> > > > > + * This function allocates and initializes the GPU SVM
> > > > > notifier
> > > > > structure.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the allocated GPU SVM notifier on success,
> > > > > ERR_PTR()
> > > > > on failure.
> > > > > + */
> > > > > +static struct drm_gpusvm_notifier *
> > > > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > > > fault_addr)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +
> > > > > +	if (gpusvm->ops->notifier_alloc)
> > > > > +		notifier = gpusvm->ops->notifier_alloc();
> > > > > +	else
> > > > > +		notifier = kzalloc(sizeof(*notifier),
> > > > > GFP_KERNEL);
> > > > > +
> > > > > +	if (!notifier)
> > > > > +		return ERR_PTR(-ENOMEM);
> > > > > +
> > > > > +	notifier->gpusvm = gpusvm;
> > > > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > > > gpusvm-
> > > > > > notifier_size);
> > > > > +	notifier->interval.end = ALIGN(fault_addr + 1,
> > > > > gpusvm-
> > > > > > notifier_size);
> > > > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > > > +	notifier->root = RB_ROOT_CACHED;
> > > > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > > > +
> > > > > +	return notifier;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + *
> > > > > + * This function frees the GPU SVM notifier structure.
> > > > > + */
> > > > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				     struct
> > > > > drm_gpusvm_notifier
> > > > > *notifier)
> > > > > +{
> > > > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > > > +
> > > > > +	if (gpusvm->ops->notifier_free)
> > > > > +		gpusvm->ops->notifier_free(notifier);
> > > > > +	else
> > > > > +		kfree(notifier);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > > > given
> > > > > rbtree node
> > > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > > drm_gpusvm_range struct
> > > > > + *
> > > > > + * Return: A pointer to the containing drm_gpusvm_range
> > > > > structure.
> > > > > + */
> > > > > +#define to_drm_gpusvm_range(node__)	\
> > > > > +	container_of((node__), struct drm_gpusvm_range,
> > > > > rb.node)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This function inserts the GPU SVM range into the notifier
> > > > > RB
> > > > > tree
> > > > > and list.
> > > > > + */
> > > > > +static void drm_gpusvm_range_insert(struct
> > > > > drm_gpusvm_notifier
> > > > > *notifier,
> > > > > +				    struct drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	struct rb_node *node;
> > > > > +	struct list_head *head;
> > > > > +
> > > > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > > > +	range_insert(range, &notifier->root);
> > > > > +
> > > > > +	node = rb_prev(&range->rb.node);
> > > > > +	if (node)
> > > > > +		head = &(to_drm_gpusvm_range(node))-
> > > > > >rb.entry;
> > > > > +	else
> > > > > +		head = &notifier->range_list;
> > > > > +
> > > > > +	list_add(&range->rb.entry, head);
> > > > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > > + * @range__: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This macro removes the GPU SVM range from the notifier RB
> > > > > tree
> > > > > and list.
> > > > > + */
> > > > > +#define __drm_gpusvm_range_remove(notifier__,
> > > > > range__)		\
> > > > > +	range_remove((range__), &(notifier__)-
> > > > > > root);		\
> > > > > +	list_del(&(range__)->rb.entry)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + * @fault_addr: Fault address
> > > > > + * @chunk_size: Chunk size
> > > > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > > > + *
> > > > > + * This function allocates and initializes the GPU SVM range
> > > > > structure.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the allocated GPU SVM range on success,
> > > > > ERR_PTR()
> > > > > on
> > > > > failure.
> > > > > + */
> > > > > +static struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > > > +		       struct drm_gpusvm_notifier *notifier,
> > > > > +		       u64 fault_addr, u64 chunk_size, bool
> > > > > migrate_vram)
> > > > > +{
> > > > > +	struct drm_gpusvm_range *range;
> > > > > +
> > > > > +	if (gpusvm->ops->range_alloc)
> > > > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > > > +	else
> > > > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > > > +
> > > > > +	if (!range)
> > > > > +		return ERR_PTR(-ENOMEM);
> > > > > +
> > > > > +	kref_init(&range->refcount);
> > > > > +	range->gpusvm = gpusvm;
> > > > > +	range->notifier = notifier;
> > > > > +	range->va.start = ALIGN_DOWN(fault_addr,
> > > > > chunk_size);
> > > > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > > > +	range->notifier_seq = LONG_MAX;
> > > > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > > > +
> > > > > +	return range;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_check_pages - Check pages
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + * @start: Start address
> > > > > + * @end: End address
> > > > > + *
> > > > > + * Check if pages between start and end have been faulted in
> > > > > on
> > > > > the
> > > > > CPU. Use to
> > > > > + * prevent migration of pages without CPU backing store.
> > > > > + *
> > > > > + * Returns:
> > > > > + * True if pages have been faulted into CPU, False otherwise
> > > > > + */
> > > > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				   struct
> > > > > drm_gpusvm_notifier
> > > > > *notifier,
> > > > > +				   u64 start, u64 end)
> > > > > +{
> > > > > +	struct hmm_range hmm_range = {
> > > > > +		.default_flags = 0,
> > > > > +		.notifier = &notifier->notifier,
> > > > > +		.start = start,
> > > > > +		.end = end,
> > > > > +		.dev_private_owner = gpusvm-
> > > > > > device_private_page_owner,
> > > > > +	};
> > > > > +	unsigned long timeout =
> > > > > +		jiffies +
> > > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > > +	unsigned long *pfns;
> > > > > +	unsigned long npages = npages_in_range(start, end);
> > > > > +	int err, i;
> > > > > +
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > +
> > > > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > > GFP_KERNEL);
> > > > > +	if (!pfns)
> > > > > +		return false;
> > > > > +
> > > > > +	hmm_range.notifier_seq =
> > > > > mmu_interval_read_begin(&notifier-
> > > > > > notifier);
> > > > > +	hmm_range.hmm_pfns = pfns;
> > > > > +
> > > > > +	while (true) {
> > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > +		if (err == -EBUSY) {
> > > > > +			if (time_after(jiffies, timeout))
> > > > > +				break;
> > > > > +
> > > > > +			hmm_range.notifier_seq =
> > > > > mmu_interval_read_begin(&notifier->notifier);
> > > > > +			continue;
> > > > > +		}
> > > > > +		break;
> > > > > +	}
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_free;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +err_free:
> > > > > +	kvfree(pfns);
> > > > > +	return err ? false : true;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_chunk_size - Determine chunk size for
> > > > > GPU
> > > > > SVM
> > > > > range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > + * @vas: Pointer to the virtual memory area structure
> > > > > + * @fault_addr: Fault address
> > > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > > + * @check_pages: Flag indicating whether to check pages
> > > > > + *
> > > > > + * This function determines the chunk size for the GPU SVM
> > > > > range
> > > > > based on the
> > > > > + * fault address, GPU SVM chunk sizes, existing GPU SVM
> > > > > ranges,
> > > > > and
> > > > > the virtual
> > > > > + * memory area boundaries.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Chunk size on success, LONG_MAX on failure.
> > > > > + */
> > > > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				       struct
> > > > > drm_gpusvm_notifier
> > > > > *notifier,
> > > > > +				       struct vm_area_struct
> > > > > *vas,
> > > > > +				       u64 fault_addr, u64
> > > > > gpuva_start,
> > > > > +				       u64 gpuva_end, bool
> > > > > check_pages)
> > > > > +{
> > > > > +	u64 start, end;
> > > > > +	int i = 0;
> > > > > +
> > > > > +retry:
> > > > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > > > chunk_sizes[i]);
> > > > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > > > > chunk_sizes[i]);
> > > > > +
> > > > > +		if (start >= vas->vm_start && end <= vas-
> > > > > >vm_end
> > > > > &&
> > > > > +		    start >= notifier->interval.start &&
> > > > > +		    end <= notifier->interval.end &&
> > > > > +		    start >= gpuva_start && end <=
> > > > > gpuva_end)
> > > > > +			break;
> > > > > +	}
> > > > > +
> > > > > +	if (i == gpusvm->num_chunks)
> > > > > +		return LONG_MAX;
> > > > > +
> > > > > +	/*
> > > > > +	 * If allocation more than page, ensure not to
> > > > > overlap
> > > > > with
> > > > > existing
> > > > > +	 * ranges.
> > > > > +	 */
> > > > > +	if (end - start != SZ_4K) {
> > > > > +		struct drm_gpusvm_range *range;
> > > > > +
> > > > > +		range = drm_gpusvm_range_find(notifier,
> > > > > start,
> > > > > end);
> > > > > +		if (range) {
> > > > > +			++i;
> > > > > +			goto retry;
> > > > > +		}
> > > > > +
> > > > > +		/*
> > > > > +		 * XXX: Only create range on pages CPU has
> > > > > faulted
> > > > > in. Without
> > > > > +		 * this check, or prefault, on BMG
> > > > > 'xe_exec_system_allocator --r
> > > > > +		 * process-many-malloc' fails. In the
> > > > > failure
> > > > > case,
> > > > > each process
> > > > > +		 * mallocs 16k but the CPU VMA is ~128k
> > > > > which
> > > > > results in 64k SVM
> > > > > +		 * ranges. When migrating the SVM ranges,
> > > > > some
> > > > > processes fail in
> > > > > +		 * drm_gpusvm_migrate_to_vram with
> > > > > 'migrate.cpages
> > > > > != npages'
> > > > > +		 * and then upon drm_gpusvm_range_get_pages
> > > > > device
> > > > > pages from
> > > > > +		 * other processes are collected + faulted
> > > > > in
> > > > > which
> > > > > creates all
> > > > > +		 * sorts of problems. Unsure exactly how
> > > > > this
> > > > > happening, also
> > > > > +		 * problem goes away if
> > > > > 'xe_exec_system_allocator --
> > > > > r
> > > > > +		 * process-many-malloc' mallocs at least 64k
> > > > > at
> > > > > a
> > > > > time.
> > > > > +		 */
> > > > > +		if (check_pages &&
> > > > > +		    !drm_gpusvm_check_pages(gpusvm,
> > > > > notifier,
> > > > > start,
> > > > > end)) {
> > > > > +			++i;
> > > > > +			goto retry;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	return end - start;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > > > range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @fault_addr: Fault address
> > > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function finds or inserts a newly allocated a GPU
> > > > > SVM
> > > > > range
> > > > > based on the
> > > > > + * fault address. Caller must hold a lock to protect range
> > > > > lookup
> > > > > and insertion.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > > > failure.
> > > > > + */
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > > > > u64
> > > > > fault_addr,
> > > > > +				u64 gpuva_start, u64
> > > > > gpuva_end,
> > > > > +				const struct drm_gpusvm_ctx
> > > > > *ctx)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +	struct drm_gpusvm_range *range;
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	struct vm_area_struct *vas;
> > > > > +	bool notifier_alloc = false;
> > > > > +	u64 chunk_size;
> > > > > +	int err;
> > > > > +	bool migrate_vram;
> > > > > +
> > > > > +	if (fault_addr < gpusvm->mm_start ||
> > > > > +	    fault_addr > gpusvm->mm_start + gpusvm-
> > > > > >mm_range) {
> > > > > +		err = -EINVAL;
> > > > > +		goto err_out;
> > > > > +	}
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +		mmap_write_lock(mm);
> > > > > +	}
> > > > > +
> > > > > +	mmap_assert_write_locked(mm);
> > > > > +
> > > > > +	notifier = drm_gpusvm_notifier_find(gpusvm,
> > > > > fault_addr);
> > > > > +	if (!notifier) {
> > > > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > > > fault_addr);
> > > > > +		if (IS_ERR(notifier)) {
> > > > > +			err = PTR_ERR(notifier);
> > > > > +			goto err_mmunlock;
> > > > > +		}
> > > > > +		notifier_alloc = true;
> > > > > +		err =
> > > > > mmu_interval_notifier_insert_locked(&notifier-
> > > > > > notifier,
> > > > > +							 
> > > > > mm,
> > > > > notifier->interval.start,
> > > > > +							 
> > > > > notifier-
> > > > > > interval.end -
> > > > > +							 
> > > > > notifier-
> > > > > > interval.start,
> > > > > +							 
> > > > > &drm_gpusvm_notifier_ops);
> > > > > +		if (err)
> > > > > +			goto err_notifier;
> > > > > +	}
> > > > > +
> > > > > +	vas = vma_lookup(mm, fault_addr);
> > > > > +	if (!vas) {
> > > > > +		err = -ENOENT;
> > > > > +		goto err_notifier_remove;
> > > > > +	}
> > > > > +
> > > > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE))
> > > > > {
> > > > > +		err = -EPERM;
> > > > > +		goto err_notifier_remove;
> > > > > +	}
> > > > > +
> > > > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > > > fault_addr + 1);
> > > > > +	if (range)
> > > > > +		goto out_mmunlock;
> > > > > +	/*
> > > > > +	 * XXX: Short-circuiting migration based on
> > > > > migrate_vma_*
> > > > > current
> > > > > +	 * limitations. If/when migrate_vma_* add more
> > > > > support,
> > > > > this
> > > > > logic will
> > > > > +	 * have to change.
> > > > > +	 */
> > > > > +	migrate_vram = ctx->vram_possible &&
> > > > > +		vma_is_anonymous(vas) &&
> > > > > !is_vm_hugetlb_page(vas);
> > > > > +
> > > > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > > > notifier,
> > > > > vas,
> > > > > +						 fault_addr,
> > > > > gpuva_start,
> > > > > +						 gpuva_end,
> > > > > migrate_vram &&
> > > > > +						 !ctx-
> > > > > > prefault);
> > > > > +	if (chunk_size == LONG_MAX) {
> > > > > +		err = -EINVAL;
> > > > > +		goto err_notifier_remove;
> > > > > +	}
> > > > > +
> > > > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > > > fault_addr,
> > > > > chunk_size,
> > > > > +				       migrate_vram);
> > > > > +	if (IS_ERR(range)) {
> > > > > +		err = PTR_ERR(range);
> > > > > +		goto err_notifier_remove;
> > > > > +	}
> > > > > +
> > > > > +	drm_gpusvm_range_insert(notifier, range);
> > > > > +	if (notifier_alloc)
> > > > > +		drm_gpusvm_notifier_insert(gpusvm,
> > > > > notifier);
> > > > > +
> > > > > +	if (ctx->prefault) {
> > > > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > > > +
> > > > > +		__ctx.mmap_locked = true;
> > > > > +		err = drm_gpusvm_range_get_pages(gpusvm,
> > > > > range,
> > > > > &__ctx);
> > > > > +		if (err)
> > > > > +			goto err_range_remove;
> > > > > +	}
> > > > > +
> > > > > +out_mmunlock:
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_write_unlock(mm);
> > > > > +		mmput(mm);
> > > > > +	}
> > > > > +
> > > > > +	return range;
> > > > > +
> > > > > +err_range_remove:
> > > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > > +err_notifier_remove:
> > > > > +	if (notifier_alloc)
> > > > > +		mmu_interval_notifier_remove(&notifier-
> > > > > > notifier);
> > > > > +err_notifier:
> > > > > +	if (notifier_alloc)
> > > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > > +err_mmunlock:
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_write_unlock(mm);
> > > > > +		mmput(mm);
> > > > > +	}
> > > > > +err_out:
> > > > > +	return ERR_PTR(err);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > > > + * @i__: the current page index in the iteration
> > > > > + * @j__: the current page index, log order, in the iteration
> > > > > + * @npages__: the total number of pages in the DMA region
> > > > > + * @order__: the order of the pages in the DMA region
> > > > > + *
> > > > > + * This macro iterates over each page in a DMA region. The
> > > > > DMA
> > > > > region
> > > > > + * is assumed to be composed of 2^@order__ pages, and the
> > > > > macro
> > > > > will
> > > > > + * step through the region one block of 2^@order__ pages at
> > > > > a
> > > > > time.
> > > > > + */
> > > > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated
> > > > > with
> > > > > a
> > > > > GPU SVM range (internal)
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This function unmap pages associated with a GPU SVM
> > > > > range.
> > > > > Assumes and
> > > > > + * asserts correct locking is in place when called.
> > > > > + */
> > > > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +					   struct
> > > > > drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > +
> > > > > +	if (range->pages) {
> > > > > +		unsigned long i, j, npages =
> > > > > npages_in_range(range-
> > > > > > va.start,
> > > > > +							    
> > > > > range-
> > > > > > va.end);
> > > > > +
> > > > > +		if (range->flags.has_dma_mapping) {
> > > > > +			for_each_dma_page(i, j, npages,
> > > > > range-
> > > > > > order)
> > > > > +				dma_unmap_page(gpusvm->drm-
> > > > > >dev,
> > > > > +					       range-
> > > > > > dma_addr[j],
> > > > > +					       PAGE_SIZE <<
> > > > > range-
> > > > > > order,
> > > > > +					      
> > > > > DMA_BIDIRECTIONAL);
> > > > > +		}
> > > > > +
> > > > > +		range->flags.has_vram_pages = false;
> > > > > +		range->flags.has_dma_mapping = false;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_free_pages - Free pages associated with
> > > > > a
> > > > > GPU
> > > > > SVM range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This function free pages associated with a GPU SVM range.
> > > > > + */
> > > > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +					struct
> > > > > drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > +
> > > > > +	if (range->pages) {
> > > > > +		if (range->flags.kfree_mapping) {
> > > > > +			kfree(range->dma_addr);
> > > > > +			range->flags.kfree_mapping = false;
> > > > > +			range->pages = NULL;
> > > > > +		} else {
> > > > > +			kvfree(range->pages);
> > > > > +			range->pages = NULL;
> > > > > +		}
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range to be removed
> > > > > + *
> > > > > + * This function removes the specified GPU SVM range and
> > > > > also
> > > > > removes the parent
> > > > > + * GPU SVM notifier if no more ranges remain in the
> > > > > notifier.
> > > > > The
> > > > > caller must
> > > > > + * hold a lock to protect range and notifier removal.
> > > > > + */
> > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > +			     struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +
> > > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > > > va.start);
> > > > > +	if (WARN_ON_ONCE(!notifier))
> > > > > +		return;
> > > > > +
> > > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +
> > > > > +	drm_gpusvm_range_put(range);
> > > > > +
> > > > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > > > +		if (!notifier->flags.removed)
> > > > > +			mmu_interval_notifier_remove(&notifi
> > > > > er-
> > > > > > notifier);
> > > > > +		drm_gpusvm_notifier_remove(gpusvm,
> > > > > notifier);
> > > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > > > + * @range: Pointer to the GPU SVM range
> > > > > + *
> > > > > + * This function increments the reference count of the
> > > > > specified
> > > > > GPU
> > > > > SVM range.
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the GPU SVM range.
> > > > > + */
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	kref_get(&range->refcount);
> > > > > +
> > > > > +	return range;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > > > + * @refcount: Pointer to the reference counter embedded in
> > > > > the
> > > > > GPU
> > > > > SVM range
> > > > > + *
> > > > > + * This function destroys the specified GPU SVM range when
> > > > > its
> > > > > reference count
> > > > > + * reaches zero. If a custom range-free function is
> > > > > provided, it
> > > > > is
> > > > > invoked to
> > > > > + * free the range; otherwise, the range is deallocated using
> > > > > kfree().
> > > > > + */
> > > > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > > > +{
> > > > > +	struct drm_gpusvm_range *range =
> > > > > +		container_of(refcount, struct
> > > > > drm_gpusvm_range,
> > > > > refcount);
> > > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > > +
> > > > > +	if (gpusvm->ops->range_free)
> > > > > +		gpusvm->ops->range_free(range);
> > > > > +	else
> > > > > +		kfree(range);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > > > + * @range: Pointer to the GPU SVM range
> > > > > + *
> > > > > + * This function decrements the reference count of the
> > > > > specified
> > > > > GPU
> > > > > SVM range
> > > > > + * and frees it when the count reaches zero.
> > > > > + */
> > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	kref_put(&range->refcount,
> > > > > drm_gpusvm_range_destroy);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This function determines if a GPU SVM range pages are
> > > > > valid.
> > > > > Expected be
> > > > > + * called holding gpusvm->notifier_lock and as the last step
> > > > > before
> > > > > commiting a
> > > > > + * GPU binding.
> > > > > + *
> > > > > + * Returns:
> > > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > > + */
> > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > +
> > > > > +	return range->flags.has_vram_pages || range-
> > > > > > flags.has_dma_mapping;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range
> > > > > pages
> > > > > valid
> > > > > unlocked
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * This function determines if a GPU SVM range pages are
> > > > > valid.
> > > > > Expected be
> > > > > + * called without holding gpusvm->notifier_lock.
> > > > > + *
> > > > > + * Returns:
> > > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > > + */
> > > > > +static bool
> > > > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				      struct
> > > > > drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	bool pages_valid;
> > > > > +
> > > > > +	if (!range->pages)
> > > > > +		return false;
> > > > > +
> > > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > > > range);
> > > > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > > > +		kfree(range->dma_addr);
> > > > > +		range->flags.kfree_mapping = false;
> > > > > +		range->pages = NULL;
> > > > > +	}
> > > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +
> > > > > +	return pages_valid;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM
> > > > > range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function gets pages for a GPU SVM range and ensures
> > > > > they
> > > > > are
> > > > > mapped for
> > > > > + * DMA access.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx)
> > > > > +{
> > > > > +	struct mmu_interval_notifier *notifier = &range-
> > > > > > notifier-
> > > > > > notifier;
> > > > > +	struct hmm_range hmm_range = {
> > > > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > > > > read_only
> > > > > ? 0 :
> > > > > +			HMM_PFN_REQ_WRITE),
> > > > > +		.notifier = notifier,
> > > > > +		.start = range->va.start,
> > > > > +		.end = range->va.end,
> > > > > +		.dev_private_owner = gpusvm-
> > > > > > device_private_page_owner,
> > > > > +	};
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	unsigned long timeout =
> > > > > +		jiffies +
> > > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > > +	unsigned long i, j;
> > > > > +	unsigned long npages = npages_in_range(range-
> > > > > >va.start,
> > > > > range->va.end);
> > > > > +	unsigned int order = 0;
> > > > > +	unsigned long *pfns;
> > > > > +	struct page **pages;
> > > > > +	int err = 0;
> > > > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > > > +	bool alloc_pfns = false, kfree_mapping;
> > > > > +
> > > > > +retry:
> > > > > +	kfree_mapping = false;
> > > > > +	hmm_range.notifier_seq =
> > > > > mmu_interval_read_begin(notifier);
> > > > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > > > range))
> > > > > +		return 0;
> > > > > +
> > > > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > > > range-
> > > > > > pages) {
> > > > > +		if (ctx->prefault)
> > > > > +			return 0;
> > > > > +
> > > > > +		pfns = (unsigned long *)range->pages;
> > > > > +		pages = range->pages;
> > > > > +		goto map_pages;
> > > > > +	}
> > > > > +
> > > > > +	if (!range->pages) {
> > > > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > > GFP_KERNEL);
> > > > > +		if (!pfns)
> > > > > +			return -ENOMEM;
> > > > > +		alloc_pfns = true;
> > > > > +	} else {
> > > > > +		pfns = (unsigned long *)range->pages;
> > > > > +	}
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	hmm_range.hmm_pfns = pfns;
> > > > > +	while (true) {
> > > > > +		/* Must be checked after
> > > > > mmu_interval_read_begin
> > > > > */
> > > > > +		if (range->flags.unmapped) {
> > > > > +			err = -EFAULT;
> > > > > +			break;
> > > > > +		}
> > > > > +
> > > > > +		if (!ctx->mmap_locked) {
> > > > > +			/*
> > > > > +			 * XXX: HMM locking document
> > > > > indicates
> > > > > only
> > > > > a read-lock
> > > > > +			 * is required but there apears to
> > > > > be a
> > > > > window between
> > > > > +			 * the MMU_NOTIFY_MIGRATE event
> > > > > triggered in
> > > > > a CPU fault
> > > > > +			 * via migrate_vma_setup and the
> > > > > pages
> > > > > actually moving
> > > > > +			 * in migrate_vma_finalize in which
> > > > > this
> > > > > code can grab
> > > > > +			 * garbage pages. Grabbing the
> > > > > write-
> > > > > lock if
> > > > > the range
> > > > > +			 * is attached to vram appears to
> > > > > protect
> > > > > against this
> > > > > +			 * race.
> > > > > +			 */
> > > > > +			if (vram_pages)
> > > > > +				mmap_write_lock(mm);
> > > > > +			else
> > > > > +				mmap_read_lock(mm);
> > > > > +		}
> > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > +		if (!ctx->mmap_locked) {
> > > > > +			if (vram_pages)
> > > > > +				mmap_write_unlock(mm);
> > > > > +			else
> > > > > +				mmap_read_unlock(mm);
> > > > > +		}
> > > > > +
> > > > > +		if (err == -EBUSY) {
> > > > > +			if (time_after(jiffies, timeout))
> > > > > +				break;
> > > > > +
> > > > > +			hmm_range.notifier_seq =
> > > > > mmu_interval_read_begin(notifier);
> > > > > +			continue;
> > > > > +		}
> > > > > +		break;
> > > > > +	}
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmput(mm);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	pages = (struct page **)pfns;
> > > > > +
> > > > > +	if (ctx->prefault) {
> > > > > +		range->pages = pages;
> > > > > +		goto set_seqno;
> > > > > +	}
> > > > > +
> > > > > +map_pages:
> > > > > +	if
> > > > > (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > > +
> > > > > +		for (i = 0; i < npages; ++i) {
> > > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > > +
> > > > > +			if
> > > > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				goto err_free;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Do not race with notifier unmapping pages
> > > > > */
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +		range->flags.has_vram_pages = true;
> > > > > +		range->pages = pages;
> > > > > +		if (mmu_interval_read_retry(notifier,
> > > > > hmm_range.notifier_seq)) {
> > > > > +			err = -EAGAIN;
> > > > > +			__drm_gpusvm_range_unmap_pages(gpusv
> > > > > m,
> > > > > range);
> > > > > +		}
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +	} else {
> > > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > > +
> > > > > +		for_each_dma_page(i, j, npages, order) {
> > > > > +			if (WARN_ON_ONCE(i && order !=
> > > > > +					
> > > > > hmm_pfn_to_map_order(pfns[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +			order =
> > > > > hmm_pfn_to_map_order(pfns[i]);
> > > > > +
> > > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > > +			if
> > > > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +
> > > > > +			set_page_dirty_lock(pages[j]);
> > > > > +			mark_page_accessed(pages[j]);
> > > > > +
> > > > > +			dma_addr[j] = dma_map_page(gpusvm-
> > > > > >drm-
> > > > > > dev,
> > > > > +						   pages[j],
> > > > > 0,
> > > > > +						   PAGE_SIZE
> > > > > <<
> > > > > order,
> > > > > +						  
> > > > > DMA_BIDIRECTIONAL);
> > > > > +			if (dma_mapping_error(gpusvm->drm-
> > > > > >dev,
> > > > > dma_addr[j])) {
> > > > > +				err = -EFAULT;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Huge pages, reduce memory footprint */
> > > > > +		if (order) {
> > > > > +			dma_addr = kmalloc_array(j,
> > > > > sizeof(*dma_addr),
> > > > > +						
> > > > > GFP_KERNEL);
> > > > > +			if (dma_addr) {
> > > > > +				for (i = 0; i < j; ++i)
> > > > > +					dma_addr[i] =
> > > > > (dma_addr_t)pfns[i];
> > > > > +				kvfree(pfns);
> > > > > +				kfree_mapping = true;
> > > > > +			} else {
> > > > > +				dma_addr = (dma_addr_t
> > > > > *)pfns;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		/* Do not race with notifier unmapping pages
> > > > > */
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +		range->order = order;
> > > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > > +		range->flags.has_dma_mapping = true;
> > > > > +		range->dma_addr = dma_addr;
> > > > > +		range->vram_allocation = NULL;
> > > > > +		if (mmu_interval_read_retry(notifier,
> > > > > hmm_range.notifier_seq)) {
> > > > > +			err = -EAGAIN;
> > > > > +			__drm_gpusvm_range_unmap_pages(gpusv
> > > > > m,
> > > > > range);
> > > > > +		}
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +	}
> > > > > +
> > > > > +	if (err == -EAGAIN)
> > > > > +		goto retry;
> > > > > +set_seqno:
> > > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > > +
> > > > > +	return 0;
> > > > > +
> > > > > +err_unmap:
> > > > > +	for_each_dma_page(i, j, npages, order)
> > > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > > +			       (dma_addr_t)pfns[j],
> > > > > +			       PAGE_SIZE << order,
> > > > > DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	if (alloc_pfns)
> > > > > +		kvfree(pfns);
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated
> > > > > with a
> > > > > GPU
> > > > > SVM range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function unmaps pages associated with a GPU SVM
> > > > > range.
> > > > > If
> > > > > @in_notifier
> > > > > + * is set, it is assumed that gpusvm->notifier_lock is held
> > > > > in
> > > > > write
> > > > > mode; if it
> > > > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > > > Must be
> > > > > called on
> > > > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > > > invalidate for IOMMU
> > > > > + * security model.
> > > > > + */
> > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range
> > > > > *range,
> > > > > +				  const struct
> > > > > drm_gpusvm_ctx
> > > > > *ctx)
> > > > > +{
> > > > > +	if (ctx->in_notifier)
> > > > > +		lockdep_assert_held_write(&gpusvm-
> > > > > > notifier_lock);
> > > > > +	else
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +
> > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +
> > > > > +	if (!ctx->in_notifier)
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > > + * @page: Pointer to the page to put
> > > > > + *
> > > > > + * This function unlocks and puts a page.
> > > > > + */
> > > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > > +{
> > > > > +	unlock_page(page);
> > > > > +	put_page(page);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > > + * @npages: Number of pages
> > > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > > + *
> > > > > + * This function puts an array of pages.
> > > > > + */
> > > > > +static void drm_gpusvm_migration_put_pages(unsigned long
> > > > > npages,
> > > > > +					   unsigned long
> > > > > *migrate_pfn)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		if (!migrate_pfn[i])
> > > > > +			continue;
> > > > > +
> > > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to
> > > > > _pag
> > > > > e(mi
> > > > > grate_pfn[i]));
> > > > > +		migrate_pfn[i] = 0;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > > + * @page: Pointer to the page
> > > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > > + *
> > > > > + * This function associates the given page with the
> > > > > specified
> > > > > GPU
> > > > > SVM zone
> > > > > + * device data and initializes it for zone device usage.
> > > > > + */
> > > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > > +				     struct drm_gpusvm_zdd
> > > > > *zdd)
> > > > > +{
> > > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > > +	zone_device_page_init(page);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for
> > > > > GPU
> > > > > SVM
> > > > > migration
> > > > > + * @dev: The device for which the pages are being mapped
> > > > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > > > mapped
> > > > > pages
> > > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > > + * @npages: Number of pages to map
> > > > > + * @dir: Direction of data transfer (e.g.,
> > > > > DMA_BIDIRECTIONAL)
> > > > > + *
> > > > > + * This function maps pages of memory for migration usage in
> > > > > GPU
> > > > > SVM. It
> > > > > + * iterates over each page frame number provided in
> > > > > @migrate_pfn,
> > > > > maps the
> > > > > + * corresponding page, and stores the DMA address in the
> > > > > provided
> > > > > @dma_addr
> > > > > + * array.
> > > > > + *
> > > > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > > > mapping.
> > > > > + */
> > > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > > +					dma_addr_t
> > > > > *dma_addr,
> > > > > +					long unsigned int
> > > > > *migrate_pfn,
> > > > > +					unsigned long
> > > > > npages,
> > > > > +					enum
> > > > > dma_data_direction
> > > > > dir)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		struct page *page =
> > > > > migrate_pfn_to_page(migrate_pfn[i]);
> > > > > +
> > > > > +		if (!page)
> > > > > +			continue;
> > > > > +
> > > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > > +			return -EFAULT;
> > > > > +
> > > > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > > > PAGE_SIZE,
> > > > > dir);
> > > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > > +			return -EFAULT;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > > > mapped
> > > > > for GPU SVM migration
> > > > > + * @dev: The device for which the pages were mapped
> > > > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > > > pages
> > > > > + * @npages: Number of pages to unmap
> > > > > + * @dir: Direction of data transfer (e.g.,
> > > > > DMA_BIDIRECTIONAL)
> > > > > + *
> > > > > + * This function unmaps previously mapped pages of memory
> > > > > for
> > > > > GPU
> > > > > Shared Virtual
> > > > > + * Memory (SVM). It iterates over each DMA address provided
> > > > > in
> > > > > @dma_addr, checks
> > > > > + * if it's valid and not already unmapped, and unmaps the
> > > > > corresponding page.
> > > > > + */
> > > > > +static void drm_gpusvm_migrate_unmap_pages(struct device
> > > > > *dev,
> > > > > +					   dma_addr_t
> > > > > *dma_addr,
> > > > > +					   unsigned long
> > > > > npages,
> > > > > +					   enum
> > > > > dma_data_direction
> > > > > dir)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > > > dma_addr[i]))
> > > > > +			continue;
> > > > > +
> > > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > > > dir);
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to
> > > > > VRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *                   failure of this function.
> > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > allocation.
> > > > > The caller
> > > > > + *                   should hold a reference to the VRAM
> > > > > allocation,
> > > > > which
> > > > > + *                   should be dropped via ops-
> > > > > >vram_allocation
> > > > > or
> > > > > upon the
> > > > > + *                   failure of this function.
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function migrates the specified GPU SVM range to
> > > > > VRAM.
> > > > > It
> > > > > performs the
> > > > > + * necessary setup and invokes the driver-specific
> > > > > operations
> > > > > for
> > > > > migration to
> > > > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > > > reference @range
> > > > > + * until ops->vram_release is called which only upon
> > > > > successful
> > > > > return.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       void *vram_allocation,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx)
> > > > > +{
> > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > +	struct migrate_vma migrate = {
> > > > > +		.start		= start,
> > > > > +		.end		= end,
> > > > > +		.pgmap_owner	= gpusvm-
> > > > > > device_private_page_owner,
> > > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > > +	};
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	unsigned long i, npages = npages_in_range(start,
> > > > > end);
> > > > > +	struct vm_area_struct *vas;
> > > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > > +	struct page **pages;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int err;
> > > > > +
> > > > > +	if (!range->flags.migrate_vram)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > > > copy_to_vram ||
> > > > > +	    !gpusvm->ops->copy_to_sram)
> > > > > +		return -EOPNOTSUPP;
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +		mmap_write_lock(mm);
> > > > > +	}
> > > > > +
> > > > > +	mmap_assert_locked(mm);
> > > > > +
> > > > > +	vas = vma_lookup(mm, start);
> > > > > +	if (!vas) {
> > > > > +		err = -ENOENT;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > > +		err = -EINVAL;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (!vma_is_anonymous(vas)) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > > sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) *
> > > > > npages);
> > > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > > sizeof(*dma_addr))
> > > > > * npages;
> > > > > +
> > > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > > +	if (!zdd) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	migrate.vma = vas;
> > > > > +	migrate.src = buf;
> > > > > +	migrate.dst = migrate.src + npages;
> > > > > +
> > > > > +	err = migrate_vma_setup(&migrate);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	/*
> > > > > +	 * FIXME: Below cases, !migrate.cpages and
> > > > > migrate.cpages !=
> > > > > npages, not
> > > > > +	 * always an error. Need to revisit possible cases
> > > > > and
> > > > > how
> > > > > to handle. We
> > > > > +	 * could prefault on migrate.cpages != npages via
> > > > > hmm_range_fault.
> > > > > +	 */
> > > > > +
> > > > > +	if (!migrate.cpages) {
> > > > > +		err = -EFAULT;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	if (migrate.cpages != npages) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_finalize;
> > > > > +	}
> > > > > +
> > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > > > vram_allocation, npages,
> > > > > +					     migrate.dst);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > +					   migrate.src,
> > > > > npages,
> > > > > DMA_TO_DEVICE);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i) {
> > > > > +		struct page *page =
> > > > > pfn_to_page(migrate.dst[i]);
> > > > > +
> > > > > +		pages[i] = page;
> > > > > +		migrate.dst[i] =
> > > > > migrate_pfn(migrate.dst[i]);
> > > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > > +	}
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages,
> > > > > dma_addr,
> > > > > npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	/* Upon success bind vram allocation to range and
> > > > > zdd */
> > > > > +	range->vram_allocation = vram_allocation;
> > > > > +	WRITE_ONCE(zdd->vram_allocation,
> > > > > vram_allocation);	/*
> > > > > Owns ref */
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > > migrate.dst);
> > > > > +	migrate_vma_pages(&migrate);
> > > > > +	migrate_vma_finalize(&migrate);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > npages,
> > > > > +				       DMA_TO_DEVICE);
> > > > > +err_free:
> > > > > +	if (zdd)
> > > > > +		drm_gpusvm_zdd_put(zdd);
> > > > > +	kvfree(buf);
> > > > > +err_mmunlock:
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_write_unlock(mm);
> > > > > +		mmput(mm);
> > > > > +	}
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs
> > > > > for
> > > > > a
> > > > > VM area
> > > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > > + * @npages: Number of pages to populate
> > > > > + * @src_mpfn: Source array of migrate PFNs
> > > > > + * @mpfn: Array of migrate PFNs to populate
> > > > > + * @addr: Start address for PFN allocation
> > > > > + *
> > > > > + * This function populates the SRAM migrate page frame
> > > > > numbers
> > > > > (PFNs) for the
> > > > > + * specified VM area structure. It allocates and locks pages
> > > > > in
> > > > > the
> > > > > VM area for
> > > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > > > allocation,
> > > > > if NULL use
> > > > > + * alloc_page for allocation.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > > > vm_area_struct *vas,
> > > > > +						unsigned
> > > > > long
> > > > > npages,
> > > > > +						unsigned
> > > > > long
> > > > > *src_mpfn,
> > > > > +						unsigned
> > > > > long
> > > > > *mpfn,
> > > > > u64 addr)
> > > > > +{
> > > > > +	unsigned long i;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > > +		struct page *page;
> > > > > +
> > > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > > +			continue;
> > > > > +
> > > > > +		if (vas)
> > > > > +			page = alloc_page_vma(GFP_HIGHUSER,
> > > > > vas,
> > > > > addr);
> > > > > +		else
> > > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > > +
> > > > > +		if (!page)
> > > > > +			return -ENOMEM;
> > > > > +
> > > > > +		lock_page(page);
> > > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + *
> > > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not
> > > > > require
> > > > > mmap
> > > > > lock and
> > > > > + * migration done via migrate_device_* functions. Fallback
> > > > > path
> > > > > as
> > > > > it is
> > > > > + * preferred to issue migrations with mmap lock.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +				    struct drm_gpusvm_range
> > > > > *range)
> > > > > +{
> > > > > +	unsigned long npages;
> > > > > +	struct page **pages;
> > > > > +	unsigned long *src, *dst;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int i, err = 0;
> > > > > +
> > > > > +	npages = npages_in_range(range->va.start, range-
> > > > > > va.end);
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > > > sizeof(*dma_addr)
> > > > > +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_out;
> > > > > +	}
> > > > > +	src = buf;
> > > > > +	dst = buf + (sizeof(*src) * npages);
> > > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr))
> > > > > *
> > > > > npages;
> > > > > +
> > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > > > vram_allocation,
> > > > > +					     npages, src);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > > +				       gpusvm-
> > > > > > device_private_page_owner, src,
> > > > > +				       npages, range-
> > > > > >va.start);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL,
> > > > > npages,
> > > > > src, dst, 0);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > +					   dst, npages,
> > > > > DMA_BIDIRECTIONAL);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i)
> > > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > > > dma_addr,
> > > > > npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > > +	migrate_device_pages(src, dst, npages);
> > > > > +	migrate_device_finalize(src, dst, npages);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > npages,
> > > > > +				       DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	kvfree(buf);
> > > > > +err_out:
> > > > > +
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to
> > > > > SRAM
> > > > > (internal)
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @vas: Pointer to the VM area structure
> > > > > + * @page: Pointer to the page for fault handling (can be
> > > > > NULL)
> > > > > + * @start: Start address of the migration range
> > > > > + * @end: End address of the migration range
> > > > > + *
> > > > > + * This internal function performs the migration of the
> > > > > specified
> > > > > GPU SVM range
> > > > > + * to SRAM. It sets up the migration, populates + dma maps
> > > > > SRAM
> > > > > PFNs, and
> > > > > + * invokes the driver-specific operations for migration to
> > > > > SRAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > > > *gpusvm,
> > > > > +					struct
> > > > > vm_area_struct
> > > > > *vas,
> > > > > +					struct page *page,
> > > > > +					u64 start, u64 end)
> > > > > +{
> > > > > +	struct migrate_vma migrate = {
> > > > > +		.vma		= vas,
> > > > > +		.pgmap_owner	= gpusvm-
> > > > > > device_private_page_owner,
> > > > > +		.flags		=
> > > > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > > +		.fault_page	= page,
> > > > > +	};
> > > > > +	unsigned long npages;
> > > > > +	struct page **pages;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int i, err = 0;
> > > > > +
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > +
> > > > > +	/* Corner where VMA area struct has been partially
> > > > > unmapped
> > > > > */
> > > > > +	if (start < vas->vm_start)
> > > > > +		start = vas->vm_start;
> > > > > +	if (end > vas->vm_end)
> > > > > +		end = vas->vm_end;
> > > > > +
> > > > > +	migrate.start = start;
> > > > > +	migrate.end = end;
> > > > > +	npages = npages_in_range(start, end);
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > > sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_out;
> > > > > +	}
> > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) *
> > > > > npages);
> > > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > > sizeof(*dma_addr))
> > > > > * npages;
> > > > > +
> > > > > +	migrate.vma = vas;
> > > > > +	migrate.src = buf;
> > > > > +	migrate.dst = migrate.src + npages;
> > > > > +
> > > > > +	err = migrate_vma_setup(&migrate);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	/* Raced with another CPU fault, nothing to do */
> > > > > +	if (!migrate.cpages)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas,
> > > > > npages,
> > > > > +						  
> > > > > migrate.src,
> > > > > migrate.dst,
> > > > > +						   start);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > +					   migrate.dst,
> > > > > npages,
> > > > > +					  
> > > > > DMA_BIDIRECTIONAL);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +	for (i = 0; i < npages; ++i)
> > > > > +		pages[i] =
> > > > > migrate_pfn_to_page(migrate.src[i]);
> > > > > +
> > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > > > dma_addr,
> > > > > npages);
> > > > > +	if (err)
> > > > > +		goto err_finalize;
> > > > > +
> > > > > +err_finalize:
> > > > > +	if (err)
> > > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > > migrate.dst);
> > > > > +	migrate_vma_pages(&migrate);
> > > > > +	migrate_vma_finalize(&migrate);
> > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > dma_addr,
> > > > > npages,
> > > > > +				       DMA_BIDIRECTIONAL);
> > > > > +err_free:
> > > > > +	kvfree(buf);
> > > > > +err_out:
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > +
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM
> > > > > range to
> > > > > SRAM
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > + * @ctx: GPU SVM context
> > > > > + *
> > > > > + * This function initiates the migration of the specified
> > > > > GPU
> > > > > SVM
> > > > > range to
> > > > > + * SRAM. It performs necessary checks and invokes the
> > > > > internal
> > > > > migration
> > > > > + * function for actual migration.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx)
> > > > > +{
> > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > +	struct vm_area_struct *vas;
> > > > > +	int err;
> > > > > +	bool retry = false;
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		if (!mmget_not_zero(mm)) {
> > > > > +			err = -EFAULT;
> > > > > +			goto err_out;
> > > > > +		}
> > > > > +		if (ctx->trylock_mmap) {
> > > > > +			if (!mmap_read_trylock(mm))  {
> > > > > +				err =
> > > > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > > +				goto err_mmput;
> > > > > +			}
> > > > > +		} else {
> > > > > +			mmap_read_lock(mm);
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	mmap_assert_locked(mm);
> > > > > +
> > > > > +	/*
> > > > > +	 * Loop required to find all VMA area structs for
> > > > > the
> > > > > corner
> > > > > case when
> > > > > +	 * VRAM backing has been partially unmapped from
> > > > > MM's
> > > > > address space.
> > > > > +	 */
> > > > > +again:
> > > > > +	vas = find_vma(mm, start);
> > > > > +	if (!vas) {
> > > > > +		if (!retry)
> > > > > +			err = -ENOENT;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > > +		if (!retry)
> > > > > +			err = -EINVAL;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas,
> > > > > NULL,
> > > > > start,
> > > > > end);
> > > > > +	if (err)
> > > > > +		goto err_mmunlock;
> > > > > +
> > > > > +	if (vas->vm_end < end) {
> > > > > +		retry = true;
> > > > > +		start = vas->vm_end;
> > > > > +		goto again;
> > > > > +	}
> > > > > +
> > > > > +	if (!ctx->mmap_locked) {
> > > > > +		mmap_read_unlock(mm);
> > > > > +		/*
> > > > > +		 * Using mmput_async as this function can be
> > > > > called
> > > > > while
> > > > > +		 * holding a dma-resv lock, and a final put
> > > > > can
> > > > > grab
> > > > > the mmap
> > > > > +		 * lock, causing a lock inversion.
> > > > > +		 */
> > > > > +		mmput_async(mm);
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +
> > > > > +err_mmunlock:
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmap_read_unlock(mm);
> > > > > +err_mmput:
> > > > > +	if (!ctx->mmap_locked)
> > > > > +		mmput_async(mm);
> > > > > +err_out:
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > > > associated
> > > > > with a page
> > > > > + * @page: Pointer to the page
> > > > > + *
> > > > > + * This function is a callback used to put the GPU SVM zone
> > > > > device
> > > > > data
> > > > > + * associated with a page when it is being released.
> > > > > + */
> > > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > > +{
> > > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > > > (page
> > > > > fault handler)
> > > > > + * @vmf: Pointer to the fault information structure
> > > > > + *
> > > > > + * This function is a page fault handler used to migrate a
> > > > > GPU
> > > > > SVM
> > > > > range to RAM.
> > > > > + * It retrieves the GPU SVM range information from the
> > > > > faulting
> > > > > page
> > > > > and invokes
> > > > > + * the internal migration function to migrate the range back
> > > > > to
> > > > > RAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > > + */
> > > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > > > *vmf)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > > > > zone_device_data;
> > > > > +	int err;
> > > > > +
> > > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range-
> > > > > >gpusvm,
> > > > > +					   vmf->vma, vmf-
> > > > > >page,
> > > > > +					   zdd->range-
> > > > > >va.start,
> > > > > +					   zdd->range-
> > > > > >va.end);
> > > > > +
> > > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_pagemap_ops - Device page map operations for
> > > > > GPU
> > > > > SVM
> > > > > + */
> > > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops =
> > > > > {
> > > > > +	.page_free = drm_gpusvm_page_free,
> > > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page
> > > > > map
> > > > > operations
> > > > > + *
> > > > > + * Returns:
> > > > > + * Pointer to the GPU SVM device page map operations
> > > > > structure.
> > > > > + */
> > > > > +const struct dev_pagemap_ops
> > > > > *drm_gpusvm_pagemap_ops_get(void)
> > > > > +{
> > > > > +	return &drm_gpusvm_pagemap_ops;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for
> > > > > the
> > > > > given address range
> > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > + * @start: Start address
> > > > > + * @end: End address
> > > > > + *
> > > > > + * Returns:
> > > > > + * True if GPU SVM has mapping, False otherwise
> > > > > + */
> > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > > start,
> > > > > u64 end)
> > > > > +{
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +
> > > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm,
> > > > > start,
> > > > > end) {
> > > > > +		struct drm_gpusvm_range *range = NULL;
> > > > > +
> > > > > +		drm_gpusvm_for_each_range(range, notifier,
> > > > > start,
> > > > > end)
> > > > > +			return true;
> > > > > +	}
> > > > > +
> > > > > +	return false;
> > > > > +}
> > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > new file mode 100644
> > > > > index 000000000000..0ea70f8534a8
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > @@ -0,0 +1,415 @@
> > > > > +/* SPDX-License-Identifier: MIT */
> > > > > +/*
> > > > > + * Copyright © 2024 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#ifndef __DRM_GPUSVM_H__
> > > > > +#define __DRM_GPUSVM_H__
> > > > > +
> > > > > +#include <linux/kref.h>
> > > > > +#include <linux/mmu_notifier.h>
> > > > > +#include <linux/workqueue.h>
> > > > > +
> > > > > +struct dev_pagemap_ops;
> > > > > +struct drm_device;
> > > > > +struct drm_gpusvm;
> > > > > +struct drm_gpusvm_notifier;
> > > > > +struct drm_gpusvm_ops;
> > > > > +struct drm_gpusvm_range;
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > > + *
> > > > > + * This structure defines the operations for GPU Shared
> > > > > Virtual
> > > > > Memory (SVM).
> > > > > + * These operations are provided by the GPU driver to manage
> > > > > SVM
> > > > > ranges and
> > > > > + * perform operations such as migration between VRAM and
> > > > > system
> > > > > RAM.
> > > > > + */
> > > > > +struct drm_gpusvm_ops {
> > > > > +	/**
> > > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier
> > > > > (optional)
> > > > > +	 *
> > > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * Pointer to the allocated GPU SVM notifier on
> > > > > success,
> > > > > NULL on failure.
> > > > > +	 */
> > > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > > +
> > > > > +	/**
> > > > > +	 * @notifier_free: Free a GPU SVM notifier
> > > > > (optional)
> > > > > +	 * @notifier: Pointer to the GPU SVM notifier to be
> > > > > freed
> > > > > +	 *
> > > > > +	 * This function shall free a GPU SVM notifier.
> > > > > +	 */
> > > > > +	void (*notifier_free)(struct drm_gpusvm_notifier
> > > > > *notifier);
> > > > > +
> > > > > +	/**
> > > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 *
> > > > > +	 * This function shall allocate a GPU SVM range.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * Pointer to the allocated GPU SVM range on
> > > > > success,
> > > > > NULL
> > > > > on failure.
> > > > > +	 */
> > > > > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > > > > drm_gpusvm
> > > > > *gpusvm);
> > > > > +
> > > > > +	/**
> > > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > > +	 *
> > > > > +	 * This function shall free a GPU SVM range.
> > > > > +	 */
> > > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > > +
> > > > > +	/**
> > > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > > +	 * @vram_allocation: Driver-private pointer to the
> > > > > VRAM
> > > > > allocation
> > > > > +	 *
> > > > > +	 * This function shall release VRAM allocation and
> > > > > expects
> > > > > to drop a
> > > > > +	 * reference to VRAM allocation.
> > > > > +	 */
> > > > > +	void (*vram_release)(void *vram_allocation);
> > > > > +
> > > > > +	/**
> > > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required
> > > > > for
> > > > > migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @vram_allocation: Driver-private pointer to the
> > > > > VRAM
> > > > > allocation
> > > > > +	 * @npages: Number of pages to populate
> > > > > +	 * @pfn: Array of page frame numbers to populate
> > > > > +	 *
> > > > > +	 * This function shall populate VRAM page frame
> > > > > numbers
> > > > > (PFN).
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > > +				 void *vram_allocation,
> > > > > +				 unsigned long npages,
> > > > > +				 unsigned long *pfn);
> > > > > +
> > > > > +	/**
> > > > > +	 * @copy_to_vram: Copy to VRAM (required for
> > > > > migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @pages: Pointer to array of VRAM pages
> > > > > (destination)
> > > > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > > > (source)
> > > > > +	 * @npages: Number of pages to copy
> > > > > +	 *
> > > > > +	 * This function shall copy pages to VRAM.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > > +			    struct page **pages,
> > > > > +			    dma_addr_t *dma_addr,
> > > > > +			    unsigned long npages);
> > > > > +
> > > > > +	/**
> > > > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > > > migration)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > > > (destination)
> > > > > +	 * @npages: Number of pages to copy
> > > > > +	 *
> > > > > +	 * This function shall copy pages to system RAM.
> > > > > +	 *
> > > > > +	 * Returns:
> > > > > +	 * 0 on success, a negative error code on failure.
> > > > > +	 */
> > > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > > +			    struct page **pages,
> > > > > +			    dma_addr_t *dma_addr,
> > > > > +			    unsigned long npages);
> > > > > +
> > > > > +	/**
> > > > > +	 * @invalidate: Invalidate GPU SVM notifier
> > > > > (required)
> > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > > > > structure
> > > > > +	 *
> > > > > +	 * This function shall invalidate the GPU page
> > > > > tables.
> > > > > It
> > > > > can safely
> > > > > +	 * walk the notifier range RB tree/list in this
> > > > > function.
> > > > > Called while
> > > > > +	 * holding the notifier lock.
> > > > > +	 */
> > > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > > +			   struct drm_gpusvm_notifier
> > > > > *notifier,
> > > > > +			   const struct mmu_notifier_range
> > > > > *mmu_range);
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_notifier - Structure representing a GPU
> > > > > SVM
> > > > > notifier
> > > > > + *
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: MMU interval notifier
> > > > > + * @interval: Interval for the notifier
> > > > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > > > notifier tree
> > > > > + * @root: Cached root node of the RB tree containing ranges
> > > > > + * @range_list: List head containing of ranges in the same
> > > > > order
> > > > > they appear in
> > > > > + *              interval tree. This is useful to keep
> > > > > iterating
> > > > > ranges while
> > > > > + *              doing modifications to RB tree.
> > > > > + * @flags.removed: Flag indicating whether the MMU interval
> > > > > notifier
> > > > > has been
> > > > > + *                 removed
> > > > > + *
> > > > > + * This structure represents a GPU SVM notifier.
> > > > > + */
> > > > > +struct drm_gpusvm_notifier {
> > > > > +	struct drm_gpusvm *gpusvm;
> > > > > +	struct mmu_interval_notifier notifier;
> > > > > +	struct {
> > > > > +		u64 start;
> > > > > +		u64 end;
> > > > > +	} interval;
> > > > > +	struct {
> > > > > +		struct rb_node node;
> > > > > +		struct list_head entry;
> > > > > +		u64 __subtree_last;
> > > > > +	} rb;
> > > > > +	struct rb_root_cached root;
> > > > > +	struct list_head range_list;
> > > > > +	struct {
> > > > > +		u32 removed : 1;
> > > > > +	} flags;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_range - Structure representing a GPU
> > > > > SVM
> > > > > range
> > > > > + *
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > > + * @refcount: Reference count for the range
> > > > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > > > structure range tree
> > > > > + * @va: Virtual address range
> > > > > + * @notifier_seq: Notifier sequence number of the range's
> > > > > pages
> > > > > + * @pages: Pointer to the array of pages (if backing store
> > > > > is in
> > > > > VRAM)
> > > > > + * @dma_addr: DMA address array (if backing store is SRAM
> > > > > and
> > > > > DMA
> > > > > mapped)
> > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > allocation
> > > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > > > > mapping
> > > > > size
> > > > > + * @flags.migrate_vram: Flag indicating whether the range
> > > > > can be
> > > > > migrated to VRAM
> > > > > + * @flags.unmapped: Flag indicating if the range has been
> > > > > unmapped
> > > > > + * @flags.partial_unmap: Flag indicating if the range has
> > > > > been
> > > > > partially unmapped
> > > > > + * @flags.has_vram_pages: Flag indicating if the range has
> > > > > vram
> > > > > pages
> > > > > + * @flags.has_dma_mapping: Flag indicating if the range has
> > > > > a
> > > > > DMA
> > > > > mapping
> > > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a
> > > > > compact
> > > > > allocation based
> > > > > + *                       on @order which releases via kfree
> > > > > + *
> > > > > + * This structure represents a GPU SVM range used for
> > > > > tracking
> > > > > memory ranges
> > > > > + * mapped in a DRM device.
> > > > > + */
> > > > > +struct drm_gpusvm_range {
> > > > > +	struct drm_gpusvm *gpusvm;
> > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > +	struct kref refcount;
> > > > > +	struct {
> > > > > +		struct rb_node node;
> > > > > +		struct list_head entry;
> > > > > +		u64 __subtree_last;
> > > > > +	} rb;
> > > > > +	struct {
> > > > > +		u64 start;
> > > > > +		u64 end;
> > > > > +	} va;
> > > > > +	unsigned long notifier_seq;
> > > > > +	union {
> > > > > +		struct page **pages;
> > > > > +		dma_addr_t *dma_addr;
> > > > > +	};
> > > > > +	void *vram_allocation;
> > > > > +	u16 order;
> > > > > +	struct {
> > > > > +		/* All flags below must be set upon creation
> > > > > */
> > > > > +		u16 migrate_vram : 1;
> > > > > +		/* All flags below must be set / cleared
> > > > > under
> > > > > notifier lock */
> > > > > +		u16 unmapped : 1;
> > > > > +		u16 partial_unmap : 1;
> > > > > +		u16 has_vram_pages : 1;
> > > > > +		u16 has_dma_mapping : 1;
> > > > > +		u16 kfree_mapping : 1;
> > > > > +	} flags;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm - GPU SVM structure
> > > > > + *
> > > > > + * @name: Name of the GPU SVM
> > > > > + * @drm: Pointer to the DRM device structure
> > > > > + * @mm: Pointer to the mm_struct for the address space
> > > > > + * @device_private_page_owner: Device private pages owner
> > > > > + * @mm_start: Start address of GPU SVM
> > > > > + * @mm_range: Range of the GPU SVM
> > > > > + * @notifier_size: Size of individual notifiers
> > > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > > range
> > > > > allocation.
> > > > > + *               Entries should be powers of 2 in descending
> > > > > order.
> > > > > + * @num_chunks: Number of chunks
> > > > > + * @notifier_lock: Read-write semaphore for protecting
> > > > > notifier
> > > > > operations
> > > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > > + * @root: Cached root node of the Red-Black tree containing
> > > > > GPU
> > > > > SVM
> > > > > notifiers
> > > > > + * @notifier_list: list head containing of notifiers in the
> > > > > same
> > > > > order they
> > > > > + *                 appear in interval tree. This is useful
> > > > > to
> > > > > keep
> > > > > iterating
> > > > > + *                 notifiers while doing modifications to RB
> > > > > tree.
> > > > > + *
> > > > > + * This structure represents a GPU SVM (Shared Virtual
> > > > > Memory)
> > > > > used
> > > > > for tracking
> > > > > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > > > > device.
> > > > > + *
> > > > > + * No reference counting is provided, as this is expected to
> > > > > be
> > > > > embedded in the
> > > > > + * driver VM structure along with the struct drm_gpuvm,
> > > > > which
> > > > > handles reference
> > > > > + * counting.
> > > > > + */
> > > > > +struct drm_gpusvm {
> > > > > +	const char *name;
> > > > > +	struct drm_device *drm;
> > > > > +	struct mm_struct *mm;
> > > > > +	void *device_private_page_owner;
> > > > > +	u64 mm_start;
> > > > > +	u64 mm_range;
> > > > > +	u64 notifier_size;
> > > > > +	const struct drm_gpusvm_ops *ops;
> > > > > +	const u64 *chunk_sizes;
> > > > > +	int num_chunks;
> > > > > +	struct rw_semaphore notifier_lock;
> > > > > +	struct workqueue_struct *zdd_wq;
> > > > > +	struct rb_root_cached root;
> > > > > +	struct list_head notifier_list;
> > > > > +};
> > > > > +
> > > > > +/**
> > > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > > + *
> > > > > + * @mmap_locked: mmap lock is locked
> > > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > > > inversions
> > > > > + *                (e.g.dma-revs -> mmap lock)
> > > > > + * @in_notifier: entering from a MMU notifier
> > > > > + * @read_only: operating on read-only memory
> > > > > + * @vram_possible: possible to use VRAM
> > > > > + * @prefault: prefault pages
> > > > > + *
> > > > > + * Context that is DRM GPUSVM is operating in (i.e. user
> > > > > arguments).
> > > > > + */
> > > > > +struct drm_gpusvm_ctx {
> > > > > +	u32 mmap_locked :1;
> > > > > +	u32 trylock_mmap :1;
> > > > > +	u32 in_notifier :1;
> > > > > +	u32 read_only :1;
> > > > > +	u32 vram_possible :1;
> > > > > +	u32 prefault :1;
> > > > > +};
> > > > > +
> > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > +		    const char *name, struct drm_device
> > > > > *drm,
> > > > > +		    struct mm_struct *mm, void
> > > > > *device_private_page_owner,
> > > > > +		    u64 mm_start, u64 mm_range, u64
> > > > > notifier_size,
> > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > > > > u64
> > > > > fault_addr,
> > > > > +				u64 gpuva_start, u64
> > > > > gpuva_end,
> > > > > +				const struct drm_gpusvm_ctx
> > > > > *ctx);
> > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > +			     struct drm_gpusvm_range
> > > > > *range);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > > +
> > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range
> > > > > *range);
> > > > > +
> > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx);
> > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > +				  struct drm_gpusvm_range
> > > > > *range,
> > > > > +				  const struct
> > > > > drm_gpusvm_ctx
> > > > > *ctx);
> > > > > +
> > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       void *vram_allocation,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx);
> > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +			       struct drm_gpusvm_range
> > > > > *range,
> > > > > +			       const struct drm_gpusvm_ctx
> > > > > *ctx);
> > > > > +
> > > > > +const struct dev_pagemap_ops
> > > > > *drm_gpusvm_pagemap_ops_get(void);
> > > > > +
> > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > > start,
> > > > > u64 end);
> > > > > +
> > > > > +struct drm_gpusvm_range *
> > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> > > > > u64
> > > > > start, u64 end);
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > + *
> > > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > + *
> > > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > > + */
> > > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > > +
> > > > > +/**
> > > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in
> > > > > the
> > > > > list
> > > > > + * @range: a pointer to the current GPU SVM range
> > > > > + *
> > > > > + * Return: A pointer to the next drm_gpusvm_range if
> > > > > available,
> > > > > or
> > > > > NULL if the
> > > > > + *         current range is the last one or if the input
> > > > > range
> > > > > is
> > > > > NULL.
> > > > > + */
> > > > > +static inline struct drm_gpusvm_range *
> > > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > > +{
> > > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > > +				   &range->notifier-
> > > > > > range_list))
> > > > > +		return list_next_entry(range, rb.entry);
> > > > > +
> > > > > +	return NULL;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges
> > > > > in a
> > > > > notifier
> > > > > + * @range__: Iterator variable for the ranges. If set, it
> > > > > indicates
> > > > > the start of
> > > > > + *	     the iterator. If NULL, call
> > > > > drm_gpusvm_range_find()
> > > > > to
> > > > > get the range.
> > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > + * @start__: Start address of the range
> > > > > + * @end__: End address of the range
> > > > > + *
> > > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > > notifier.
> > > > > It is safe
> > > > > + * to use while holding the driver SVM lock or the notifier
> > > > > lock.
> > > > > + */
> > > > > +#define drm_gpusvm_for_each_range(range__, notifier__,
> > > > > start__,
> > > > > end__)	\
> > > > > +	for ((range__) = (range__)
> > > > > ?:					\
> > > > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > > > (end__));	\
> > > > > +	     (range__) && (range__->va.start <
> > > > > (end__));		\
> > > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > > +
> > > > > +/**
> > > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> > > > > unmapped
> > > > > + * @range: Pointer to the GPU SVM range structure.
> > > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > > + *
> > > > > + * This function marks a GPU SVM range as unmapped and sets
> > > > > the
> > > > > partial_unmap flag
> > > > > + * if the range partially falls within the provided MMU
> > > > > notifier
> > > > > range.
> > > > > + */
> > > > > +static inline void
> > > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range
> > > > > *range,
> > > > > +			      const struct
> > > > > mmu_notifier_range
> > > > > *mmu_range)
> > > > > +{
> > > > > +	lockdep_assert_held_write(&range->gpusvm-
> > > > > > notifier_lock);
> > > > > +
> > > > > +	range->flags.unmapped = true;
> > > > > +	if (range->va.start < mmu_range->start ||
> > > > > +	    range->va.end > mmu_range->end)
> > > > > +		range->flags.partial_unmap = true;
> > > > > +}
> > > > > +
> > > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > 
> >
Thomas Hellström Aug. 30, 2024, 9:16 a.m. UTC | #19
Hi, Matthew

On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework designed to
> manage shared
> + * virtual memory between the CPU and GPU. It enables efficient data
> exchange and
> + * processing for GPU-accelerated applications by allowing memory
> sharing and
> + * synchronization between the CPU's and GPU's virtual address
> spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and
> notifying the
> + *		GPU of changes, notifiers are sized based on a GPU
> SVM
> + *		initialization parameter, with a recommendation of
> 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of
> ranges that
> + *		fall within the notifier interval. Notifiers are
> tracked within
> + *		a GPU SVM Red-BlacK tree and list and are
> dynamically inserted
> + *		or removed as ranges within the interval are created
> or
> + *		destroyed.
> + * - Ranges: Represent memory ranges mapped in a DRM device and
> managed
> + *	     by GPU SVM. They are sized based on an array of chunk
> sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU
> address space.
> + *	     Upon GPU fault, the largest aligned chunk that fits
> within the
> + *	     faulting CPU address space is chosen for the range
> size. Ranges are
> + *	     expected to be dynamically allocated on GPU fault and
> removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges
> are tracked in
> + *	     a notifier's Red-Black tree.
> + * - Operations: Define the interface for driver-specific SVM
> operations such as
> + *		 allocation, page collection, migration,
> invalidations, and VRAM
> + *		 release.
> + *

Another question, since ranges, as I understand it, are per gpuvm and
per cpu mm, whereas migration is per device and per cpu_mm, (whe might
have multiple gpuvms mapping the same cpu_mm), I figure the gpu_svm is
per gpuvm, but that makes migration currently inconsistent, right?

/Thomas
Thomas Hellström Aug. 30, 2024, 9:57 a.m. UTC | #20
Hi, Matthew,

Agreed the below might not be important just now, but some ideas:

On Thu, 2024-08-29 at 20:56 +0000, Matthew Brost wrote:
> Issues with removing a SVM range:
> 
> - Xe bind code stores invalidation / present state in VMA, this would
>   need to be moved to the radix tree. I have Jira open for that work
>   which I believe other developers are going to own.

Yeah, although we shouldn't *design* around xe bind-code and page-table
code shortcomings.


> - Where would the dma mapping / device pages be stored?
> 	- In the radix tree? What if ATS is enabled? We don't have a
> 	  driver owned radix tree. How do we reasonably connect a
> driver
> 	  owned radix to a common GPUSVM layer?

With ATS you mean IOMMU SVA, right? I think we could assume that any
user of this code also has a gpu page-table since otherwise they
couldn't be using VRAM and a simpler solution would be in place. 

But to that specific question, drm_gpusvm state would live in a
drm_gpusvm radix tree and driver-specific stuff in the driver tree. A
helper based approach would then call drm_gpusvm_unmap_dma(range),
whereas a middle layer would just traverse the tree and unmap.

> 	- In the notifier? What is the notifier is sparsely
> populated?
> 	  We would be wasting huge amounts of memory. What is the
> 	  notifier is configured to span the entire virtual address
> 	  space?

Let's assume you use a fake page-table like in xe_pt_walk.c as your
"radix tree", adapted to relevant page-sizes, sparsity is not a
problem.

> - How does the garbage collector work? We can't allocate memory in
> the
>   notifier so we don't anything to add to the garbage collector. We
>   can't directly modify page tables given you need lock in the path
> of
>   reclaim.

The garbage collector would operate on the whole invalidated range. In
the case of xe, upon zapping under reclaim you mark individual page-
table bos that are to be removed as "invalid", the garbage collector
walks the range removing the "invalid" entries. Subsequent (re-binding)
avoids the "invalid" entries, (perhaps even helps removing them) and
can thus race with the garbage collector. Hence, any ranges implied by
the page-table code are elimitated.

> - How do we deal with fault storms (e.g. tons of faults hitting the
> same
>   SVM range in a row)? Without a SVM range no every to know if
> mapping
>   is valid and GPU page handler can be short circuited.

Perhaps look at page-table tree and check whether the gpu_pte causing
the fault is valid.

> - Do we have notifier seqno for every PTE?

I'd say no. With this approach it makes sense to have a wide notifier.
The seqno now only affects binding of new gpu_ptes, so the problem with
a wide notifier becomes that if invalidation occurs to *any* part of
the notifier while we're in the read section during binding, we need to
rerun the binding. Adding more notifiers to mitigate that would be to
optimize faulting performance over core invalidation performance which
Jason asked us to avoid.

/Thomas
Matthew Brost Aug. 30, 2024, 1:47 p.m. UTC | #21
On Fri, Aug 30, 2024 at 11:57:33AM +0200, Thomas Hellström wrote:
> Hi, Matthew,
> 
> Agreed the below might not be important just now, but some ideas:
> 
> On Thu, 2024-08-29 at 20:56 +0000, Matthew Brost wrote:
> > Issues with removing a SVM range:
> > 
> > - Xe bind code stores invalidation / present state in VMA, this would
> >   need to be moved to the radix tree. I have Jira open for that work
> >   which I believe other developers are going to own.
> 
> Yeah, although we shouldn't *design* around xe bind-code and page-table
> code shortcomings.
> 

I'm thinking this one certainly should be fixed sooner rather than
later which would be helpful.

But let's also consider the case where we get a bunch of individual page
invalidates serially for an entire range (I can't remember when this
happens but I have seen it in my testing, will look into this more to
figure exactly when). If we invalidate 1 page at a time in radix tree,
each invalidation could potentially results in TLB invalidation
interaction with the hardware in cases where a larger GPU pages are not
being used. The TLB invalidation is going to vastly slower than any CPU
operation (e.g. RB search, radix tree walk). If we key on a range
invalidate the entire once on the first invalidation this may end up
being significantly faster.

Above is pure speculation though, a lot of what both of us is saying
is... So another reason I'd like to get apps running to do profiling. It
would be nice to make design decisions based on data not speculation.

> 
> > - Where would the dma mapping / device pages be stored?
> > 	- In the radix tree? What if ATS is enabled? We don't have a
> > 	  driver owned radix tree. How do we reasonably connect a
> > driver
> > 	  owned radix to a common GPUSVM layer?
> 
> With ATS you mean IOMMU SVA, right? I think we could assume that any
> user of this code also has a gpu page-table since otherwise they
> couldn't be using VRAM and a simpler solution would be in place. 
>

Fair point.

> But to that specific question, drm_gpusvm state would live in a
> drm_gpusvm radix tree and driver-specific stuff in the driver tree. A
> helper based approach would then call drm_gpusvm_unmap_dma(range),
> whereas a middle layer would just traverse the tree and unmap.
> 

Let me consider this. Open to all options.

> > 	- In the notifier? What is the notifier is sparsely
> > populated?
> > 	  We would be wasting huge amounts of memory. What is the
> > 	  notifier is configured to span the entire virtual address
> > 	  space?
> 
> Let's assume you use a fake page-table like in xe_pt_walk.c as your
> "radix tree", adapted to relevant page-sizes, sparsity is not a
> problem.
>

Ok, makes sense I think.

> > - How does the garbage collector work? We can't allocate memory in
> > the
> >   notifier so we don't anything to add to the garbage collector. We
> >   can't directly modify page tables given you need lock in the path
> > of
> >   reclaim.
> 
> The garbage collector would operate on the whole invalidated range. In
> the case of xe, upon zapping under reclaim you mark individual page-
> table bos that are to be removed as "invalid", the garbage collector
> walks the range removing the "invalid" entries. Subsequent (re-binding)
> avoids the "invalid" entries, (perhaps even helps removing them) and
> can thus race with the garbage collector. Hence, any ranges implied by
> the page-table code are elimitated.
> 

This is pretty much with what I came up with too if we didn't have a SVM
range.

> > - How do we deal with fault storms (e.g. tons of faults hitting the
> > same
> >   SVM range in a row)? Without a SVM range no every to know if
> > mapping
> >   is valid and GPU page handler can be short circuited.
> 
> Perhaps look at page-table tree and check whether the gpu_pte causing
> the fault is valid.
> 

Came up with the same thing.

> > - Do we have notifier seqno for every PTE?
> 
> I'd say no. With this approach it makes sense to have a wide notifier.
> The seqno now only affects binding of new gpu_ptes, so the problem with
> a wide notifier becomes that if invalidation occurs to *any* part of
> the notifier while we're in the read section during binding, we need to

I have avoided this by the drm_gpusvm_range_pages_valid. This isn't just
an optimization is actually required for the 2 tile case to be able to
safely know when dma pages can be unmapped (i.e. you can't dma unmap
pages if either tile has a valid mapping).

Matt

> rerun the binding. Adding more notifiers to mitigate that would be to
> optimize faulting performance over core invalidation performance which
> Jason asked us to avoid.
> 
> /Thomas
> 
> 
>
Matthew Brost Aug. 30, 2024, 1:58 p.m. UTC | #22
On Fri, Aug 30, 2024 at 10:18:58AM +0200, Thomas Hellström wrote:
> Hi, Matthew,
> 
> On Thu, 2024-08-29 at 20:56 +0000, Matthew Brost wrote:
> > On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> > > Hi, Matthew,
> > > 
> > > On Thu, 2024-08-29 at 17:45 +0000, Matthew Brost wrote:
> > > > On Thu, Aug 29, 2024 at 11:16:49AM +0200, Thomas Hellström wrote:
> > > > > Hi, Matt. 
> > > > > 
> > > > > Some initial design comments / questions:
> > > > > 
> > > > > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > > > > This patch introduces support for GPU Shared Virtual Memory
> > > > > > (SVM)
> > > > > > in
> > > > > > the
> > > > > > Direct Rendering Manager (DRM) subsystem. SVM allows for
> > > > > > seamless
> > > > > > sharing of memory between the CPU and GPU, enhancing
> > > > > > performance
> > > > > > and
> > > > > > flexibility in GPU computing tasks.
> > > > > > 
> > > > > > The patch adds the necessary infrastructure for SVM,
> > > > > > including
> > > > > > data
> > > > > > structures and functions for managing SVM ranges and
> > > > > > notifiers.
> > > > > > It
> > > > > > also
> > > > > > provides mechanisms for allocating, deallocating, and
> > > > > > migrating
> > > > > > memory
> > > > > > regions between system RAM and GPU VRAM.
> > > > > > 
> > > > > > This mid-layer is largely inspired by GPUVM.
> > > > > > 
> > > > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > > Cc: Christian König <christian.koenig@amd.com>
> > > > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > > > ---
> > > > > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > > > > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > > > > +++++++++++++++++++++++++++++++
> > > > > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > > > > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > > > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > > > > b/drivers/gpu/drm/xe/Makefile
> > > > > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > > > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > > > >  
> > > > > >  # core driver code
> > > > > >  
> > > > > > -xe-y += xe_bb.o \
> > > > > > +xe-y += drm_gpusvm.o \
> > > > > > +	xe_bb.o \
> > > > > >  	xe_bo.o \
> > > > > >  	xe_bo_evict.o \
> > > > > >  	xe_devcoredump.o \
> > > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > > new file mode 100644
> > > > > > index 000000000000..fc1e44e6ae72
> > > > > > --- /dev/null
> > > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > > > @@ -0,0 +1,2174 @@
> > > > > > +// SPDX-License-Identifier: MIT
> > > > > > +/*
> > > > > > + * Copyright © 2024 Intel Corporation
> > > > > > + *
> > > > > > + * Authors:
> > > > > > + *     Matthew Brost <matthew.brost@intel.com>
> > > > > > + */
> > > > > > +
> > > > > > +#include <linux/dma-mapping.h>
> > > > > > +#include <linux/interval_tree_generic.h>
> > > > > > +#include <linux/hmm.h>
> > > > > > +#include <linux/memremap.h>
> > > > > > +#include <linux/migrate.h>
> > > > > > +#include <linux/mm_types.h>
> > > > > > +#include <linux/pagemap.h>
> > > > > > +#include <linux/slab.h>
> > > > > > +
> > > > > > +#include <drm/drm_device.h>
> > > > > > +#include "drm_gpusvm.h"
> > > > > > +
> > > > > > +/**
> > > > > > + * DOC: Overview
> > > > > > + *
> > > > > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > > > > Rendering Manager (DRM)
> > > > > > + *
> > > > > > + * The GPU SVM layer is a component of the DRM framework
> > > > > > designed to
> > > > > > manage shared
> > > > > > + * virtual memory between the CPU and GPU. It enables
> > > > > > efficient
> > > > > > data
> > > > > > exchange and
> > > > > > + * processing for GPU-accelerated applications by allowing
> > > > > > memory
> > > > > > sharing and
> > > > > > + * synchronization between the CPU's and GPU's virtual
> > > > > > address
> > > > > > spaces.
> > > > > > + *
> > > > > > + * Key GPU SVM Components:
> > > > > > + * - Notifiers: Notifiers: Used for tracking memory
> > > > > > intervals
> > > > > > and
> > > > > > notifying the
> > > > > > + *		GPU of changes, notifiers are sized based on
> > > > > > a
> > > > > > GPU
> > > > > > SVM
> > > > > > + *		initialization parameter, with a
> > > > > > recommendation
> > > > > > of
> > > > > > 512M or
> > > > > > + *		larger. They maintain a Red-BlacK tree and a
> > > > > > list of
> > > > > > ranges that
> > > > > > + *		fall within the notifier interval. Notifiers
> > > > > > are
> > > > > > tracked within
> > > > > > + *		a GPU SVM Red-BlacK tree and list and are
> > > > > > dynamically inserted
> > > > > > + *		or removed as ranges within the interval are
> > > > > > created
> > > > > > or
> > > > > > + *		destroyed.
> > > > > 
> > > > > What is the benefit of this extra layer compared to direct
> > > > > insertion of
> > > > > ranges using mmu_interval_notifier_insert?
> > > > > 
> > > > > IIRC the argument made previously about having wide notifiers
> > > > > was
> > > > > that
> > > > > the rb tree lookups inside the core were costly and if there
> > > > > were
> > > > > only
> > > > > a few, then the rb tree lookups within a notifier range could
> > > > > be
> > > > > replaced with the page-table radix-tree-like lookup, so each
> > > > > lookup
> > > > > complexity would be O(log(n_notifiers) + page_table_depth).
> > > > > 
> > > > > But now we have first an rb-tree lookup in the core and then an
> > > > > rb-
> > > > > tree
> > > > > lookup within each notifier yeilding O(log(n_ranges))
> > > > > 
> > > > > I can see a small benefit in that inserting directly into the
> > > > > core
> > > > > rb-
> > > > > tree will block pending ongoing invalidations, but at a cost of
> > > > > an
> > > > > extra multiplexing layer.
> > > > > 
> > > > 
> > > > So when the notifier is triggered the search is a smaller range.
> > > > In a
> > > > perfect world eventually I'd like to drop the SVM range
> > > > completely.
> > > > There is a lot of changes required in Xe to make that possible
> > > > and
> > > > not
> > > > entirely convinced it is possible and the ROI is worth it
> > > > (additional
> > > > complexity vs. perf benefit). For now, this was a relatively
> > > > simple
> > > > way
> > > > to get SVM working (mirrors boths AMD's and Nvidia's implement
> > > > wrt to
> > > > having a range concept) but also is flexible in the sense the
> > > > notifier
> > > > size can be easily tweaked via a modparam [1] following Jason's
> > > > suggestion of larger notifiers.
> > > > 
> > > > [1]
> > > > https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1
> > > 
> > > What I meant was the core is already implementing the "one notifier
> > > for
> > > the whole range", since your notifier duplicates the
> > > mmu_interval_notifier functionality.
> > > 
> > > The mmu_interval_notifier first does an rbtree search to get to the
> > > notifier, and then drm_gpusvm does an rbtree search to get to the
> > > range.
> > 
> > Yes.
> > 
> > > 
> > > If the svm notifier layer is skipped, mmu_interval_notifier has to
> > > perform a wider rbtree search to get to the range. The point is,
> > > the
> > > complexity is the same for both approaches so there is no point in
> > > adding a svm notifier layer for that reason. The width of the
> > > notifier
> > > just adjust the relative size of the two rbtree searches, so from
> > > that
> > > point of view the drm_gpusvm does not offer any benefit from
> > > inserting
> > > the ranges into the mmu_interval_notifier directly (except that the
> > > mmu_interval_notifier is slightly more heavyweight).
> > > 
> > 
> > I think a large part of it was to avoid inserting / removing many
> > notifiers as that was expensive. Agree the search is not
> > fundamentally
> > faster the way I have this coded. It just avoids heavy inserting /
> > removing of notifiers.
> 
> So I specifically asked Jason about the performance problem about using
> many notifiers vs using a single one, and he responded that the problem
> is slowing down the core mm on invalidations, if the RB tree gets too
> large to walk. He also mentioned that we should consider core
> invalidation performance before faulting performance because the latter
> is so slow anyway we must have the driver stack avoid gpu faults using
> user-space prefetching and similar techniques.
> 
> In particular inserting and removing into the mmu_interval tree is not
> costly in terms of locking but because of correctness requirements
> insertion might block on ongoing validations.
> 
> So basically what I'm trying to say is that as long as we're using SVM
> ranges in the way we do (I'm not saying that is wrong at this point,

If you have been following the mmap write discussions at all, one
potential fix for removing that hack is a per range migrate mutex [1].
This also need to be considered when / if we try to drop a raneg
concept.

[1] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111296

> and I agree that could be fine-tuned later), The benefit of an extra
> notifier layer is questionable compared to directly inserting the
> ranges into the mmu_interval_tree. So hence my questions, given those
> considerations why this additional layer?
> 

One we do fairly easily if you think this questionable is have an option
to size the notifier to range size and wire this the notifier size
modparam [2]. Again once we have apps running it would be fairly to
profile this and see if there is benefit to this large notifier scheme.
If there really is none, perhaps then we consider ripping this out.

[2] https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1

Matt

> Anyway, a more detailed review of the code perhaps helps clear this
> out.
> 
> > 
> > > As I understand it, Jasons comments were based on the assumption
> > > that
> > > the drm_gpusvm search would be radix tree based, and hence with
> > > less
> > > complexity than the rbtree search, and therefore providing a clear
> > > benefit the larger they could be.
> > > 
> > > I.e. just calling something similar to xe_vm_invalidate_xxx over
> > > the
> > > whole range, which will just skip subranges that are not populated.
> > > 
> > 
> > As stated, I think eventually removing the SVM range is a good
> > longterm
> > goal.
> > 
> > I almost coded that in this initial series but ran into a number of
> > issues which make this complex and to get something working in
> > simplest
> > way possible to enable further test development, start constructive
> > upstream discussions which appear to be happening, UMDs / application
> > development, and other up[er layer KMD development I stuck with this
> > approach.
> > 
> > I think for any solution which requires a SVM range (fwiw both AMD
> > and
> > Nvidia have a similar concept), attaching the ranges to a larger
> > notifier makes sense and is better than 1 notifier per range.
> > 
> > Issues with removing a SVM range:
> > 
> > - Xe bind code stores invalidation / present state in VMA, this would
> >   need to be moved to the radix tree. I have Jira open for that work
> >   which I believe other developers are going to own.
> > - Where would the dma mapping / device pages be stored?
> > 	- In the radix tree? What if ATS is enabled? We don't have a
> > 	  driver owned radix tree. How do we reasonably connect a
> > driver
> > 	  owned radix to a common GPUSVM layer?
> > 	- In the notifier? What is the notifier is sparsely
> > populated?
> > 	  We would be wasting huge amounts of memory. What is the
> > 	  notifier is configured to span the entire virtual address
> > 	  space?
> > - How does the garbage collector work? We can't allocate memory in
> > the
> >   notifier so we don't anything to add to the garbage collector. We
> >   can't directly modify page tables given you need lock in the path
> > of
> >   reclaim.
> > - How do we deal with fault storms (e.g. tons of faults hitting the
> > same
> >   SVM range in a row)? Without a SVM range no every to know if
> > mapping
> >   is valid and GPU page handler can be short circuited.
> > - Do we have notifier seqno for every PTE?
> > 
> > I feel like I'm missing a few and likely more issues would arrise
> > when
> > implementing this too.
> > 
> > To be clear, I'm saying we shouldn't try to do this and all of the
> > above
> > issues are likely workable but doing all this upfront is akin running
> > before we can walk. I rather solve of fundamental locking issues
> > first,
> > have robust testing in place + passing and UMDs / apps running before
> > trying to rework this one. Performance numbers for this would also be
> > helpful too.
> 
> 
> 
> 
> 
> 
> > 
> > Matt
> > 
> > > /Thomas
> > > 
> > > > 
> > > > > > + * - Ranges: Represent memory ranges mapped in a DRM device
> > > > > > and
> > > > > > managed
> > > > > > + *	     by GPU SVM. They are sized based on an array of
> > > > > > chunk
> > > > > > sizes, which
> > > > > > + *	     is a GPU SVM initialization parameter, and the
> > > > > > CPU
> > > > > > address space.
> > > > > > + *	     Upon GPU fault, the largest aligned chunk that
> > > > > > fits
> > > > > > within the
> > > > > > + *	     faulting CPU address space is chosen for the
> > > > > > range
> > > > > > size. Ranges are
> > > > > > + *	     expected to be dynamically allocated on GPU
> > > > > > fault
> > > > > > and
> > > > > > removed on an
> > > > > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > > > > ranges
> > > > > > are tracked in
> > > > > > + *	     a notifier's Red-Black tree.
> > > > > 
> > > > > How do ranges and chunks map to
> > > > >  
> > > > > a) Prefaulting granularity
> > > > > b) Migration granularity?
> > > > > 
> > > > > > + * - Operations: Define the interface for driver-specific
> > > > > > SVM
> > > > > > operations such as
> > > > > > + *		 allocation, page collection, migration,
> > > > > > invalidations, and VRAM
> > > > > > + *		 release.
> > > > > > + *
> > > > > > + * This layer provides interfaces for allocating, mapping,
> > > > > > migrating, and
> > > > > > + * releasing memory ranges between the CPU and GPU. It
> > > > > > handles
> > > > > > all
> > > > > > core memory
> > > > > > + * management interactions (DMA mapping, HMM, and migration)
> > > > > > and
> > > > > > provides
> > > > > > + * driver-specific virtual functions (vfuncs). This
> > > > > > infrastructure
> > > > > > is sufficient
> > > > > > + * to build the expected driver components for an SVM
> > > > > > implementation
> > > > > > as detailed
> > > > > > + * below.
> > > > > > + *
> > > > > > + * Expected Driver Components:
> > > > > > + * - GPU page fault handler: Used to create ranges and
> > > > > > notifiers
> > > > > > based on the
> > > > > > + *			     fault address, optionally
> > > > > > migrate
> > > > > > the
> > > > > > range to
> > > > > > + *			     VRAM, and create GPU bindings.
> > > > > > + * - Garbage collector: Used to destroy GPU bindings for
> > > > > > ranges.
> > > > > > Ranges are
> > > > > > + *			expected to be added to the garbage
> > > > > > collector upon
> > > > > > + *			MMU_NOTIFY_UNMAP event.
> > > > > > + */
> > > > > > +
> > > > > > +/**
> > > > > > + * DOC: Locking
> > > > > > + *
> > > > > > + * GPU SVM handles locking for core MM interactions, i.e.,
> > > > > > it
> > > > > > locks/unlocks the
> > > > > > + * mmap lock as needed. Alternatively, if the driver prefers
> > > > > > to
> > > > > > handle the mmap
> > > > > > + * lock itself, a 'locked' argument is provided to the
> > > > > > functions
> > > > > > that require
> > > > > > + * the mmap lock. This option may be useful for drivers that
> > > > > > need to
> > > > > > call into
> > > > > > + * GPU SVM while also holding a dma-resv lock, thus
> > > > > > preventing
> > > > > > locking
> > > > > > + * inversions between the mmap and dma-resv locks.
> > > > > > + *
> > > > > > + * GPU SVM introduces a global notifier lock, which
> > > > > > safeguards
> > > > > > the
> > > > > > notifier's
> > > > > > + * range RB tree and list, as well as the range's DMA
> > > > > > mappings
> > > > > > and
> > > > > > sequence
> > > > > > + * number. GPU SVM manages all necessary locking and
> > > > > > unlocking
> > > > > > operations,
> > > > > > + * except for the recheck of the range's sequence number
> > > > > > + * (mmu_interval_read_retry) when the driver is committing
> > > > > > GPU
> > > > > > bindings. This
> > > > > > + * lock corresponds to the 'driver->update' lock mentioned
> > > > > > in
> > > > > > the
> > > > > > HMM
> > > > > > + * documentation (TODO: Link). Future revisions may
> > > > > > transition
> > > > > > from
> > > > > > a GPU SVM
> > > > > > + * global lock to a per-notifier lock if finer-grained
> > > > > > locking
> > > > > > is
> > > > > > deemed
> > > > > > + * necessary.
> > > > > > + *
> > > > > > + * In addition to the locking mentioned above, the driver
> > > > > > should
> > > > > > implement a
> > > > > > + * lock to safeguard core GPU SVM function calls that modify
> > > > > > state,
> > > > > > such as
> > > > > > + * drm_gpusvm_range_find_or_insert and
> > > > > > drm_gpusvm_range_remove.
> > > > > > Alternatively,
> > > > > > + * these core functions can be called within a single kernel
> > > > > > thread,
> > > > > > for
> > > > > > + * instance, using an ordered work queue. This lock is
> > > > > > denoted
> > > > > > as
> > > > > > + * 'driver_svm_lock' in code examples.
> > > > > > + */
> > > > > > +
> > > > > > +/**
> > > > > > + * DOC: Migrataion
> > > > > > + *
> > > > > > + * The migration support is quite simple, allowing migration
> > > > > > between
> > > > > > SRAM and
> > > > > > + * VRAM at the range granularity. For example, GPU SVM
> > > > > > currently
> > > > > > does not
> > > > > > + * support mixing SRAM and VRAM pages within a range. This
> > > > > > means
> > > > > > that upon GPU
> > > > > > + * fault, the entire range can be migrated to VRAM, and upon
> > > > > > CPU
> > > > > > fault, the
> > > > > > + * entire range is migrated to SRAM.
> > > > > > + *
> > > > > > + * The reasoning for only supporting range granularity is as
> > > > > > follows: it
> > > > > > + * simplifies the implementation, and range sizes are
> > > > > > driver-
> > > > > > defined
> > > > > > and should
> > > > > > + * be relatively small.
> > > > > > + */
> > > > > > +
> > > > > > +/**
> > > > > > + * DOC: Partial Unmapping of Ranges
> > > > > > + *
> > > > > > + * Partial unmapping of ranges (e.g., 1M out of 2M is
> > > > > > unmapped
> > > > > > by
> > > > > > CPU resulting
> > > > > > + * in MMU_NOTIFY_UNMAP event) presents several challenges,
> > > > > > with
> > > > > > the
> > > > > > main one
> > > > > > + * being that a subset of the range still has CPU and GPU
> > > > > > mappings.
> > > > > > If the
> > > > > > + * backing store for the range is in VRAM, a subset of the
> > > > > > backing
> > > > > > store has
> > > > > > + * references. One option would be to split the range and
> > > > > > VRAM
> > > > > > backing store,
> > > > > > + * but the implementation for this would be quite
> > > > > > complicated.
> > > > > > Given
> > > > > > that
> > > > > > + * partial unmappings are rare and driver-defined range
> > > > > > sizes
> > > > > > are
> > > > > > relatively
> > > > > > + * small, GPU SVM does not support splitting of ranges.
> > > > > > + *
> > > > > > + * With no support for range splitting, upon partial
> > > > > > unmapping
> > > > > > of a
> > > > > > range, the
> > > > > > + * driver is expected to invalidate and destroy the entire
> > > > > > range. If
> > > > > > the range
> > > > > > + * has VRAM as its backing, the driver is also expected to
> > > > > > migrate
> > > > > > any remaining
> > > > > > + * pages back to SRAM.
> > > > > 
> > > > > So what happens if we get a one-page invalidation, say
> > > > > protection
> > > > > change event, or NUMA accounting event, in the middle of a
> > > > > range?
> > > > > Can
> > > > > we unmap just that single gpu pte covering that range, that is,
> > > > > how
> > > > > do
> > > > > the ranges map to invalidation granularity? Does this differ
> > > > > between
> > > > > igfx an dgfx?
> > > > 
> > > > Well the idea of chunks is ranges should be 1 GPU page (the chunk
> > > > array
> > > > in Xe is 4k, 64k, and 2M). The design is flexible enough that
> > > > doesn't
> > > > have to true but optimized for the thinking each range is most
> > > > likely
> > > > 1
> > > > GPU page. If this isn't true, then all GPU pages in the range are
> > > > invalidated which isn't ideal but keeps it simple which IMO far
> > > > out
> > > > weighs the potential benefits. In theory a driver could implement
> > > > spliting / partial invalidaions too with a couple of updates to
> > > > GPUSVM
> > > > but would likely largely be a driver implementation rather than
> > > > GPUSVM.
> > > > 
> > > > No difference between igfx an dgfx.
> > > > 
> > > > You bring up a good point about protection changes, I likely
> > > > haven't
> > > > fully gotten that part of implementation correct either. I can
> > > > add
> > > > this
> > > > to my TODO list and also update my IGTs to do things like this.
> > > > 
> > > > Matt
> > > > 
> > > > > 
> > > > > Thanks,
> > > > > Thomas
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > > + */
> > > > > > +
> > > > > > +/**
> > > > > > + * DOC: Examples
> > > > > > + *
> > > > > > + * This section provides two examples of how to build the
> > > > > > expected
> > > > > > driver
> > > > > > + * components: the GPU page fault handler and the garbage
> > > > > > collector.
> > > > > > A third
> > > > > > + * example demonstrates a sample invalidation driver vfunc.
> > > > > > + *
> > > > > > + * The generic code provided does not include logic for
> > > > > > complex
> > > > > > migration
> > > > > > + * policies, optimized invalidations, or other potentially
> > > > > > required
> > > > > > driver
> > > > > > + * locking (e.g., DMA-resv locks).
> > > > > > + *
> > > > > > + * 1) GPU page fault handler
> > > > > > + *
> > > > > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm,
> > > > > > struct
> > > > > > drm_gpusvm_range *range)
> > > > > > + *	{
> > > > > > + *		int err = 0;
> > > > > > + *
> > > > > > +
> > > > > > *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > > > > range);
> > > > > > + *
> > > > > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > > > > + *			driver_commit_bind(gpusvm, range);
> > > > > > + *		else
> > > > > > + *			err = -EAGAIN;
> > > > > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > + *
> > > > > > + *		return err;
> > > > > > + *	}
> > > > > > + *
> > > > > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > > > > fault_addr,
> > > > > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > > > > + *	{
> > > > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > > > + *		int err;
> > > > > > + *
> > > > > > + *		driver_svm_lock();
> > > > > > + *	retry:
> > > > > > + *		// Always process UNMAPs first so view of
> > > > > > GPU
> > > > > > SVM
> > > > > > ranges is current
> > > > > > + *		driver_garbage_collector(gpusvm);
> > > > > > + *
> > > > > > + *		range =
> > > > > > drm_gpusvm_range_find_or_insert(gpusvm,
> > > > > > fault_addr,
> > > > > > +
> > > > > > *							gpuv
> > > > > > a_start,
> > > > > > gpuva_end,
> > > > > > + *						       
> > > > > > &ctx);
> > > > > > + *		if (IS_ERR(range)) {
> > > > > > + *			err = PTR_ERR(range);
> > > > > > + *			goto unlock;
> > > > > > + *		}
> > > > > > + *
> > > > > > + *		if (driver_migration_policy(range)) {
> > > > > > + *			bo = driver_alloc_bo();
> > > > > > + *			err =
> > > > > > drm_gpusvm_migrate_to_vram(gpusvm,
> > > > > > range, bo, &ctx);
> > > > > > + *			if (err)	// CPU mappings may
> > > > > > have
> > > > > > changed
> > > > > > + *				goto retry;
> > > > > > + *		}
> > > > > > + *
> > > > > > + *		err = drm_gpusvm_range_get_pages(gpusvm,
> > > > > > range,
> > > > > > &ctx);
> > > > > > + *		if (err == -EFAULT || err == -EPERM)	//
> > > > > > CPU
> > > > > > mappings changed
> > > > > > + *			goto retry;
> > > > > > + *		else if (err)
> > > > > > + *			goto unlock;
> > > > > > + *
> > > > > > + *		err = driver_bind_range(gpusvm, range);
> > > > > > + *		if (err == -EAGAIN)	// CPU mappings
> > > > > > changed
> > > > > > + *			goto retry
> > > > > > + *
> > > > > > + *	unlock:
> > > > > > + *		driver_svm_unlock();
> > > > > > + *		return err;
> > > > > > + *	}
> > > > > > + *
> > > > > > + * 2) Garbage Collector.
> > > > > > + *
> > > > > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > + *					struct
> > > > > > drm_gpusvm_range
> > > > > > *range)
> > > > > > + *	{
> > > > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > > > + *
> > > > > > + *		assert_driver_svm_locked(gpusvm);
> > > > > > + *
> > > > > > + *		// Partial unmap, migrate any remaining VRAM
> > > > > > pages
> > > > > > back to SRAM
> > > > > > + *		if (range->flags.partial_unmap)
> > > > > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > > > > range,
> > > > > > &ctx);
> > > > > > + *
> > > > > > + *		driver_unbind_range(range);
> > > > > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > > > > + *	}
> > > > > > + *
> > > > > > + *	void driver_garbage_collector(struct drm_gpusvm
> > > > > > *gpusvm)
> > > > > > + *	{
> > > > > > + *		assert_driver_svm_locked(gpusvm);
> > > > > > + *
> > > > > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > > > > range)
> > > > > > + *			__driver_garbage_collector(gpusvm,
> > > > > > range);
> > > > > > + *	}
> > > > > > + *
> > > > > > + * 3) Invalidation driver vfunc.
> > > > > > + *
> > > > > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > > > > + *				 struct drm_gpusvm_notifier
> > > > > > *notifier,
> > > > > > + *				 const struct
> > > > > > mmu_notifier_range
> > > > > > *mmu_range)
> > > > > > + *	{
> > > > > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > > > > true,
> > > > > > };
> > > > > > + *		struct drm_gpusvm_range *range = NULL;
> > > > > > + *
> > > > > > + *		driver_invalidate_device_tlb(gpusvm,
> > > > > > mmu_range-
> > > > > > > start, mmu_range->end);
> > > > > > + *
> > > > > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > > > > mmu_range->start,
> > > > > > + *					  mmu_range->end) {
> > > > > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > > > > range,
> > > > > > &ctx);
> > > > > > + *
> > > > > > + *			if (mmu_range->event !=
> > > > > > MMU_NOTIFY_UNMAP)
> > > > > > + *				continue;
> > > > > > + *
> > > > > > + *			drm_gpusvm_range_set_unmapped(range,
> > > > > > mmu_range);
> > > > > > + *			driver_garbage_collector_add(gpusvm,
> > > > > > range);
> > > > > > + *		}
> > > > > > + *	}
> > > > > > + */
> > > > > > +
> > > > > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > > > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end -
> > > > > > 1)
> > > > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > > > > rb.__subtree_last,
> > > > > > +		     DRM_GPUSVM_RANGE_START,
> > > > > > DRM_GPUSVM_RANGE_END,
> > > > > > +		     static __maybe_unused, range);
> > > > > > +
> > > > > > +#define
> > > > > > DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > > > > interval.start)
> > > > > > +#define
> > > > > > DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > > > > interval.end - 1)
> > > > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node,
> > > > > > u64,
> > > > > > +		     rb.__subtree_last,
> > > > > > DRM_GPUSVM_NOTIFIER_START,
> > > > > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > > > > __maybe_unused,
> > > > > > notifier);
> > > > > > +
> > > > > > +/**
> > > > > > + * npages_in_range() - Calculate the number of pages in a
> > > > > > given
> > > > > > range
> > > > > > + * @start__: The start address of the range
> > > > > > + * @end__: The end address of the range
> > > > > > + *
> > > > > > + * This macro calculates the number of pages in a given
> > > > > > memory
> > > > > > range,
> > > > > > + * specified by the start and end addresses. It divides the
> > > > > > difference
> > > > > > + * between the end and start addresses by the page size
> > > > > > (PAGE_SIZE)
> > > > > > to
> > > > > > + * determine the number of pages in the range.
> > > > > > + *
> > > > > > + * Return: The number of pages in the specified range.
> > > > > > + */
> > > > > > +#define npages_in_range(start__, end__)	\
> > > > > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > > > > + *
> > > > > > + * @refcount: Reference count for the zdd
> > > > > > + * @destroy_work: Work structure for asynchronous zdd
> > > > > > destruction
> > > > > > + * @range: Pointer to the GPU SVM range
> > > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > > allocation
> > > > > > + *
> > > > > > + * This structure serves as a generic wrapper installed in
> > > > > > + * page->zone_device_data. It provides infrastructure for
> > > > > > looking up
> > > > > > a range
> > > > > > + * upon CPU page fault and asynchronously releasing VRAM
> > > > > > once
> > > > > > the
> > > > > > CPU has no
> > > > > > + * page references. Asynchronous release is useful because
> > > > > > CPU
> > > > > > page
> > > > > > references
> > > > > > + * can be dropped in IRQ contexts, while releasing VRAM
> > > > > > likely
> > > > > > requires sleeping
> > > > > > + * locks.
> > > > > > + */
> > > > > > +struct drm_gpusvm_zdd {
> > > > > > +	struct kref refcount;
> > > > > > +	struct work_struct destroy_work;
> > > > > > +	struct drm_gpusvm_range *range;
> > > > > > +	void *vram_allocation;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > > > > destroying a
> > > > > > zdd
> > > > > > + * @w: Pointer to the work_struct
> > > > > > + *
> > > > > > + * This function releases VRAM, puts GPU SVM range, and
> > > > > > frees
> > > > > > zdd.
> > > > > > + */
> > > > > > +static void drm_gpusvm_zdd_destroy_work_func(struct
> > > > > > work_struct
> > > > > > *w)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_zdd *zdd =
> > > > > > +		container_of(w, struct drm_gpusvm_zdd,
> > > > > > destroy_work);
> > > > > > +	struct drm_gpusvm_range *range = zdd->range;
> > > > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > > > +
> > > > > > +	if (gpusvm->ops->vram_release && zdd-
> > > > > > >vram_allocation)
> > > > > > +		gpusvm->ops->vram_release(zdd-
> > > > > > >vram_allocation);
> > > > > > +	drm_gpusvm_range_put(range);
> > > > > > +	kfree(zdd);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > > > > + * @range: Pointer to the GPU SVM range.
> > > > > > + *
> > > > > > + * This function allocates and initializes a new zdd
> > > > > > structure.
> > > > > > It
> > > > > > sets up the
> > > > > > + * reference count, initializes the destroy work, and links
> > > > > > the
> > > > > > provided GPU SVM
> > > > > > + * range.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > > > > failure.
> > > > > > + */
> > > > > > +static struct drm_gpusvm_zdd *
> > > > > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_zdd *zdd;
> > > > > > +
> > > > > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > > > > +	if (!zdd)
> > > > > > +		return NULL;
> > > > > > +
> > > > > > +	kref_init(&zdd->refcount);
> > > > > > +	INIT_WORK(&zdd->destroy_work,
> > > > > > drm_gpusvm_zdd_destroy_work_func);
> > > > > > +	zdd->range = drm_gpusvm_range_get(range);
> > > > > > +	zdd->vram_allocation = NULL;
> > > > > > +
> > > > > > +	return zdd;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > > > > + * @zdd: Pointer to the zdd structure.
> > > > > > + *
> > > > > > + * This function increments the reference count of the
> > > > > > provided
> > > > > > zdd
> > > > > > structure.
> > > > > > + *
> > > > > > + * Returns: Pointer to the zdd structure.
> > > > > > + */
> > > > > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > > > > drm_gpusvm_zdd *zdd)
> > > > > > +{
> > > > > > +	kref_get(&zdd->refcount);
> > > > > > +	return zdd;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > > > > + * @ref: Pointer to the reference count structure.
> > > > > > + *
> > > > > > + * This function queues the destroy_work of the zdd for
> > > > > > asynchronous
> > > > > > destruction.
> > > > > > + */
> > > > > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_zdd *zdd =
> > > > > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > > > > refcount);
> > > > > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > > > > +
> > > > > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > > > > + * @zdd: Pointer to the zdd structure.
> > > > > > + *
> > > > > > + * This function decrements the reference count of the
> > > > > > provided
> > > > > > zdd
> > > > > > structure
> > > > > > + * and schedules its destruction if the count drops to zero.
> > > > > > + */
> > > > > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > > > > +{
> > > > > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > > > > notifier
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > > > > + * @start: Start address of the range
> > > > > > + * @end: End address of the range
> > > > > > + *
> > > > > > + * Return: A pointer to the drm_gpusvm_range if found or
> > > > > > NULL
> > > > > > + */
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> > > > > > u64
> > > > > > start, u64 end)
> > > > > > +{
> > > > > > +	return range_iter_first(&notifier->root, start, end
> > > > > > -
> > > > > > 1);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU
> > > > > > SVM
> > > > > > ranges in a notifier
> > > > > > + * @range__: Iterator variable for the ranges
> > > > > > + * @next__: Iterator variable for the ranges temporay
> > > > > > storage
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > > + * @start__: Start address of the range
> > > > > > + * @end__: End address of the range
> > > > > > + *
> > > > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > > > notifier
> > > > > > while
> > > > > > + * removing ranges from it.
> > > > > > + */
> > > > > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > > > > notifier__,
> > > > > > start__, end__)	\
> > > > > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > > > > (start__), (end__)),	\
> > > > > > +	     (next__) =
> > > > > > __drm_gpusvm_range_next(range__);			
> > > > > > 	\
> > > > > > +	     (range__) && (range__->va.start <
> > > > > > (end__));				\
> > > > > > +	     (range__) = (next__), (next__) =
> > > > > > __drm_gpusvm_range_next(range__))
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_notifier_next - get the next
> > > > > > drm_gpusvm_notifier
> > > > > > in
> > > > > > the list
> > > > > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > > > > + *
> > > > > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > > > > available,
> > > > > > or NULL if
> > > > > > + *         the current notifier is the last one or if the
> > > > > > input
> > > > > > notifier is
> > > > > > + *         NULL.
> > > > > > + */
> > > > > > +static struct drm_gpusvm_notifier *
> > > > > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier
> > > > > > *notifier)
> > > > > > +{
> > > > > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > > > > +				      &notifier->gpusvm-
> > > > > > > notifier_list))
> > > > > > +		return list_next_entry(notifier, rb.entry);
> > > > > > +
> > > > > > +	return NULL;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM
> > > > > > notifiers
> > > > > > in
> > > > > > a gpusvm
> > > > > > + * @notifier__: Iterator variable for the notifiers
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > > + * @start__: Start address of the notifier
> > > > > > + * @end__: End address of the notifier
> > > > > > + *
> > > > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > > > gpusvm.
> > > > > > + */
> > > > > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > > > > start__,
> > > > > > end__)		\
> > > > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > > > > root,
> > > > > > (start__), (end__) - 1);	\
> > > > > > +	     (notifier__) && (notifier__->interval.start <
> > > > > > (end__));			\
> > > > > > +	     (notifier__) =
> > > > > > __drm_gpusvm_notifier_next(notifier__))
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over
> > > > > > GPU
> > > > > > SVM
> > > > > > notifiers in a gpusvm
> > > > > > + * @notifier__: Iterator variable for the notifiers
> > > > > > + * @next__: Iterator variable for the notifiers temporay
> > > > > > storage
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > > + * @start__: Start address of the notifier
> > > > > > + * @end__: End address of the notifier
> > > > > > + *
> > > > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > > > gpusvm
> > > > > > while
> > > > > > + * removing notifiers from it.
> > > > > > + */
> > > > > > +#define drm_gpusvm_for_each_notifier_safe(notifier__,
> > > > > > next__,
> > > > > > gpusvm__, start__, end__)	\
> > > > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > > > > root,
> > > > > > (start__), (end__) - 1),	\
> > > > > > +	     (next__) =
> > > > > > __drm_gpusvm_notifier_next(notifier__);			
> > > > > > 	\
> > > > > > +	     (notifier__) && (notifier__->interval.start <
> > > > > > (end__));			\
> > > > > > +	     (notifier__) = (next__), (next__) =
> > > > > > __drm_gpusvm_notifier_next(notifier__))
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > > > > notifier.
> > > > > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > > > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > > > > + * @cur_seq: Current sequence number.
> > > > > > + *
> > > > > > + * This function serves as a generic MMU notifier for GPU
> > > > > > SVM.
> > > > > > It
> > > > > > sets the MMU
> > > > > > + * notifier sequence number and calls the driver invalidate
> > > > > > vfunc
> > > > > > under
> > > > > > + * gpusvm->notifier_lock.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * true if the operation succeeds, false otherwise.
> > > > > > + */
> > > > > > +static bool
> > > > > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > > > > *mni,
> > > > > > +			       const struct
> > > > > > mmu_notifier_range
> > > > > > *mmu_range,
> > > > > > +			       unsigned long cur_seq)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier =
> > > > > > +		container_of(mni, typeof(*notifier),
> > > > > > notifier);
> > > > > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > > > > +
> > > > > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > > > > +		return false;
> > > > > > +
> > > > > > +	down_write(&gpusvm->notifier_lock);
> > > > > > +	mmu_interval_set_seq(mni, cur_seq);
> > > > > > +	gpusvm->ops->invalidate(gpusvm, notifier,
> > > > > > mmu_range);
> > > > > > +	up_write(&gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	return true;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_ops - MMU interval notifier
> > > > > > operations
> > > > > > for
> > > > > > GPU SVM
> > > > > > + */
> > > > > > +static const struct mmu_interval_notifier_ops
> > > > > > drm_gpusvm_notifier_ops = {
> > > > > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > > + * @name: Name of the GPU SVM.
> > > > > > + * @drm: Pointer to the DRM device structure.
> > > > > > + * @mm: Pointer to the mm_struct for the address space.
> > > > > > + * @device_private_page_owner: Device private pages owner.
> > > > > > + * @mm_start: Start address of GPU SVM.
> > > > > > + * @mm_range: Range of the GPU SVM.
> > > > > > + * @notifier_size: Size of individual notifiers.
> > > > > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > > > range
> > > > > > allocation.
> > > > > > + *               Entries should be powers of 2 in descending
> > > > > > order
> > > > > > with last
> > > > > > + *               entry being SZ_4K.
> > > > > > + * @num_chunks: Number of chunks.
> > > > > > + *
> > > > > > + * This function initializes the GPU SVM.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, a negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > > +		    const char *name, struct drm_device
> > > > > > *drm,
> > > > > > +		    struct mm_struct *mm, void
> > > > > > *device_private_page_owner,
> > > > > > +		    u64 mm_start, u64 mm_range, u64
> > > > > > notifier_size,
> > > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > > +		    const u64 *chunk_sizes, int num_chunks)
> > > > > > +{
> > > > > > +	if (!ops->invalidate || !num_chunks)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	gpusvm->name = name;
> > > > > > +	gpusvm->drm = drm;
> > > > > > +	gpusvm->mm = mm;
> > > > > > +	gpusvm->device_private_page_owner =
> > > > > > device_private_page_owner;
> > > > > > +	gpusvm->mm_start = mm_start;
> > > > > > +	gpusvm->mm_range = mm_range;
> > > > > > +	gpusvm->notifier_size = notifier_size;
> > > > > > +	gpusvm->ops = ops;
> > > > > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > > > > +	gpusvm->num_chunks = num_chunks;
> > > > > > +	gpusvm->zdd_wq = system_wq;
> > > > > > +
> > > > > > +	mmgrab(mm);
> > > > > > +	gpusvm->root = RB_ROOT_CACHED;
> > > > > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > > > > +
> > > > > > +	init_rwsem(&gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > > > > +	might_lock(&gpusvm->notifier_lock);
> > > > > > +	fs_reclaim_release(GFP_KERNEL);
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > > > > + * @fault_addr__: Fault address
> > > > > > + *
> > > > > > + * This macro finds the GPU SVM notifier associated with the
> > > > > > fault
> > > > > > address.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the GPU SVM notifier on success, NULL
> > > > > > otherwise.
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > > > > fault_addr__)	\
> > > > > > +	notifier_iter_first(&(gpusvm__)->root,
> > > > > > (fault_addr__),	\
> > > > > > +			    (fault_addr__ + 1))
> > > > > > +
> > > > > > +/**
> > > > > > + * to_drm_gpusvm_notifier - retrieve the container struct
> > > > > > for a
> > > > > > given rbtree node
> > > > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > > > drm_gpusvm_notifier struct
> > > > > > + *
> > > > > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > > > > structure.
> > > > > > + */
> > > > > > +#define
> > > > > > to_drm_gpusvm_notifier(__node)				\
> > > > > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > > > > rb.node)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + *
> > > > > > + * This function inserts the GPU SVM notifier into the GPU
> > > > > > SVM
> > > > > > RB
> > > > > > tree and list.
> > > > > > + */
> > > > > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				       struct
> > > > > > drm_gpusvm_notifier
> > > > > > *notifier)
> > > > > > +{
> > > > > > +	struct rb_node *node;
> > > > > > +	struct list_head *head;
> > > > > > +
> > > > > > +	notifier_insert(notifier, &gpusvm->root);
> > > > > > +
> > > > > > +	node = rb_prev(&notifier->rb.node);
> > > > > > +	if (node)
> > > > > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > > > > > rb.entry;
> > > > > > +	else
> > > > > > +		head = &gpusvm->notifier_list;
> > > > > > +
> > > > > > +	list_add(&notifier->rb.entry, head);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > > > + *
> > > > > > + * This macro removes the GPU SVM notifier from the GPU SVM
> > > > > > RB
> > > > > > tree
> > > > > > and list.
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > > > > notifier__)	\
> > > > > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > > > > > root);	\
> > > > > > +	list_del(&(notifier__)->rb.entry)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > > + *
> > > > > > + * This function finalizes the GPU SVM by cleaning up any
> > > > > > remaining
> > > > > > ranges and
> > > > > > + * notifiers, and dropping a reference to struct MM.
> > > > > > + */
> > > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > > > > +
> > > > > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > > > > gpusvm, 0,
> > > > > > LONG_MAX) {
> > > > > > +		struct drm_gpusvm_range *range, *__next;
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * Remove notifier first to avoid racing
> > > > > > with
> > > > > > any
> > > > > > invalidation
> > > > > > +		 */
> > > > > > +		mmu_interval_notifier_remove(&notifier-
> > > > > > > notifier);
> > > > > > +		notifier->flags.removed = true;
> > > > > > +
> > > > > > +		drm_gpusvm_for_each_range_safe(range,
> > > > > > __next,
> > > > > > notifier, 0,
> > > > > > +					       LONG_MAX)
> > > > > > +			drm_gpusvm_range_remove(gpusvm,
> > > > > > range);
> > > > > > +	}
> > > > > > +
> > > > > > +	mmdrop(gpusvm->mm);
> > > > > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @fault_addr: Fault address
> > > > > > + *
> > > > > > + * This function allocates and initializes the GPU SVM
> > > > > > notifier
> > > > > > structure.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the allocated GPU SVM notifier on success,
> > > > > > ERR_PTR()
> > > > > > on failure.
> > > > > > + */
> > > > > > +static struct drm_gpusvm_notifier *
> > > > > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > > > > fault_addr)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +
> > > > > > +	if (gpusvm->ops->notifier_alloc)
> > > > > > +		notifier = gpusvm->ops->notifier_alloc();
> > > > > > +	else
> > > > > > +		notifier = kzalloc(sizeof(*notifier),
> > > > > > GFP_KERNEL);
> > > > > > +
> > > > > > +	if (!notifier)
> > > > > > +		return ERR_PTR(-ENOMEM);
> > > > > > +
> > > > > > +	notifier->gpusvm = gpusvm;
> > > > > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > > > > gpusvm-
> > > > > > > notifier_size);
> > > > > > +	notifier->interval.end = ALIGN(fault_addr + 1,
> > > > > > gpusvm-
> > > > > > > notifier_size);
> > > > > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > > > > +	notifier->root = RB_ROOT_CACHED;
> > > > > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > > > > +
> > > > > > +	return notifier;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + *
> > > > > > + * This function frees the GPU SVM notifier structure.
> > > > > > + */
> > > > > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				     struct
> > > > > > drm_gpusvm_notifier
> > > > > > *notifier)
> > > > > > +{
> > > > > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > > > > +
> > > > > > +	if (gpusvm->ops->notifier_free)
> > > > > > +		gpusvm->ops->notifier_free(notifier);
> > > > > > +	else
> > > > > > +		kfree(notifier);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > > > > given
> > > > > > rbtree node
> > > > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > > > drm_gpusvm_range struct
> > > > > > + *
> > > > > > + * Return: A pointer to the containing drm_gpusvm_range
> > > > > > structure.
> > > > > > + */
> > > > > > +#define to_drm_gpusvm_range(node__)	\
> > > > > > +	container_of((node__), struct drm_gpusvm_range,
> > > > > > rb.node)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This function inserts the GPU SVM range into the notifier
> > > > > > RB
> > > > > > tree
> > > > > > and list.
> > > > > > + */
> > > > > > +static void drm_gpusvm_range_insert(struct
> > > > > > drm_gpusvm_notifier
> > > > > > *notifier,
> > > > > > +				    struct drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	struct rb_node *node;
> > > > > > +	struct list_head *head;
> > > > > > +
> > > > > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > > > > +	range_insert(range, &notifier->root);
> > > > > > +
> > > > > > +	node = rb_prev(&range->rb.node);
> > > > > > +	if (node)
> > > > > > +		head = &(to_drm_gpusvm_range(node))-
> > > > > > >rb.entry;
> > > > > > +	else
> > > > > > +		head = &notifier->range_list;
> > > > > > +
> > > > > > +	list_add(&range->rb.entry, head);
> > > > > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > > > + * @range__: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This macro removes the GPU SVM range from the notifier RB
> > > > > > tree
> > > > > > and list.
> > > > > > + */
> > > > > > +#define __drm_gpusvm_range_remove(notifier__,
> > > > > > range__)		\
> > > > > > +	range_remove((range__), &(notifier__)-
> > > > > > > root);		\
> > > > > > +	list_del(&(range__)->rb.entry)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + * @fault_addr: Fault address
> > > > > > + * @chunk_size: Chunk size
> > > > > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > > > > + *
> > > > > > + * This function allocates and initializes the GPU SVM range
> > > > > > structure.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the allocated GPU SVM range on success,
> > > > > > ERR_PTR()
> > > > > > on
> > > > > > failure.
> > > > > > + */
> > > > > > +static struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > > > > +		       struct drm_gpusvm_notifier *notifier,
> > > > > > +		       u64 fault_addr, u64 chunk_size, bool
> > > > > > migrate_vram)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_range *range;
> > > > > > +
> > > > > > +	if (gpusvm->ops->range_alloc)
> > > > > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > > > > +	else
> > > > > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > > > > +
> > > > > > +	if (!range)
> > > > > > +		return ERR_PTR(-ENOMEM);
> > > > > > +
> > > > > > +	kref_init(&range->refcount);
> > > > > > +	range->gpusvm = gpusvm;
> > > > > > +	range->notifier = notifier;
> > > > > > +	range->va.start = ALIGN_DOWN(fault_addr,
> > > > > > chunk_size);
> > > > > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > > > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > > > > +	range->notifier_seq = LONG_MAX;
> > > > > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > > > > +
> > > > > > +	return range;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_check_pages - Check pages
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + * @start: Start address
> > > > > > + * @end: End address
> > > > > > + *
> > > > > > + * Check if pages between start and end have been faulted in
> > > > > > on
> > > > > > the
> > > > > > CPU. Use to
> > > > > > + * prevent migration of pages without CPU backing store.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * True if pages have been faulted into CPU, False otherwise
> > > > > > + */
> > > > > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				   struct
> > > > > > drm_gpusvm_notifier
> > > > > > *notifier,
> > > > > > +				   u64 start, u64 end)
> > > > > > +{
> > > > > > +	struct hmm_range hmm_range = {
> > > > > > +		.default_flags = 0,
> > > > > > +		.notifier = &notifier->notifier,
> > > > > > +		.start = start,
> > > > > > +		.end = end,
> > > > > > +		.dev_private_owner = gpusvm-
> > > > > > > device_private_page_owner,
> > > > > > +	};
> > > > > > +	unsigned long timeout =
> > > > > > +		jiffies +
> > > > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > > > +	unsigned long *pfns;
> > > > > > +	unsigned long npages = npages_in_range(start, end);
> > > > > > +	int err, i;
> > > > > > +
> > > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > > +
> > > > > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > > > GFP_KERNEL);
> > > > > > +	if (!pfns)
> > > > > > +		return false;
> > > > > > +
> > > > > > +	hmm_range.notifier_seq =
> > > > > > mmu_interval_read_begin(&notifier-
> > > > > > > notifier);
> > > > > > +	hmm_range.hmm_pfns = pfns;
> > > > > > +
> > > > > > +	while (true) {
> > > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > > +		if (err == -EBUSY) {
> > > > > > +			if (time_after(jiffies, timeout))
> > > > > > +				break;
> > > > > > +
> > > > > > +			hmm_range.notifier_seq =
> > > > > > mmu_interval_read_begin(&notifier->notifier);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +		break;
> > > > > > +	}
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_free;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +err_free:
> > > > > > +	kvfree(pfns);
> > > > > > +	return err ? false : true;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_chunk_size - Determine chunk size for
> > > > > > GPU
> > > > > > SVM
> > > > > > range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > > > + * @vas: Pointer to the virtual memory area structure
> > > > > > + * @fault_addr: Fault address
> > > > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > > > + * @check_pages: Flag indicating whether to check pages
> > > > > > + *
> > > > > > + * This function determines the chunk size for the GPU SVM
> > > > > > range
> > > > > > based on the
> > > > > > + * fault address, GPU SVM chunk sizes, existing GPU SVM
> > > > > > ranges,
> > > > > > and
> > > > > > the virtual
> > > > > > + * memory area boundaries.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Chunk size on success, LONG_MAX on failure.
> > > > > > + */
> > > > > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				       struct
> > > > > > drm_gpusvm_notifier
> > > > > > *notifier,
> > > > > > +				       struct vm_area_struct
> > > > > > *vas,
> > > > > > +				       u64 fault_addr, u64
> > > > > > gpuva_start,
> > > > > > +				       u64 gpuva_end, bool
> > > > > > check_pages)
> > > > > > +{
> > > > > > +	u64 start, end;
> > > > > > +	int i = 0;
> > > > > > +
> > > > > > +retry:
> > > > > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > > > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > > > > chunk_sizes[i]);
> > > > > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > > > > > chunk_sizes[i]);
> > > > > > +
> > > > > > +		if (start >= vas->vm_start && end <= vas-
> > > > > > >vm_end
> > > > > > &&
> > > > > > +		    start >= notifier->interval.start &&
> > > > > > +		    end <= notifier->interval.end &&
> > > > > > +		    start >= gpuva_start && end <=
> > > > > > gpuva_end)
> > > > > > +			break;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (i == gpusvm->num_chunks)
> > > > > > +		return LONG_MAX;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * If allocation more than page, ensure not to
> > > > > > overlap
> > > > > > with
> > > > > > existing
> > > > > > +	 * ranges.
> > > > > > +	 */
> > > > > > +	if (end - start != SZ_4K) {
> > > > > > +		struct drm_gpusvm_range *range;
> > > > > > +
> > > > > > +		range = drm_gpusvm_range_find(notifier,
> > > > > > start,
> > > > > > end);
> > > > > > +		if (range) {
> > > > > > +			++i;
> > > > > > +			goto retry;
> > > > > > +		}
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * XXX: Only create range on pages CPU has
> > > > > > faulted
> > > > > > in. Without
> > > > > > +		 * this check, or prefault, on BMG
> > > > > > 'xe_exec_system_allocator --r
> > > > > > +		 * process-many-malloc' fails. In the
> > > > > > failure
> > > > > > case,
> > > > > > each process
> > > > > > +		 * mallocs 16k but the CPU VMA is ~128k
> > > > > > which
> > > > > > results in 64k SVM
> > > > > > +		 * ranges. When migrating the SVM ranges,
> > > > > > some
> > > > > > processes fail in
> > > > > > +		 * drm_gpusvm_migrate_to_vram with
> > > > > > 'migrate.cpages
> > > > > > != npages'
> > > > > > +		 * and then upon drm_gpusvm_range_get_pages
> > > > > > device
> > > > > > pages from
> > > > > > +		 * other processes are collected + faulted
> > > > > > in
> > > > > > which
> > > > > > creates all
> > > > > > +		 * sorts of problems. Unsure exactly how
> > > > > > this
> > > > > > happening, also
> > > > > > +		 * problem goes away if
> > > > > > 'xe_exec_system_allocator --
> > > > > > r
> > > > > > +		 * process-many-malloc' mallocs at least 64k
> > > > > > at
> > > > > > a
> > > > > > time.
> > > > > > +		 */
> > > > > > +		if (check_pages &&
> > > > > > +		    !drm_gpusvm_check_pages(gpusvm,
> > > > > > notifier,
> > > > > > start,
> > > > > > end)) {
> > > > > > +			++i;
> > > > > > +			goto retry;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	return end - start;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > > > > range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @fault_addr: Fault address
> > > > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function finds or inserts a newly allocated a GPU
> > > > > > SVM
> > > > > > range
> > > > > > based on the
> > > > > > + * fault address. Caller must hold a lock to protect range
> > > > > > lookup
> > > > > > and insertion.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > > > > failure.
> > > > > > + */
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > > > > > u64
> > > > > > fault_addr,
> > > > > > +				u64 gpuva_start, u64
> > > > > > gpuva_end,
> > > > > > +				const struct drm_gpusvm_ctx
> > > > > > *ctx)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +	struct drm_gpusvm_range *range;
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	struct vm_area_struct *vas;
> > > > > > +	bool notifier_alloc = false;
> > > > > > +	u64 chunk_size;
> > > > > > +	int err;
> > > > > > +	bool migrate_vram;
> > > > > > +
> > > > > > +	if (fault_addr < gpusvm->mm_start ||
> > > > > > +	    fault_addr > gpusvm->mm_start + gpusvm-
> > > > > > >mm_range) {
> > > > > > +		err = -EINVAL;
> > > > > > +		goto err_out;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +		mmap_write_lock(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	mmap_assert_write_locked(mm);
> > > > > > +
> > > > > > +	notifier = drm_gpusvm_notifier_find(gpusvm,
> > > > > > fault_addr);
> > > > > > +	if (!notifier) {
> > > > > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > > > > fault_addr);
> > > > > > +		if (IS_ERR(notifier)) {
> > > > > > +			err = PTR_ERR(notifier);
> > > > > > +			goto err_mmunlock;
> > > > > > +		}
> > > > > > +		notifier_alloc = true;
> > > > > > +		err =
> > > > > > mmu_interval_notifier_insert_locked(&notifier-
> > > > > > > notifier,
> > > > > > +							 
> > > > > > mm,
> > > > > > notifier->interval.start,
> > > > > > +							 
> > > > > > notifier-
> > > > > > > interval.end -
> > > > > > +							 
> > > > > > notifier-
> > > > > > > interval.start,
> > > > > > +							 
> > > > > > &drm_gpusvm_notifier_ops);
> > > > > > +		if (err)
> > > > > > +			goto err_notifier;
> > > > > > +	}
> > > > > > +
> > > > > > +	vas = vma_lookup(mm, fault_addr);
> > > > > > +	if (!vas) {
> > > > > > +		err = -ENOENT;
> > > > > > +		goto err_notifier_remove;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE))
> > > > > > {
> > > > > > +		err = -EPERM;
> > > > > > +		goto err_notifier_remove;
> > > > > > +	}
> > > > > > +
> > > > > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > > > > fault_addr + 1);
> > > > > > +	if (range)
> > > > > > +		goto out_mmunlock;
> > > > > > +	/*
> > > > > > +	 * XXX: Short-circuiting migration based on
> > > > > > migrate_vma_*
> > > > > > current
> > > > > > +	 * limitations. If/when migrate_vma_* add more
> > > > > > support,
> > > > > > this
> > > > > > logic will
> > > > > > +	 * have to change.
> > > > > > +	 */
> > > > > > +	migrate_vram = ctx->vram_possible &&
> > > > > > +		vma_is_anonymous(vas) &&
> > > > > > !is_vm_hugetlb_page(vas);
> > > > > > +
> > > > > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > > > > notifier,
> > > > > > vas,
> > > > > > +						 fault_addr,
> > > > > > gpuva_start,
> > > > > > +						 gpuva_end,
> > > > > > migrate_vram &&
> > > > > > +						 !ctx-
> > > > > > > prefault);
> > > > > > +	if (chunk_size == LONG_MAX) {
> > > > > > +		err = -EINVAL;
> > > > > > +		goto err_notifier_remove;
> > > > > > +	}
> > > > > > +
> > > > > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > > > > fault_addr,
> > > > > > chunk_size,
> > > > > > +				       migrate_vram);
> > > > > > +	if (IS_ERR(range)) {
> > > > > > +		err = PTR_ERR(range);
> > > > > > +		goto err_notifier_remove;
> > > > > > +	}
> > > > > > +
> > > > > > +	drm_gpusvm_range_insert(notifier, range);
> > > > > > +	if (notifier_alloc)
> > > > > > +		drm_gpusvm_notifier_insert(gpusvm,
> > > > > > notifier);
> > > > > > +
> > > > > > +	if (ctx->prefault) {
> > > > > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > > > > +
> > > > > > +		__ctx.mmap_locked = true;
> > > > > > +		err = drm_gpusvm_range_get_pages(gpusvm,
> > > > > > range,
> > > > > > &__ctx);
> > > > > > +		if (err)
> > > > > > +			goto err_range_remove;
> > > > > > +	}
> > > > > > +
> > > > > > +out_mmunlock:
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_write_unlock(mm);
> > > > > > +		mmput(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	return range;
> > > > > > +
> > > > > > +err_range_remove:
> > > > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > > > +err_notifier_remove:
> > > > > > +	if (notifier_alloc)
> > > > > > +		mmu_interval_notifier_remove(&notifier-
> > > > > > > notifier);
> > > > > > +err_notifier:
> > > > > > +	if (notifier_alloc)
> > > > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > > > +err_mmunlock:
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_write_unlock(mm);
> > > > > > +		mmput(mm);
> > > > > > +	}
> > > > > > +err_out:
> > > > > > +	return ERR_PTR(err);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > > > > + * @i__: the current page index in the iteration
> > > > > > + * @j__: the current page index, log order, in the iteration
> > > > > > + * @npages__: the total number of pages in the DMA region
> > > > > > + * @order__: the order of the pages in the DMA region
> > > > > > + *
> > > > > > + * This macro iterates over each page in a DMA region. The
> > > > > > DMA
> > > > > > region
> > > > > > + * is assumed to be composed of 2^@order__ pages, and the
> > > > > > macro
> > > > > > will
> > > > > > + * step through the region one block of 2^@order__ pages at
> > > > > > a
> > > > > > time.
> > > > > > + */
> > > > > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > > > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > > > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated
> > > > > > with
> > > > > > a
> > > > > > GPU SVM range (internal)
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This function unmap pages associated with a GPU SVM
> > > > > > range.
> > > > > > Assumes and
> > > > > > + * asserts correct locking is in place when called.
> > > > > > + */
> > > > > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +					   struct
> > > > > > drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	if (range->pages) {
> > > > > > +		unsigned long i, j, npages =
> > > > > > npages_in_range(range-
> > > > > > > va.start,
> > > > > > +							    
> > > > > > range-
> > > > > > > va.end);
> > > > > > +
> > > > > > +		if (range->flags.has_dma_mapping) {
> > > > > > +			for_each_dma_page(i, j, npages,
> > > > > > range-
> > > > > > > order)
> > > > > > +				dma_unmap_page(gpusvm->drm-
> > > > > > >dev,
> > > > > > +					       range-
> > > > > > > dma_addr[j],
> > > > > > +					       PAGE_SIZE <<
> > > > > > range-
> > > > > > > order,
> > > > > > +					      
> > > > > > DMA_BIDIRECTIONAL);
> > > > > > +		}
> > > > > > +
> > > > > > +		range->flags.has_vram_pages = false;
> > > > > > +		range->flags.has_dma_mapping = false;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_free_pages - Free pages associated with
> > > > > > a
> > > > > > GPU
> > > > > > SVM range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This function free pages associated with a GPU SVM range.
> > > > > > + */
> > > > > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +					struct
> > > > > > drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	if (range->pages) {
> > > > > > +		if (range->flags.kfree_mapping) {
> > > > > > +			kfree(range->dma_addr);
> > > > > > +			range->flags.kfree_mapping = false;
> > > > > > +			range->pages = NULL;
> > > > > > +		} else {
> > > > > > +			kvfree(range->pages);
> > > > > > +			range->pages = NULL;
> > > > > > +		}
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range to be removed
> > > > > > + *
> > > > > > + * This function removes the specified GPU SVM range and
> > > > > > also
> > > > > > removes the parent
> > > > > > + * GPU SVM notifier if no more ranges remain in the
> > > > > > notifier.
> > > > > > The
> > > > > > caller must
> > > > > > + * hold a lock to protect range and notifier removal.
> > > > > > + */
> > > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > > +			     struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +
> > > > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > > > > va.start);
> > > > > > +	if (WARN_ON_ONCE(!notifier))
> > > > > > +		return;
> > > > > > +
> > > > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +
> > > > > > +	drm_gpusvm_range_put(range);
> > > > > > +
> > > > > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > > > > +		if (!notifier->flags.removed)
> > > > > > +			mmu_interval_notifier_remove(&notifi
> > > > > > er-
> > > > > > > notifier);
> > > > > > +		drm_gpusvm_notifier_remove(gpusvm,
> > > > > > notifier);
> > > > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > > > > + * @range: Pointer to the GPU SVM range
> > > > > > + *
> > > > > > + * This function increments the reference count of the
> > > > > > specified
> > > > > > GPU
> > > > > > SVM range.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the GPU SVM range.
> > > > > > + */
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	kref_get(&range->refcount);
> > > > > > +
> > > > > > +	return range;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > > > > + * @refcount: Pointer to the reference counter embedded in
> > > > > > the
> > > > > > GPU
> > > > > > SVM range
> > > > > > + *
> > > > > > + * This function destroys the specified GPU SVM range when
> > > > > > its
> > > > > > reference count
> > > > > > + * reaches zero. If a custom range-free function is
> > > > > > provided, it
> > > > > > is
> > > > > > invoked to
> > > > > > + * free the range; otherwise, the range is deallocated using
> > > > > > kfree().
> > > > > > + */
> > > > > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_range *range =
> > > > > > +		container_of(refcount, struct
> > > > > > drm_gpusvm_range,
> > > > > > refcount);
> > > > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > > > +
> > > > > > +	if (gpusvm->ops->range_free)
> > > > > > +		gpusvm->ops->range_free(range);
> > > > > > +	else
> > > > > > +		kfree(range);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > > > > + * @range: Pointer to the GPU SVM range
> > > > > > + *
> > > > > > + * This function decrements the reference count of the
> > > > > > specified
> > > > > > GPU
> > > > > > SVM range
> > > > > > + * and frees it when the count reaches zero.
> > > > > > + */
> > > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	kref_put(&range->refcount,
> > > > > > drm_gpusvm_range_destroy);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This function determines if a GPU SVM range pages are
> > > > > > valid.
> > > > > > Expected be
> > > > > > + * called holding gpusvm->notifier_lock and as the last step
> > > > > > before
> > > > > > commiting a
> > > > > > + * GPU binding.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > > > + */
> > > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	return range->flags.has_vram_pages || range-
> > > > > > > flags.has_dma_mapping;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range
> > > > > > pages
> > > > > > valid
> > > > > > unlocked
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * This function determines if a GPU SVM range pages are
> > > > > > valid.
> > > > > > Expected be
> > > > > > + * called without holding gpusvm->notifier_lock.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > > > + */
> > > > > > +static bool
> > > > > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				      struct
> > > > > > drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	bool pages_valid;
> > > > > > +
> > > > > > +	if (!range->pages)
> > > > > > +		return false;
> > > > > > +
> > > > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > > > > range);
> > > > > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > > > > +		kfree(range->dma_addr);
> > > > > > +		range->flags.kfree_mapping = false;
> > > > > > +		range->pages = NULL;
> > > > > > +	}
> > > > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +
> > > > > > +	return pages_valid;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM
> > > > > > range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function gets pages for a GPU SVM range and ensures
> > > > > > they
> > > > > > are
> > > > > > mapped for
> > > > > > + * DMA access.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx)
> > > > > > +{
> > > > > > +	struct mmu_interval_notifier *notifier = &range-
> > > > > > > notifier-
> > > > > > > notifier;
> > > > > > +	struct hmm_range hmm_range = {
> > > > > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > > > > > read_only
> > > > > > ? 0 :
> > > > > > +			HMM_PFN_REQ_WRITE),
> > > > > > +		.notifier = notifier,
> > > > > > +		.start = range->va.start,
> > > > > > +		.end = range->va.end,
> > > > > > +		.dev_private_owner = gpusvm-
> > > > > > > device_private_page_owner,
> > > > > > +	};
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	unsigned long timeout =
> > > > > > +		jiffies +
> > > > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > > > +	unsigned long i, j;
> > > > > > +	unsigned long npages = npages_in_range(range-
> > > > > > >va.start,
> > > > > > range->va.end);
> > > > > > +	unsigned int order = 0;
> > > > > > +	unsigned long *pfns;
> > > > > > +	struct page **pages;
> > > > > > +	int err = 0;
> > > > > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > > > > +	bool alloc_pfns = false, kfree_mapping;
> > > > > > +
> > > > > > +retry:
> > > > > > +	kfree_mapping = false;
> > > > > > +	hmm_range.notifier_seq =
> > > > > > mmu_interval_read_begin(notifier);
> > > > > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > > > > range))
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > > > > range-
> > > > > > > pages) {
> > > > > > +		if (ctx->prefault)
> > > > > > +			return 0;
> > > > > > +
> > > > > > +		pfns = (unsigned long *)range->pages;
> > > > > > +		pages = range->pages;
> > > > > > +		goto map_pages;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!range->pages) {
> > > > > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > > > GFP_KERNEL);
> > > > > > +		if (!pfns)
> > > > > > +			return -ENOMEM;
> > > > > > +		alloc_pfns = true;
> > > > > > +	} else {
> > > > > > +		pfns = (unsigned long *)range->pages;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	hmm_range.hmm_pfns = pfns;
> > > > > > +	while (true) {
> > > > > > +		/* Must be checked after
> > > > > > mmu_interval_read_begin
> > > > > > */
> > > > > > +		if (range->flags.unmapped) {
> > > > > > +			err = -EFAULT;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +
> > > > > > +		if (!ctx->mmap_locked) {
> > > > > > +			/*
> > > > > > +			 * XXX: HMM locking document
> > > > > > indicates
> > > > > > only
> > > > > > a read-lock
> > > > > > +			 * is required but there apears to
> > > > > > be a
> > > > > > window between
> > > > > > +			 * the MMU_NOTIFY_MIGRATE event
> > > > > > triggered in
> > > > > > a CPU fault
> > > > > > +			 * via migrate_vma_setup and the
> > > > > > pages
> > > > > > actually moving
> > > > > > +			 * in migrate_vma_finalize in which
> > > > > > this
> > > > > > code can grab
> > > > > > +			 * garbage pages. Grabbing the
> > > > > > write-
> > > > > > lock if
> > > > > > the range
> > > > > > +			 * is attached to vram appears to
> > > > > > protect
> > > > > > against this
> > > > > > +			 * race.
> > > > > > +			 */
> > > > > > +			if (vram_pages)
> > > > > > +				mmap_write_lock(mm);
> > > > > > +			else
> > > > > > +				mmap_read_lock(mm);
> > > > > > +		}
> > > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > > +		if (!ctx->mmap_locked) {
> > > > > > +			if (vram_pages)
> > > > > > +				mmap_write_unlock(mm);
> > > > > > +			else
> > > > > > +				mmap_read_unlock(mm);
> > > > > > +		}
> > > > > > +
> > > > > > +		if (err == -EBUSY) {
> > > > > > +			if (time_after(jiffies, timeout))
> > > > > > +				break;
> > > > > > +
> > > > > > +			hmm_range.notifier_seq =
> > > > > > mmu_interval_read_begin(notifier);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +		break;
> > > > > > +	}
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmput(mm);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	pages = (struct page **)pfns;
> > > > > > +
> > > > > > +	if (ctx->prefault) {
> > > > > > +		range->pages = pages;
> > > > > > +		goto set_seqno;
> > > > > > +	}
> > > > > > +
> > > > > > +map_pages:
> > > > > > +	if
> > > > > > (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > > > +
> > > > > > +		for (i = 0; i < npages; ++i) {
> > > > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > > > +
> > > > > > +			if
> > > > > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				goto err_free;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Do not race with notifier unmapping pages
> > > > > > */
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +		range->flags.has_vram_pages = true;
> > > > > > +		range->pages = pages;
> > > > > > +		if (mmu_interval_read_retry(notifier,
> > > > > > hmm_range.notifier_seq)) {
> > > > > > +			err = -EAGAIN;
> > > > > > +			__drm_gpusvm_range_unmap_pages(gpusv
> > > > > > m,
> > > > > > range);
> > > > > > +		}
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +	} else {
> > > > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > > > +
> > > > > > +		for_each_dma_page(i, j, npages, order) {
> > > > > > +			if (WARN_ON_ONCE(i && order !=
> > > > > > +					
> > > > > > hmm_pfn_to_map_order(pfns[i]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +			order =
> > > > > > hmm_pfn_to_map_order(pfns[i]);
> > > > > > +
> > > > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > > > +			if
> > > > > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +
> > > > > > +			set_page_dirty_lock(pages[j]);
> > > > > > +			mark_page_accessed(pages[j]);
> > > > > > +
> > > > > > +			dma_addr[j] = dma_map_page(gpusvm-
> > > > > > >drm-
> > > > > > > dev,
> > > > > > +						   pages[j],
> > > > > > 0,
> > > > > > +						   PAGE_SIZE
> > > > > > <<
> > > > > > order,
> > > > > > +						  
> > > > > > DMA_BIDIRECTIONAL);
> > > > > > +			if (dma_mapping_error(gpusvm->drm-
> > > > > > >dev,
> > > > > > dma_addr[j])) {
> > > > > > +				err = -EFAULT;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Huge pages, reduce memory footprint */
> > > > > > +		if (order) {
> > > > > > +			dma_addr = kmalloc_array(j,
> > > > > > sizeof(*dma_addr),
> > > > > > +						
> > > > > > GFP_KERNEL);
> > > > > > +			if (dma_addr) {
> > > > > > +				for (i = 0; i < j; ++i)
> > > > > > +					dma_addr[i] =
> > > > > > (dma_addr_t)pfns[i];
> > > > > > +				kvfree(pfns);
> > > > > > +				kfree_mapping = true;
> > > > > > +			} else {
> > > > > > +				dma_addr = (dma_addr_t
> > > > > > *)pfns;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Do not race with notifier unmapping pages
> > > > > > */
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +		range->order = order;
> > > > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > > > +		range->flags.has_dma_mapping = true;
> > > > > > +		range->dma_addr = dma_addr;
> > > > > > +		range->vram_allocation = NULL;
> > > > > > +		if (mmu_interval_read_retry(notifier,
> > > > > > hmm_range.notifier_seq)) {
> > > > > > +			err = -EAGAIN;
> > > > > > +			__drm_gpusvm_range_unmap_pages(gpusv
> > > > > > m,
> > > > > > range);
> > > > > > +		}
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +	}
> > > > > > +
> > > > > > +	if (err == -EAGAIN)
> > > > > > +		goto retry;
> > > > > > +set_seqno:
> > > > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > > > +
> > > > > > +	return 0;
> > > > > > +
> > > > > > +err_unmap:
> > > > > > +	for_each_dma_page(i, j, npages, order)
> > > > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > > > +			       (dma_addr_t)pfns[j],
> > > > > > +			       PAGE_SIZE << order,
> > > > > > DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	if (alloc_pfns)
> > > > > > +		kvfree(pfns);
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated
> > > > > > with a
> > > > > > GPU
> > > > > > SVM range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function unmaps pages associated with a GPU SVM
> > > > > > range.
> > > > > > If
> > > > > > @in_notifier
> > > > > > + * is set, it is assumed that gpusvm->notifier_lock is held
> > > > > > in
> > > > > > write
> > > > > > mode; if it
> > > > > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > > > > Must be
> > > > > > called on
> > > > > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > > > > invalidate for IOMMU
> > > > > > + * security model.
> > > > > > + */
> > > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +				  const struct
> > > > > > drm_gpusvm_ctx
> > > > > > *ctx)
> > > > > > +{
> > > > > > +	if (ctx->in_notifier)
> > > > > > +		lockdep_assert_held_write(&gpusvm-
> > > > > > > notifier_lock);
> > > > > > +	else
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +
> > > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > > +
> > > > > > +	if (!ctx->in_notifier)
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > > > + * @page: Pointer to the page to put
> > > > > > + *
> > > > > > + * This function unlocks and puts a page.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > > > +{
> > > > > > +	unlock_page(page);
> > > > > > +	put_page(page);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > > > + * @npages: Number of pages
> > > > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > > > + *
> > > > > > + * This function puts an array of pages.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migration_put_pages(unsigned long
> > > > > > npages,
> > > > > > +					   unsigned long
> > > > > > *migrate_pfn)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		if (!migrate_pfn[i])
> > > > > > +			continue;
> > > > > > +
> > > > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to
> > > > > > _pag
> > > > > > e(mi
> > > > > > grate_pfn[i]));
> > > > > > +		migrate_pfn[i] = 0;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > > > + * @page: Pointer to the page
> > > > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > > > + *
> > > > > > + * This function associates the given page with the
> > > > > > specified
> > > > > > GPU
> > > > > > SVM zone
> > > > > > + * device data and initializes it for zone device usage.
> > > > > > + */
> > > > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > > > +				     struct drm_gpusvm_zdd
> > > > > > *zdd)
> > > > > > +{
> > > > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > > > +	zone_device_page_init(page);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for
> > > > > > GPU
> > > > > > SVM
> > > > > > migration
> > > > > > + * @dev: The device for which the pages are being mapped
> > > > > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > > > > mapped
> > > > > > pages
> > > > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > > > + * @npages: Number of pages to map
> > > > > > + * @dir: Direction of data transfer (e.g.,
> > > > > > DMA_BIDIRECTIONAL)
> > > > > > + *
> > > > > > + * This function maps pages of memory for migration usage in
> > > > > > GPU
> > > > > > SVM. It
> > > > > > + * iterates over each page frame number provided in
> > > > > > @migrate_pfn,
> > > > > > maps the
> > > > > > + * corresponding page, and stores the DMA address in the
> > > > > > provided
> > > > > > @dma_addr
> > > > > > + * array.
> > > > > > + *
> > > > > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > > > > mapping.
> > > > > > + */
> > > > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > > > +					dma_addr_t
> > > > > > *dma_addr,
> > > > > > +					long unsigned int
> > > > > > *migrate_pfn,
> > > > > > +					unsigned long
> > > > > > npages,
> > > > > > +					enum
> > > > > > dma_data_direction
> > > > > > dir)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		struct page *page =
> > > > > > migrate_pfn_to_page(migrate_pfn[i]);
> > > > > > +
> > > > > > +		if (!page)
> > > > > > +			continue;
> > > > > > +
> > > > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > > > +			return -EFAULT;
> > > > > > +
> > > > > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > > > > PAGE_SIZE,
> > > > > > dir);
> > > > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > > > +			return -EFAULT;
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > > > > mapped
> > > > > > for GPU SVM migration
> > > > > > + * @dev: The device for which the pages were mapped
> > > > > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > > > > pages
> > > > > > + * @npages: Number of pages to unmap
> > > > > > + * @dir: Direction of data transfer (e.g.,
> > > > > > DMA_BIDIRECTIONAL)
> > > > > > + *
> > > > > > + * This function unmaps previously mapped pages of memory
> > > > > > for
> > > > > > GPU
> > > > > > Shared Virtual
> > > > > > + * Memory (SVM). It iterates over each DMA address provided
> > > > > > in
> > > > > > @dma_addr, checks
> > > > > > + * if it's valid and not already unmapped, and unmaps the
> > > > > > corresponding page.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migrate_unmap_pages(struct device
> > > > > > *dev,
> > > > > > +					   dma_addr_t
> > > > > > *dma_addr,
> > > > > > +					   unsigned long
> > > > > > npages,
> > > > > > +					   enum
> > > > > > dma_data_direction
> > > > > > dir)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > > > > dma_addr[i]))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > > > > dir);
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to
> > > > > > VRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *                   failure of this function.
> > > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > > allocation.
> > > > > > The caller
> > > > > > + *                   should hold a reference to the VRAM
> > > > > > allocation,
> > > > > > which
> > > > > > + *                   should be dropped via ops-
> > > > > > >vram_allocation
> > > > > > or
> > > > > > upon the
> > > > > > + *                   failure of this function.
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function migrates the specified GPU SVM range to
> > > > > > VRAM.
> > > > > > It
> > > > > > performs the
> > > > > > + * necessary setup and invokes the driver-specific
> > > > > > operations
> > > > > > for
> > > > > > migration to
> > > > > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > > > > reference @range
> > > > > > + * until ops->vram_release is called which only upon
> > > > > > successful
> > > > > > return.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       void *vram_allocation,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx)
> > > > > > +{
> > > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > > +	struct migrate_vma migrate = {
> > > > > > +		.start		= start,
> > > > > > +		.end		= end,
> > > > > > +		.pgmap_owner	= gpusvm-
> > > > > > > device_private_page_owner,
> > > > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > > > +	};
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	unsigned long i, npages = npages_in_range(start,
> > > > > > end);
> > > > > > +	struct vm_area_struct *vas;
> > > > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > > > +	struct page **pages;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	if (!range->flags.migrate_vram)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > > > > copy_to_vram ||
> > > > > > +	    !gpusvm->ops->copy_to_sram)
> > > > > > +		return -EOPNOTSUPP;
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +		mmap_write_lock(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	mmap_assert_locked(mm);
> > > > > > +
> > > > > > +	vas = vma_lookup(mm, start);
> > > > > > +	if (!vas) {
> > > > > > +		err = -ENOENT;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > > > +		err = -EINVAL;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!vma_is_anonymous(vas)) {
> > > > > > +		err = -EBUSY;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > > > sizeof(*dma_addr) +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) *
> > > > > > npages);
> > > > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > > > sizeof(*dma_addr))
> > > > > > * npages;
> > > > > > +
> > > > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > > > +	if (!zdd) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_free;
> > > > > > +	}
> > > > > > +
> > > > > > +	migrate.vma = vas;
> > > > > > +	migrate.src = buf;
> > > > > > +	migrate.dst = migrate.src + npages;
> > > > > > +
> > > > > > +	err = migrate_vma_setup(&migrate);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * FIXME: Below cases, !migrate.cpages and
> > > > > > migrate.cpages !=
> > > > > > npages, not
> > > > > > +	 * always an error. Need to revisit possible cases
> > > > > > and
> > > > > > how
> > > > > > to handle. We
> > > > > > +	 * could prefault on migrate.cpages != npages via
> > > > > > hmm_range_fault.
> > > > > > +	 */
> > > > > > +
> > > > > > +	if (!migrate.cpages) {
> > > > > > +		err = -EFAULT;
> > > > > > +		goto err_free;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (migrate.cpages != npages) {
> > > > > > +		err = -EBUSY;
> > > > > > +		goto err_finalize;
> > > > > > +	}
> > > > > > +
> > > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > > > > vram_allocation, npages,
> > > > > > +					     migrate.dst);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > +					   migrate.src,
> > > > > > npages,
> > > > > > DMA_TO_DEVICE);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		struct page *page =
> > > > > > pfn_to_page(migrate.dst[i]);
> > > > > > +
> > > > > > +		pages[i] = page;
> > > > > > +		migrate.dst[i] =
> > > > > > migrate_pfn(migrate.dst[i]);
> > > > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > > > +	}
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages,
> > > > > > dma_addr,
> > > > > > npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	/* Upon success bind vram allocation to range and
> > > > > > zdd */
> > > > > > +	range->vram_allocation = vram_allocation;
> > > > > > +	WRITE_ONCE(zdd->vram_allocation,
> > > > > > vram_allocation);	/*
> > > > > > Owns ref */
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > > > migrate.dst);
> > > > > > +	migrate_vma_pages(&migrate);
> > > > > > +	migrate_vma_finalize(&migrate);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > npages,
> > > > > > +				       DMA_TO_DEVICE);
> > > > > > +err_free:
> > > > > > +	if (zdd)
> > > > > > +		drm_gpusvm_zdd_put(zdd);
> > > > > > +	kvfree(buf);
> > > > > > +err_mmunlock:
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_write_unlock(mm);
> > > > > > +		mmput(mm);
> > > > > > +	}
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs
> > > > > > for
> > > > > > a
> > > > > > VM area
> > > > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > > > + * @npages: Number of pages to populate
> > > > > > + * @src_mpfn: Source array of migrate PFNs
> > > > > > + * @mpfn: Array of migrate PFNs to populate
> > > > > > + * @addr: Start address for PFN allocation
> > > > > > + *
> > > > > > + * This function populates the SRAM migrate page frame
> > > > > > numbers
> > > > > > (PFNs) for the
> > > > > > + * specified VM area structure. It allocates and locks pages
> > > > > > in
> > > > > > the
> > > > > > VM area for
> > > > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > > > > allocation,
> > > > > > if NULL use
> > > > > > + * alloc_page for allocation.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > > > > vm_area_struct *vas,
> > > > > > +						unsigned
> > > > > > long
> > > > > > npages,
> > > > > > +						unsigned
> > > > > > long
> > > > > > *src_mpfn,
> > > > > > +						unsigned
> > > > > > long
> > > > > > *mpfn,
> > > > > > u64 addr)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > > > +		struct page *page;
> > > > > > +
> > > > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		if (vas)
> > > > > > +			page = alloc_page_vma(GFP_HIGHUSER,
> > > > > > vas,
> > > > > > addr);
> > > > > > +		else
> > > > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > > > +
> > > > > > +		if (!page)
> > > > > > +			return -ENOMEM;
> > > > > > +
> > > > > > +		lock_page(page);
> > > > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not
> > > > > > require
> > > > > > mmap
> > > > > > lock and
> > > > > > + * migration done via migrate_device_* functions. Fallback
> > > > > > path
> > > > > > as
> > > > > > it is
> > > > > > + * preferred to issue migrations with mmap lock.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +				    struct drm_gpusvm_range
> > > > > > *range)
> > > > > > +{
> > > > > > +	unsigned long npages;
> > > > > > +	struct page **pages;
> > > > > > +	unsigned long *src, *dst;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int i, err = 0;
> > > > > > +
> > > > > > +	npages = npages_in_range(range->va.start, range-
> > > > > > > va.end);
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > > > > sizeof(*dma_addr)
> > > > > > +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_out;
> > > > > > +	}
> > > > > > +	src = buf;
> > > > > > +	dst = buf + (sizeof(*src) * npages);
> > > > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr))
> > > > > > *
> > > > > > npages;
> > > > > > +
> > > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > > > > vram_allocation,
> > > > > > +					     npages, src);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > > > +				       gpusvm-
> > > > > > > device_private_page_owner, src,
> > > > > > +				       npages, range-
> > > > > > >va.start);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL,
> > > > > > npages,
> > > > > > src, dst, 0);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > +					   dst, npages,
> > > > > > DMA_BIDIRECTIONAL);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i)
> > > > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > > > > dma_addr,
> > > > > > npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > > > +	migrate_device_pages(src, dst, npages);
> > > > > > +	migrate_device_finalize(src, dst, npages);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > npages,
> > > > > > +				       DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	kvfree(buf);
> > > > > > +err_out:
> > > > > > +
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to
> > > > > > SRAM
> > > > > > (internal)
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @vas: Pointer to the VM area structure
> > > > > > + * @page: Pointer to the page for fault handling (can be
> > > > > > NULL)
> > > > > > + * @start: Start address of the migration range
> > > > > > + * @end: End address of the migration range
> > > > > > + *
> > > > > > + * This internal function performs the migration of the
> > > > > > specified
> > > > > > GPU SVM range
> > > > > > + * to SRAM. It sets up the migration, populates + dma maps
> > > > > > SRAM
> > > > > > PFNs, and
> > > > > > + * invokes the driver-specific operations for migration to
> > > > > > SRAM.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > > > > *gpusvm,
> > > > > > +					struct
> > > > > > vm_area_struct
> > > > > > *vas,
> > > > > > +					struct page *page,
> > > > > > +					u64 start, u64 end)
> > > > > > +{
> > > > > > +	struct migrate_vma migrate = {
> > > > > > +		.vma		= vas,
> > > > > > +		.pgmap_owner	= gpusvm-
> > > > > > > device_private_page_owner,
> > > > > > +		.flags		=
> > > > > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > > > +		.fault_page	= page,
> > > > > > +	};
> > > > > > +	unsigned long npages;
> > > > > > +	struct page **pages;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int i, err = 0;
> > > > > > +
> > > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > > +
> > > > > > +	/* Corner where VMA area struct has been partially
> > > > > > unmapped
> > > > > > */
> > > > > > +	if (start < vas->vm_start)
> > > > > > +		start = vas->vm_start;
> > > > > > +	if (end > vas->vm_end)
> > > > > > +		end = vas->vm_end;
> > > > > > +
> > > > > > +	migrate.start = start;
> > > > > > +	migrate.end = end;
> > > > > > +	npages = npages_in_range(start, end);
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > > > sizeof(*dma_addr) +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_out;
> > > > > > +	}
> > > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) *
> > > > > > npages);
> > > > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > > > sizeof(*dma_addr))
> > > > > > * npages;
> > > > > > +
> > > > > > +	migrate.vma = vas;
> > > > > > +	migrate.src = buf;
> > > > > > +	migrate.dst = migrate.src + npages;
> > > > > > +
> > > > > > +	err = migrate_vma_setup(&migrate);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	/* Raced with another CPU fault, nothing to do */
> > > > > > +	if (!migrate.cpages)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas,
> > > > > > npages,
> > > > > > +						  
> > > > > > migrate.src,
> > > > > > migrate.dst,
> > > > > > +						   start);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > +					   migrate.dst,
> > > > > > npages,
> > > > > > +					  
> > > > > > DMA_BIDIRECTIONAL);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i)
> > > > > > +		pages[i] =
> > > > > > migrate_pfn_to_page(migrate.src[i]);
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > > > > dma_addr,
> > > > > > npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > > > migrate.dst);
> > > > > > +	migrate_vma_pages(&migrate);
> > > > > > +	migrate_vma_finalize(&migrate);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > > > dma_addr,
> > > > > > npages,
> > > > > > +				       DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	kvfree(buf);
> > > > > > +err_out:
> > > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > > +
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM
> > > > > > range to
> > > > > > SRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function initiates the migration of the specified
> > > > > > GPU
> > > > > > SVM
> > > > > > range to
> > > > > > + * SRAM. It performs necessary checks and invokes the
> > > > > > internal
> > > > > > migration
> > > > > > + * function for actual migration.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx)
> > > > > > +{
> > > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	struct vm_area_struct *vas;
> > > > > > +	int err;
> > > > > > +	bool retry = false;
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +		if (ctx->trylock_mmap) {
> > > > > > +			if (!mmap_read_trylock(mm))  {
> > > > > > +				err =
> > > > > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > > > +				goto err_mmput;
> > > > > > +			}
> > > > > > +		} else {
> > > > > > +			mmap_read_lock(mm);
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	mmap_assert_locked(mm);
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Loop required to find all VMA area structs for
> > > > > > the
> > > > > > corner
> > > > > > case when
> > > > > > +	 * VRAM backing has been partially unmapped from
> > > > > > MM's
> > > > > > address space.
> > > > > > +	 */
> > > > > > +again:
> > > > > > +	vas = find_vma(mm, start);
> > > > > > +	if (!vas) {
> > > > > > +		if (!retry)
> > > > > > +			err = -ENOENT;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > > > +		if (!retry)
> > > > > > +			err = -EINVAL;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas,
> > > > > > NULL,
> > > > > > start,
> > > > > > end);
> > > > > > +	if (err)
> > > > > > +		goto err_mmunlock;
> > > > > > +
> > > > > > +	if (vas->vm_end < end) {
> > > > > > +		retry = true;
> > > > > > +		start = vas->vm_end;
> > > > > > +		goto again;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_read_unlock(mm);
> > > > > > +		/*
> > > > > > +		 * Using mmput_async as this function can be
> > > > > > called
> > > > > > while
> > > > > > +		 * holding a dma-resv lock, and a final put
> > > > > > can
> > > > > > grab
> > > > > > the mmap
> > > > > > +		 * lock, causing a lock inversion.
> > > > > > +		 */
> > > > > > +		mmput_async(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +
> > > > > > +err_mmunlock:
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmap_read_unlock(mm);
> > > > > > +err_mmput:
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmput_async(mm);
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > > > > associated
> > > > > > with a page
> > > > > > + * @page: Pointer to the page
> > > > > > + *
> > > > > > + * This function is a callback used to put the GPU SVM zone
> > > > > > device
> > > > > > data
> > > > > > + * associated with a page when it is being released.
> > > > > > + */
> > > > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > > > +{
> > > > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > > > > (page
> > > > > > fault handler)
> > > > > > + * @vmf: Pointer to the fault information structure
> > > > > > + *
> > > > > > + * This function is a page fault handler used to migrate a
> > > > > > GPU
> > > > > > SVM
> > > > > > range to RAM.
> > > > > > + * It retrieves the GPU SVM range information from the
> > > > > > faulting
> > > > > > page
> > > > > > and invokes
> > > > > > + * the internal migration function to migrate the range back
> > > > > > to
> > > > > > RAM.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > > > + */
> > > > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > > > > *vmf)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > > > > > zone_device_data;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range-
> > > > > > >gpusvm,
> > > > > > +					   vmf->vma, vmf-
> > > > > > >page,
> > > > > > +					   zdd->range-
> > > > > > >va.start,
> > > > > > +					   zdd->range-
> > > > > > >va.end);
> > > > > > +
> > > > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_pagemap_ops - Device page map operations for
> > > > > > GPU
> > > > > > SVM
> > > > > > + */
> > > > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops =
> > > > > > {
> > > > > > +	.page_free = drm_gpusvm_page_free,
> > > > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page
> > > > > > map
> > > > > > operations
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the GPU SVM device page map operations
> > > > > > structure.
> > > > > > + */
> > > > > > +const struct dev_pagemap_ops
> > > > > > *drm_gpusvm_pagemap_ops_get(void)
> > > > > > +{
> > > > > > +	return &drm_gpusvm_pagemap_ops;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for
> > > > > > the
> > > > > > given address range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > > + * @start: Start address
> > > > > > + * @end: End address
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * True if GPU SVM has mapping, False otherwise
> > > > > > + */
> > > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > > > start,
> > > > > > u64 end)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +
> > > > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm,
> > > > > > start,
> > > > > > end) {
> > > > > > +		struct drm_gpusvm_range *range = NULL;
> > > > > > +
> > > > > > +		drm_gpusvm_for_each_range(range, notifier,
> > > > > > start,
> > > > > > end)
> > > > > > +			return true;
> > > > > > +	}
> > > > > > +
> > > > > > +	return false;
> > > > > > +}
> > > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..0ea70f8534a8
> > > > > > --- /dev/null
> > > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > @@ -0,0 +1,415 @@
> > > > > > +/* SPDX-License-Identifier: MIT */
> > > > > > +/*
> > > > > > + * Copyright © 2024 Intel Corporation
> > > > > > + */
> > > > > > +
> > > > > > +#ifndef __DRM_GPUSVM_H__
> > > > > > +#define __DRM_GPUSVM_H__
> > > > > > +
> > > > > > +#include <linux/kref.h>
> > > > > > +#include <linux/mmu_notifier.h>
> > > > > > +#include <linux/workqueue.h>
> > > > > > +
> > > > > > +struct dev_pagemap_ops;
> > > > > > +struct drm_device;
> > > > > > +struct drm_gpusvm;
> > > > > > +struct drm_gpusvm_notifier;
> > > > > > +struct drm_gpusvm_ops;
> > > > > > +struct drm_gpusvm_range;
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > > > + *
> > > > > > + * This structure defines the operations for GPU Shared
> > > > > > Virtual
> > > > > > Memory (SVM).
> > > > > > + * These operations are provided by the GPU driver to manage
> > > > > > SVM
> > > > > > ranges and
> > > > > > + * perform operations such as migration between VRAM and
> > > > > > system
> > > > > > RAM.
> > > > > > + */
> > > > > > +struct drm_gpusvm_ops {
> > > > > > +	/**
> > > > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier
> > > > > > (optional)
> > > > > > +	 *
> > > > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * Pointer to the allocated GPU SVM notifier on
> > > > > > success,
> > > > > > NULL on failure.
> > > > > > +	 */
> > > > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @notifier_free: Free a GPU SVM notifier
> > > > > > (optional)
> > > > > > +	 * @notifier: Pointer to the GPU SVM notifier to be
> > > > > > freed
> > > > > > +	 *
> > > > > > +	 * This function shall free a GPU SVM notifier.
> > > > > > +	 */
> > > > > > +	void (*notifier_free)(struct drm_gpusvm_notifier
> > > > > > *notifier);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 *
> > > > > > +	 * This function shall allocate a GPU SVM range.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * Pointer to the allocated GPU SVM range on
> > > > > > success,
> > > > > > NULL
> > > > > > on failure.
> > > > > > +	 */
> > > > > > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > > > > > drm_gpusvm
> > > > > > *gpusvm);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > > > +	 *
> > > > > > +	 * This function shall free a GPU SVM range.
> > > > > > +	 */
> > > > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > > > +	 * @vram_allocation: Driver-private pointer to the
> > > > > > VRAM
> > > > > > allocation
> > > > > > +	 *
> > > > > > +	 * This function shall release VRAM allocation and
> > > > > > expects
> > > > > > to drop a
> > > > > > +	 * reference to VRAM allocation.
> > > > > > +	 */
> > > > > > +	void (*vram_release)(void *vram_allocation);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required
> > > > > > for
> > > > > > migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @vram_allocation: Driver-private pointer to the
> > > > > > VRAM
> > > > > > allocation
> > > > > > +	 * @npages: Number of pages to populate
> > > > > > +	 * @pfn: Array of page frame numbers to populate
> > > > > > +	 *
> > > > > > +	 * This function shall populate VRAM page frame
> > > > > > numbers
> > > > > > (PFN).
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > > > +				 void *vram_allocation,
> > > > > > +				 unsigned long npages,
> > > > > > +				 unsigned long *pfn);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @copy_to_vram: Copy to VRAM (required for
> > > > > > migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @pages: Pointer to array of VRAM pages
> > > > > > (destination)
> > > > > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > > > > (source)
> > > > > > +	 * @npages: Number of pages to copy
> > > > > > +	 *
> > > > > > +	 * This function shall copy pages to VRAM.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > > > +			    struct page **pages,
> > > > > > +			    dma_addr_t *dma_addr,
> > > > > > +			    unsigned long npages);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > > > > migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > > > > (destination)
> > > > > > +	 * @npages: Number of pages to copy
> > > > > > +	 *
> > > > > > +	 * This function shall copy pages to system RAM.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > > > +			    struct page **pages,
> > > > > > +			    dma_addr_t *dma_addr,
> > > > > > +			    unsigned long npages);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @invalidate: Invalidate GPU SVM notifier
> > > > > > (required)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > > > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > > > > > structure
> > > > > > +	 *
> > > > > > +	 * This function shall invalidate the GPU page
> > > > > > tables.
> > > > > > It
> > > > > > can safely
> > > > > > +	 * walk the notifier range RB tree/list in this
> > > > > > function.
> > > > > > Called while
> > > > > > +	 * holding the notifier lock.
> > > > > > +	 */
> > > > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > > > +			   struct drm_gpusvm_notifier
> > > > > > *notifier,
> > > > > > +			   const struct mmu_notifier_range
> > > > > > *mmu_range);
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_notifier - Structure representing a GPU
> > > > > > SVM
> > > > > > notifier
> > > > > > + *
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: MMU interval notifier
> > > > > > + * @interval: Interval for the notifier
> > > > > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > > > > notifier tree
> > > > > > + * @root: Cached root node of the RB tree containing ranges
> > > > > > + * @range_list: List head containing of ranges in the same
> > > > > > order
> > > > > > they appear in
> > > > > > + *              interval tree. This is useful to keep
> > > > > > iterating
> > > > > > ranges while
> > > > > > + *              doing modifications to RB tree.
> > > > > > + * @flags.removed: Flag indicating whether the MMU interval
> > > > > > notifier
> > > > > > has been
> > > > > > + *                 removed
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM notifier.
> > > > > > + */
> > > > > > +struct drm_gpusvm_notifier {
> > > > > > +	struct drm_gpusvm *gpusvm;
> > > > > > +	struct mmu_interval_notifier notifier;
> > > > > > +	struct {
> > > > > > +		u64 start;
> > > > > > +		u64 end;
> > > > > > +	} interval;
> > > > > > +	struct {
> > > > > > +		struct rb_node node;
> > > > > > +		struct list_head entry;
> > > > > > +		u64 __subtree_last;
> > > > > > +	} rb;
> > > > > > +	struct rb_root_cached root;
> > > > > > +	struct list_head range_list;
> > > > > > +	struct {
> > > > > > +		u32 removed : 1;
> > > > > > +	} flags;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_range - Structure representing a GPU
> > > > > > SVM
> > > > > > range
> > > > > > + *
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > > > + * @refcount: Reference count for the range
> > > > > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > > > > structure range tree
> > > > > > + * @va: Virtual address range
> > > > > > + * @notifier_seq: Notifier sequence number of the range's
> > > > > > pages
> > > > > > + * @pages: Pointer to the array of pages (if backing store
> > > > > > is in
> > > > > > VRAM)
> > > > > > + * @dma_addr: DMA address array (if backing store is SRAM
> > > > > > and
> > > > > > DMA
> > > > > > mapped)
> > > > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > > > allocation
> > > > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > > > > > mapping
> > > > > > size
> > > > > > + * @flags.migrate_vram: Flag indicating whether the range
> > > > > > can be
> > > > > > migrated to VRAM
> > > > > > + * @flags.unmapped: Flag indicating if the range has been
> > > > > > unmapped
> > > > > > + * @flags.partial_unmap: Flag indicating if the range has
> > > > > > been
> > > > > > partially unmapped
> > > > > > + * @flags.has_vram_pages: Flag indicating if the range has
> > > > > > vram
> > > > > > pages
> > > > > > + * @flags.has_dma_mapping: Flag indicating if the range has
> > > > > > a
> > > > > > DMA
> > > > > > mapping
> > > > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a
> > > > > > compact
> > > > > > allocation based
> > > > > > + *                       on @order which releases via kfree
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM range used for
> > > > > > tracking
> > > > > > memory ranges
> > > > > > + * mapped in a DRM device.
> > > > > > + */
> > > > > > +struct drm_gpusvm_range {
> > > > > > +	struct drm_gpusvm *gpusvm;
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +	struct kref refcount;
> > > > > > +	struct {
> > > > > > +		struct rb_node node;
> > > > > > +		struct list_head entry;
> > > > > > +		u64 __subtree_last;
> > > > > > +	} rb;
> > > > > > +	struct {
> > > > > > +		u64 start;
> > > > > > +		u64 end;
> > > > > > +	} va;
> > > > > > +	unsigned long notifier_seq;
> > > > > > +	union {
> > > > > > +		struct page **pages;
> > > > > > +		dma_addr_t *dma_addr;
> > > > > > +	};
> > > > > > +	void *vram_allocation;
> > > > > > +	u16 order;
> > > > > > +	struct {
> > > > > > +		/* All flags below must be set upon creation
> > > > > > */
> > > > > > +		u16 migrate_vram : 1;
> > > > > > +		/* All flags below must be set / cleared
> > > > > > under
> > > > > > notifier lock */
> > > > > > +		u16 unmapped : 1;
> > > > > > +		u16 partial_unmap : 1;
> > > > > > +		u16 has_vram_pages : 1;
> > > > > > +		u16 has_dma_mapping : 1;
> > > > > > +		u16 kfree_mapping : 1;
> > > > > > +	} flags;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm - GPU SVM structure
> > > > > > + *
> > > > > > + * @name: Name of the GPU SVM
> > > > > > + * @drm: Pointer to the DRM device structure
> > > > > > + * @mm: Pointer to the mm_struct for the address space
> > > > > > + * @device_private_page_owner: Device private pages owner
> > > > > > + * @mm_start: Start address of GPU SVM
> > > > > > + * @mm_range: Range of the GPU SVM
> > > > > > + * @notifier_size: Size of individual notifiers
> > > > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > > > range
> > > > > > allocation.
> > > > > > + *               Entries should be powers of 2 in descending
> > > > > > order.
> > > > > > + * @num_chunks: Number of chunks
> > > > > > + * @notifier_lock: Read-write semaphore for protecting
> > > > > > notifier
> > > > > > operations
> > > > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > > > + * @root: Cached root node of the Red-Black tree containing
> > > > > > GPU
> > > > > > SVM
> > > > > > notifiers
> > > > > > + * @notifier_list: list head containing of notifiers in the
> > > > > > same
> > > > > > order they
> > > > > > + *                 appear in interval tree. This is useful
> > > > > > to
> > > > > > keep
> > > > > > iterating
> > > > > > + *                 notifiers while doing modifications to RB
> > > > > > tree.
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM (Shared Virtual
> > > > > > Memory)
> > > > > > used
> > > > > > for tracking
> > > > > > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > > > > > device.
> > > > > > + *
> > > > > > + * No reference counting is provided, as this is expected to
> > > > > > be
> > > > > > embedded in the
> > > > > > + * driver VM structure along with the struct drm_gpuvm,
> > > > > > which
> > > > > > handles reference
> > > > > > + * counting.
> > > > > > + */
> > > > > > +struct drm_gpusvm {
> > > > > > +	const char *name;
> > > > > > +	struct drm_device *drm;
> > > > > > +	struct mm_struct *mm;
> > > > > > +	void *device_private_page_owner;
> > > > > > +	u64 mm_start;
> > > > > > +	u64 mm_range;
> > > > > > +	u64 notifier_size;
> > > > > > +	const struct drm_gpusvm_ops *ops;
> > > > > > +	const u64 *chunk_sizes;
> > > > > > +	int num_chunks;
> > > > > > +	struct rw_semaphore notifier_lock;
> > > > > > +	struct workqueue_struct *zdd_wq;
> > > > > > +	struct rb_root_cached root;
> > > > > > +	struct list_head notifier_list;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > > > + *
> > > > > > + * @mmap_locked: mmap lock is locked
> > > > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > > > > inversions
> > > > > > + *                (e.g.dma-revs -> mmap lock)
> > > > > > + * @in_notifier: entering from a MMU notifier
> > > > > > + * @read_only: operating on read-only memory
> > > > > > + * @vram_possible: possible to use VRAM
> > > > > > + * @prefault: prefault pages
> > > > > > + *
> > > > > > + * Context that is DRM GPUSVM is operating in (i.e. user
> > > > > > arguments).
> > > > > > + */
> > > > > > +struct drm_gpusvm_ctx {
> > > > > > +	u32 mmap_locked :1;
> > > > > > +	u32 trylock_mmap :1;
> > > > > > +	u32 in_notifier :1;
> > > > > > +	u32 read_only :1;
> > > > > > +	u32 vram_possible :1;
> > > > > > +	u32 prefault :1;
> > > > > > +};
> > > > > > +
> > > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > > +		    const char *name, struct drm_device
> > > > > > *drm,
> > > > > > +		    struct mm_struct *mm, void
> > > > > > *device_private_page_owner,
> > > > > > +		    u64 mm_start, u64 mm_range, u64
> > > > > > notifier_size,
> > > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > > > > > u64
> > > > > > fault_addr,
> > > > > > +				u64 gpuva_start, u64
> > > > > > gpuva_end,
> > > > > > +				const struct drm_gpusvm_ctx
> > > > > > *ctx);
> > > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > > +			     struct drm_gpusvm_range
> > > > > > *range);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range
> > > > > > *range);
> > > > > > +
> > > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx);
> > > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +				  const struct
> > > > > > drm_gpusvm_ctx
> > > > > > *ctx);
> > > > > > +
> > > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       void *vram_allocation,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx);
> > > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			       const struct drm_gpusvm_ctx
> > > > > > *ctx);
> > > > > > +
> > > > > > +const struct dev_pagemap_ops
> > > > > > *drm_gpusvm_pagemap_ops_get(void);
> > > > > > +
> > > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > > > > > start,
> > > > > > u64 end);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> > > > > > u64
> > > > > > start, u64 end);
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > > + *
> > > > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > > + *
> > > > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in
> > > > > > the
> > > > > > list
> > > > > > + * @range: a pointer to the current GPU SVM range
> > > > > > + *
> > > > > > + * Return: A pointer to the next drm_gpusvm_range if
> > > > > > available,
> > > > > > or
> > > > > > NULL if the
> > > > > > + *         current range is the last one or if the input
> > > > > > range
> > > > > > is
> > > > > > NULL.
> > > > > > + */
> > > > > > +static inline struct drm_gpusvm_range *
> > > > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > > > +				   &range->notifier-
> > > > > > > range_list))
> > > > > > +		return list_next_entry(range, rb.entry);
> > > > > > +
> > > > > > +	return NULL;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges
> > > > > > in a
> > > > > > notifier
> > > > > > + * @range__: Iterator variable for the ranges. If set, it
> > > > > > indicates
> > > > > > the start of
> > > > > > + *	     the iterator. If NULL, call
> > > > > > drm_gpusvm_range_find()
> > > > > > to
> > > > > > get the range.
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > > + * @start__: Start address of the range
> > > > > > + * @end__: End address of the range
> > > > > > + *
> > > > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > > > notifier.
> > > > > > It is safe
> > > > > > + * to use while holding the driver SVM lock or the notifier
> > > > > > lock.
> > > > > > + */
> > > > > > +#define drm_gpusvm_for_each_range(range__, notifier__,
> > > > > > start__,
> > > > > > end__)	\
> > > > > > +	for ((range__) = (range__)
> > > > > > ?:					\
> > > > > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > > > > (end__));	\
> > > > > > +	     (range__) && (range__->va.start <
> > > > > > (end__));		\
> > > > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> > > > > > unmapped
> > > > > > + * @range: Pointer to the GPU SVM range structure.
> > > > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > > > + *
> > > > > > + * This function marks a GPU SVM range as unmapped and sets
> > > > > > the
> > > > > > partial_unmap flag
> > > > > > + * if the range partially falls within the provided MMU
> > > > > > notifier
> > > > > > range.
> > > > > > + */
> > > > > > +static inline void
> > > > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range
> > > > > > *range,
> > > > > > +			      const struct
> > > > > > mmu_notifier_range
> > > > > > *mmu_range)
> > > > > > +{
> > > > > > +	lockdep_assert_held_write(&range->gpusvm-
> > > > > > > notifier_lock);
> > > > > > +
> > > > > > +	range->flags.unmapped = true;
> > > > > > +	if (range->va.start < mmu_range->start ||
> > > > > > +	    range->va.end > mmu_range->end)
> > > > > > +		range->flags.partial_unmap = true;
> > > > > > +}
> > > > > > +
> > > > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > > 
> > > 
>
Thomas Hellström Sept. 2, 2024, 9:45 a.m. UTC | #23
Hi, Matt.

On Fri, 2024-08-30 at 13:47 +0000, Matthew Brost wrote:
> On Fri, Aug 30, 2024 at 11:57:33AM +0200, Thomas Hellström wrote:
> > Hi, Matthew,
> > 
> > Agreed the below might not be important just now, but some ideas:
> > 
> > On Thu, 2024-08-29 at 20:56 +0000, Matthew Brost wrote:
> > > Issues with removing a SVM range:
> > > 
> > > - Xe bind code stores invalidation / present state in VMA, this
> > > would
> > >   need to be moved to the radix tree. I have Jira open for that
> > > work
> > >   which I believe other developers are going to own.
> > 
> > Yeah, although we shouldn't *design* around xe bind-code and page-
> > table
> > code shortcomings.
> > 
> 
> I'm thinking this one certainly should be fixed sooner rather than
> later which would be helpful.
> 
> But let's also consider the case where we get a bunch of individual
> page
> invalidates serially for an entire range (I can't remember when this
> happens but I have seen it in my testing, will look into this more to
> figure exactly when). If we invalidate 1 page at a time in radix
> tree,
> each invalidation could potentially results in TLB invalidation
> interaction with the hardware in cases where a larger GPU pages are
> not
> being used. The TLB invalidation is going to vastly slower than any
> CPU
> operation (e.g. RB search, radix tree walk). If we key on a range
> invalidate the entire once on the first invalidation this may end up
> being significantly faster.
> 
> Above is pure speculation though, a lot of what both of us is saying
> is... So another reason I'd like to get apps running to do profiling.
> It
> would be nice to make design decisions based on data not speculation.

Well nothing would stop you from adding a configurable invalidation
granularity, even with a radix-tree based approach. You'd just pad the
invalidation range to match the granularity.

> 
> > 
> > > - Where would the dma mapping / device pages be stored?
> > > 	- In the radix tree? What if ATS is enabled? We don't
> > > have a
> > > 	  driver owned radix tree. How do we reasonably connect
> > > a
> > > driver
> > > 	  owned radix to a common GPUSVM layer?
> > 
> > With ATS you mean IOMMU SVA, right? I think we could assume that
> > any
> > user of this code also has a gpu page-table since otherwise they
> > couldn't be using VRAM and a simpler solution would be in place. 
> > 
> 
> Fair point.
> 
> > But to that specific question, drm_gpusvm state would live in a
> > drm_gpusvm radix tree and driver-specific stuff in the driver tree.
> > A
> > helper based approach would then call drm_gpusvm_unmap_dma(range),
> > whereas a middle layer would just traverse the tree and unmap.
> > 
> 
> Let me consider this. Open to all options.
> 
> > > 	- In the notifier? What is the notifier is sparsely
> > > populated?
> > > 	  We would be wasting huge amounts of memory. What is
> > > the
> > > 	  notifier is configured to span the entire virtual
> > > address
> > > 	  space?
> > 
> > Let's assume you use a fake page-table like in xe_pt_walk.c as your
> > "radix tree", adapted to relevant page-sizes, sparsity is not a
> > problem.
> > 
> 
> Ok, makes sense I think.
> 
> > > - How does the garbage collector work? We can't allocate memory
> > > in
> > > the
> > >   notifier so we don't anything to add to the garbage collector.
> > > We
> > >   can't directly modify page tables given you need lock in the
> > > path
> > > of
> > >   reclaim.
> > 
> > The garbage collector would operate on the whole invalidated range.
> > In
> > the case of xe, upon zapping under reclaim you mark individual
> > page-
> > table bos that are to be removed as "invalid", the garbage
> > collector
> > walks the range removing the "invalid" entries. Subsequent (re-
> > binding)
> > avoids the "invalid" entries, (perhaps even helps removing them)
> > and
> > can thus race with the garbage collector. Hence, any ranges implied
> > by
> > the page-table code are elimitated.
> > 
> 
> This is pretty much with what I came up with too if we didn't have a
> SVM
> range.
> 
> > > - How do we deal with fault storms (e.g. tons of faults hitting
> > > the
> > > same
> > >   SVM range in a row)? Without a SVM range no every to know if
> > > mapping
> > >   is valid and GPU page handler can be short circuited.
> > 
> > Perhaps look at page-table tree and check whether the gpu_pte
> > causing
> > the fault is valid.
> > 
> 
> Came up with the same thing.
> 
> > > - Do we have notifier seqno for every PTE?
> > 
> > I'd say no. With this approach it makes sense to have a wide
> > notifier.
> > The seqno now only affects binding of new gpu_ptes, so the problem
> > with
> > a wide notifier becomes that if invalidation occurs to *any* part
> > of
> > the notifier while we're in the read section during binding, we
> > need to
> 
> I have avoided this by the drm_gpusvm_range_pages_valid. This isn't
> just
> an optimization is actually required for the 2 tile case to be able
> to
> safely know when dma pages can be unmapped (i.e. you can't dma unmap
> pages if either tile has a valid mapping).

OK, I still need to read up on that..

Thanks,
Thomas


> 
> Matt
> 
> > rerun the binding. Adding more notifiers to mitigate that would be
> > to
> > optimize faulting performance over core invalidation performance
> > which
> > Jason asked us to avoid.
> > 
> > /Thomas
> > 
> > 
> >
Thomas Hellström Sept. 2, 2024, 9:57 a.m. UTC | #24
Hi, Matt

On Fri, 2024-08-30 at 13:58 +0000, Matthew Brost wrote:
> > 
> > So I specifically asked Jason about the performance problem about
> > using
> > many notifiers vs using a single one, and he responded that the
> > problem
> > is slowing down the core mm on invalidations, if the RB tree gets
> > too
> > large to walk. He also mentioned that we should consider core
> > invalidation performance before faulting performance because the
> > latter
> > is so slow anyway we must have the driver stack avoid gpu faults
> > using
> > user-space prefetching and similar techniques.
> > 
> > In particular inserting and removing into the mmu_interval tree is
> > not
> > costly in terms of locking but because of correctness requirements
> > insertion might block on ongoing validations.
> > 
> > So basically what I'm trying to say is that as long as we're using
> > SVM
> > ranges in the way we do (I'm not saying that is wrong at this
> > point,
> 
> If you have been following the mmap write discussions at all, one
> potential fix for removing that hack is a per range migrate mutex
> [1].
> This also need to be considered when / if we try to drop a raneg
> concept.

Still need to read up on that, and for migration I think the situation
is a bit different, pls see below.

> 
> [1]
> https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111296
> 
> > and I agree that could be fine-tuned later), The benefit of an
> > extra
> > notifier layer is questionable compared to directly inserting the
> > ranges into the mmu_interval_tree. So hence my questions, given
> > those
> > considerations why this additional layer?
> > 
> 
> One we do fairly easily if you think this questionable is have an
> option
> to size the notifier to range size and wire this the notifier size
> modparam [2]. Again once we have apps running it would be fairly to
> profile this and see if there is benefit to this large notifier
> scheme.
> If there really is none, perhaps then we consider ripping this out.
> 
> [2]
> https://patchwork.freedesktop.org/patch/611007/?series=137870&rev=1
> 
> Matt

At this point I'm mostly trying to understand the reasoning behind the
various design choices and why data structures look like they do.

But also considering that the page-table mapping and invalidation is
per (vm, gpu_vm) pair and migration is per (vm, device (device group))
pair,I have really been advocating for sorting out the page-table
mapping and invalidation first and end up with something that is
lightweight and sufficient for igpu systems, and to avoid conflating
possible page-table range requirements with migration range
requirements which might be completely different. 

I think the former can be done completely without ranges, having
configurable prefaulting-, invalidation- and notifier granularity,
whereas the latter also introduces migration granularity.

/Thomas
Daniel Vetter Sept. 2, 2024, 11:29 a.m. UTC | #25
On Thu, Aug 29, 2024 at 04:40:47PM +0000, Matthew Brost wrote:
> On Wed, Aug 28, 2024 at 06:25:18PM +0200, Daniel Vetter wrote:
> > On Wed, Aug 28, 2024 at 03:43:48PM +0000, Matthew Brost wrote:
> > > On Wed, Aug 28, 2024 at 04:46:24PM +0200, Christian König wrote:
> > > > Am 28.08.24 um 16:31 schrieb Daniel Vetter:
> > > > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > > > +		if (!ctx->mmap_locked) {
> > > > > > +			/*
> > > > > > +			 * XXX: HMM locking document indicates only a read-lock
> > > > > > +			 * is required but there apears to be a window between
> > > > > > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > > > > +			 * via migrate_vma_setup and the pages actually moving
> > > > > > +			 * in migrate_vma_finalize in which this code can grab
> > > > > > +			 * garbage pages. Grabbing the write-lock if the range
> > > > > > +			 * is attached to vram appears to protect against this
> > > > > > +			 * race.
> > > > > > +			 */
> > > 
> > > Thanks the comments, replying to both of you inline.
> > > 
> > > > > This one is really scary, since it means the entire migrate pte trickery
> > > > > is essentially completely busted. Grabbing the mmap write lock just means
> > > > > you block out pretty much everything interesting from concurrently
> > > > > happening.
> > > > > 
> > > > > My gut feeling says we need to figure out what's happening here, because
> > > > > this looks a bit too fundamental to me.
> > > 
> > > I agree. I haven’t looked into this issue for a couple of months but
> > > really need to understand what is going on.
> > > 
> > > I should have mentioned this in the cover letter: the goal of this
> > > series was to produce something for review that is stable and supports
> > > UMDs/user applications. It was not intended to be presented as a final
> > > solution. This issue certainly falls into the category of "needs to be
> > > understood and requires a proper fix."
> > > 
> > > One open question I have is whether the test case that triggers this
> > > issue is even defined behavior. The test creates concurrent access
> > > between the GPU and CPU to the same memory address, resulting in GPU and
> > > CPU faults racing against each other. It’s possible that this is
> > > undefined behavior, so data corruption might be acceptable—i.e., the
> > > kernel can’t crash, but incorrect results might be permissible.
> > 
> > Yes this is supposed to be defined, at least from an hmm pov. And core mm/
> > is ridiculous in how many races it allows, especially around concurrent
> > fault handling.
> > 
> > It is ofc really slow if every fault results in a migration, but that's a
> > matter of the application setting stupid memory migration hints for the
> > gpu.
> > 
> > > e.g. This is the only defined usage model:
> > > 
> > > alloc_memory();
> > > start_compute_kernel();
> > > sync_on_compute_kernel_completion();
> > > read_memory();
> > > 
> > > Hopefully, in the next week or so, I'll be heavily engaging with the UMD
> > > teams. Development can then start, and applications will be running soon
> > > after. This will allow us to address issues like this, collect data on
> > > memory usage, and verify some of the assumptions I've made, such as
> > > optimizing for 2M+ allocations.
> > > 
> > > > 
> > > > I think I have at least a high level understanding what's going on here,
> > > > Felix and especially Philip should know more of the details.
> > > > 
> > > 
> > > I meant to reach out to AMD for issues like this. So, Felix
> > > (felix.kuehling@amd.com) and Philip (Philip.Yang@amd.com) would be good
> > > contacts?
> > > 
> > > > In general grabbing the mm_lock to protect PTEs from changing is completely
> > > > nonsense. The mm_lock is to protect the VMAs and *not* the PTEs!
> > > > 
> > > 
> > > Thanks for the hint. I believe that in the AMD implementation, I noticed
> > > some additional locks for migration, which might be how you mitigated
> > > this issue.
> > 
> > Yeah, so in general hold mmap_reading is indeed pure magic thinking for
> > preventing pte changes, like Christian points out. It doesn't stop
> > invalidates, and with the per vma locking it also doesn't stop new valid
> 
> Invalidations happening to parallel to migrations, get pages, or
> bindings should be fine. The notifier lock usage should make all of this
> safe.
> 
> > ptes from being inserted at least for anon memory.
> > 
> > Except migration pte entries that point at vram pages are special, and are
> > _only_ resolved while holding mmap_read. Which means holding mmap_write
> > for the case of looking up our own vram pages with hmm_range_fault
> > actually prevents issues. And so this duct-tape of holding mmap_write very
> > much looks like a working hack to plug any races against concurrently
> > ongoing migrations to system memory due to cpu faults.
> > 
> 
> Agree holding mmap_write is a hack. Looking at AMD 'To serialize concurrent
> migrations or validations of the same range, the prange->migrate_mutex
> must be held.', seemly I could drop mmap write lock abuse and use
> something like this here. The would like be an inner lock of the mmap
> lock.
> 
> Does this seem like a reasonable thing to explore?

Meta-comment: Since I've learned as I typed replies there's a bit a mess
in my suggestions/questions ...

See the other replies, I think prage->migrate_mutex doesn't work because
we need a lock on the physical storage, not on the virtual range. Because
thanks to mremap (if I understand that thing right) and fork does do not
need to line up, or be unique.

The other thing I only slowly realized is that the code in migrate.c
forces full page/folio lock semantics on use already, like a migration
done by core mm code between sram and sram. So I /think/ that if we follow
the rules correctly, the page/folio lock should be enough to sufficiently
serialize migrations.

But then there's amd's migration_mutex, and your migration vs fault
troubles, and I'm honestly not really understanding those races ..

So my current thinking is that page/folio lock should be enough, or we
have bugs or wrong understanding of how this should work.

> > An even more fun corner case is multiple concurrent cpu faults on the same
> > vram page. fork gets you that, or maybe a bit more reasonable mremap with
> 
> My understanding is memory shared between processes cannot migrated due
> to current limitations migrate layer.
> 
> e.g. mmap called with MAP_SHARED is not eligible for migration.

Hm, where is that limitation? All the code I've seen suggests it all
should work, and the only memory not eligible for hmm/zone_device
migration is pagecache. But any kind of anon memory, whether private (i.e.
cow on fork) or shared should work?

If that's not the case then yeah a _lot_ of what I said is just plain
wrong I think ...

> Unsure what the behavior is fork() is called on a process with memory in
> VRAM and the child tries to access it. Maybe fork() is different than
> MAP_SHARED where as parent / child processes can share memory in VRAM? 
> 
> Also really I'm unsure what would happen if user space calls fork() and
> has an Xe VM open and tries to use it too. Before commenting more on
> this, I need play around with test cases like this educate myself.
> 
> My current test doesn't use mremap, agree that would be to good add.
> Again before commenting more here, let add more test cases to educate
> myself.

Yeah I definitely need more learning too.
> 
> > MREMAP_DONTUNMAP | MREMAP_MAYMOVE. I think just hammer the same va with
> > multiple threads along isn't enough, it's better to have a private va for
> 
> I do have test cases where multiple CPU faults from threads hammer the
> same memory. Found some bugs in my initial code but as far as I can tell
> multiple CPU faults in parallel occur in my testing and do work.
> 
> > each thread pointing at the same anon memory page, so that you can get
> 
> You are losing me here - 'private va for each thread pointing at the
> same anon memory page'. This is a fork() case where the parent allocates
> memory and then all children try to read in parallel?

shared anon memory should be a thing, at least to my knowledge. Which
might be wrong ...

> > more parallel faults due to finely grained pte locking.
> > 
> > Would be a good testcase to add, if you don't have it yet.
> >
> 
> See above, agree these are good test cases which I haven't considered and
> will expand my suite to include these. Thanks for the tip - IMO testing
> is as important or even more important than the KMD design and need to
> ensure I have all possible uses covered.
> 
> > > I must say it is a bit unfortunate that the HMM locking documentation
> > > doesn’t mention this. I believe the documentation needs additional
> > > information, which I can add once we finalize the solution.
> > 
> > Yeah, at least from my very cursory lock you don't have enough locking.
> > I've written an in-depth reply to patch 23 with the high-level summary of
> > my thoughts.
> >
> 
> Will look and reply there.

Cheers, Sima

> 
> Matt
> 
> > Cheers, Sima
> > 
> > > 
> > > Matt 
> > > 
> > > > Even with the write side of the mm_lock taken it is perfectly possible that
> > > > PTE change. It's just less likely.
> > > > 
> > > > We run into multiple issues before we figured out this important distinction
> > > > as well.
> > > > 
> > > > Christian.
> > > > 
> > > > > -Sima
> > > > > 
> > > > > 
> > > > > > +			if (vram_pages)
> > > > > > +				mmap_write_lock(mm);
> > > > > > +			else
> > > > > > +				mmap_read_lock(mm);
> > > > > > +		}
> > > > > > +		err = hmm_range_fault(&hmm_range);
> > > > > > +		if (!ctx->mmap_locked) {
> > > > > > +			if (vram_pages)
> > > > > > +				mmap_write_unlock(mm);
> > > > > > +			else
> > > > > > +				mmap_read_unlock(mm);
> > > > > > +		}
> > > > > > +
> > > > > > +		if (err == -EBUSY) {
> > > > > > +			if (time_after(jiffies, timeout))
> > > > > > +				break;
> > > > > > +
> > > > > > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +		break;
> > > > > > +	}
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmput(mm);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	pages = (struct page **)pfns;
> > > > > > +
> > > > > > +	if (ctx->prefault) {
> > > > > > +		range->pages = pages;
> > > > > > +		goto set_seqno;
> > > > > > +	}
> > > > > > +
> > > > > > +map_pages:
> > > > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > > > +
> > > > > > +		for (i = 0; i < npages; ++i) {
> > > > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > > > +
> > > > > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				goto err_free;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Do not race with notifier unmapping pages */
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +		range->flags.has_vram_pages = true;
> > > > > > +		range->pages = pages;
> > > > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > > > +			err = -EAGAIN;
> > > > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > > +		}
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +	} else {
> > > > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > > > +
> > > > > > +		for_each_dma_page(i, j, npages, order) {
> > > > > > +			if (WARN_ON_ONCE(i && order !=
> > > > > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > > > +
> > > > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > > > +				err = -EOPNOTSUPP;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +
> > > > > > +			set_page_dirty_lock(pages[j]);
> > > > > > +			mark_page_accessed(pages[j]);
> > > > > > +
> > > > > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > > > > +						   pages[j], 0,
> > > > > > +						   PAGE_SIZE << order,
> > > > > > +						   DMA_BIDIRECTIONAL);
> > > > > > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > > > > +				err = -EFAULT;
> > > > > > +				npages = i;
> > > > > > +				goto err_unmap;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Huge pages, reduce memory footprint */
> > > > > > +		if (order) {
> > > > > > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > > > > +						 GFP_KERNEL);
> > > > > > +			if (dma_addr) {
> > > > > > +				for (i = 0; i < j; ++i)
> > > > > > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > > > > > +				kvfree(pfns);
> > > > > > +				kfree_mapping = true;
> > > > > > +			} else {
> > > > > > +				dma_addr = (dma_addr_t *)pfns;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		/* Do not race with notifier unmapping pages */
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +		range->order = order;
> > > > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > > > +		range->flags.has_dma_mapping = true;
> > > > > > +		range->dma_addr = dma_addr;
> > > > > > +		range->vram_allocation = NULL;
> > > > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > > > +			err = -EAGAIN;
> > > > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > > +		}
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +	}
> > > > > > +
> > > > > > +	if (err == -EAGAIN)
> > > > > > +		goto retry;
> > > > > > +set_seqno:
> > > > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > > > +
> > > > > > +	return 0;
> > > > > > +
> > > > > > +err_unmap:
> > > > > > +	for_each_dma_page(i, j, npages, order)
> > > > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > > > +			       (dma_addr_t)pfns[j],
> > > > > > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	if (alloc_pfns)
> > > > > > +		kvfree(pfns);
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > > > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > > > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > > > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > > > > + * security model.
> > > > > > + */
> > > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range *range,
> > > > > > +				  const struct drm_gpusvm_ctx *ctx)
> > > > > > +{
> > > > > > +	if (ctx->in_notifier)
> > > > > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > > > > +	else
> > > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > > +
> > > > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > > +
> > > > > > +	if (!ctx->in_notifier)
> > > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > > > + * @page: Pointer to the page to put
> > > > > > + *
> > > > > > + * This function unlocks and puts a page.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > > > +{
> > > > > > +	unlock_page(page);
> > > > > > +	put_page(page);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > > > + * @npages: Number of pages
> > > > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > > > + *
> > > > > > + * This function puts an array of pages.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > > > +					   unsigned long *migrate_pfn)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		if (!migrate_pfn[i])
> > > > > > +			continue;
> > > > > > +
> > > > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > > > > +		migrate_pfn[i] = 0;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > > > + * @page: Pointer to the page
> > > > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > > > + *
> > > > > > + * This function associates the given page with the specified GPU SVM zone
> > > > > > + * device data and initializes it for zone device usage.
> > > > > > + */
> > > > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > > > +				     struct drm_gpusvm_zdd *zdd)
> > > > > > +{
> > > > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > > > +	zone_device_page_init(page);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > > > > + * @dev: The device for which the pages are being mapped
> > > > > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > > > + * @npages: Number of pages to map
> > > > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > > > + *
> > > > > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > > > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > > > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > > > > + * array.
> > > > > > + *
> > > > > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > > > > + */
> > > > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > > > +					dma_addr_t *dma_addr,
> > > > > > +					long unsigned int *migrate_pfn,
> > > > > > +					unsigned long npages,
> > > > > > +					enum dma_data_direction dir)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > > > > +
> > > > > > +		if (!page)
> > > > > > +			continue;
> > > > > > +
> > > > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > > > +			return -EFAULT;
> > > > > > +
> > > > > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > > > +			return -EFAULT;
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > > > > + * @dev: The device for which the pages were mapped
> > > > > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > > > > + * @npages: Number of pages to unmap
> > > > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > > > + *
> > > > > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > > > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > > > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > > > > + */
> > > > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > > > +					   dma_addr_t *dma_addr,
> > > > > > +					   unsigned long npages,
> > > > > > +					   enum dma_data_direction dir)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *                   failure of this function.
> > > > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > > > > + *                   should hold a reference to the VRAM allocation, which
> > > > > > + *                   should be dropped via ops->vram_allocation or upon the
> > > > > > + *                   failure of this function.
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > > > > + * necessary setup and invokes the driver-specific operations for migration to
> > > > > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > > > > + * until ops->vram_release is called which only upon successful return.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range *range,
> > > > > > +			       void *vram_allocation,
> > > > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > > > +{
> > > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > > +	struct migrate_vma migrate = {
> > > > > > +		.start		= start,
> > > > > > +		.end		= end,
> > > > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > > > +	};
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	unsigned long i, npages = npages_in_range(start, end);
> > > > > > +	struct vm_area_struct *vas;
> > > > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > > > +	struct page **pages;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	if (!range->flags.migrate_vram)
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > > > > +	    !gpusvm->ops->copy_to_sram)
> > > > > > +		return -EOPNOTSUPP;
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +		mmap_write_lock(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	mmap_assert_locked(mm);
> > > > > > +
> > > > > > +	vas = vma_lookup(mm, start);
> > > > > > +	if (!vas) {
> > > > > > +		err = -ENOENT;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > > > +		err = -EINVAL;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!vma_is_anonymous(vas)) {
> > > > > > +		err = -EBUSY;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > > > +
> > > > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > > > +	if (!zdd) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_free;
> > > > > > +	}
> > > > > > +
> > > > > > +	migrate.vma = vas;
> > > > > > +	migrate.src = buf;
> > > > > > +	migrate.dst = migrate.src + npages;
> > > > > > +
> > > > > > +	err = migrate_vma_setup(&migrate);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > > > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > > > > +	 */
> > > > > > +
> > > > > > +	if (!migrate.cpages) {
> > > > > > +		err = -EFAULT;
> > > > > > +		goto err_free;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (migrate.cpages != npages) {
> > > > > > +		err = -EBUSY;
> > > > > > +		goto err_finalize;
> > > > > > +	}
> > > > > > +
> > > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > > > > +					     migrate.dst);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > > +					   migrate.src, npages, DMA_TO_DEVICE);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i) {
> > > > > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > > > > +
> > > > > > +		pages[i] = page;
> > > > > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > > > +	}
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	/* Upon success bind vram allocation to range and zdd */
> > > > > > +	range->vram_allocation = vram_allocation;
> > > > > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > > > +	migrate_vma_pages(&migrate);
> > > > > > +	migrate_vma_finalize(&migrate);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > > +				       DMA_TO_DEVICE);
> > > > > > +err_free:
> > > > > > +	if (zdd)
> > > > > > +		drm_gpusvm_zdd_put(zdd);
> > > > > > +	kvfree(buf);
> > > > > > +err_mmunlock:
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_write_unlock(mm);
> > > > > > +		mmput(mm);
> > > > > > +	}
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > > > + * @npages: Number of pages to populate
> > > > > > + * @src_mpfn: Source array of migrate PFNs
> > > > > > + * @mpfn: Array of migrate PFNs to populate
> > > > > > + * @addr: Start address for PFN allocation
> > > > > > + *
> > > > > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > > > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > > > > + * alloc_page for allocation.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > > > > +						unsigned long npages,
> > > > > > +						unsigned long *src_mpfn,
> > > > > > +						unsigned long *mpfn, u64 addr)
> > > > > > +{
> > > > > > +	unsigned long i;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > > > +		struct page *page;
> > > > > > +
> > > > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		if (vas)
> > > > > > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > > > > +		else
> > > > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > > > +
> > > > > > +		if (!page)
> > > > > > +			return -ENOMEM;
> > > > > > +
> > > > > > +		lock_page(page);
> > > > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + *
> > > > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > > > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > > > > + * preferred to issue migrations with mmap lock.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +				    struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	unsigned long npages;
> > > > > > +	struct page **pages;
> > > > > > +	unsigned long *src, *dst;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int i, err = 0;
> > > > > > +
> > > > > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_out;
> > > > > > +	}
> > > > > > +	src = buf;
> > > > > > +	dst = buf + (sizeof(*src) * npages);
> > > > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > > > > +
> > > > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > > > > +					     npages, src);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > > > +				       gpusvm->device_private_page_owner, src,
> > > > > > +				       npages, range->va.start);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > > +					   dst, npages, DMA_BIDIRECTIONAL);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i)
> > > > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > > > +	migrate_device_pages(src, dst, npages);
> > > > > > +	migrate_device_finalize(src, dst, npages);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > > +				       DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	kvfree(buf);
> > > > > > +err_out:
> > > > > > +
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @vas: Pointer to the VM area structure
> > > > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > > > + * @start: Start address of the migration range
> > > > > > + * @end: End address of the migration range
> > > > > > + *
> > > > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +					struct vm_area_struct *vas,
> > > > > > +					struct page *page,
> > > > > > +					u64 start, u64 end)
> > > > > > +{
> > > > > > +	struct migrate_vma migrate = {
> > > > > > +		.vma		= vas,
> > > > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > > > +		.fault_page	= page,
> > > > > > +	};
> > > > > > +	unsigned long npages;
> > > > > > +	struct page **pages;
> > > > > > +	dma_addr_t *dma_addr;
> > > > > > +	void *buf;
> > > > > > +	int i, err = 0;
> > > > > > +
> > > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > > +
> > > > > > +	/* Corner where VMA area struct has been partially unmapped */
> > > > > > +	if (start < vas->vm_start)
> > > > > > +		start = vas->vm_start;
> > > > > > +	if (end > vas->vm_end)
> > > > > > +		end = vas->vm_end;
> > > > > > +
> > > > > > +	migrate.start = start;
> > > > > > +	migrate.end = end;
> > > > > > +	npages = npages_in_range(start, end);
> > > > > > +
> > > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > > +	if (!buf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err_out;
> > > > > > +	}
> > > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > > > +
> > > > > > +	migrate.vma = vas;
> > > > > > +	migrate.src = buf;
> > > > > > +	migrate.dst = migrate.src + npages;
> > > > > > +
> > > > > > +	err = migrate_vma_setup(&migrate);
> > > > > > +	if (err)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	/* Raced with another CPU fault, nothing to do */
> > > > > > +	if (!migrate.cpages)
> > > > > > +		goto err_free;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > > > +						   migrate.src, migrate.dst,
> > > > > > +						   start);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > > > > +					   migrate.dst, npages,
> > > > > > +					   DMA_BIDIRECTIONAL);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +	for (i = 0; i < npages; ++i)
> > > > > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > > > > +
> > > > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > > > > +	if (err)
> > > > > > +		goto err_finalize;
> > > > > > +
> > > > > > +err_finalize:
> > > > > > +	if (err)
> > > > > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > > > > +	migrate_vma_pages(&migrate);
> > > > > > +	migrate_vma_finalize(&migrate);
> > > > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > > > > +				       DMA_BIDIRECTIONAL);
> > > > > > +err_free:
> > > > > > +	kvfree(buf);
> > > > > > +err_out:
> > > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > > > +
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @range: Pointer to the GPU SVM range structure
> > > > > > + * @ctx: GPU SVM context
> > > > > > + *
> > > > > > + * This function initiates the migration of the specified GPU SVM range to
> > > > > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > > > > + * function for actual migration.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * 0 on success, negative error code on failure.
> > > > > > + */
> > > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range *range,
> > > > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > > > +{
> > > > > > +	u64 start = range->va.start, end = range->va.end;
> > > > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > > > +	struct vm_area_struct *vas;
> > > > > > +	int err;
> > > > > > +	bool retry = false;
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		if (!mmget_not_zero(mm)) {
> > > > > > +			err = -EFAULT;
> > > > > > +			goto err_out;
> > > > > > +		}
> > > > > > +		if (ctx->trylock_mmap) {
> > > > > > +			if (!mmap_read_trylock(mm))  {
> > > > > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > > > +				goto err_mmput;
> > > > > > +			}
> > > > > > +		} else {
> > > > > > +			mmap_read_lock(mm);
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	mmap_assert_locked(mm);
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Loop required to find all VMA area structs for the corner case when
> > > > > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > > > > +	 */
> > > > > > +again:
> > > > > > +	vas = find_vma(mm, start);
> > > > > > +	if (!vas) {
> > > > > > +		if (!retry)
> > > > > > +			err = -ENOENT;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > > > +		if (!retry)
> > > > > > +			err = -EINVAL;
> > > > > > +		goto err_mmunlock;
> > > > > > +	}
> > > > > > +
> > > > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > > > > +	if (err)
> > > > > > +		goto err_mmunlock;
> > > > > > +
> > > > > > +	if (vas->vm_end < end) {
> > > > > > +		retry = true;
> > > > > > +		start = vas->vm_end;
> > > > > > +		goto again;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ctx->mmap_locked) {
> > > > > > +		mmap_read_unlock(mm);
> > > > > > +		/*
> > > > > > +		 * Using mmput_async as this function can be called while
> > > > > > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > > > > > +		 * lock, causing a lock inversion.
> > > > > > +		 */
> > > > > > +		mmput_async(mm);
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +
> > > > > > +err_mmunlock:
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmap_read_unlock(mm);
> > > > > > +err_mmput:
> > > > > > +	if (!ctx->mmap_locked)
> > > > > > +		mmput_async(mm);
> > > > > > +err_out:
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > > > > + * @page: Pointer to the page
> > > > > > + *
> > > > > > + * This function is a callback used to put the GPU SVM zone device data
> > > > > > + * associated with a page when it is being released.
> > > > > > + */
> > > > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > > > +{
> > > > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > > > + * @vmf: Pointer to the fault information structure
> > > > > > + *
> > > > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > > > + * the internal migration function to migrate the range back to RAM.
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > > > + */
> > > > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > > > +					   vmf->vma, vmf->page,
> > > > > > +					   zdd->range->va.start,
> > > > > > +					   zdd->range->va.end);
> > > > > > +
> > > > > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > > > > + */
> > > > > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > > > > +	.page_free = drm_gpusvm_page_free,
> > > > > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * Pointer to the GPU SVM device page map operations structure.
> > > > > > + */
> > > > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > > > > +{
> > > > > > +	return &drm_gpusvm_pagemap_ops;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > > > + * @start: Start address
> > > > > > + * @end: End address
> > > > > > + *
> > > > > > + * Returns:
> > > > > > + * True if GPU SVM has mapping, False otherwise
> > > > > > + */
> > > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > > > > +{
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +
> > > > > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > > > > +		struct drm_gpusvm_range *range = NULL;
> > > > > > +
> > > > > > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > > > > > +			return true;
> > > > > > +	}
> > > > > > +
> > > > > > +	return false;
> > > > > > +}
> > > > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..0ea70f8534a8
> > > > > > --- /dev/null
> > > > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > > > > @@ -0,0 +1,415 @@
> > > > > > +/* SPDX-License-Identifier: MIT */
> > > > > > +/*
> > > > > > + * Copyright © 2024 Intel Corporation
> > > > > > + */
> > > > > > +
> > > > > > +#ifndef __DRM_GPUSVM_H__
> > > > > > +#define __DRM_GPUSVM_H__
> > > > > > +
> > > > > > +#include <linux/kref.h>
> > > > > > +#include <linux/mmu_notifier.h>
> > > > > > +#include <linux/workqueue.h>
> > > > > > +
> > > > > > +struct dev_pagemap_ops;
> > > > > > +struct drm_device;
> > > > > > +struct drm_gpusvm;
> > > > > > +struct drm_gpusvm_notifier;
> > > > > > +struct drm_gpusvm_ops;
> > > > > > +struct drm_gpusvm_range;
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > > > > + *
> > > > > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > > > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > > > > + * perform operations such as migration between VRAM and system RAM.
> > > > > > + */
> > > > > > +struct drm_gpusvm_ops {
> > > > > > +	/**
> > > > > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > > > > +	 *
> > > > > > +	 * This function shall allocate a GPU SVM notifier.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > > > > +	 */
> > > > > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > > > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > > > > +	 *
> > > > > > +	 * This function shall free a GPU SVM notifier.
> > > > > > +	 */
> > > > > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 *
> > > > > > +	 * This function shall allocate a GPU SVM range.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > > > > +	 */
> > > > > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @range_free: Free a GPU SVM range (optional)
> > > > > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > > > > +	 *
> > > > > > +	 * This function shall free a GPU SVM range.
> > > > > > +	 */
> > > > > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @vram_release: Release VRAM allocation (optional)
> > > > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > > +	 *
> > > > > > +	 * This function shall release VRAM allocation and expects to drop a
> > > > > > +	 * reference to VRAM allocation.
> > > > > > +	 */
> > > > > > +	void (*vram_release)(void *vram_allocation);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > > +	 * @npages: Number of pages to populate
> > > > > > +	 * @pfn: Array of page frame numbers to populate
> > > > > > +	 *
> > > > > > +	 * This function shall populate VRAM page frame numbers (PFN).
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > > > > +				 void *vram_allocation,
> > > > > > +				 unsigned long npages,
> > > > > > +				 unsigned long *pfn);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > > > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > > > > +	 * @npages: Number of pages to copy
> > > > > > +	 *
> > > > > > +	 * This function shall copy pages to VRAM.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > > > > +			    struct page **pages,
> > > > > > +			    dma_addr_t *dma_addr,
> > > > > > +			    unsigned long npages);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > > > > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > > > > > +	 * @npages: Number of pages to copy
> > > > > > +	 *
> > > > > > +	 * This function shall copy pages to system RAM.
> > > > > > +	 *
> > > > > > +	 * Returns:
> > > > > > +	 * 0 on success, a negative error code on failure.
> > > > > > +	 */
> > > > > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > > > > +			    struct page **pages,
> > > > > > +			    dma_addr_t *dma_addr,
> > > > > > +			    unsigned long npages);
> > > > > > +
> > > > > > +	/**
> > > > > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > > > > +	 * @gpusvm: Pointer to the GPU SVM
> > > > > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > > > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > > > > +	 *
> > > > > > +	 * This function shall invalidate the GPU page tables. It can safely
> > > > > > +	 * walk the notifier range RB tree/list in this function. Called while
> > > > > > +	 * holding the notifier lock.
> > > > > > +	 */
> > > > > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > > > > +			   struct drm_gpusvm_notifier *notifier,
> > > > > > +			   const struct mmu_notifier_range *mmu_range);
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > > > > + *
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: MMU interval notifier
> > > > > > + * @interval: Interval for the notifier
> > > > > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > > > > + * @root: Cached root node of the RB tree containing ranges
> > > > > > + * @range_list: List head containing of ranges in the same order they appear in
> > > > > > + *              interval tree. This is useful to keep iterating ranges while
> > > > > > + *              doing modifications to RB tree.
> > > > > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > > > > + *                 removed
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM notifier.
> > > > > > + */
> > > > > > +struct drm_gpusvm_notifier {
> > > > > > +	struct drm_gpusvm *gpusvm;
> > > > > > +	struct mmu_interval_notifier notifier;
> > > > > > +	struct {
> > > > > > +		u64 start;
> > > > > > +		u64 end;
> > > > > > +	} interval;
> > > > > > +	struct {
> > > > > > +		struct rb_node node;
> > > > > > +		struct list_head entry;
> > > > > > +		u64 __subtree_last;
> > > > > > +	} rb;
> > > > > > +	struct rb_root_cached root;
> > > > > > +	struct list_head range_list;
> > > > > > +	struct {
> > > > > > +		u32 removed : 1;
> > > > > > +	} flags;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > > > > + *
> > > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > > + * @notifier: Pointer to the GPU SVM notifier
> > > > > > + * @refcount: Reference count for the range
> > > > > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > > > > + * @va: Virtual address range
> > > > > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > > > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > > > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > > > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > > > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > > > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > > > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > > > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > > > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > > > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > > > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > > > > + *                       on @order which releases via kfree
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > > > > + * mapped in a DRM device.
> > > > > > + */
> > > > > > +struct drm_gpusvm_range {
> > > > > > +	struct drm_gpusvm *gpusvm;
> > > > > > +	struct drm_gpusvm_notifier *notifier;
> > > > > > +	struct kref refcount;
> > > > > > +	struct {
> > > > > > +		struct rb_node node;
> > > > > > +		struct list_head entry;
> > > > > > +		u64 __subtree_last;
> > > > > > +	} rb;
> > > > > > +	struct {
> > > > > > +		u64 start;
> > > > > > +		u64 end;
> > > > > > +	} va;
> > > > > > +	unsigned long notifier_seq;
> > > > > > +	union {
> > > > > > +		struct page **pages;
> > > > > > +		dma_addr_t *dma_addr;
> > > > > > +	};
> > > > > > +	void *vram_allocation;
> > > > > > +	u16 order;
> > > > > > +	struct {
> > > > > > +		/* All flags below must be set upon creation */
> > > > > > +		u16 migrate_vram : 1;
> > > > > > +		/* All flags below must be set / cleared under notifier lock */
> > > > > > +		u16 unmapped : 1;
> > > > > > +		u16 partial_unmap : 1;
> > > > > > +		u16 has_vram_pages : 1;
> > > > > > +		u16 has_dma_mapping : 1;
> > > > > > +		u16 kfree_mapping : 1;
> > > > > > +	} flags;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm - GPU SVM structure
> > > > > > + *
> > > > > > + * @name: Name of the GPU SVM
> > > > > > + * @drm: Pointer to the DRM device structure
> > > > > > + * @mm: Pointer to the mm_struct for the address space
> > > > > > + * @device_private_page_owner: Device private pages owner
> > > > > > + * @mm_start: Start address of GPU SVM
> > > > > > + * @mm_range: Range of the GPU SVM
> > > > > > + * @notifier_size: Size of individual notifiers
> > > > > > + * @ops: Pointer to the operations structure for GPU SVM
> > > > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > > > > + *               Entries should be powers of 2 in descending order.
> > > > > > + * @num_chunks: Number of chunks
> > > > > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > > > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > > > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > > > > + * @notifier_list: list head containing of notifiers in the same order they
> > > > > > + *                 appear in interval tree. This is useful to keep iterating
> > > > > > + *                 notifiers while doing modifications to RB tree.
> > > > > > + *
> > > > > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > > > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > > > > + *
> > > > > > + * No reference counting is provided, as this is expected to be embedded in the
> > > > > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > > > > + * counting.
> > > > > > + */
> > > > > > +struct drm_gpusvm {
> > > > > > +	const char *name;
> > > > > > +	struct drm_device *drm;
> > > > > > +	struct mm_struct *mm;
> > > > > > +	void *device_private_page_owner;
> > > > > > +	u64 mm_start;
> > > > > > +	u64 mm_range;
> > > > > > +	u64 notifier_size;
> > > > > > +	const struct drm_gpusvm_ops *ops;
> > > > > > +	const u64 *chunk_sizes;
> > > > > > +	int num_chunks;
> > > > > > +	struct rw_semaphore notifier_lock;
> > > > > > +	struct workqueue_struct *zdd_wq;
> > > > > > +	struct rb_root_cached root;
> > > > > > +	struct list_head notifier_list;
> > > > > > +};
> > > > > > +
> > > > > > +/**
> > > > > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > > > > + *
> > > > > > + * @mmap_locked: mmap lock is locked
> > > > > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > > > > + *                (e.g.dma-revs -> mmap lock)
> > > > > > + * @in_notifier: entering from a MMU notifier
> > > > > > + * @read_only: operating on read-only memory
> > > > > > + * @vram_possible: possible to use VRAM
> > > > > > + * @prefault: prefault pages
> > > > > > + *
> > > > > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > > > > + */
> > > > > > +struct drm_gpusvm_ctx {
> > > > > > +	u32 mmap_locked :1;
> > > > > > +	u32 trylock_mmap :1;
> > > > > > +	u32 in_notifier :1;
> > > > > > +	u32 read_only :1;
> > > > > > +	u32 vram_possible :1;
> > > > > > +	u32 prefault :1;
> > > > > > +};
> > > > > > +
> > > > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > > > +		    const char *name, struct drm_device *drm,
> > > > > > +		    struct mm_struct *mm, void *device_private_page_owner,
> > > > > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > > > > +		    const struct drm_gpusvm_ops *ops,
> > > > > > +		    const u64 *chunk_sizes, int num_chunks);
> > > > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > > > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > > > +				const struct drm_gpusvm_ctx *ctx);
> > > > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > > > +			     struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range *range);
> > > > > > +
> > > > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range *range,
> > > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > > > +				  struct drm_gpusvm_range *range,
> > > > > > +				  const struct drm_gpusvm_ctx *ctx);
> > > > > > +
> > > > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range *range,
> > > > > > +			       void *vram_allocation,
> > > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > > +			       struct drm_gpusvm_range *range,
> > > > > > +			       const struct drm_gpusvm_ctx *ctx);
> > > > > > +
> > > > > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > > > > +
> > > > > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > > > > +
> > > > > > +struct drm_gpusvm_range *
> > > > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > > + *
> > > > > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > > > > +	down_read(&(gpusvm__)->notifier_lock)
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > > > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > > > > + *
> > > > > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > > > > + */
> > > > > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > > > > +	up_read(&(gpusvm__)->notifier_lock)
> > > > > > +
> > > > > > +/**
> > > > > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > > > > + * @range: a pointer to the current GPU SVM range
> > > > > > + *
> > > > > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > > > > + *         current range is the last one or if the input range is NULL.
> > > > > > + */
> > > > > > +static inline struct drm_gpusvm_range *
> > > > > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > > > > +{
> > > > > > +	if (range && !list_is_last(&range->rb.entry,
> > > > > > +				   &range->notifier->range_list))
> > > > > > +		return list_next_entry(range, rb.entry);
> > > > > > +
> > > > > > +	return NULL;
> > > > > > +}
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > > > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > > > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > > > + * @start__: Start address of the range
> > > > > > + * @end__: End address of the range
> > > > > > + *
> > > > > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > > > > + * to use while holding the driver SVM lock or the notifier lock.
> > > > > > + */
> > > > > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > > > > > +	for ((range__) = (range__) ?:					\
> > > > > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > > > > > +	     (range__) && (range__->va.start < (end__));		\
> > > > > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > > > > +
> > > > > > +/**
> > > > > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > > > > + * @range: Pointer to the GPU SVM range structure.
> > > > > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > > > > + *
> > > > > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > > > > + * if the range partially falls within the provided MMU notifier range.
> > > > > > + */
> > > > > > +static inline void
> > > > > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > > > > +			      const struct mmu_notifier_range *mmu_range)
> > > > > > +{
> > > > > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > > > > +
> > > > > > +	range->flags.unmapped = true;
> > > > > > +	if (range->va.start < mmu_range->start ||
> > > > > > +	    range->va.end > mmu_range->end)
> > > > > > +		range->flags.partial_unmap = true;
> > > > > > +}
> > > > > > +
> > > > > > +#endif /* __DRM_GPUSVM_H__ */
> > > > > > -- 
> > > > > > 2.34.1
> > > > > > 
> > > > 
> > 
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Daniel Vetter Sept. 2, 2024, 11:36 a.m. UTC | #26
On Fri, Aug 30, 2024 at 05:00:11AM +0000, Matthew Brost wrote:
> On Wed, Aug 28, 2024 at 04:31:19PM +0200, Daniel Vetter wrote:
> > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates only a read-lock
> > > +			 * is required but there apears to be a window between
> > > +			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
> > > +			 * via migrate_vma_setup and the pages actually moving
> > > +			 * in migrate_vma_finalize in which this code can grab
> > > +			 * garbage pages. Grabbing the write-lock if the range
> > > +			 * is attached to vram appears to protect against this
> > > +			 * race.
> > > +			 */
> > 
> > This one is really scary, since it means the entire migrate pte trickery
> > is essentially completely busted. Grabbing the mmap write lock just means
> > you block out pretty much everything interesting from concurrently
> > happening.
> > 
> > My gut feeling says we need to figure out what's happening here, because
> > this looks a bit too fundamental to me.
> > -Sima
> > 
> 
> Sima,
> 
> I’ve already replied to this.
> 
> We’ve discussed the mmap write hack extensively, so I’m not quite sure
> where to put this. The reply chain is quickly becoming a mess. However,
> I’ve looked into this and collected some data points based on your
> feedback.
> 
> I’ve pushed a branch [1] with the updated code.
> 
> The first new commit [2] removes the mmap write lock hack and addresses
> an issue related to VRAM migrations, which couldn’t collect all VRAM
> pages without this hack.
> 
> With this commit [2], xe_exec_system_allocator --r twice*race* fails
> quite regularly, perhaps 25% of the time. This test is a
> single-thread/process test that races CPU and GPU faults with migration.
> 
> It fails with the following dmesg:
> 
> [   68.473007] WARNING: CPU: 12 PID: 1643 at drivers/gpu/drm/xe/drm_gpusvm.c:1407 drm_gpusvm_range_get_pages+0xbda/0x1480 [xe]
> ...
> [   68.473836] xe 0000:03:00.0: [drm:pf_queue_work_func [xe]] Fault response: Unsuccessful -95
> [   68.474024] xe 0000:03:00.0: [drm:xe_guc_exec_queue_memory_cat_error_handler [xe]] GT1: Engine memory cat error: engine_class=vecs, logical_mask: 0x2, guc_id=0
> [   68.474163] xe 0000:03:00.0: [drm] exec queue reset detected
> [   68.474696] xe 0000:03:00.0: [drm] GT1: Engine reset: engine_class=vecs, logical_mask: 0x2, guc_id=0
> 
> This means hmm_range_fault collects a mix of SRAM and VRAM pages, which
> my design aims to avoid. Perhaps allowing a mix of SRAM and VRAM pages
> in my design might work, but I highly doubt it based on AMD's
> range->migration_mutex and my inspection of the migration layer.
> Allowing mixed mappings would introduce significant complexity, so I’d
> prefer to avoid this if possible. Additionally, allowing mixed mappings
> would eliminate the use of huge GPU pages when race like this occurs.

Ah, if the issue is just that you get a mix of sram and vram pages from
hmm_range_fault, then I think that answers all my questions. From the
discussion we had and your comment it sounded like you're getting complete
nonsense, or missing an invalidation, or something else equally scary.

Thanks a lot for these details, I'm a lot less worried now here.

> I also implemented a retry loop to see if the system stabilizes with
> either only SRAM or VRAM pages. Unfortunately, it results in a
> continuous loop of drm_gpusvm_range_get_pages / hmm_range_fault until
> the test case kills the MM due to a timeout.

Yeah, the core mm makes no guarantees about forward progress for groups of
pages/folios. So if we go with core mm locking rules, then you have to
deal with individual pages/folios and anything bigger is just a
performance optimasation that must fall back to a per-page/folio approach.

> Next, I added a lock similar to AMD's range->migration_lock, but using
> an rwsem [3]. The semantics are to allow read access for CPU access and
> write access for GPU access, thus enabling parallel CPU page faults for
> the same range which matching existing core semantics. This provides
> finer granularity compared to using the mmap write lock; it only
> disallows CPU and GPU servicing in parallel for a given range, rather
> than the entire MM. It also aligns with AMD’s approach. I haven’t
> checked Nvidia’s approach wrt this locking but can do so if you think it
> would be helpful.

Yeah minus the entire question whether va base locking is ok, something
like amd's migration_mutex should also close the race you're having here.

Cheers, Sima

> 
> Matt
> 
> [1] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commits/mmap_write_lock
> [2] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commit/6cf67d98c719ffbb4ac6124a7cb81d797a5bad9f
> [3] https://gitlab.freedesktop.org/mbrost/xe-kernel-driver-svm-post-8-27-24/-/commit/2b62075d193265b2c1634ecfd0497dffd2e18c13
> 
> > 
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE << order,
> > > +						   DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] = (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range. If @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified GPU SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to mapped pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU SVM. It
> > > + * iterates over each page frame number provided in @migrate_pfn, maps the
> > > + * corresponding page, and stores the DMA address in the provided @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for GPU Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
> > > + *                   should hold a reference to the VRAM allocation, which
> > > + *                   should be dropped via ops->vram_allocation or upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM. It performs the
> > > + * necessary setup and invokes the driver-specific operations for migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely reference @range
> > > + * until ops->vram_release is called which only upon successful return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   migrate.src, npages, DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in the VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
> > > +						unsigned long npages,
> > > +						unsigned long *src_mpfn,
> > > +						unsigned long *mpfn, u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
> > > + * migration done via migrate_device_* functions. Fallback path as it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm->device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   dst, npages, DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially unmapped */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src, migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU SVM range to
> > > + * SRAM. It performs necessary checks and invokes the internal migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner case when
> > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be called while
> > > +		 * holding a dma-resv lock, and a final put can grab the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone device data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > + * the internal migration function to migrate the range back to RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > + */
> > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier, start, end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage SVM ranges and
> > > + * perform operations such as migration between VRAM and system RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and expects to drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables. It can safely
> > > +	 * walk the notifier range RB tree/list in this function. Called while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order they appear in
> > > + *              interval tree. This is useful to keep iterating ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval notifier has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
> > > + * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under notifier lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
> > > + *               Entries should be powers of 2 in descending order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
> > > + * @notifier_list: list head containing of notifiers in the same order they
> > > + *                 appear in interval tree. This is useful to keep iterating
> > > + *                 notifiers while doing modifications to RB tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
> > > + *         current range is the last one or if the input range is NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier->range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
> > > + * @range__: Iterator variable for the ranges. If set, it indicates the start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
> > > + * to use while holding the driver SVM lock or the notifier lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
> > > +	for ((range__) = (range__) ?:					\
> > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
> > > +	     (range__) && (range__->va.start < (end__));		\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > +			      const struct mmu_notifier_range *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> > > -- 
> > > 2.34.1
> > > 
> > 
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Daniel Vetter Sept. 2, 2024, 11:40 a.m. UTC | #27
On Thu, Aug 29, 2024 at 04:49:15PM +0000, Matthew Brost wrote:
> On Wed, Aug 28, 2024 at 08:50:02PM +0200, Daniel Vetter wrote:
> > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err = drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner case when
> > > +	 * VRAM backing has been partially unmapped from MM's address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > 
> > So a hiliarous case that amdkfd gets a bit better but still not entirely
> > is that the original vma might entirely gone. Even when you can still get
> > at the mm of that process. This happens with cow (or shared too I think)
> > mappings in forked child processes, or also if you play fun mremap games.
> > 
> > I think that outside of the ->migrate_to_ram callback migration/eviction
> > to sram cannot assume there's any reasonable vma around and has to
> > unconditionally go with the drm_gpusvm_evict_to_sram path.
> > 
> 
> See my response here [1]. Let me drop the whole trylock thing and
> convert to an 'evict' flag which calls drm_gpusvm_evict_to_sram in
> places where Xe needs to evict VRAM. Or maybe just export that function
> and call it directly. That way the only place the VMA is looked up for
> SRAM -> VRAM is upon CPU page fault.

Yeah I think a dedicated path for migrate_to_ram hook that goes directly
into your evict_to_sram path is the design-clean approach here imo.

> [1] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164
> 
> > Also in the migrate_to_ram case the vma is essentially nothing else that
> > informational about which ranges we might need if we prefault a bit (in
> > case the child changed the vma compared to the original one). So it's good
> > to as parameter for migrate_vma_setup, but absolutely nothing else.
> > 
> > amdkfd almost gets this right by being entirely based on their svm_range
> > structures, except they still have the lingering check that the orignal mm
> > is still alive. Of course you cannot ever use that memory on the gpu
> > anymore, but the child process could get very pissed if their memory is
> > suddenly gone. Also the eviction code has the same issue as yours and
> > limits itself to vma that still exist in the original mm, leaving anything
> > that's orphaned in children or remaps stuck in vram. At least that's my
> > understanding, I might very well be wrong.
> > 
> > So probably want a bunch of these testcases too to make sure that all
> > works, and we're not stuck with memory allocations in vram that we can't
> > move out.
> 
> When writing some additional test cases, let me add hooks in my IGTs to
> be able to verify we are not orphaning VRAM too.

So maybe apply caution, I'm honestly not sure whether core mm makes any
guarantees about not orphaning stuff, at least for a little bit.

Over the w/e my brain tossed me the "so are we sure we can tear down our
zone_device data, the page array specifically" brain teaser. And I think
the answer is that we have to wait until all page references disappear,
which might take a long time. Core mm makes no guarantee about elevated
page references disappearing in a timely manner, at least as far as I
know. Which is also why migration is a best effort thing only.

Cheers, Sima
Daniel Vetter Sept. 2, 2024, 11:53 a.m. UTC | #28
On Thu, Aug 29, 2024 at 05:27:13PM +0000, Matthew Brost wrote:
> On Thu, Aug 29, 2024 at 11:45:08AM +0200, Daniel Vetter wrote:
> > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > This patch introduces support for GPU Shared Virtual Memory (SVM) in the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > sharing of memory between the CPU and GPU, enhancing performance and
> > > flexibility in GPU computing tasks.
> > > 
> > > The patch adds the necessary infrastructure for SVM, including data
> > > structures and functions for managing SVM ranges and notifiers. It also
> > > provides mechanisms for allocating, deallocating, and migrating memory
> > > regions between system RAM and GPU VRAM.
> > > 
> > > This mid-layer is largely inspired by GPUVM.
> > > 
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > 
> > Still not sure I've got the right race that you paper over with
> > mmap_write_lock, but I spotted a few things, commments inline.
> > 
> 
> I've replied to this issue several times, let's table the
> mmap_write_lock issue in this reply - a lot of other things to get
> through. Current thinking is try to add a range->migrate_lock like AMD
> which I state here [1]. Let's continue discussing the mmap lock issue
> there if possible.

Yeah I wrote replies as I read code, so there's a bit a mess from my side
here. Apologies for that.

> [1] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169

Some more replies below that I think we haven't covered anywhere else yet.

> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > > + *					struct drm_gpusvm_range *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);
> > 
> > Note that the migration back to sram isn't guaranteed to succeed, so you
> > might be still stuck with partially migrated range. This might be a case
> > where hmm gives you vram pfns, but the range you have doesn't have any
> > vram allocation anymore because you droppped it here. Not sure tbh.
> >
> 
> Hmm isn't the picture here nor will a VMA once the
> drm_gpusvm_evict_to_sram path is always taken as discussed here [2]. I
> might have a corner case BO refcounting / TTM resource lookup bug in
> somewhere in here which needs to be resolved though (e.g. eviction
> racing with this code path), will try to close on that.
> 
> [2] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164

So maybe my understanding is wrong, but from my reading of the device
migration code the exact same non-guarantees as for the sram2sram
migration code apply:

- There's no guarantee the page/folio doesn't have an elevated refcount,
  which makes the migration fail (in try_to_migrate, where it checks for
  surplus refcounts).

- There's no guarantee you'll get the page/folio lock, which makes the
  migration fail. Worse the core mm seems to use a fallback to per-page
  locking as it's extremely crude "get out of deadlocks due to acquiring
  multiple page locks" card.

> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > 
> > You can't do the above, because the pfn you get from hmm come with zero
> > guarantees, you neither hold a page reference nor the page lock. The only
> > thing you can do is grab the pagetable lock (or mmu notifier locks) and
> > check it's still valid, before you can touch any state. I think the
> > range->vram_allocation is probably always valid since you clean that up
> > under the same lock/thread, but there's good chances the vram allocation
> > is otherwise already gone for good. Or you get an inconsistent snapshot.
> > 
> 
> I haven't seen this pop in my testing yet which is fairly thorough. My
> thinking was migration always being enforced at range grainularity we'd
> never get mixed mappings from the core as migration is completely under
> control of the driver. Maybe I'm not understanding what you are saying
> here...

So one scenario is that you race (without the mmap write lock or the
migration_mutex design ofc) with another invalidate, and get a partial
view here of mixed vram and sram pages. Until you acquire the mmu notifier
lock and have made sure your pages are still valid, there's essentially no
guarantee.
> 
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > 
> > You can't do these, because you don't hold a page reference. They're also
> > not needed because hmm_range_fault goes thorugh the full mkwrite dance,
> > which takes care of these, unlike the gup family of functions.
> >
> 
> This is a left over from our existing userpte code and it does appear to
> be incorrect. Let me remove this and fixup our userptr code while I'm at
> it.

Ack.

> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> 
> This is a bit stale, can update this comment.
> 
> > > +	 */
> > 
> > Yeah I think especially under contention partial migrations, at least back
> > to sram due to cpu faults, are pretty much expected. And you need to cope
> > somehow.
> > 
> 
> I have seen these pop if the IGT calls mlock on the memory. My thinking
> is migration to VRAM is basically optional and fallback to leaving range
> in SRAM if an error occurs rather than doing a partial migration. This
> is what currently happens so it is coped with.
> 
> If the memory is marked as must be in VRAM (NIY), well then the user
> program has done something wrong and can kill the app (akin to
> segfault).

Yeah SIGBUS for "must be in VRAM" sounds like ok semantics.

> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}

What I think is more fundamental is that I think this one here doesn't
work. For migrate_to_ram you cannot assume that you can always migrate the
entire block, I think to uphold the core mm forward progress rules we need
to allow partial migrations there. And I think your current code allows
that.

But that then means you also are stuck with partial migration state here.
That was the point I tried to make.

> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > 
> > That's the wrong mm, at least for the ->migrate_to_ram path. You might be
> > called on a anon mapping from a child process. That also means that the
> > vma you're looking at might have no relationship with anythign you're
> > tracking in your gpusvm.
> >
> 
> Hmm, as discussed [3] I haven't added tests with child processes yet.
> Let me do that and update the design as needed. This likely isn't
> correct as you say.
> 
> [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169 

Ack. More tests should definitely help here to figure out what's up, and
what's just me being confused.

> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > + * the internal migration function to migrate the range back to RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > 
> > So I think zdd->range doesn't work, because even within a single mm the
> > vma mapping a given piece of anon memory does not need to be unique, you
> > can duplicate them with mremap.
> > 
> 
> This is attached to a page, not a VMA. Both AMD and Nvidia drivers use a
> similar lookup mechanism.

Yeah the page->zone_device_data is fine. It's the zone_device_rage->range
which I think isn't ok.

> > So all you have here is the physical memory and the vma, which might or
> > might not be from the same process as gpusvm->mm.
> > 
> > Also the child process scenario means you using mmap_write on the fault
> > side doesn't stop all cpu faults migrating stuff back.
> > 
> > Somewhat aside, but I think that means amdkfd's svm_range->migration_mutex
> > is busted, because it's va based and so misses concurrently ongoing
> > different mappings moving physical storage around underneath.
> >
> 
> I think all of the above which falls into the fork() + child process
> issues which you have raise. Until I test this out I can't speak to this
> any level of confidence so I won't. Thanks for raising this issue and
> let me write test cases as discussed and educate myself. Once I do that,
> we can engage in further discussions.

I think fork + childs will still result in zdd->range being unique (albeit
confused about which mm). You need mremap of some of these mappings to
change the addresses and really cause confusion, which I /think/ (but
didn't test) is doable with a single process even and duplicating anon
memory mappings with mremap.

Cheers, Sima
Daniel Vetter Sept. 2, 2024, 12:20 p.m. UTC | #29
On Fri, Aug 30, 2024 at 11:16:53AM +0200, Thomas Hellström wrote:
> Hi, Matthew
> 
> On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to
> > manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data
> > exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are
> > tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU
> > address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits
> > within the
> > + *	     faulting CPU address space is chosen for the range
> > size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration,
> > invalidations, and VRAM
> > + *		 release.
> > + *
> 
> Another question, since ranges, as I understand it, are per gpuvm and
> per cpu mm, whereas migration is per device and per cpu_mm, (whe might
> have multiple gpuvms mapping the same cpu_mm), I figure the gpu_svm is
> per gpuvm, but that makes migration currently inconsistent, right?

I think anything that tracks va must be 1:1 tied to the single specific
cpu mm that we use for hmm/svm. So I think that's ok.

There's a pile of paths where that 1:1 mapping doesn't capture the entire
picture. but I think there the right choice is to just completely ignore
any cpu/gpu mm/vma stuff, and defacto rely on the core mm rmap
datastructure to make sure we find them all (e.g. to update/invalidate
ptes during migration).
-Sima
Daniel Vetter Sept. 2, 2024, 12:33 p.m. UTC | #30
Jumping in here in the middle, since I think it's a solid place to drop my
idea of "align with core mm" gpusvm locking ...

On Thu, Aug 29, 2024 at 08:56:23PM +0000, Matthew Brost wrote:
> On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> Issues with removing a SVM range:
> 
> - Xe bind code stores invalidation / present state in VMA, this would
>   need to be moved to the radix tree. I have Jira open for that work
>   which I believe other developers are going to own.
> - Where would the dma mapping / device pages be stored?
> 	- In the radix tree? What if ATS is enabled? We don't have a
> 	  driver owned radix tree. How do we reasonably connect a driver
> 	  owned radix to a common GPUSVM layer?

Yeah this one is really annoying, because the core mm gets away with
nothing because it can just store the pfn in the pte. And it doesn't need
anything else. So we probably still need something unfortuantely ...

> 	- In the notifier? What is the notifier is sparsely populated?
> 	  We would be wasting huge amounts of memory. What is the
> 	  notifier is configured to span the entire virtual address
> 	  space?

So if we go with the radix idea, we could model the radix to exactly match
the gpu pagetables. That's essentially what the core mm does. Then each
pagetable at each level has a spinlock for essentially a range lock.
notifier seqno would be stored into each pagetable (not the endividual
entries, that's probably too much), which should allow us to very
effeciently check whether an entire arbitrary va range is still valid on
the fault side.

On the notifier side we can also very efficiently walk arbitrary ranges,
because the locking is really fine-grained and in an adaptive way.

> - How does the garbage collector work? We can't allocate memory in the
>   notifier so we don't anything to add to the garbage collector. We
>   can't directly modify page tables given you need lock in the path of
>   reclaim.

Probably no more garbage collector, you deal with pages/folios like the
core mm expects.

> - How do we deal with fault storms (e.g. tons of faults hitting the same
>   SVM range in a row)? Without a SVM range no every to know if mapping
>   is valid and GPU page handler can be short circuited.

So the core mm sorts this out by allowing faults to be handled in
parallel, without any lock. Essentially:
- you get a fault (or prefault)
- you hold just enough read locks to make sure stuff doesn't disappear.
  Currently that's mmap_read_lock, but strictly speaking we only need the
  new-ish per-vma lock.
- you allocate memory, dma_map, everything else you need
- you grab that very fine-grained radix tree lock (pagetable locks on the
  cpu side) and recheck whether you've raced: mmu notifier seqno and the
  pte must still be non-present. If that check fails, you bail out and
  release all the vram/dma_maps you've created.

> - Do we have notifier seqno for every PTE?

I think per-pagetable, so every node in the radix tree, would make sense.
If we go with also one lock per pagetable like the cpu mm then tracking
notifier seqno to match makes the most sense imo.

Again, this is entirely aside from the discussion in this subthread about
understanding the current design and tradeoffs/reasons. Just figured this
is a good spot to drop this.
-Sima
Matthew Brost Sept. 2, 2024, 5:03 p.m. UTC | #31
On Mon, Sep 02, 2024 at 01:53:14PM +0200, Daniel Vetter wrote:
> On Thu, Aug 29, 2024 at 05:27:13PM +0000, Matthew Brost wrote:
> > On Thu, Aug 29, 2024 at 11:45:08AM +0200, Daniel Vetter wrote:
> > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > This patch introduces support for GPU Shared Virtual Memory (SVM) in the
> > > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > > sharing of memory between the CPU and GPU, enhancing performance and
> > > > flexibility in GPU computing tasks.
> > > > 
> > > > The patch adds the necessary infrastructure for SVM, including data
> > > > structures and functions for managing SVM ranges and notifiers. It also
> > > > provides mechanisms for allocating, deallocating, and migrating memory
> > > > regions between system RAM and GPU VRAM.
> > > > 
> > > > This mid-layer is largely inspired by GPUVM.
> > > > 
> > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: Christian König <christian.koenig@amd.com>
> > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > 
> > > Still not sure I've got the right race that you paper over with
> > > mmap_write_lock, but I spotted a few things, commments inline.
> > > 
> > 
> > I've replied to this issue several times, let's table the
> > mmap_write_lock issue in this reply - a lot of other things to get
> > through. Current thinking is try to add a range->migrate_lock like AMD
> > which I state here [1]. Let's continue discussing the mmap lock issue
> > there if possible.
> 
> Yeah I wrote replies as I read code, so there's a bit a mess from my side
> here. Apologies for that.
> 

All good, has been quite helpful thus far.

> > [1] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169
> 
> Some more replies below that I think we haven't covered anywhere else yet.
> 
> > > > + * 2) Garbage Collector.
> > > > + *
> > > > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > > > + *					struct drm_gpusvm_range *range)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > + *
> > > > + *		assert_driver_svm_locked(gpusvm);
> > > > + *
> > > > + *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
> > > > + *		if (range->flags.partial_unmap)
> > > > + *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);
> > > 
> > > Note that the migration back to sram isn't guaranteed to succeed, so you
> > > might be still stuck with partially migrated range. This might be a case
> > > where hmm gives you vram pfns, but the range you have doesn't have any
> > > vram allocation anymore because you droppped it here. Not sure tbh.
> > >
> > 
> > Hmm isn't the picture here nor will a VMA once the
> > drm_gpusvm_evict_to_sram path is always taken as discussed here [2]. I
> > might have a corner case BO refcounting / TTM resource lookup bug in
> > somewhere in here which needs to be resolved though (e.g. eviction
> > racing with this code path), will try to close on that.
> > 
> > [2] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164
> 
> So maybe my understanding is wrong, but from my reading of the device
> migration code the exact same non-guarantees as for the sram2sram
> migration code apply:
> 
> - There's no guarantee the page/folio doesn't have an elevated refcount,
>   which makes the migration fail (in try_to_migrate, where it checks for
>   surplus refcounts).
> 
> - There's no guarantee you'll get the page/folio lock, which makes the
>   migration fail. Worse the core mm seems to use a fallback to per-page
>   locking as it's extremely crude "get out of deadlocks due to acquiring
>   multiple page locks" card.
>

I think this circles back to basically the design must be able to move
VRAM -> SRAM because the host can't access VRAM. Certainly in the CPU
page fault path this can't fail on the fauling page at least or if it
does the app gets segfaulted. I'll investigate more here but that is
still my current thinking. If VRAM -> SRAM can fail / make partial
progress in eviction paths, then mixed mappings likely need to be
supported which shouldn't be all that painful - basically just need
cursor in the bind code which can walk mixed mappings.

SRAM -> VRAM certainly can fail which is handled by just aborting the
migration.

> > > > +map_pages:
> > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > +
> > > > +		for (i = 0; i < npages; ++i) {
> > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > +
> > > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				goto err_free;
> > > > +			}
> > > > +		}
> > > 
> > > You can't do the above, because the pfn you get from hmm come with zero
> > > guarantees, you neither hold a page reference nor the page lock. The only
> > > thing you can do is grab the pagetable lock (or mmu notifier locks) and
> > > check it's still valid, before you can touch any state. I think the
> > > range->vram_allocation is probably always valid since you clean that up
> > > under the same lock/thread, but there's good chances the vram allocation
> > > is otherwise already gone for good. Or you get an inconsistent snapshot.
> > > 
> > 
> > I haven't seen this pop in my testing yet which is fairly thorough. My
> > thinking was migration always being enforced at range grainularity we'd
> > never get mixed mappings from the core as migration is completely under
> > control of the driver. Maybe I'm not understanding what you are saying
> > here...
> 
> So one scenario is that you race (without the mmap write lock or the
> migration_mutex design ofc) with another invalidate, and get a partial
> view here of mixed vram and sram pages. Until you acquire the mmu notifier
> lock and have made sure your pages are still valid, there's essentially no
> guarantee.

The pages are collected in notifier stable state via the hmm locking +
seqno begin and recheck. Before they can used (e.g. program a bind) yes
the notifier lock needs to be taken to ensure they haven't changed
between collection and used - at least this my understanding.

> > 
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->flags.has_vram_pages = true;
> > > > +		range->pages = pages;
> > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	} else {
> > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > +
> > > > +		for_each_dma_page(i, j, npages, order) {
> > > > +			if (WARN_ON_ONCE(i && order !=
> > > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > +
> > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +
> > > > +			set_page_dirty_lock(pages[j]);
> > > > +			mark_page_accessed(pages[j]);
> > > 
> > > You can't do these, because you don't hold a page reference. They're also
> > > not needed because hmm_range_fault goes thorugh the full mkwrite dance,
> > > which takes care of these, unlike the gup family of functions.
> > >
> > 
> > This is a left over from our existing userpte code and it does appear to
> > be incorrect. Let me remove this and fixup our userptr code while I'm at
> > it.
> 
> Ack.
> 
> > > > +	vas = vma_lookup(mm, start);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > +		err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (!vma_is_anonymous(vas)) {
> > > > +		err = -EBUSY;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > +
> > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > +	if (!zdd) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/*
> > > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > 
> > This is a bit stale, can update this comment.
> > 
> > > > +	 */
> > > 
> > > Yeah I think especially under contention partial migrations, at least back
> > > to sram due to cpu faults, are pretty much expected. And you need to cope
> > > somehow.
> > > 
> > 
> > I have seen these pop if the IGT calls mlock on the memory. My thinking
> > is migration to VRAM is basically optional and fallback to leaving range
> > in SRAM if an error occurs rather than doing a partial migration. This
> > is what currently happens so it is coped with.
> > 
> > If the memory is marked as must be in VRAM (NIY), well then the user
> > program has done something wrong and can kill the app (akin to
> > segfault).
> 
> Yeah SIGBUS for "must be in VRAM" sounds like ok semantics.
> 
> > > > +
> > > > +	if (!migrate.cpages) {
> > > > +		err = -EFAULT;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	if (migrate.cpages != npages) {
> > > > +		err = -EBUSY;
> > > > +		goto err_finalize;
> > > > +	}
> 
> What I think is more fundamental is that I think this one here doesn't
> work. For migrate_to_ram you cannot assume that you can always migrate the
> entire block, I think to uphold the core mm forward progress rules we need
> to allow partial migrations there. And I think your current code allows
> that.
>

Yes. I had similar checks in migrate_to_ram at one point and that did
not work when multiple CPU faults from different threads occured in
parallel. Each thread can grab a random set of VRAM pages to migrate I
think.
 
> But that then means you also are stuck with partial migration state here.
> That was the point I tried to make.
>

The error path with migrate_vma_pages/finalize safely unwinds the
migration in these cases leaving all pages in SRAM.

> > > > +/**
> > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @vas: Pointer to the VM area structure
> > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > + * @start: Start address of the migration range
> > > > + * @end: End address of the migration range
> > > > + *
> > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +					struct vm_area_struct *vas,
> > > > +					struct page *page,
> > > > +					u64 start, u64 end)
> > > > +{
> > > > +	struct migrate_vma migrate = {
> > > > +		.vma		= vas,
> > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > +		.fault_page	= page,
> > > > +	};
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > 
> > > That's the wrong mm, at least for the ->migrate_to_ram path. You might be
> > > called on a anon mapping from a child process. That also means that the
> > > vma you're looking at might have no relationship with anythign you're
> > > tracking in your gpusvm.
> > >
> > 
> > Hmm, as discussed [3] I haven't added tests with child processes yet.
> > Let me do that and update the design as needed. This likely isn't
> > correct as you say.
> > 
> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169 
> 
> Ack. More tests should definitely help here to figure out what's up, and
> what's just me being confused.
> 

Starting to add tests this fork() appears to work after dropping these
asserts. More thorough testing is needed though.

> > > > +/**
> > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > + * @vmf: Pointer to the fault information structure
> > > > + *
> > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > + * the internal migration function to migrate the range back to RAM.
> > > > + *
> > > > + * Returns:
> > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > + */
> > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > +	int err;
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > 
> > > So I think zdd->range doesn't work, because even within a single mm the
> > > vma mapping a given piece of anon memory does not need to be unique, you
> > > can duplicate them with mremap.
> > > 
> > 
> > This is attached to a page, not a VMA. Both AMD and Nvidia drivers use a
> > similar lookup mechanism.
> 
> Yeah the page->zone_device_data is fine. It's the zone_device_rage->range
> which I think isn't ok.
> 

Yes, this gets a little confusing with fork() and mremap. The range's
start / end can be nonsense in the remap case. Also as you mention a
range->migrate_mutex doesn't seem correct either. I can make it work but
maybe not worth even typing out why here (I can provide a little more
detail in another reply). New thinking is zdd stores a size field and
has the locking - I think is akin to a VRAM folio then.

> > > So all you have here is the physical memory and the vma, which might or
> > > might not be from the same process as gpusvm->mm.
> > > 
> > > Also the child process scenario means you using mmap_write on the fault
> > > side doesn't stop all cpu faults migrating stuff back.
> > > 
> > > Somewhat aside, but I think that means amdkfd's svm_range->migration_mutex
> > > is busted, because it's va based and so misses concurrently ongoing
> > > different mappings moving physical storage around underneath.
> > >
> > 
> > I think all of the above which falls into the fork() + child process
> > issues which you have raise. Until I test this out I can't speak to this
> > any level of confidence so I won't. Thanks for raising this issue and
> > let me write test cases as discussed and educate myself. Once I do that,
> > we can engage in further discussions.
> 
> I think fork + childs will still result in zdd->range being unique (albeit
> confused about which mm). You need mremap of some of these mappings to

Agree for fork + child based on initial testing.

> change the addresses and really cause confusion, which I /think/ (but
> didn't test) is doable with a single process even and duplicating anon

Yep, remap changes the address so range is confusing and really size is
sufficient aligning within VMA's start / end upon CPU fault. AMD does
this but with a VMA search which I think is a bit overkill.

Matt

> memory mappings with mremap.
> 
> Cheers, Sima
> -- 
> Simona Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Thomas Hellström Sept. 4, 2024, 12:27 p.m. UTC | #32
Hi, Sima,

On Mon, 2024-09-02 at 14:33 +0200, Daniel Vetter wrote:
> Jumping in here in the middle, since I think it's a solid place to
> drop my
> idea of "align with core mm" gpusvm locking ...
> 
> On Thu, Aug 29, 2024 at 08:56:23PM +0000, Matthew Brost wrote:
> > On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> > Issues with removing a SVM range:
> > 
> > - Xe bind code stores invalidation / present state in VMA, this
> > would
> >   need to be moved to the radix tree. I have Jira open for that
> > work
> >   which I believe other developers are going to own.
> > - Where would the dma mapping / device pages be stored?
> > 	- In the radix tree? What if ATS is enabled? We don't have
> > a
> > 	  driver owned radix tree. How do we reasonably connect a
> > driver
> > 	  owned radix to a common GPUSVM layer?
> 
> Yeah this one is really annoying, because the core mm gets away with
> nothing because it can just store the pfn in the pte. And it doesn't
> need
> anything else. So we probably still need something unfortuantely ...
> 
> > 	- In the notifier? What is the notifier is sparsely
> > populated?
> > 	  We would be wasting huge amounts of memory. What is the
> > 	  notifier is configured to span the entire virtual
> > address
> > 	  space?
> 
> So if we go with the radix idea, we could model the radix to exactly
> match
> the gpu pagetables. That's essentially what the core mm does. Then
> each
> pagetable at each level has a spinlock for essentially a range lock.
> notifier seqno would be stored into each pagetable (not the
> endividual
> entries, that's probably too much), which should allow us to very
> effeciently check whether an entire arbitrary va range is still valid
> on
> the fault side.

I still wonder wether this should be owned by the driver, though. And
if we were optimizing for multiple simultaneous fault processing with a
small granularity, I would agree, but given that gpu pagefaults are
considered so slow they should be avoided, I wonder whether xe's
current approach of a single page-table lock wouldn't suffice, in
addition to a semi-global seqno?

For invalidations, I think we actually currently allow simultaneous
overlapping invalidations that are only protected by the write-side of
the notifier seqno.

> 
> On the notifier side we can also very efficiently walk arbitrary
> ranges,
> because the locking is really fine-grained and in an adaptive way.
> 
> > - How does the garbage collector work? We can't allocate memory in
> > the
> >   notifier so we don't anything to add to the garbage collector. We
> >   can't directly modify page tables given you need lock in the path
> > of
> >   reclaim.
> 
> Probably no more garbage collector, you deal with pages/folios like
> the
> core mm expects.

Yeah, if the page-table locks are reclaim-safe no more garbage
collector, but OTOH, IIRC even in core-mm, the invalidation
counterpart, unmap_mapping_range() can't and doesn't remove page-table
subtrees when called from the address-space side, whereas zapping when
called from the mm side, like madvise(WONTNEED), can.

/Thomas



> 
> > - How do we deal with fault storms (e.g. tons of faults hitting the
> > same
> >   SVM range in a row)? Without a SVM range no every to know if
> > mapping
> >   is valid and GPU page handler can be short circuited.
> 
> So the core mm sorts this out by allowing faults to be handled in
> parallel, without any lock. Essentially:
> - you get a fault (or prefault)
> - you hold just enough read locks to make sure stuff doesn't
> disappear.
>   Currently that's mmap_read_lock, but strictly speaking we only need
> the
>   new-ish per-vma lock.
> - you allocate memory, dma_map, everything else you need
> - you grab that very fine-grained radix tree lock (pagetable locks on
> the
>   cpu side) and recheck whether you've raced: mmu notifier seqno and
> the
>   pte must still be non-present. If that check fails, you bail out
> and
>   release all the vram/dma_maps you've created.
> 
> > - Do we have notifier seqno for every PTE?
> 
> I think per-pagetable, so every node in the radix tree, would make
> sense.
> If we go with also one lock per pagetable like the cpu mm then
> tracking
> notifier seqno to match makes the most sense imo.
> 
> Again, this is entirely aside from the discussion in this subthread
> about
> understanding the current design and tradeoffs/reasons. Just figured
> this
> is a good spot to drop this.
> -Sima
Zeng, Oak Sept. 6, 2024, 6:41 p.m. UTC | #33
There are fundamental design conflicts with what we have aligned, see inline.

> -----Original Message-----
> From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf
> Of Matthew Brost
> Sent: Tuesday, August 27, 2024 10:49 PM
> To: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Cc: airlied@gmail.com; christian.koenig@amd.com;
> thomas.hellstrom@linux.intel.com; Auld, Matthew
> <matthew.auld@intel.com>; daniel@ffwll.ch
> Subject: [RFC PATCH 05/28] drm/gpusvm: Add support for GPU
> Shared Virtual Memory
> 
> This patch introduces support for GPU Shared Virtual Memory (SVM)
> in the
> Direct Rendering Manager (DRM) subsystem. SVM allows for
> seamless
> sharing of memory between the CPU and GPU, enhancing
> performance and
> flexibility in GPU computing tasks.
> 
> The patch adds the necessary infrastructure for SVM, including data
> structures and functions for managing SVM ranges and notifiers. It
> also
> provides mechanisms for allocating, deallocating, and migrating
> memory
> regions between system RAM and GPU VRAM.
> 
> This mid-layer is largely inspired by GPUVM.
> 
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile     |    3 +-
>  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
>  3 files changed, 2591 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile
> b/drivers/gpu/drm/xe/Makefile
> index b9670ae09a9e..b8fc2ee58f1a 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> 
>  # core driver code
> 
> -xe-y += xe_bb.o \
> +xe-y += drm_gpusvm.o \
> +	xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
>  	xe_devcoredump.o \
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> b/drivers/gpu/drm/xe/drm_gpusvm.c
> new file mode 100644
> index 000000000000..fc1e44e6ae72
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> @@ -0,0 +1,2174 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + *
> + * Authors:
> + *     Matthew Brost <matthew.brost@intel.com>
> + */
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hmm.h>
> +#include <linux/memremap.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_types.h>
> +#include <linux/pagemap.h>
> +#include <linux/slab.h>
> +
> +#include <drm/drm_device.h>
> +#include "drm_gpusvm.h"
> +
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework
> designed to manage shared
> + * virtual memory between the CPU and GPU. It enables efficient
> data exchange and
> + * processing for GPU-accelerated applications by allowing memory
> sharing and
> + * synchronization between the CPU's and GPU's virtual address
> spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and
> notifying the
> + *		GPU of changes, notifiers are sized based on a GPU
> SVM
> + *		initialization parameter, with a recommendation of
> 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of
> ranges that
> + *		fall within the notifier interval. Notifiers are tracked
> within
> + *		a GPU SVM Red-BlacK tree and list and are
> dynamically inserted
> + *		or removed as ranges within the interval are created
> or
> + *		destroyed.
> + * - Ranges: Represent memory ranges mapped in a DRM device and
> managed
> + *	     by GPU SVM. 


This svm_range concept has introduced a lot of code duplications in xekmd, 
Indicating that this is a wrong design. I think one of the design principle is to
Reuse, not to duplicate.

Look at patch 9, 11, bunch of duplicated codes to page table update, invalidate,
And page fault handler. 

I had this range concept in v1 [1], but after we agreed to unify svm and userptr
Codes during review, I dropped this concept, and the xe_svm concept, which ends
Up much less duplicated codes in v2[2]. I will say more below why I thought the svm
Concept can also be removed.

Conceptually vma represent a range. Why duplicate?

[1] https://patchwork.freedesktop.org/patch/574898/?series=128910&rev=1
[2] https://patchwork.freedesktop.org/series/132229/


They are sized based on an array of chunk
> sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU address
> space.
> + *	     Upon GPU fault, the largest aligned chunk that fits within
> the
> + *	     faulting CPU address space is chosen for the range size.
> Ranges are
> + *	     expected to be dynamically allocated on GPU fault and
> removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges
> are tracked in
> + *	     a notifier's Red-Black tree.
> + * - Operations: Define the interface for driver-specific SVM
> operations such as
> + *		 allocation, page collection, migration, invalidations,
> and VRAM
> + *		 release.
> + *
> + * This layer provides interfaces for allocating, mapping, migrating,
> and
> + * releasing memory ranges between the CPU and GPU. It handles
> all core memory
> + * management interactions (DMA mapping, HMM, and migration)
> and provides
> + * driver-specific virtual functions (vfuncs). This infrastructure is
> sufficient
> + * to build the expected driver components for an SVM
> implementation as detailed
> + * below.
> + *
> + * Expected Driver Components:
> + * - GPU page fault handler: Used to create ranges and notifiers
> based on the
> + *			     fault address, optionally migrate the range
> to
> + *			     VRAM, and create GPU bindings.
> + * - Garbage collector: Used to destroy GPU bindings for ranges.
> Ranges are
> + *			expected to be added to the garbage collector
> upon
> + *			MMU_NOTIFY_UNMAP event.
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * GPU SVM handles locking for core MM interactions, i.e., it
> locks/unlocks the
> + * mmap lock as needed. Alternatively, if the driver prefers to
> handle the mmap
> + * lock itself, a 'locked' argument is provided to the functions that
> require
> + * the mmap lock. This option may be useful for drivers that need to
> call into
> + * GPU SVM while also holding a dma-resv lock, thus preventing
> locking
> + * inversions between the mmap and dma-resv locks.
> + *
> + * GPU SVM introduces a global notifier lock, which safeguards the
> notifier's
> + * range RB tree and list, as well as the range's DMA mappings and
> sequence
> + * number. GPU SVM manages all necessary locking and unlocking
> operations,
> + * except for the recheck of the range's sequence number
> + * (mmu_interval_read_retry) when the driver is committing GPU
> bindings. This
> + * lock corresponds to the 'driver->update' lock mentioned in the
> HMM
> + * documentation (TODO: Link). Future revisions may transition from
> a GPU SVM
> + * global lock to a per-notifier lock if finer-grained locking is deemed
> + * necessary.
> + *
> + * In addition to the locking mentioned above, the driver should
> implement a
> + * lock to safeguard core GPU SVM function calls that modify state,
> such as
> + * drm_gpusvm_range_find_or_insert and
> drm_gpusvm_range_remove. Alternatively,
> + * these core functions can be called within a single kernel thread,
> for
> + * instance, using an ordered work queue. This lock is denoted as
> + * 'driver_svm_lock' in code examples.
> + */
> +
> +/**
> + * DOC: Migrataion
> + *
> + * The migration support is quite simple, allowing migration between
> SRAM and
> + * VRAM at the range granularity. For example, GPU SVM currently
> does not
> + * support mixing SRAM and VRAM pages within a range. This means
> that upon GPU
> + * fault, the entire range can be migrated to VRAM, and upon CPU
> fault, the
> + * entire range is migrated to SRAM.
> + *
> + * The reasoning for only supporting range granularity is as follows: it
> + * simplifies the implementation, and range sizes are driver-defined
> and should
> + * be relatively small.

Migration at range granularity just couples the physical world with virtual world,
Which is against the fundamental page-centric design we aligned before.

Looking at core mm behavior, the shrinking/swapping doesn't operate at vma or any
Virtual range granularity. This way we swap out the less frequently used pages and
Keep the more frequently used pages in ram. 

Similar thing should be done to vram migration to sram.

> + */
> +
> +/**
> + * DOC: Partial Unmapping of Ranges
> + *
> + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> CPU resulting
> + * in MMU_NOTIFY_UNMAP event) presents several challenges,

As said above, the challenge is coming from a design choice. In a 
Page centric design, the challenges don't exist at all.



> with the main one
> + * being that a subset of the range still has CPU and GPU mappings.
> If the
> + * backing store for the range is in VRAM, a subset of the backing
> store has
> + * references. One option would be to split the range and VRAM
> backing store,
> + * but the implementation for this would be quite complicated.
> Given that
> + * partial unmappings are rare and driver-defined range sizes are
> relatively
> + * small, GPU SVM does not support splitting of ranges.
> + *
> + * With no support for range splitting, upon partial unmapping of a
> range, the
> + * driver is expected to invalidate and destroy the entire range. If
> the range
> + * has VRAM as its backing, the driver is also expected to migrate any
> remaining
> + * pages back to SRAM.
> + */
> +
> +/**
> + * DOC: Examples
> + *
> + * This section provides two examples of how to build the expected
> driver
> + * components: the GPU page fault handler and the garbage
> collector. A third
> + * example demonstrates a sample invalidation driver vfunc.
> + *
> + * The generic code provided does not include logic for complex
> migration
> + * policies, optimized invalidations, or other potentially required
> driver
> + * locking (e.g., DMA-resv locks).
> + *
> + * 1) GPU page fault handler
> + *
> + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> drm_gpusvm_range *range)
> + *	{
> + *		int err = 0;
> + *
> + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> range);
> + *
> + *		drm_gpusvm_notifier_lock(gpusvm);
> + *		if (drm_gpusvm_range_pages_valid(range))
> + *			driver_commit_bind(gpusvm, range);
> + *		else
> + *			err = -EAGAIN;
> + *		drm_gpusvm_notifier_unlock(gpusvm);
> + *
> + *		return err;
> + *	}
> + *
> + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> + *			     u64 gpuva_start, u64 gpuva_end)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *		int err;
> + *
> + *		driver_svm_lock();
> + *	retry:
> + *		// Always process UNMAPs first so view of GPU SVM
> ranges is current
> + *		driver_garbage_collector(gpusvm);
> + *
> + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> fault_addr,
> + *							gpuva_start,
> gpuva_end,
> + *						        &ctx);
> + *		if (IS_ERR(range)) {
> + *			err = PTR_ERR(range);
> + *			goto unlock;
> + *		}
> + *
> + *		if (driver_migration_policy(range)) {
> + *			bo = driver_alloc_bo();
> + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> range, bo, &ctx);
> + *			if (err)	// CPU mappings may have changed
> + *				goto retry;
> + *		}
> + *
> + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &ctx);
> + *		if (err == -EFAULT || err == -EPERM)	// CPU
> mappings changed
> + *			goto retry;
> + *		else if (err)
> + *			goto unlock;
> + *
> + *		err = driver_bind_range(gpusvm, range);
> + *		if (err == -EAGAIN)	// CPU mappings changed
> + *			goto retry
> + *
> + *	unlock:
> + *		driver_svm_unlock();
> + *		return err;
> + *	}
> + *
> + * 2) Garbage Collector.
> + *
> + *	void __driver_garbage_collector(struct drm_gpusvm
> *gpusvm,
> + *					struct drm_gpusvm_range
> *range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		// Partial unmap, migrate any remaining VRAM pages
> back to SRAM
> + *		if (range->flags.partial_unmap)
> + *			drm_gpusvm_migrate_to_sram(gpusvm,
> range, &ctx);
> + *
> + *		driver_unbind_range(range);
> + *		drm_gpusvm_range_remove(gpusvm, range);
> + *	}
> + *
> + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> + *	{
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		for_each_range_in_garbage_collector(gpusvm, range)
> + *			__driver_garbage_collector(gpusvm, range);
> + *	}
> + *
> + * 3) Invalidation driver vfunc.
> + *
> + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> + *				 struct drm_gpusvm_notifier *notifier,
> + *				 const struct mmu_notifier_range
> *mmu_range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> + *		struct drm_gpusvm_range *range = NULL;
> + *
> + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> >start, mmu_range->end);
> + *
> + *		drm_gpusvm_for_each_range(range, notifier,
> mmu_range->start,
> + *					  mmu_range->end) {
> + *			drm_gpusvm_range_unmap_pages(gpusvm,
> range, &ctx);
> + *
> + *			if (mmu_range->event !=
> MMU_NOTIFY_UNMAP)
> + *				continue;
> + *
> + *			drm_gpusvm_range_set_unmapped(range,
> mmu_range);
> + *			driver_garbage_collector_add(gpusvm,
> range);
> + *		}
> + *	}
> + */
> +
> +#define DRM_GPUSVM_RANGE_START(_range)	((_range)-
> >va.start)
> +#define DRM_GPUSVM_RANGE_END(_range)	((_range)-
> >va.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> rb.__subtree_last,
> +		     DRM_GPUSVM_RANGE_START,
> DRM_GPUSVM_RANGE_END,
> +		     static __maybe_unused, range);
> +
> +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> >interval.start)
> +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> >interval.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> +		     rb.__subtree_last,
> DRM_GPUSVM_NOTIFIER_START,
> +		     DRM_GPUSVM_NOTIFIER_END, static
> __maybe_unused, notifier);
> +
> +/**
> + * npages_in_range() - Calculate the number of pages in a given
> range
> + * @start__: The start address of the range
> + * @end__: The end address of the range
> + *
> + * This macro calculates the number of pages in a given memory
> range,
> + * specified by the start and end addresses. It divides the difference
> + * between the end and start addresses by the page size
> (PAGE_SIZE) to
> + * determine the number of pages in the range.
> + *
> + * Return: The number of pages in the specified range.
> + */
> +#define npages_in_range(start__, end__)	\
> +	(((end__) - (start__)) >> PAGE_SHIFT)
> +
> +/**
> + * struct drm_gpusvm_zdd - GPU SVM zone device data
> + *
> + * @refcount: Reference count for the zdd
> + * @destroy_work: Work structure for asynchronous zdd
> destruction
> + * @range: Pointer to the GPU SVM range
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + *
> + * This structure serves as a generic wrapper installed in
> + * page->zone_device_data. It provides infrastructure for looking up
> a range
> + * upon CPU page fault and asynchronously releasing VRAM once
> the CPU has no
> + * page references. Asynchronous release is useful because CPU
> page references
> + * can be dropped in IRQ contexts, while releasing VRAM likely
> requires sleeping
> + * locks.
> + */
> +struct drm_gpusvm_zdd {
> +	struct kref refcount;
> +	struct work_struct destroy_work;
> +	struct drm_gpusvm_range *range;
> +	void *vram_allocation;
> +};
> +
> +/**
> + * drm_gpusvm_zdd_destroy_work_func - Work function for
> destroying a zdd
> + * @w: Pointer to the work_struct
> + *
> + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> + */
> +static void drm_gpusvm_zdd_destroy_work_func(struct
> work_struct *w)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(w, struct drm_gpusvm_zdd,
> destroy_work);
> +	struct drm_gpusvm_range *range = zdd->range;
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> +		gpusvm->ops->vram_release(zdd->vram_allocation);
> +	drm_gpusvm_range_put(range);
> +	kfree(zdd);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> + * @range: Pointer to the GPU SVM range.
> + *
> + * This function allocates and initializes a new zdd structure. It sets
> up the
> + * reference count, initializes the destroy work, and links the
> provided GPU SVM
> + * range.
> + *
> + * Returns:
> + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_zdd *
> +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_zdd *zdd;
> +
> +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> +	if (!zdd)
> +		return NULL;
> +
> +	kref_init(&zdd->refcount);
> +	INIT_WORK(&zdd->destroy_work,
> drm_gpusvm_zdd_destroy_work_func);
> +	zdd->range = drm_gpusvm_range_get(range);
> +	zdd->vram_allocation = NULL;
> +
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function increments the reference count of the provided zdd
> structure.
> + *
> + * Returns: Pointer to the zdd structure.
> + */
> +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> drm_gpusvm_zdd *zdd)
> +{
> +	kref_get(&zdd->refcount);
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> + * @ref: Pointer to the reference count structure.
> + *
> + * This function queues the destroy_work of the zdd for
> asynchronous destruction.
> + */
> +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> +
> +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_put - Put a zdd reference.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function decrements the reference count of the provided zdd
> structure
> + * and schedules its destruction if the count drops to zero.
> + */
> +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> notifier
> + * @notifier: Pointer to the GPU SVM notifier structure.
> + * @start: Start address of the range
> + * @end: End address of the range
> + *
> + * Return: A pointer to the drm_gpusvm_range if found or NULL
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end)
> +{
> +	return range_iter_first(&notifier->root, start, end - 1);
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU
> SVM ranges in a notifier
> + * @range__: Iterator variable for the ranges
> + * @next__: Iterator variable for the ranges temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier
> while
> + * removing ranges from it.
> + */
> +#define drm_gpusvm_for_each_range_safe(range__, next__,
> notifier__, start__, end__)	\
> +	for ((range__) = drm_gpusvm_range_find((notifier__),
> (start__), (end__)),	\
> +	     (next__) = __drm_gpusvm_range_next(range__);
> 			\
> +	     (range__) && (range__->va.start < (end__));
> 			\
> +	     (range__) = (next__), (next__) =
> __drm_gpusvm_range_next(range__))
> +
> +/**
> + * __drm_gpusvm_notifier_next - get the next
> drm_gpusvm_notifier in the list
> + * @notifier: a pointer to the current drm_gpusvm_notifier
> + *
> + * Return: A pointer to the next drm_gpusvm_notifier if available, or
> NULL if
> + *         the current notifier is the last one or if the input notifier is
> + *         NULL.
> + */
> +static struct drm_gpusvm_notifier *
> +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier
> *notifier)
> +{
> +	if (notifier && !list_is_last(&notifier->rb.entry,
> +				      &notifier->gpusvm->notifier_list))
> +		return list_next_entry(notifier, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> + */
> +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> start__, end__)		\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1);	\
> +	     (notifier__) && (notifier__->interval.start < (end__));
> 			\
> +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> SVM notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @next__: Iterator variable for the notifiers temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> while
> + * removing notifiers from it.
> + */
> +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> gpusvm__, start__, end__)	\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1),	\
> +	     (next__) = __drm_gpusvm_notifier_next(notifier__);
> 				\
> +	     (notifier__) && (notifier__->interval.start < (end__));
> 			\
> +	     (notifier__) = (next__), (next__) =
> __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> + * @mni: Pointer to the mmu_interval_notifier structure.
> + * @mmu_range: Pointer to the mmu_notifier_range structure.
> + * @cur_seq: Current sequence number.
> + *
> + * This function serves as a generic MMU notifier for GPU SVM. It
> sets the MMU
> + * notifier sequence number and calls the driver invalidate vfunc
> under
> + * gpusvm->notifier_lock.
> + *
> + * Returns:
> + * true if the operation succeeds, false otherwise.
> + */
> +static bool
> +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> +			       const struct mmu_notifier_range
> *mmu_range,
> +			       unsigned long cur_seq)
> +{
> +	struct drm_gpusvm_notifier *notifier =
> +		container_of(mni, typeof(*notifier), notifier);
> +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> +
> +	if (!mmu_notifier_range_blockable(mmu_range))
> +		return false;
> +
> +	down_write(&gpusvm->notifier_lock);
> +	mmu_interval_set_seq(mni, cur_seq);
> +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> +	up_write(&gpusvm->notifier_lock);
> +
> +	return true;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> GPU SVM
> + */
> +static const struct mmu_interval_notifier_ops
> drm_gpusvm_notifier_ops = {
> +	.invalidate = drm_gpusvm_notifier_invalidate,
> +};
> +
> +/**
> + * drm_gpusvm_init - Initialize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @name: Name of the GPU SVM.
> + * @drm: Pointer to the DRM device structure.
> + * @mm: Pointer to the mm_struct for the address space.
> + * @device_private_page_owner: Device private pages owner.
> + * @mm_start: Start address of GPU SVM.
> + * @mm_range: Range of the GPU SVM.
> + * @notifier_size: Size of individual notifiers.
> + * @ops: Pointer to the operations structure for GPU SVM.
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order with last
> + *               entry being SZ_4K.
> + * @num_chunks: Number of chunks.
> + *
> + * This function initializes the GPU SVM.
> + *
> + * Returns:
> + * 0 on success, a negative error code on failure.
> + */
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks)
> +{
> +	if (!ops->invalidate || !num_chunks)
> +		return -EINVAL;
> +
> +	gpusvm->name = name;
> +	gpusvm->drm = drm;
> +	gpusvm->mm = mm;
> +	gpusvm->device_private_page_owner =
> device_private_page_owner;
> +	gpusvm->mm_start = mm_start;
> +	gpusvm->mm_range = mm_range;
> +	gpusvm->notifier_size = notifier_size;
> +	gpusvm->ops = ops;
> +	gpusvm->chunk_sizes = chunk_sizes;
> +	gpusvm->num_chunks = num_chunks;
> +	gpusvm->zdd_wq = system_wq;
> +
> +	mmgrab(mm);
> +	gpusvm->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> +
> +	init_rwsem(&gpusvm->notifier_lock);
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	might_lock(&gpusvm->notifier_lock);
> +	fs_reclaim_release(GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure
> + * @fault_addr__: Fault address
> + *
> + * This macro finds the GPU SVM notifier associated with the fault
> address.
> + *
> + * Returns:
> + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> + */
> +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)
> 	\
> +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),
> 	\
> +			    (fault_addr__ + 1))
> +
> +/**
> + * to_drm_gpusvm_notifier - retrieve the container struct for a
> given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_notifier struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_notifier
> structure.
> + */
> +#define to_drm_gpusvm_notifier(__node)
> 	\
> +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> +
> +/**
> + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function inserts the GPU SVM notifier into the GPU SVM RB
> tree and list.
> + */
> +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	notifier_insert(notifier, &gpusvm->root);
> +
> +	node = rb_prev(&notifier->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> +	else
> +		head = &gpusvm->notifier_list;
> +
> +	list_add(&notifier->rb.entry, head);
> +}
> +
> +/**
> + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM tructure
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + *
> + * This macro removes the GPU SVM notifier from the GPU SVM RB
> tree and list.
> + */
> +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)
> 	\
> +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> +	list_del(&(notifier__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_fini - Finalize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + *
> + * This function finalizes the GPU SVM by cleaning up any remaining
> ranges and
> + * notifiers, and dropping a reference to struct MM.
> + */
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> +{
> +	struct drm_gpusvm_notifier *notifier, *next;
> +
> +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm,
> 0, LONG_MAX) {
> +		struct drm_gpusvm_range *range, *__next;
> +
> +		/*
> +		 * Remove notifier first to avoid racing with any
> invalidation
> +		 */
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +		notifier->flags.removed = true;
> +
> +		drm_gpusvm_for_each_range_safe(range, __next,
> notifier, 0,
> +					       LONG_MAX)
> +			drm_gpusvm_range_remove(gpusvm, range);
> +	}
> +
> +	mmdrop(gpusvm->mm);
> +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> +}
> +
> +/**
> + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + *
> + * This function allocates and initializes the GPU SVM notifier
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> on failure.
> + */
> +static struct drm_gpusvm_notifier *
> +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> fault_addr)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	if (gpusvm->ops->notifier_alloc)
> +		notifier = gpusvm->ops->notifier_alloc();
> +	else
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +
> +	if (!notifier)
> +		return ERR_PTR(-ENOMEM);
> +
> +	notifier->gpusvm = gpusvm;
> +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> >notifier_size);
> +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> >notifier_size);
> +	INIT_LIST_HEAD(&notifier->rb.entry);
> +	notifier->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&notifier->range_list);
> +
> +	return notifier;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function frees the GPU SVM notifier structure.
> + */
> +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> +				     struct drm_gpusvm_notifier
> *notifier)
> +{
> +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> +
> +	if (gpusvm->ops->notifier_free)
> +		gpusvm->ops->notifier_free(notifier);
> +	else
> +		kfree(notifier);
> +}
> +
> +/**
> + * to_drm_gpusvm_range - retrieve the container struct for a given
> rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_range struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_range structure.
> + */
> +#define to_drm_gpusvm_range(node__)	\
> +	container_of((node__), struct drm_gpusvm_range, rb.node)
> +
> +/**
> + * drm_gpusvm_range_insert - Insert GPU SVM range
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function inserts the GPU SVM range into the notifier RB tree
> and list.
> + */
> +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> *notifier,
> +				    struct drm_gpusvm_range *range)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> +	range_insert(range, &notifier->root);
> +
> +	node = rb_prev(&range->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> +	else
> +		head = &notifier->range_list;
> +
> +	list_add(&range->rb.entry, head);
> +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> +}
> +
> +/**
> + * __drm_gpusvm_range_remove - Remove GPU SVM range
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + * @range__: Pointer to the GPU SVM range structure
> + *
> + * This macro removes the GPU SVM range from the notifier RB tree
> and list.
> + */
> +#define __drm_gpusvm_range_remove(notifier__, range__)
> 		\
> +	range_remove((range__), &(notifier__)->root);
> 	\
> +	list_del(&(range__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @fault_addr: Fault address
> + * @chunk_size: Chunk size
> + * @migrate_vram: Flag indicating whether to migrate VRAM
> + *
> + * This function allocates and initializes the GPU SVM range structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> failure.
> + */
> +static struct drm_gpusvm_range *
> +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> +		       struct drm_gpusvm_notifier *notifier,
> +		       u64 fault_addr, u64 chunk_size, bool
> migrate_vram)
> +{
> +	struct drm_gpusvm_range *range;
> +
> +	if (gpusvm->ops->range_alloc)
> +		range = gpusvm->ops->range_alloc(gpusvm);
> +	else
> +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&range->refcount);
> +	range->gpusvm = gpusvm;
> +	range->notifier = notifier;
> +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> +	INIT_LIST_HEAD(&range->rb.entry);
> +	range->notifier_seq = LONG_MAX;
> +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_check_pages - Check pages
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @start: Start address
> + * @end: End address
> + *
> + * Check if pages between start and end have been faulted in on the
> CPU. Use to
> + * prevent migration of pages without CPU backing store.
> + *
> + * Returns:
> + * True if pages have been faulted into CPU, False otherwise
> + */
> +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> +				   struct drm_gpusvm_notifier
> *notifier,
> +				   u64 start, u64 end)
> +{
> +	struct hmm_range hmm_range = {
> +		.default_flags = 0,
> +		.notifier = &notifier->notifier,
> +		.start = start,
> +		.end = end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long *pfns;
> +	unsigned long npages = npages_in_range(start, end);
> +	int err, i;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +	if (!pfns)
> +		return false;
> +
> +	hmm_range.notifier_seq =
> mmu_interval_read_begin(&notifier->notifier);
> +	hmm_range.hmm_pfns = pfns;
> +
> +	while (true) {
> +		err = hmm_range_fault(&hmm_range);
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(&notifier->notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (err)
> +		goto err_free;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!(pfns[i] & HMM_PFN_VALID)) {
> +			err = -EFAULT;
> +			goto err_free;
> +		}
> +	}
> +
> +err_free:
> +	kvfree(pfns);
> +	return err ? false : true;
> +}
> +
> +/**
> + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @vas: Pointer to the virtual memory area structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @check_pages: Flag indicating whether to check pages
> + *
> + * This function determines the chunk size for the GPU SVM range
> based on the
> + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> and the virtual
> + * memory area boundaries.
> + *
> + * Returns:
> + * Chunk size on success, LONG_MAX on failure.
> + */
> +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier,
> +				       struct vm_area_struct *vas,
> +				       u64 fault_addr, u64 gpuva_start,
> +				       u64 gpuva_end, bool check_pages)
> +{
> +	u64 start, end;
> +	int i = 0;
> +
> +retry:
> +	for (; i < gpusvm->num_chunks; ++i) {
> +		start = ALIGN_DOWN(fault_addr, gpusvm-
> >chunk_sizes[i]);
> +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> +
> +		if (start >= vas->vm_start && end <= vas->vm_end
> &&
> +		    start >= notifier->interval.start &&
> +		    end <= notifier->interval.end &&
> +		    start >= gpuva_start && end <= gpuva_end)
> +			break;
> +	}
> +
> +	if (i == gpusvm->num_chunks)
> +		return LONG_MAX;
> +
> +	/*
> +	 * If allocation more than page, ensure not to overlap with
> existing
> +	 * ranges.
> +	 */
> +	if (end - start != SZ_4K) {
> +		struct drm_gpusvm_range *range;
> +
> +		range = drm_gpusvm_range_find(notifier, start, end);
> +		if (range) {
> +			++i;
> +			goto retry;
> +		}
> +
> +		/*
> +		 * XXX: Only create range on pages CPU has faulted in.
> Without
> +		 * this check, or prefault, on BMG
> 'xe_exec_system_allocator --r
> +		 * process-many-malloc' fails. In the failure case, each
> process
> +		 * mallocs 16k but the CPU VMA is ~128k which results
> in 64k SVM
> +		 * ranges. When migrating the SVM ranges, some
> processes fail in
> +		 * drm_gpusvm_migrate_to_vram with
> 'migrate.cpages != npages'
> +		 * and then upon drm_gpusvm_range_get_pages
> device pages from
> +		 * other processes are collected + faulted in which
> creates all
> +		 * sorts of problems. Unsure exactly how this
> happening, also
> +		 * problem goes away if 'xe_exec_system_allocator --
> r
> +		 * process-many-malloc' mallocs at least 64k at a time.
> +		 */
> +		if (check_pages &&
> +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> end)) {
> +			++i;
> +			goto retry;
> +		}
> +	}
> +
> +	return end - start;
> +}
> +
> +/**
> + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @ctx: GPU SVM context
> + *
> + * This function finds or inserts a newly allocated a GPU SVM range
> based on the
> + * fault address. Caller must hold a lock to protect range lookup and
> insertion.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> u64 fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +	struct drm_gpusvm_range *range;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	bool notifier_alloc = false;
> +	u64 chunk_size;
> +	int err;
> +	bool migrate_vram;
> +
> +	if (fault_addr < gpusvm->mm_start ||
> +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_write_locked(mm);
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> +	if (!notifier) {
> +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> fault_addr);
> +		if (IS_ERR(notifier)) {
> +			err = PTR_ERR(notifier);
> +			goto err_mmunlock;
> +		}
> +		notifier_alloc = true;
> +		err = mmu_interval_notifier_insert_locked(&notifier-
> >notifier,
> +							  mm, notifier-
> >interval.start,
> +							  notifier-
> >interval.end -
> +							  notifier-
> >interval.start,
> +
> &drm_gpusvm_notifier_ops);
> +		if (err)
> +			goto err_notifier;
> +	}
> +
> +	vas = vma_lookup(mm, fault_addr);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_notifier_remove;
> +	}
> +
> +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> +		err = -EPERM;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_find(notifier, fault_addr,
> fault_addr + 1);
> +	if (range)
> +		goto out_mmunlock;
> +	/*
> +	 * XXX: Short-circuiting migration based on migrate_vma_*
> current
> +	 * limitations. If/when migrate_vma_* add more support, this
> logic will
> +	 * have to change.
> +	 */
> +	migrate_vram = ctx->vram_possible &&
> +		vma_is_anonymous(vas)
> && !is_vm_hugetlb_page(vas);
> +
> +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> notifier, vas,
> +						 fault_addr,
> gpuva_start,
> +						 gpuva_end,
> migrate_vram &&
> +						 !ctx->prefault);
> +	if (chunk_size == LONG_MAX) {
> +		err = -EINVAL;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> fault_addr, chunk_size,
> +				       migrate_vram);
> +	if (IS_ERR(range)) {
> +		err = PTR_ERR(range);
> +		goto err_notifier_remove;
> +	}
> +
> +	drm_gpusvm_range_insert(notifier, range);
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> +
> +	if (ctx->prefault) {
> +		struct drm_gpusvm_ctx __ctx = *ctx;
> +
> +		__ctx.mmap_locked = true;
> +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &__ctx);
> +		if (err)
> +			goto err_range_remove;
> +	}
> +
> +out_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +
> +	return range;
> +
> +err_range_remove:
> +	__drm_gpusvm_range_remove(notifier, range);
> +err_notifier_remove:
> +	if (notifier_alloc)
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +err_notifier:
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * for_each_dma_page - iterate over pages in a DMA regio`n
> + * @i__: the current page index in the iteration
> + * @j__: the current page index, log order, in the iteration
> + * @npages__: the total number of pages in the DMA region
> + * @order__: the order of the pages in the DMA region
> + *
> + * This macro iterates over each page in a DMA region. The DMA
> region
> + * is assumed to be composed of 2^@order__ pages, and the macro
> will
> + * step through the region one block of 2^@order__ pages at a time.
> + */
> +#define for_each_dma_page(i__, j__, npages__, order__)	\
> +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> +	     (j__)++, (i__) += 0x1 << (order__))
> +
> +/**
> + * __drm_gpusvm_range_unmap_pages - Unmap pages associated
> with a GPU SVM range (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function unmap pages associated with a GPU SVM range.
> Assumes and
> + * asserts correct locking is in place when called.
> + */
> +static void __drm_gpusvm_range_unmap_pages(struct
> drm_gpusvm *gpusvm,
> +					   struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		unsigned long i, j, npages = npages_in_range(range-
> >va.start,
> +							     range-
> >va.end);
> +
> +		if (range->flags.has_dma_mapping) {
> +			for_each_dma_page(i, j, npages, range-
> >order)
> +				dma_unmap_page(gpusvm->drm-
> >dev,
> +					       range->dma_addr[j],
> +					       PAGE_SIZE << range-
> >order,
> +					       DMA_BIDIRECTIONAL);
> +		}
> +
> +		range->flags.has_vram_pages = false;
> +		range->flags.has_dma_mapping = false;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_free_pages - Free pages associated with a
> GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function free pages associated with a GPU SVM range.
> + */
> +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> *gpusvm,
> +					struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		if (range->flags.kfree_mapping) {
> +			kfree(range->dma_addr);
> +			range->flags.kfree_mapping = false;
> +			range->pages = NULL;
> +		} else {
> +			kvfree(range->pages);
> +			range->pages = NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_remove - Remove GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range to be removed
> + *
> + * This function removes the specified GPU SVM range and also
> removes the parent
> + * GPU SVM notifier if no more ranges remain in the notifier. The
> caller must
> + * hold a lock to protect range and notifier removal.
> + */
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> >va.start);
> +	if (WARN_ON_ONCE(!notifier))
> +		return;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +	drm_gpusvm_range_free_pages(gpusvm, range);
> +	__drm_gpusvm_range_remove(notifier, range);
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	drm_gpusvm_range_put(range);
> +
> +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> +		if (!notifier->flags.removed)
> +			mmu_interval_notifier_remove(&notifier-
> >notifier);
> +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function increments the reference count of the specified
> GPU SVM range.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> +{
> +	kref_get(&range->refcount);
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> + * @refcount: Pointer to the reference counter embedded in the
> GPU SVM range
> + *
> + * This function destroys the specified GPU SVM range when its
> reference count
> + * reaches zero. If a custom range-free function is provided, it is
> invoked to
> + * free the range; otherwise, the range is deallocated using kfree().
> + */
> +static void drm_gpusvm_range_destroy(struct kref *refcount)
> +{
> +	struct drm_gpusvm_range *range =
> +		container_of(refcount, struct drm_gpusvm_range,
> refcount);
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->range_free)
> +		gpusvm->ops->range_free(range);
> +	else
> +		kfree(range);
> +}
> +
> +/**
> + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function decrements the reference count of the specified
> GPU SVM range
> + * and frees it when the count reaches zero.
> + */
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> +{
> +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called holding gpusvm->notifier_lock and as the last step before
> commiting a
> + * GPU binding.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	return range->flags.has_vram_pages || range-
> >flags.has_dma_mapping;
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range
> pages valid unlocked
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called without holding gpusvm->notifier_lock.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +static bool
> +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm
> *gpusvm,
> +				      struct drm_gpusvm_range *range)
> +{
> +	bool pages_valid;
> +
> +	if (!range->pages)
> +		return false;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> range);
> +	if (!pages_valid && range->flags.kfree_mapping) {
> +		kfree(range->dma_addr);
> +		range->flags.kfree_mapping = false;
> +		range->pages = NULL;
> +	}
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	return pages_valid;
> +}
> +
> +/**
> + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function gets pages for a GPU SVM range and ensures they
> are mapped for
> + * DMA access.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct mmu_interval_notifier *notifier = &range->notifier-
> >notifier;
> +	struct hmm_range hmm_range = {
> +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> >read_only ? 0 :
> +			HMM_PFN_REQ_WRITE),
> +		.notifier = notifier,
> +		.start = range->va.start,
> +		.end = range->va.end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long i, j;
> +	unsigned long npages = npages_in_range(range->va.start,
> range->va.end);
> +	unsigned int order = 0;
> +	unsigned long *pfns;
> +	struct page **pages;
> +	int err = 0;
> +	bool vram_pages = !!range->flags.migrate_vram;
> +	bool alloc_pfns = false, kfree_mapping;
> +
> +retry:
> +	kfree_mapping = false;
> +	hmm_range.notifier_seq =
> mmu_interval_read_begin(notifier);
> +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> range))
> +		return 0;
> +
> +	if (range->notifier_seq == hmm_range.notifier_seq &&
> range->pages) {
> +		if (ctx->prefault)
> +			return 0;
> +
> +		pfns = (unsigned long *)range->pages;
> +		pages = range->pages;
> +		goto map_pages;
> +	}
> +
> +	if (!range->pages) {
> +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> GFP_KERNEL);
> +		if (!pfns)
> +			return -ENOMEM;
> +		alloc_pfns = true;
> +	} else {
> +		pfns = (unsigned long *)range->pages;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +	}
> +
> +	hmm_range.hmm_pfns = pfns;
> +	while (true) {
> +		/* Must be checked after mmu_interval_read_begin
> */
> +		if (range->flags.unmapped) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only
> a read-lock
> +			 * is required but there apears to be a window
> between
> +			 * the MMU_NOTIFY_MIGRATE event
> triggered in a CPU fault
> +			 * via migrate_vma_setup and the pages
> actually moving
> +			 * in migrate_vma_finalize in which this code
> can grab
> +			 * garbage pages. Grabbing the write-lock if
> the range
> +			 * is attached to vram appears to protect
> against this
> +			 * race.
> +			 */
> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if
> (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +
> 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {
> +			if (WARN_ON_ONCE(i && order !=
> +
> hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if
> (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);
> +
> +			dma_addr[j] = dma_map_page(gpusvm-
> >drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE << order,
> +
> DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev,
> dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j,
> sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] =
> (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +
> 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order,
> DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated
> with a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If
> @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write
> mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> called on
> + * each GPU SVM range attached to notifier in gpusvm->ops-
> >invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long
> npages,
> +					   unsigned long *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +
> 	drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> grate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU
> SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for
> GPU SVM migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to
> mapped pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU
> SVM. It
> + * iterates over each page frame number provided in @migrate_pfn,
> maps the
> + * corresponding page, and stores the DMA address in the provided
> @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int
> *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page =
> migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0,
> PAGE_SIZE, dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> mapped for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped
> pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for
> GPU Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in
> @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the
> corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction
> dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev,
> dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to
> VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> The caller
> + *                   should hold a reference to the VRAM allocation, which
> + *                   should be dropped via ops->vram_allocation or upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It
> performs the
> + * necessary setup and invokes the driver-specific operations for
> migration to
> + * VRAM. Upon successful return, @vram_allocation can safely
> reference @range
> + * until ops->vram_release is called which only upon successful
> return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm-
> >device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> >copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> npages, not
> +	 * always an error. Need to revisit possible cases and how to
> handle. We
> +	 * could prefault on migrate.cpages != npages via
> hmm_range_fault.
> +	 */
> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.src, npages,
> DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages,
> dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages,
> migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> dma_addr, npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM
> PFNs for a VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers
> (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the VM
> area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> vm_area_struct *vas,
> +						unsigned long npages,
> +						unsigned long
> *src_mpfn,
> +						unsigned long *mpfn,
> u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> mmap lock and
> + * migration done via migrate_device_* functions. Fallback path as it
> is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> >vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm-
> >device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL,
> npages, src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   dst, npages,
> DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to
> SRAM (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified GPU
> SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM
> PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm-
> >device_private_page_owner,
> +		.flags		=
> MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	/* Corner where VMA area struct has been partially
> unmapped */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src,
> migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> dma_addr, npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages,
> migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> dma_addr, npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range
> to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM
> range to
> + * SRAM. It performs necessary checks and invokes the internal
> migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err =
> drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner
> case when
> +	 * VRAM backing has been partially unmapped from MM's
> address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> start, end);
> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called
> while
> +		 * holding a dma-resv lock, and a final put can grab the
> mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data
> associated with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device
> data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> (page fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM
> range to RAM.
> + * It retrieves the GPU SVM range information from the faulting
> page and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page-
> >zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range-
> >gpusvm,
> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);
> +
> +	return err ? VM_FAULT_SIGBUS : 0;
> +}
> +
> +/**
> + * drm_gpusvm_pagemap_ops - Device page map operations for
> GPU SVM
> + */
> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops =
> {
> +	.page_free = drm_gpusvm_page_free,
> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> +};
> +
> +/**
> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device
> page map operations
> + *
> + * Returns:
> + * Pointer to the GPU SVM device page map operations structure.
> + */
> +const struct dev_pagemap_ops
> *drm_gpusvm_pagemap_ops_get(void)
> +{
> +	return &drm_gpusvm_pagemap_ops;
> +}
> +
> +/**
> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for
> the given address range
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @start: Start address
> + * @end: End address
> + *
> + * Returns:
> + * True if GPU SVM has mapping, False otherwise
> + */
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> start, u64 end)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end)
> {
> +		struct drm_gpusvm_range *range = NULL;
> +
> +		drm_gpusvm_for_each_range(range, notifier, start,
> end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> b/drivers/gpu/drm/xe/drm_gpusvm.h
> new file mode 100644
> index 000000000000..0ea70f8534a8
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> @@ -0,0 +1,415 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef __DRM_GPUSVM_H__
> +#define __DRM_GPUSVM_H__
> +
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/workqueue.h>
> +
> +struct dev_pagemap_ops;
> +struct drm_device;
> +struct drm_gpusvm;
> +struct drm_gpusvm_notifier;
> +struct drm_gpusvm_ops;
> +struct drm_gpusvm_range;
> +
> +/**
> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> + *
> + * This structure defines the operations for GPU Shared Virtual
> Memory (SVM).
> + * These operations are provided by the GPU driver to manage SVM
> ranges and
> + * perform operations such as migration between VRAM and system
> RAM.
> + */
> +struct drm_gpusvm_ops {
> +	/**
> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> +	 *
> +	 * This function shall allocate a GPU SVM notifier.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM notifier on success, NULL
> on failure.
> +	 */
> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> +
> +	/**
> +	 * @notifier_free: Free a GPU SVM notifier (optional)
> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> +	 *
> +	 * This function shall free a GPU SVM notifier.
> +	 */
> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> +
> +	/**
> +	 * @range_alloc: Allocate a GPU SVM range (optional)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 *
> +	 * This function shall allocate a GPU SVM range.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM range on success, NULL
> on failure.
> +	 */
> +	struct drm_gpusvm_range *(*range_alloc)(struct
> drm_gpusvm *gpusvm);
> +
> +	/**
> +	 * @range_free: Free a GPU SVM range (optional)
> +	 * @range: Pointer to the GPU SVM range to be freed
> +	 *
> +	 * This function shall free a GPU SVM range.
> +	 */
> +	void (*range_free)(struct drm_gpusvm_range *range);
> +
> +	/**
> +	 * @vram_release: Release VRAM allocation (optional)
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 *
> +	 * This function shall release VRAM allocation and expects to
> drop a
> +	 * reference to VRAM allocation.
> +	 */
> +	void (*vram_release)(void *vram_allocation);
> +
> +	/**
> +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 * @npages: Number of pages to populate
> +	 * @pfn: Array of page frame numbers to populate
> +	 *
> +	 * This function shall populate VRAM page frame numbers
> (PFN).
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> +				 void *vram_allocation,
> +				 unsigned long npages,
> +				 unsigned long *pfn);
> +
> +	/**
> +	 * @copy_to_vram: Copy to VRAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (destination)
> +	 * @dma_addr: Pointer to array of DMA addresses (source)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to VRAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @copy_to_sram: Copy to system RAM (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (source)
> +	 * @dma_addr: Pointer to array of DMA addresses
> (destination)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to system RAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @invalidate: Invalidate GPU SVM notifier (required)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @notifier: Pointer to the GPU SVM notifier
> +	 * @mmu_range: Pointer to the mmu_notifier_range
> structure
> +	 *
> +	 * This function shall invalidate the GPU page tables. It can
> safely
> +	 * walk the notifier range RB tree/list in this function. Called
> while
> +	 * holding the notifier lock.
> +	 */
> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> +			   struct drm_gpusvm_notifier *notifier,
> +			   const struct mmu_notifier_range
> *mmu_range);
> +};
> +
> +/**
> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> notifier
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: MMU interval notifier
> + * @interval: Interval for the notifier
> + * @rb: Red-black tree node for the parent GPU SVM structure
> notifier tree
> + * @root: Cached root node of the RB tree containing ranges
> + * @range_list: List head containing of ranges in the same order they
> appear in
> + *              interval tree. This is useful to keep iterating ranges while
> + *              doing modifications to RB tree.
> + * @flags.removed: Flag indicating whether the MMU interval
> notifier has been
> + *                 removed
> + *
> + * This structure represents a GPU SVM notifier.
> + */
> +struct drm_gpusvm_notifier {
> +	struct drm_gpusvm *gpusvm;
> +	struct mmu_interval_notifier notifier;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} interval;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct rb_root_cached root;
> +	struct list_head range_list;
> +	struct {
> +		u32 removed : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm_range - Structure representing a GPU SVM
> range
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier
> + * @refcount: Reference count for the range
> + * @rb: Red-black tree node for the parent GPU SVM notifier
> structure range tree
> + * @va: Virtual address range
> + * @notifier_seq: Notifier sequence number of the range's pages
> + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> + * @dma_addr: DMA address array (if backing store is SRAM and
> DMA mapped)
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> mapping size
> + * @flags.migrate_vram: Flag indicating whether the range can be
> migrated to VRAM
> + * @flags.unmapped: Flag indicating if the range has been
> unmapped
> + * @flags.partial_unmap: Flag indicating if the range has been
> partially unmapped
> + * @flags.has_vram_pages: Flag indicating if the range has vram
> pages
> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> mapping
> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> allocation based
> + *                       on @order which releases via kfree
> + *
> + * This structure represents a GPU SVM range used for tracking
> memory ranges
> + * mapped in a DRM device.
> + */
> +struct drm_gpusvm_range {
> +	struct drm_gpusvm *gpusvm;
> +	struct drm_gpusvm_notifier *notifier;
> +	struct kref refcount;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} va;
> +	unsigned long notifier_seq;
> +	union {
> +		struct page **pages;
> +		dma_addr_t *dma_addr;
> +	};
> +	void *vram_allocation;
> +	u16 order;
> +	struct {
> +		/* All flags below must be set upon creation */
> +		u16 migrate_vram : 1;
> +		/* All flags below must be set / cleared under notifier
> lock */
> +		u16 unmapped : 1;
> +		u16 partial_unmap : 1;
> +		u16 has_vram_pages : 1;
> +		u16 has_dma_mapping : 1;
> +		u16 kfree_mapping : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm - GPU SVM structure
> + *
> + * @name: Name of the GPU SVM
> + * @drm: Pointer to the DRM device structure
> + * @mm: Pointer to the mm_struct for the address space
> + * @device_private_page_owner: Device private pages owner
> + * @mm_start: Start address of GPU SVM
> + * @mm_range: Range of the GPU SVM
> + * @notifier_size: Size of individual notifiers
> + * @ops: Pointer to the operations structure for GPU SVM
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order.
> + * @num_chunks: Number of chunks
> + * @notifier_lock: Read-write semaphore for protecting notifier
> operations
> + * @zdd_wq: Workqueue for deferred work on zdd destruction
> + * @root: Cached root node of the Red-Black tree containing GPU
> SVM notifiers
> + * @notifier_list: list head containing of notifiers in the same order
> they
> + *                 appear in interval tree. This is useful to keep iterating
> + *                 notifiers while doing modifications to RB tree.
> + *
> + * This structure represents a GPU SVM (Shared Virtual Memory)
> used for tracking
> + * memory ranges mapped in a DRM (Direct Rendering Manager)
> device.
> + *
> + * No reference counting is provided, as this is expected to be
> embedded in the
> + * driver VM structure along with the struct drm_gpuvm, which
> handles reference
> + * counting.
> + */
> +struct drm_gpusvm {
> +	const char *name;
> +	struct drm_device *drm;
> +	struct mm_struct *mm;
> +	void *device_private_page_owner;
> +	u64 mm_start;
> +	u64 mm_range;
> +	u64 notifier_size;
> +	const struct drm_gpusvm_ops *ops;
> +	const u64 *chunk_sizes;
> +	int num_chunks;
> +	struct rw_semaphore notifier_lock;
> +	struct workqueue_struct *zdd_wq;
> +	struct rb_root_cached root;
> +	struct list_head notifier_list;
> +};

I also think the gpusvm concept is a duplication of the drm_gpuvm.
Look at the members here, mm_start, mm_range, rb_tree...

Maintaining a list of notifier at this layer is odd. Everybody else seems
Embed the notifier in a range...

Mm field is essential for svm though. I think what we can do is, introduce a
*mm field in drm_gpuvm and introduce uAPI to allow user to say one gpuvm
Participate svm. If one gpuvm participate svm, we set the mm field for this
Gpuvm.

Another benefit of the proposed way is, multiple gpuvms can share address space
With single cpu mm process.


Oak


> +
> +/**
> + * struct drm_gpusvm_ctx - DRM GPU SVM context
> + *
> + * @mmap_locked: mmap lock is locked
> + * @trylock_mmap: trylock mmap lock, used to avoid locking
> inversions
> + *                (e.g.dma-revs -> mmap lock)
> + * @in_notifier: entering from a MMU notifier
> + * @read_only: operating on read-only memory
> + * @vram_possible: possible to use VRAM
> + * @prefault: prefault pages
> + *
> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> + */
> +struct drm_gpusvm_ctx {
> +	u32 mmap_locked :1;
> +	u32 trylock_mmap :1;
> +	u32 in_notifier :1;
> +	u32 read_only :1;
> +	u32 vram_possible :1;
> +	u32 prefault :1;
> +};
> +
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks);
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> u64 fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> +
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range);
> +
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx);
> +
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx);
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +
> +const struct dev_pagemap_ops
> *drm_gpusvm_pagemap_ops_get(void);
> +
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> start, u64 end);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end);
> +
> +/**
> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, take lock
> + */
> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> +	down_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, drop lock
> + */
> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> +	up_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the
> list
> + * @range: a pointer to the current GPU SVM range
> + *
> + * Return: A pointer to the next drm_gpusvm_range if available, or
> NULL if the
> + *         current range is the last one or if the input range is NULL.
> + */
> +static inline struct drm_gpusvm_range *
> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> +{
> +	if (range && !list_is_last(&range->rb.entry,
> +				   &range->notifier->range_list))
> +		return list_next_entry(range, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> notifier
> + * @range__: Iterator variable for the ranges. If set, it indicates the
> start of
> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get
> the range.
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier. It
> is safe
> + * to use while holding the driver SVM lock or the notifier lock.
> + */
> +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> end__)	\
> +	for ((range__) = (range__) ?:
> 	\
> +	     drm_gpusvm_range_find((notifier__), (start__), (end__));
> 	\
> +	     (range__) && (range__->va.start < (end__));
> 	\
> +	     (range__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> unmapped
> + * @range: Pointer to the GPU SVM range structure.
> + * @mmu_range: Pointer to the MMU notifier range structure.
> + *
> + * This function marks a GPU SVM range as unmapped and sets the
> partial_unmap flag
> + * if the range partially falls within the provided MMU notifier range.
> + */
> +static inline void
> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range
> *range,
> +			      const struct mmu_notifier_range
> *mmu_range)
> +{
> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> +
> +	range->flags.unmapped = true;
> +	if (range->va.start < mmu_range->start ||
> +	    range->va.end > mmu_range->end)
> +		range->flags.partial_unmap = true;
> +}
> +
> +#endif /* __DRM_GPUSVM_H__ */
> --
> 2.34.1
Matthew Brost Sept. 11, 2024, 4:06 p.m. UTC | #34
On Mon, Sep 02, 2024 at 05:03:20PM +0000, Matthew Brost wrote:
> On Mon, Sep 02, 2024 at 01:53:14PM +0200, Daniel Vetter wrote:
> > On Thu, Aug 29, 2024 at 05:27:13PM +0000, Matthew Brost wrote:
> > > On Thu, Aug 29, 2024 at 11:45:08AM +0200, Daniel Vetter wrote:
> > > > On Tue, Aug 27, 2024 at 07:48:38PM -0700, Matthew Brost wrote:
> > > > > This patch introduces support for GPU Shared Virtual Memory (SVM) in the
> > > > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > > > sharing of memory between the CPU and GPU, enhancing performance and
> > > > > flexibility in GPU computing tasks.
> > > > > 
> > > > > The patch adds the necessary infrastructure for SVM, including data
> > > > > structures and functions for managing SVM ranges and notifiers. It also
> > > > > provides mechanisms for allocating, deallocating, and migrating memory
> > > > > regions between system RAM and GPU VRAM.
> > > > > 
> > > > > This mid-layer is largely inspired by GPUVM.
> > > > > 
> > > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > > Cc: Christian König <christian.koenig@amd.com>
> > > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > 
> > > > Still not sure I've got the right race that you paper over with
> > > > mmap_write_lock, but I spotted a few things, commments inline.
> > > > 
> > > 
> > > I've replied to this issue several times, let's table the
> > > mmap_write_lock issue in this reply - a lot of other things to get
> > > through. Current thinking is try to add a range->migrate_lock like AMD
> > > which I state here [1]. Let's continue discussing the mmap lock issue
> > > there if possible.
> > 
> > Yeah I wrote replies as I read code, so there's a bit a mess from my side
> > here. Apologies for that.
> > 
> 
> All good, has been quite helpful thus far.
> 
> > > [1] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169
> > 
> > Some more replies below that I think we haven't covered anywhere else yet.
> > 
> > > > > + * 2) Garbage Collector.
> > > > > + *
> > > > > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > > > > + *					struct drm_gpusvm_range *range)
> > > > > + *	{
> > > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > > + *
> > > > > + *		assert_driver_svm_locked(gpusvm);
> > > > > + *
> > > > > + *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
> > > > > + *		if (range->flags.partial_unmap)
> > > > > + *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);
> > > > 
> > > > Note that the migration back to sram isn't guaranteed to succeed, so you
> > > > might be still stuck with partially migrated range. This might be a case
> > > > where hmm gives you vram pfns, but the range you have doesn't have any
> > > > vram allocation anymore because you droppped it here. Not sure tbh.
> > > >
> > > 
> > > Hmm isn't the picture here nor will a VMA once the
> > > drm_gpusvm_evict_to_sram path is always taken as discussed here [2]. I
> > > might have a corner case BO refcounting / TTM resource lookup bug in
> > > somewhere in here which needs to be resolved though (e.g. eviction
> > > racing with this code path), will try to close on that.
> > > 
> > > [2] https://patchwork.freedesktop.org/patch/610955/?series=137870&rev=1#comment_1111164
> > 
> > So maybe my understanding is wrong, but from my reading of the device
> > migration code the exact same non-guarantees as for the sram2sram
> > migration code apply:
> > 
> > - There's no guarantee the page/folio doesn't have an elevated refcount,
> >   which makes the migration fail (in try_to_migrate, where it checks for
> >   surplus refcounts).
> > 
> > - There's no guarantee you'll get the page/folio lock, which makes the
> >   migration fail. Worse the core mm seems to use a fallback to per-page
> >   locking as it's extremely crude "get out of deadlocks due to acquiring
> >   multiple page locks" card.
> >
> 
> I think this circles back to basically the design must be able to move
> VRAM -> SRAM because the host can't access VRAM. Certainly in the CPU
> page fault path this can't fail on the fauling page at least or if it
> does the app gets segfaulted. I'll investigate more here but that is
> still my current thinking. If VRAM -> SRAM can fail / make partial
> progress in eviction paths, then mixed mappings likely need to be
> supported which shouldn't be all that painful - basically just need
> cursor in the bind code which can walk mixed mappings.
> 
> SRAM -> VRAM certainly can fail which is handled by just aborting the
> migration.
> 
> > > > > +map_pages:
> > > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > > +
> > > > > +		for (i = 0; i < npages; ++i) {
> > > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > > +
> > > > > +			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				goto err_free;
> > > > > +			}
> > > > > +		}
> > > > 
> > > > You can't do the above, because the pfn you get from hmm come with zero
> > > > guarantees, you neither hold a page reference nor the page lock. The only
> > > > thing you can do is grab the pagetable lock (or mmu notifier locks) and
> > > > check it's still valid, before you can touch any state. I think the
> > > > range->vram_allocation is probably always valid since you clean that up
> > > > under the same lock/thread, but there's good chances the vram allocation
> > > > is otherwise already gone for good. Or you get an inconsistent snapshot.
> > > > 
> > > 
> > > I haven't seen this pop in my testing yet which is fairly thorough. My
> > > thinking was migration always being enforced at range grainularity we'd
> > > never get mixed mappings from the core as migration is completely under
> > > control of the driver. Maybe I'm not understanding what you are saying
> > > here...
> > 
> > So one scenario is that you race (without the mmap write lock or the
> > migration_mutex design ofc) with another invalidate, and get a partial
> > view here of mixed vram and sram pages. Until you acquire the mmu notifier
> > lock and have made sure your pages are still valid, there's essentially no
> > guarantee.
> 
> The pages are collected in notifier stable state via the hmm locking +
> seqno begin and recheck. Before they can used (e.g. program a bind) yes
> the notifier lock needs to be taken to ensure they haven't changed
> between collection and used - at least this my understanding.
> 
> > > 
> > > > > +
> > > > > +		/* Do not race with notifier unmapping pages */
> > > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > > +		range->flags.has_vram_pages = true;
> > > > > +		range->pages = pages;
> > > > > +		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > > > > +			err = -EAGAIN;
> > > > > +			__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > > +		}
> > > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > > +	} else {
> > > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > > +
> > > > > +		for_each_dma_page(i, j, npages, order) {
> > > > > +			if (WARN_ON_ONCE(i && order !=
> > > > > +					 hmm_pfn_to_map_order(pfns[i]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > > +
> > > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > > +			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > > +				err = -EOPNOTSUPP;
> > > > > +				npages = i;
> > > > > +				goto err_unmap;
> > > > > +			}
> > > > > +
> > > > > +			set_page_dirty_lock(pages[j]);
> > > > > +			mark_page_accessed(pages[j]);
> > > > 
> > > > You can't do these, because you don't hold a page reference. They're also
> > > > not needed because hmm_range_fault goes thorugh the full mkwrite dance,
> > > > which takes care of these, unlike the gup family of functions.
> > > >
> > > 
> > > This is a left over from our existing userpte code and it does appear to
> > > be incorrect. Let me remove this and fixup our userptr code while I'm at
> > > it.
> > 
> > Ack.
> > 
> > > > > +	vas = vma_lookup(mm, start);
> > > > > +	if (!vas) {
> > > > > +		err = -ENOENT;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > > +		err = -EINVAL;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	if (!vma_is_anonymous(vas)) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +
> > > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
> > > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > > +	if (!buf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_mmunlock;
> > > > > +	}
> > > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
> > > > > +
> > > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > > +	if (!zdd) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	migrate.vma = vas;
> > > > > +	migrate.src = buf;
> > > > > +	migrate.dst = migrate.src + npages;
> > > > > +
> > > > > +	err = migrate_vma_setup(&migrate);
> > > > > +	if (err)
> > > > > +		goto err_free;
> > > > > +
> > > > > +	/*
> > > > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
> > > > > +	 * always an error. Need to revisit possible cases and how to handle. We
> > > > > +	 * could prefault on migrate.cpages != npages via hmm_range_fault.
> > > 
> > > This is a bit stale, can update this comment.
> > > 
> > > > > +	 */
> > > > 
> > > > Yeah I think especially under contention partial migrations, at least back
> > > > to sram due to cpu faults, are pretty much expected. And you need to cope
> > > > somehow.
> > > > 
> > > 
> > > I have seen these pop if the IGT calls mlock on the memory. My thinking
> > > is migration to VRAM is basically optional and fallback to leaving range
> > > in SRAM if an error occurs rather than doing a partial migration. This
> > > is what currently happens so it is coped with.
> > > 
> > > If the memory is marked as must be in VRAM (NIY), well then the user
> > > program has done something wrong and can kill the app (akin to
> > > segfault).
> > 
> > Yeah SIGBUS for "must be in VRAM" sounds like ok semantics.
> > 
> > > > > +
> > > > > +	if (!migrate.cpages) {
> > > > > +		err = -EFAULT;
> > > > > +		goto err_free;
> > > > > +	}
> > > > > +
> > > > > +	if (migrate.cpages != npages) {
> > > > > +		err = -EBUSY;
> > > > > +		goto err_finalize;
> > > > > +	}
> > 
> > What I think is more fundamental is that I think this one here doesn't
> > work. For migrate_to_ram you cannot assume that you can always migrate the
> > entire block, I think to uphold the core mm forward progress rules we need
> > to allow partial migrations there. And I think your current code allows
> > that.
> >
> 
> Yes. I had similar checks in migrate_to_ram at one point and that did
> not work when multiple CPU faults from different threads occured in
> parallel. Each thread can grab a random set of VRAM pages to migrate I
> think.
>  
> > But that then means you also are stuck with partial migration state here.
> > That was the point I tried to make.
> >
> 
> The error path with migrate_vma_pages/finalize safely unwinds the
> migration in these cases leaving all pages in SRAM.
> 
> > > > > +/**
> > > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
> > > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > > + * @vas: Pointer to the VM area structure
> > > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > > + * @start: Start address of the migration range
> > > > > + * @end: End address of the migration range
> > > > > + *
> > > > > + * This internal function performs the migration of the specified GPU SVM range
> > > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
> > > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * 0 on success, negative error code on failure.
> > > > > + */
> > > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > > +					struct vm_area_struct *vas,
> > > > > +					struct page *page,
> > > > > +					u64 start, u64 end)
> > > > > +{
> > > > > +	struct migrate_vma migrate = {
> > > > > +		.vma		= vas,
> > > > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > > +		.fault_page	= page,
> > > > > +	};
> > > > > +	unsigned long npages;
> > > > > +	struct page **pages;
> > > > > +	dma_addr_t *dma_addr;
> > > > > +	void *buf;
> > > > > +	int i, err = 0;
> > > > > +
> > > > > +	mmap_assert_locked(gpusvm->mm);
> > > > 
> > > > That's the wrong mm, at least for the ->migrate_to_ram path. You might be
> > > > called on a anon mapping from a child process. That also means that the
> > > > vma you're looking at might have no relationship with anythign you're
> > > > tracking in your gpusvm.
> > > >
> > > 
> > > Hmm, as discussed [3] I haven't added tests with child processes yet.
> > > Let me do that and update the design as needed. This likely isn't
> > > correct as you say.
> > > 
> > > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1111169 
> > 
> > Ack. More tests should definitely help here to figure out what's up, and
> > what's just me being confused.
> > 
> 
> Starting to add tests this fork() appears to work after dropping these
> asserts. More thorough testing is needed though.
> 
> > > > > +/**
> > > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
> > > > > + * @vmf: Pointer to the fault information structure
> > > > > + *
> > > > > + * This function is a page fault handler used to migrate a GPU SVM range to RAM.
> > > > > + * It retrieves the GPU SVM range information from the faulting page and invokes
> > > > > + * the internal migration function to migrate the range back to RAM.
> > > > > + *
> > > > > + * Returns:
> > > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > > + */
> > > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > > > +{
> > > > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > > > +	int err;
> > > > > +
> > > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > 
> > > > So I think zdd->range doesn't work, because even within a single mm the
> > > > vma mapping a given piece of anon memory does not need to be unique, you
> > > > can duplicate them with mremap.
> > > > 
> > > 
> > > This is attached to a page, not a VMA. Both AMD and Nvidia drivers use a
> > > similar lookup mechanism.
> > 
> > Yeah the page->zone_device_data is fine. It's the zone_device_rage->range
> > which I think isn't ok.
> > 
> 
> Yes, this gets a little confusing with fork() and mremap. The range's
> start / end can be nonsense in the remap case. Also as you mention a
> range->migrate_mutex doesn't seem correct either. I can make it work but
> maybe not worth even typing out why here (I can provide a little more
> detail in another reply). New thinking is zdd stores a size field and
> has the locking - I think is akin to a VRAM folio then.
> 
> > > > So all you have here is the physical memory and the vma, which might or
> > > > might not be from the same process as gpusvm->mm.
> > > > 
> > > > Also the child process scenario means you using mmap_write on the fault
> > > > side doesn't stop all cpu faults migrating stuff back.
> > > > 
> > > > Somewhat aside, but I think that means amdkfd's svm_range->migration_mutex
> > > > is busted, because it's va based and so misses concurrently ongoing
> > > > different mappings moving physical storage around underneath.
> > > >
> > > 
> > > I think all of the above which falls into the fork() + child process
> > > issues which you have raise. Until I test this out I can't speak to this
> > > any level of confidence so I won't. Thanks for raising this issue and
> > > let me write test cases as discussed and educate myself. Once I do that,
> > > we can engage in further discussions.
> > 
> > I think fork + childs will still result in zdd->range being unique (albeit
> > confused about which mm). You need mremap of some of these mappings to
> 
> Agree for fork + child based on initial testing.
> 
> > change the addresses and really cause confusion, which I /think/ (but
> > didn't test) is doable with a single process even and duplicating anon
> 
> Yep, remap changes the address so range is confusing and really size is
> sufficient aligning within VMA's start / end upon CPU fault. AMD does
> this but with a VMA search which I think is a bit overkill.
> 

Sima gave me something to investigate over the past week or so and asked
me to write up my findings and share the list. I'm replying here because
this seems like as good a place as any. 

A. Investigate possible livelock with do_swap_page taking a device page
reference and folio_migrate_mapping aborting in migrate_vma_* if
multiple references are held.

	Sima was correct in identifying this livelock. I was able to reproduce a
	stable livelock with a test where multiple CPU threads faulted the same
	device page in parallel, and an exclusive lock was taken in
	migrate_to_ram. Without an exclusive lock, forward progress is made, but
	on average, there were ~32k calls to migrate_to_ram before a thread
	succeeded. This issue appears to affect all implementations that use
	device pages.

	I have posted a patch with Sima’s suggested core MM fix on the list [1]
	and verified in the local Xe branch that this patch resolves the
	livelock and reduces multiple calls to migrate_to_ram on the same
	faulting page. It would be helpful to get AMD's input and testing on
	this patch.

B. Test out fork

	I added a few test sections to my IGT.

	This basically worked due to the COW (Copy-On-Write) semantics of fork.
	Both the parent and child processes fault on their first CPU access,
	getting their own new copy of any memory allocated before the fork.

	I believe the only change needed was dropping a lockdep assert in
	migrate_to_ram, as the MM can change.

C. MMAP shared with anonymous memory

	I found that this is actually not anonymous memory but rather
	shmem-backed [2] [3]. Only anonymous memory is available for migration,
	so the corner cases related to multiple CPU mappings for device pages
	discussed do not exist.

D. MREMAP behavior

	Added a few test sections to my IGT for possible REMAP cases.

	In all cases (DONTUNMAP, DONTUNMAP with read only...) the old
	memory generates a MMU_NOTIFY_UNMAP event. This fits nicely with
	the design as old range it just unmapped. Next CPU or GPU acess
	to old memory has zero fill semantics.

	The new memory can point to previously allocated device pages.
	With a simple update to design the next GPU fault can find these
	pages and map them.

	MREMAP did expose some problems with a zdd (device page
	zone_device_data) pointing to a range. It was pointed out that
	something physical pointing to something virtual is nonsense. I
	have fixed this locally and agree that all refs from physical to
	virtual will be dropped in the common layer.

D. MREMAP Behavior

	I added a few test sections to my IGT to explore possible MREMAP cases.

	In all cases (e.g., DONTUNMAP, DONTUNMAP with read-only...), the old
	memory generates an MMU_NOTIFY_UNMAP event. This aligns well with the
	design, as the old range is simply unmapped. Subsequent CPU or GPU
	access to the old memory has zero-fill semantics.

	The new memory can point to previously allocated device pages. With a
	simple update to the design, the next GPU fault can find these pages and
	map them.

	MREMAP did reveal some issues with zdd (device page zone_device_data)
	pointing to a range. It was pointed out that having something physical
	pointing to something virtual is nonsensical. I have fixed this locally
	and agree that all references from physical to virtual will be removed
	in the common layer.

E. Locking issues

	Sima strongly suggested not inventing locks for migration to avoid
	races, but rather to accept the core MM races. I removed all locks
	except for the existing Xe locks and eliminated mmap write abuse. With a
	more robust retry loop in the GPU page fault handler, I was able to
	successfully avoid mixed mappings. Whether mixed mappings will be
	supported is a different topic, but in my opinion, this demonstrates
	that a design can work with minimal locking. This will be the design
	moving forward.

Matt

[1] https://patchwork.freedesktop.org/series/138497/
[2] https://elixir.bootlin.com/linux/v6.10.7/source/mm/mmap.c#L2934
[3] https://elixir.bootlin.com/linux/v6.10.7/source/mm/shmem.c#L4941

> Matt
> 
> > memory mappings with mremap.
> > 
> > Cheers, Sima
> > -- 
> > Simona Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Simona Vetter Sept. 24, 2024, 8:41 a.m. UTC | #35
On Wed, Sep 04, 2024 at 02:27:15PM +0200, Thomas Hellström wrote:
> Hi, Sima,
> 
> On Mon, 2024-09-02 at 14:33 +0200, Daniel Vetter wrote:
> > Jumping in here in the middle, since I think it's a solid place to
> > drop my
> > idea of "align with core mm" gpusvm locking ...
> > 
> > On Thu, Aug 29, 2024 at 08:56:23PM +0000, Matthew Brost wrote:
> > > On Thu, Aug 29, 2024 at 09:18:29PM +0200, Thomas Hellström wrote:
> > > Issues with removing a SVM range:
> > > 
> > > - Xe bind code stores invalidation / present state in VMA, this
> > > would
> > >   need to be moved to the radix tree. I have Jira open for that
> > > work
> > >   which I believe other developers are going to own.
> > > - Where would the dma mapping / device pages be stored?
> > > 	- In the radix tree? What if ATS is enabled? We don't have
> > > a
> > > 	  driver owned radix tree. How do we reasonably connect a
> > > driver
> > > 	  owned radix to a common GPUSVM layer?
> > 
> > Yeah this one is really annoying, because the core mm gets away with
> > nothing because it can just store the pfn in the pte. And it doesn't
> > need
> > anything else. So we probably still need something unfortuantely ...
> > 
> > > 	- In the notifier? What is the notifier is sparsely
> > > populated?
> > > 	  We would be wasting huge amounts of memory. What is the
> > > 	  notifier is configured to span the entire virtual
> > > address
> > > 	  space?
> > 
> > So if we go with the radix idea, we could model the radix to exactly
> > match
> > the gpu pagetables. That's essentially what the core mm does. Then
> > each
> > pagetable at each level has a spinlock for essentially a range lock.
> > notifier seqno would be stored into each pagetable (not the
> > endividual
> > entries, that's probably too much), which should allow us to very
> > effeciently check whether an entire arbitrary va range is still valid
> > on
> > the fault side.
> 
> I still wonder wether this should be owned by the driver, though. And
> if we were optimizing for multiple simultaneous fault processing with a
> small granularity, I would agree, but given that gpu pagefaults are
> considered so slow they should be avoided, I wonder whether xe's
> current approach of a single page-table lock wouldn't suffice, in
> addition to a semi-global seqno?
> 
> For invalidations, I think we actually currently allow simultaneous
> overlapping invalidations that are only protected by the write-side of
> the notifier seqno.

Yeah I think this is just a long-term design point: As long as the
pagetable locking is conceptually a range thing I agree it doesn't matter
what we start out with, as long as it's somewhere on the line between a
global lock and the over-the-top scalable radix tree per-pagetable node
approach core mm has.

> > On the notifier side we can also very efficiently walk arbitrary
> > ranges,
> > because the locking is really fine-grained and in an adaptive way.
> > 
> > > - How does the garbage collector work? We can't allocate memory in
> > > the
> > >   notifier so we don't anything to add to the garbage collector. We
> > >   can't directly modify page tables given you need lock in the path
> > > of
> > >   reclaim.
> > 
> > Probably no more garbage collector, you deal with pages/folios like
> > the
> > core mm expects.
> 
> Yeah, if the page-table locks are reclaim-safe no more garbage
> collector, but OTOH, IIRC even in core-mm, the invalidation
> counterpart, unmap_mapping_range() can't and doesn't remove page-table
> subtrees when called from the address-space side, whereas zapping when
> called from the mm side, like madvise(WONTNEED), can.

Yeah we might need to mark up entirely empty pagetables and pass that up
the radix tree, so that on the next gpu bind we can zap those if needed.
Since we have the pagetables already it should be doable to add them to a
"needs garbage collecting" list of some sorts for entirely empty
pagetables, unlike the garbage collector that tosses out partial ranges
and so needs more stuff.

But also, future problem for post-merge I think.
-Sima
Simona Vetter Sept. 24, 2024, 9:25 a.m. UTC | #36
On Fri, Sep 06, 2024 at 06:41:18PM +0000, Zeng, Oak wrote:
> There are fundamental design conflicts with what we have aligned, see inline.
> 
> > -----Original Message-----
> > From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf
> > Of Matthew Brost
> > Sent: Tuesday, August 27, 2024 10:49 PM
> > To: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> > Cc: airlied@gmail.com; christian.koenig@amd.com;
> > thomas.hellstrom@linux.intel.com; Auld, Matthew
> > <matthew.auld@intel.com>; daniel@ffwll.ch
> > Subject: [RFC PATCH 05/28] drm/gpusvm: Add support for GPU
> > Shared Virtual Memory
> > 
> > This patch introduces support for GPU Shared Virtual Memory (SVM)
> > in the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for
> > seamless
> > sharing of memory between the CPU and GPU, enhancing
> > performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It
> > also
> > provides mechanisms for allocating, deallocating, and migrating
> > memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile
> > b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > 
> >  # core driver code
> > 
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework
> > designed to manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient
> > data exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are tracked
> > within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. 
> 
> 
> This svm_range concept has introduced a lot of code duplications in xekmd, 
> Indicating that this is a wrong design. I think one of the design principle is to
> Reuse, not to duplicate.
> 
> Look at patch 9, 11, bunch of duplicated codes to page table update, invalidate,
> And page fault handler. 
> 
> I had this range concept in v1 [1], but after we agreed to unify svm and userptr
> Codes during review, I dropped this concept, and the xe_svm concept, which ends
> Up much less duplicated codes in v2[2]. I will say more below why I thought the svm
> Concept can also be removed.
> 
> Conceptually vma represent a range. Why duplicate?

Because we cannot rely on mmap_read/write locks or vma_read/write locks
without causing headaches. They are core mm datastructures that the gpu
driver does not own, so for better or worse we have to do a bit of
duplication.

Duplication for no reaons is bad, but trying to avoid necessary
duplication that's inherit to the design challenge we face is much worse.


> [1] https://patchwork.freedesktop.org/patch/574898/?series=128910&rev=1
> [2] https://patchwork.freedesktop.org/series/132229/
> 
> 
> They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU address
> > space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits within
> > the
> > + *	     faulting CPU address space is chosen for the range size.
> > Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration, invalidations,
> > and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping, migrating,
> > and
> > + * releasing memory ranges between the CPU and GPU. It handles
> > all core memory
> > + * management interactions (DMA mapping, HMM, and migration)
> > and provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure is
> > sufficient
> > + * to build the expected driver components for an SVM
> > implementation as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers
> > based on the
> > + *			     fault address, optionally migrate the range
> > to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > Ranges are
> > + *			expected to be added to the garbage collector
> > upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it
> > locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to
> > handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions that
> > require
> > + * the mmap lock. This option may be useful for drivers that need to
> > call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the
> > notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and
> > sequence
> > + * number. GPU SVM manages all necessary locking and unlocking
> > operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU
> > bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the
> > HMM
> > + * documentation (TODO: Link). Future revisions may transition from
> > a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should
> > implement a
> > + * lock to safeguard core GPU SVM function calls that modify state,
> > such as
> > + * drm_gpusvm_range_find_or_insert and
> > drm_gpusvm_range_remove. Alternatively,
> > + * these core functions can be called within a single kernel thread,
> > for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between
> > SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently
> > does not
> > + * support mixing SRAM and VRAM pages within a range. This means
> > that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as follows: it
> > + * simplifies the implementation, and range sizes are driver-defined
> > and should
> > + * be relatively small.
> 
> Migration at range granularity just couples the physical world with virtual world,
> Which is against the fundamental page-centric design we aligned before.
> 
> Looking at core mm behavior, the shrinking/swapping doesn't operate at vma or any
> Virtual range granularity. This way we swap out the less frequently used pages and
> Keep the more frequently used pages in ram. 
> 
> Similar thing should be done to vram migration to sram.
> 
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges,
> 
> As said above, the challenge is coming from a design choice. In a 
> Page centric design, the challenges don't exist at all.

See my other reply, as long as migrate_to_ram is entirely page centric
we're fine. And I think Matt fixed that now.

The other aspect of being page centric is gpu pagetable locking, and there
I also gained a lot of clarity on what exactly matters, and what doesn't.
The mmu_notifer -> range -> page design wouldn't be my personal first
choice, but it is a perfectly ok one I think. As long as we follow all the
other rules we need to follow about page-centric locking/refcounting/pte
invaliation that migrate_to_ram requires.

Cheers, Sima


> > with the main one
> > + * being that a subset of the range still has CPU and GPU mappings.
> > If the
> > + * backing store for the range is in VRAM, a subset of the backing
> > store has
> > + * references. One option would be to split the range and VRAM
> > backing store,
> > + * but the implementation for this would be quite complicated.
> > Given that
> > + * partial unmappings are rare and driver-defined range sizes are
> > relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a
> > range, the
> > + * driver is expected to invalidate and destroy the entire range. If
> > the range
> > + * has VRAM as its backing, the driver is also expected to migrate any
> > remaining
> > + * pages back to SRAM.
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected
> > driver
> > + * components: the GPU page fault handler and the garbage
> > collector. A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex
> > migration
> > + * policies, optimized invalidations, or other potentially required
> > driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM
> > ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > fault_addr,
> > + *							gpuva_start,
> > gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm
> > *gpusvm,
> > + *					struct drm_gpusvm_range
> > *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages
> > back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > range, &ctx);
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier *notifier,
> > + *				 const struct mmu_notifier_range
> > *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > >start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier,
> > mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > range, &ctx);
> > + *
> > + *			if (mmu_range->event !=
> > MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range,
> > mmu_range);
> > + *			driver_garbage_collector_add(gpusvm,
> > range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)-
> > >va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)-
> > >va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START,
> > DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > >interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > >interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last,
> > DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static
> > __maybe_unused, notifier);
> > +
> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given
> > range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory
> > range,
> > + * specified by the start and end addresses. It divides the difference
> > + * between the end and start addresses by the page size
> > (PAGE_SIZE) to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd
> > destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up
> > a range
> > + * upon CPU page fault and asynchronously releasing VRAM once
> > the CPU has no
> > + * page references. Asynchronous release is useful because CPU
> > page references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
> > +	void *vram_allocation;
> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > destroying a zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct
> > work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd,
> > destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It sets
> > up the
> > + * reference count, initializes the destroy work, and links the
> > provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work,
> > drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd
> > structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for
> > asynchronous destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd
> > structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU
> > SVM ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > notifier__, start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > (start__), (end__)),	\
> > +	     (next__) = __drm_gpusvm_range_next(range__);
> > 			\
> > +	     (range__) && (range__->va.start < (end__));
> > 			\
> > +	     (range__) = (next__), (next__) =
> > __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next
> > drm_gpusvm_notifier in the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available, or
> > NULL if
> > + *         the current notifier is the last one or if the input notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm->notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> > in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > start__, end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start < (end__));
> > 			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> > SVM notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1),	\
> > +	     (next__) = __drm_gpusvm_notifier_next(notifier__);
> > 				\
> > +	     (notifier__) && (notifier__->interval.start < (end__));
> > 			\
> > +	     (notifier__) = (next__), (next__) =
> > __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It
> > sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc
> > under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range
> > *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops
> > drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner =
> > device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault
> > address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)
> > 	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),
> > 	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier
> > structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)
> > 	\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB
> > tree and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)
> > 	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining
> > ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm,
> > 0, LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any
> > invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next,
> > notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > >notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > >notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree
> > and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree
> > and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)
> > 		\
> > +	range_remove((range__), &(notifier__)->root);
> > 	\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool
> > migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the
> > CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier
> > *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range
> > based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > and the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64 gpuva_start,
> > +				       u64 gpuva_end, bool check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > >chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end
> > &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with
> > existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted in.
> > Without
> > +		 * this check, or prefault, on BMG
> > 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case, each
> > process
> > +		 * mallocs 16k but the CPU VMA is ~128k which results
> > in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some
> > processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with
> > 'migrate.cpages != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages
> > device pages from
> > +		 * other processes are collected + faulted in which
> > creates all
> > +		 * sorts of problems. Unsure exactly how this
> > happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --
> > r
> > +		 * process-many-malloc' mallocs at least 64k at a time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range
> > based on the
> > + * fault address. Caller must hold a lock to protect range lookup and
> > insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > u64 fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > >notifier,
> > +							  mm, notifier-
> > >interval.start,
> > +							  notifier-
> > >interval.end -
> > +							  notifier-
> > >interval.start,
> > +
> > &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > current
> > +	 * limitations. If/when migrate_vma_* add more support, this
> > logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas)
> > && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > notifier, vas,
> > +						 fault_addr,
> > gpuva_start,
> > +						 gpuva_end,
> > migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > fault_addr, chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA
> > region
> > + * is assumed to be composed of 2^@order__ pages, and the macro
> > will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated
> > with a GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range.
> > Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct
> > drm_gpusvm *gpusvm,
> > +					   struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range-
> > >va.start,
> > +							     range-
> > >va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range-
> > >order)
> > +				dma_unmap_page(gpusvm->drm-
> > >dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range-
> > >order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a
> > GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > *gpusvm,
> > +					struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also
> > removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > >va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier-
> > >notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified
> > GPU SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the
> > GPU SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its
> > reference count
> > + * reaches zero. If a custom range-free function is provided, it is
> > invoked to
> > + * free the range; otherwise, the range is deallocated using kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range,
> > refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified
> > GPU SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before
> > commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range-
> > >flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range
> > pages valid unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm
> > *gpusvm,
> > +				      struct drm_gpusvm_range *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they
> > are mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > >notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > >read_only ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start,
> > range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > range->pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin
> > */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only
> > a read-lock
> > +			 * is required but there apears to be a window
> > between
> > +			 * the MMU_NOTIFY_MIGRATE event
> > triggered in a CPU fault
> > +			 * via migrate_vma_setup and the pages
> > actually moving
> > +			 * in migrate_vma_finalize in which this code
> > can grab
> > +			 * garbage pages. Grabbing the write-lock if
> > the range
> > +			 * is attached to vram appears to protect
> > against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if
> > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +
> > 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +
> > hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if
> > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm-
> > >drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE << order,
> > +
> > DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev,
> > dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j,
> > sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] =
> > (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +
> > 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order,
> > DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated
> > with a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If
> > @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > >invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long
> > npages,
> > +					   unsigned long *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +
> > 	drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > grate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU
> > SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for
> > GPU SVM migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to
> > mapped pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU
> > SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn,
> > maps the
> > + * corresponding page, and stores the DMA address in the provided
> > @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int
> > *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > PAGE_SIZE, dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > mapped for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for
> > GPU Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in
> > @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the
> > corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction
> > dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to
> > VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > The caller
> > + *                   should hold a reference to the VRAM allocation, which
> > + *                   should be dropped via ops->vram_allocation or upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It
> > performs the
> > + * necessary setup and invokes the driver-specific operations for
> > migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely
> > reference @range
> > + * until ops->vram_release is called which only upon successful
> > return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm-
> > >device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > >copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > npages, not
> > +	 * always an error. Need to revisit possible cases and how to
> > handle. We
> > +	 * could prefault on migrate.cpages != npages via
> > hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.src, npages,
> > DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages,
> > dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages,
> > migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > dma_addr, npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM
> > PFNs for a VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers
> > (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the VM
> > area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > vm_area_struct *vas,
> > +						unsigned long npages,
> > +						unsigned long
> > *src_mpfn,
> > +						unsigned long *mpfn,
> > u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> > mmap lock and
> > + * migration done via migrate_device_* functions. Fallback path as it
> > is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > >vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm-
> > >device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL,
> > npages, src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   dst, npages,
> > DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to
> > SRAM (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified GPU
> > SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm-
> > >device_private_page_owner,
> > +		.flags		=
> > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially
> > unmapped */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src,
> > migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > dma_addr, npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages,
> > migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > dma_addr, npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range
> > to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM
> > range to
> > + * SRAM. It performs necessary checks and invokes the internal
> > migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err =
> > drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner
> > case when
> > +	 * VRAM backing has been partially unmapped from MM's
> > address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > start, end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called
> > while
> > +		 * holding a dma-resv lock, and a final put can grab the
> > mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > associated with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device
> > data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > (page fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM
> > range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting
> > page and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > >zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range-
> > >gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for
> > GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops =
> > {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device
> > page map operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops
> > *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for
> > the given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > start, u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end)
> > {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start,
> > end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual
> > Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM
> > ranges and
> > + * perform operations such as migration between VRAM and system
> > RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success, NULL
> > on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > drm_gpusvm *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects to
> > drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers
> > (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses
> > (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It can
> > safely
> > +	 * walk the notifier range RB tree/list in this function. Called
> > while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range
> > *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure
> > notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order they
> > appear in
> > + *              interval tree. This is useful to keep iterating ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval
> > notifier has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM
> > range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and
> > DMA mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > mapping size
> > + * @flags.migrate_vram: Flag indicating whether the range can be
> > migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been
> > unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been
> > partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking
> > memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under notifier
> > lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier
> > operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU
> > SVM notifiers
> > + * @notifier_list: list head containing of notifiers in the same order
> > they
> > + *                 appear in interval tree. This is useful to keep iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory)
> > used for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > device.
> > + *
> > + * No reference counting is provided, as this is expected to be
> > embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which
> > handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> 
> I also think the gpusvm concept is a duplication of the drm_gpuvm.
> Look at the members here, mm_start, mm_range, rb_tree...
> 
> Maintaining a list of notifier at this layer is odd. Everybody else seems
> Embed the notifier in a range...
> 
> Mm field is essential for svm though. I think what we can do is, introduce a
> *mm field in drm_gpuvm and introduce uAPI to allow user to say one gpuvm
> Participate svm. If one gpuvm participate svm, we set the mm field for this
> Gpuvm.
> 
> Another benefit of the proposed way is, multiple gpuvms can share address space
> With single cpu mm process.
> 
> 
> Oak
> 
> 
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
> > u64 fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops
> > *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64
> > start, u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the
> > list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > NULL if the
> > + *         current range is the last one or if the input range is NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates the
> > start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get
> > the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier. It
> > is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > end__)	\
> > +	for ((range__) = (range__) ?:
> > 	\
> > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));
> > 	\
> > +	     (range__) && (range__->va.start < (end__));
> > 	\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as
> > unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the
> > partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range
> > *range,
> > +			      const struct mmu_notifier_range
> > *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
> > --
> > 2.34.1
>
Thomas Hellström Sept. 24, 2024, 10:42 a.m. UTC | #37
Hi, Matt,

Some random review comments on this patch I came across while looking
at multi-device.

Thanks,
Thomas


On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> This patch introduces support for GPU Shared Virtual Memory (SVM) in
> the
> Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> sharing of memory between the CPU and GPU, enhancing performance and
> flexibility in GPU computing tasks.
> 
> The patch adds the necessary infrastructure for SVM, including data
> structures and functions for managing SVM ranges and notifiers. It
> also
> provides mechanisms for allocating, deallocating, and migrating
> memory
> regions between system RAM and GPU VRAM.
> 
> This mid-layer is largely inspired by GPUVM.

NIT: Naming, Should it be drm_svm rather than drm_gpusvm? For the
drm_gpuvm component, gpuvm clearly distinguished a gpu_vm from a
mm_struct but here we don't have the same need.

> 
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile     |    3 +-
>  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
>  3 files changed, 2591 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile
> b/drivers/gpu/drm/xe/Makefile
> index b9670ae09a9e..b8fc2ee58f1a 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>  
>  # core driver code
>  
> -xe-y += xe_bb.o \
> +xe-y += drm_gpusvm.o \
> +	xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
>  	xe_devcoredump.o \
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> b/drivers/gpu/drm/xe/drm_gpusvm.c
> new file mode 100644
> index 000000000000..fc1e44e6ae72
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> @@ -0,0 +1,2174 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + *
> + * Authors:
> + *     Matthew Brost <matthew.brost@intel.com>
> + */
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hmm.h>
> +#include <linux/memremap.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_types.h>
> +#include <linux/pagemap.h>
> +#include <linux/slab.h>
> +
> +#include <drm/drm_device.h>
> +#include "drm_gpusvm.h"
> +
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework designed to
> manage shared
> + * virtual memory between the CPU and GPU. It enables efficient data
> exchange and
> + * processing for GPU-accelerated applications by allowing memory
> sharing and
> + * synchronization between the CPU's and GPU's virtual address
> spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and
> notifying the
> + *		GPU of changes, notifiers are sized based on a GPU
> SVM
> + *		initialization parameter, with a recommendation of
> 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of
> ranges that
> + *		fall within the notifier interval. Notifiers are
> tracked within
> + *		a GPU SVM Red-BlacK tree and list and are
> dynamically inserted
> + *		or removed as ranges within the interval are created
> or
> + *		destroyed.
> + * - Ranges: Represent memory ranges mapped in a DRM device and
> managed
> + *	     by GPU SVM. They are sized based on an array of chunk
> sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU
> address space.
> + *	     Upon GPU fault, the largest aligned chunk that fits
> within the
> + *	     faulting CPU address space is chosen for the range
> size. Ranges are
> + *	     expected to be dynamically allocated on GPU fault and
> removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges
> are tracked in
> + *	     a notifier's Red-Black tree.
> + * - Operations: Define the interface for driver-specific SVM
> operations such as
> + *		 allocation, page collection, migration,
> invalidations, and VRAM
> + *		 release.
> + *
> + * This layer provides interfaces for allocating, mapping,
> migrating, and
> + * releasing memory ranges between the CPU and GPU. It handles all
> core memory
> + * management interactions (DMA mapping, HMM, and migration) and
> provides
> + * driver-specific virtual functions (vfuncs). This infrastructure
> is sufficient
> + * to build the expected driver components for an SVM implementation
> as detailed
> + * below.
> + *
> + * Expected Driver Components:
> + * - GPU page fault handler: Used to create ranges and notifiers
> based on the
> + *			     fault address, optionally migrate the
> range to
> + *			     VRAM, and create GPU bindings.
> + * - Garbage collector: Used to destroy GPU bindings for ranges.
> Ranges are
> + *			expected to be added to the garbage
> collector upon
> + *			MMU_NOTIFY_UNMAP event.
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * GPU SVM handles locking for core MM interactions, i.e., it
> locks/unlocks the
> + * mmap lock as needed. Alternatively, if the driver prefers to
> handle the mmap
> + * lock itself, a 'locked' argument is provided to the functions
> that require
> + * the mmap lock. This option may be useful for drivers that need to
> call into
> + * GPU SVM while also holding a dma-resv lock, thus preventing
> locking
> + * inversions between the mmap and dma-resv locks.
> + *
> + * GPU SVM introduces a global notifier lock, which safeguards the
> notifier's
> + * range RB tree and list, as well as the range's DMA mappings and
> sequence
> + * number. GPU SVM manages all necessary locking and unlocking
> operations,
> + * except for the recheck of the range's sequence number
> + * (mmu_interval_read_retry) when the driver is committing GPU
> bindings. This
> + * lock corresponds to the 'driver->update' lock mentioned in the
> HMM
> + * documentation (TODO: Link). Future revisions may transition from
> a GPU SVM
> + * global lock to a per-notifier lock if finer-grained locking is
> deemed
> + * necessary.
> + *
> + * In addition to the locking mentioned above, the driver should
> implement a
> + * lock to safeguard core GPU SVM function calls that modify state,
> such as
> + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> Alternatively,
> + * these core functions can be called within a single kernel thread,
> for
> + * instance, using an ordered work queue. This lock is denoted as
> + * 'driver_svm_lock' in code examples.
> + */
> +
> +/**
> + * DOC: Migrataion
> + *
> + * The migration support is quite simple, allowing migration between
> SRAM and
> + * VRAM at the range granularity. For example, GPU SVM currently
> does not
> + * support mixing SRAM and VRAM pages within a range. This means
> that upon GPU
> + * fault, the entire range can be migrated to VRAM, and upon CPU
> fault, the
> + * entire range is migrated to SRAM.
> + *
> + * The reasoning for only supporting range granularity is as
> follows: it
> + * simplifies the implementation, and range sizes are driver-defined
> and should
> + * be relatively small.
> + */
> +
> +/**
> + * DOC: Partial Unmapping of Ranges
> + *
> + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> CPU resulting
> + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> main one
> + * being that a subset of the range still has CPU and GPU mappings.
> If the
> + * backing store for the range is in VRAM, a subset of the backing
> store has
> + * references. One option would be to split the range and VRAM
> backing store,
> + * but the implementation for this would be quite complicated. Given
> that
> + * partial unmappings are rare and driver-defined range sizes are
> relatively
> + * small, GPU SVM does not support splitting of ranges.
> + *
> + * With no support for range splitting, upon partial unmapping of a
> range, the
> + * driver is expected to invalidate and destroy the entire range. If
> the range
> + * has VRAM as its backing, the driver is also expected to migrate
> any remaining
> + * pages back to SRAM.
> + */
> +
> +/**
> + * DOC: Examples
> + *
> + * This section provides two examples of how to build the expected
> driver
> + * components: the GPU page fault handler and the garbage collector.
> A third
> + * example demonstrates a sample invalidation driver vfunc.
> + *
> + * The generic code provided does not include logic for complex
> migration
> + * policies, optimized invalidations, or other potentially required
> driver
> + * locking (e.g., DMA-resv locks).
> + *
> + * 1) GPU page fault handler
> + *
> + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> drm_gpusvm_range *range)
> + *	{
> + *		int err = 0;
> + *
> + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> range);
> + *
> + *		drm_gpusvm_notifier_lock(gpusvm);
> + *		if (drm_gpusvm_range_pages_valid(range))
> + *			driver_commit_bind(gpusvm, range);
> + *		else
> + *			err = -EAGAIN;
> + *		drm_gpusvm_notifier_unlock(gpusvm);
> + *
> + *		return err;
> + *	}
> + *
> + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> + *			     u64 gpuva_start, u64 gpuva_end)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *		int err;
> + *
> + *		driver_svm_lock();
> + *	retry:
> + *		// Always process UNMAPs first so view of GPU SVM
> ranges is current
> + *		driver_garbage_collector(gpusvm);
> + *
> + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> fault_addr,
> + *							gpuva_start,
> gpuva_end,
> + *						        &ctx);
> + *		if (IS_ERR(range)) {
> + *			err = PTR_ERR(range);
> + *			goto unlock;
> + *		}
> + *
> + *		if (driver_migration_policy(range)) {
> + *			bo = driver_alloc_bo();
> + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> range, bo, &ctx);
> + *			if (err)	// CPU mappings may have
> changed
> + *				goto retry;
> + *		}
> + *
> + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &ctx);
> + *		if (err == -EFAULT || err == -EPERM)	// CPU
> mappings changed
> + *			goto retry;
> + *		else if (err)
> + *			goto unlock;
> + *
> + *		err = driver_bind_range(gpusvm, range);
> + *		if (err == -EAGAIN)	// CPU mappings changed
> + *			goto retry
> + *
> + *	unlock:
> + *		driver_svm_unlock();
> + *		return err;
> + *	}
> + *
> + * 2) Garbage Collector.
> + *
> + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> + *					struct drm_gpusvm_range
> *range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		// Partial unmap, migrate any remaining VRAM pages
> back to SRAM
> + *		if (range->flags.partial_unmap)
> + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> &ctx);
> + *
> + *		driver_unbind_range(range);
> + *		drm_gpusvm_range_remove(gpusvm, range);
> + *	}
> + *
> + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> + *	{
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		for_each_range_in_garbage_collector(gpusvm, range)
> + *			__driver_garbage_collector(gpusvm, range);
> + *	}
> + *
> + * 3) Invalidation driver vfunc.
> + *
> + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> + *				 struct drm_gpusvm_notifier
> *notifier,
> + *				 const struct mmu_notifier_range
> *mmu_range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> };
> + *		struct drm_gpusvm_range *range = NULL;
> + *
> + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> >start, mmu_range->end);
> + *
> + *		drm_gpusvm_for_each_range(range, notifier,
> mmu_range->start,
> + *					  mmu_range->end) {
> + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> &ctx);
> + *
> + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> + *				continue;
> + *
> + *			drm_gpusvm_range_set_unmapped(range,
> mmu_range);
> + *			driver_garbage_collector_add(gpusvm, range);
> + *		}
> + *	}
> + */
> +
> +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> rb.__subtree_last,
> +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> +		     static __maybe_unused, range);
> +
> +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> >interval.start)
> +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> >interval.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> notifier);
> +

Since these trees span struct mm_struct address space which should fit
in an unsigned long, can we use the generic version (interval_tree.h)
rather than instantiating two new versions? I figure both contain
overlapping ranges so we can't use maple trees?

> +/**
> + * npages_in_range() - Calculate the number of pages in a given
> range
> + * @start__: The start address of the range
> + * @end__: The end address of the range
> + *
> + * This macro calculates the number of pages in a given memory
> range,
> + * specified by the start and end addresses. It divides the
> difference
> + * between the end and start addresses by the page size (PAGE_SIZE)
> to
> + * determine the number of pages in the range.
> + *
> + * Return: The number of pages in the specified range.
> + */
> +#define npages_in_range(start__, end__)	\
> +	(((end__) - (start__)) >> PAGE_SHIFT)
> +
> +/**
> + * struct drm_gpusvm_zdd - GPU SVM zone device data
> + *
> + * @refcount: Reference count for the zdd
> + * @destroy_work: Work structure for asynchronous zdd destruction
> + * @range: Pointer to the GPU SVM range
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + *
> + * This structure serves as a generic wrapper installed in
> + * page->zone_device_data. It provides infrastructure for looking up
> a range
> + * upon CPU page fault and asynchronously releasing VRAM once the
> CPU has no
> + * page references. Asynchronous release is useful because CPU page
> references
> + * can be dropped in IRQ contexts, while releasing VRAM likely
> requires sleeping
> + * locks.
> + */
> +struct drm_gpusvm_zdd {
> +	struct kref refcount;
> +	struct work_struct destroy_work;
> +	struct drm_gpusvm_range *range;
 
I still believe previous review comments are valid here, considering we
do have multiple drm_gpusvm per struct mm_struct, potentially all
mapping the above page.

> +	void *vram_allocation;

NIT: Naming. The core is using device memory or devmem. Should we
follow.

Also could we, rather than using av void * use an embeddable struct
with its own ops rather than using the gpusvm ops for this?

> +};
> +
> +/**
> + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> zdd
> + * @w: Pointer to the work_struct
> + *
> + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> + */
> +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(w, struct drm_gpusvm_zdd,
> destroy_work);
> +	struct drm_gpusvm_range *range = zdd->range;
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> +		gpusvm->ops->vram_release(zdd->vram_allocation);
> +	drm_gpusvm_range_put(range);
> +	kfree(zdd);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> + * @range: Pointer to the GPU SVM range.
> + *
> + * This function allocates and initializes a new zdd structure. It
> sets up the
> + * reference count, initializes the destroy work, and links the
> provided GPU SVM
> + * range.
> + *
> + * Returns:
> + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_zdd *
> +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_zdd *zdd;
> +
> +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> +	if (!zdd)
> +		return NULL;
> +
> +	kref_init(&zdd->refcount);
> +	INIT_WORK(&zdd->destroy_work,
> drm_gpusvm_zdd_destroy_work_func);
> +	zdd->range = drm_gpusvm_range_get(range);
> +	zdd->vram_allocation = NULL;
> +
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function increments the reference count of the provided zdd
> structure.
> + *
> + * Returns: Pointer to the zdd structure.
> + */
> +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> drm_gpusvm_zdd *zdd)
> +{
> +	kref_get(&zdd->refcount);
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> + * @ref: Pointer to the reference count structure.
> + *
> + * This function queues the destroy_work of the zdd for asynchronous
> destruction.
> + */
> +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> +
> +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_put - Put a zdd reference.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function decrements the reference count of the provided zdd
> structure
> + * and schedules its destruction if the count drops to zero.
> + */
> +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> + * @notifier: Pointer to the GPU SVM notifier structure.
> + * @start: Start address of the range
> + * @end: End address of the range
> + *
> + * Return: A pointer to the drm_gpusvm_range if found or NULL
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end)
> +{
> +	return range_iter_first(&notifier->root, start, end - 1);
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> ranges in a notifier
> + * @range__: Iterator variable for the ranges
> + * @next__: Iterator variable for the ranges temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier
> while
> + * removing ranges from it.
> + */
> +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> start__, end__)	\
> +	for ((range__) = drm_gpusvm_range_find((notifier__),
> (start__), (end__)),	\
> +	     (next__) =
> __drm_gpusvm_range_next(range__);				\
> +	     (range__) && (range__->va.start <
> (end__));				\
> +	     (range__) = (next__), (next__) =
> __drm_gpusvm_range_next(range__))
> +
> +/**
> + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> the list
> + * @notifier: a pointer to the current drm_gpusvm_notifier
> + *
> + * Return: A pointer to the next drm_gpusvm_notifier if available,
> or NULL if
> + *         the current notifier is the last one or if the input
> notifier is
> + *         NULL.
> + */
> +static struct drm_gpusvm_notifier *
> +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> +{
> +	if (notifier && !list_is_last(&notifier->rb.entry,
> +				      &notifier->gpusvm-
> >notifier_list))
> +		return list_next_entry(notifier, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> + */
> +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> end__)		\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1);	\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @next__: Iterator variable for the notifiers temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> while
> + * removing notifiers from it.
> + */
> +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> gpusvm__, start__, end__)	\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1),	\
> +	     (next__) =
> __drm_gpusvm_notifier_next(notifier__);				\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = (next__), (next__) =
> __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> + * @mni: Pointer to the mmu_interval_notifier structure.
> + * @mmu_range: Pointer to the mmu_notifier_range structure.
> + * @cur_seq: Current sequence number.
> + *
> + * This function serves as a generic MMU notifier for GPU SVM. It
> sets the MMU
> + * notifier sequence number and calls the driver invalidate vfunc
> under
> + * gpusvm->notifier_lock.
> + *
> + * Returns:
> + * true if the operation succeeds, false otherwise.
> + */
> +static bool
> +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> +			       const struct mmu_notifier_range
> *mmu_range,
> +			       unsigned long cur_seq)
> +{
> +	struct drm_gpusvm_notifier *notifier =
> +		container_of(mni, typeof(*notifier), notifier);
> +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> +
> +	if (!mmu_notifier_range_blockable(mmu_range))
> +		return false;
> +
> +	down_write(&gpusvm->notifier_lock);
> +	mmu_interval_set_seq(mni, cur_seq);
> +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> +	up_write(&gpusvm->notifier_lock);
> +
> +	return true;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> GPU SVM
> + */
> +static const struct mmu_interval_notifier_ops
> drm_gpusvm_notifier_ops = {
> +	.invalidate = drm_gpusvm_notifier_invalidate,
> +};
> +
> +/**
> + * drm_gpusvm_init - Initialize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @name: Name of the GPU SVM.
> + * @drm: Pointer to the DRM device structure.
> + * @mm: Pointer to the mm_struct for the address space.
> + * @device_private_page_owner: Device private pages owner.
> + * @mm_start: Start address of GPU SVM.
> + * @mm_range: Range of the GPU SVM.
> + * @notifier_size: Size of individual notifiers.
> + * @ops: Pointer to the operations structure for GPU SVM.
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order
> with last
> + *               entry being SZ_4K.
> + * @num_chunks: Number of chunks.
> + *
> + * This function initializes the GPU SVM.
> + *
> + * Returns:
> + * 0 on success, a negative error code on failure.
> + */
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks)
> +{
> +	if (!ops->invalidate || !num_chunks)
> +		return -EINVAL;
> +
> +	gpusvm->name = name;
> +	gpusvm->drm = drm;
> +	gpusvm->mm = mm;
> +	gpusvm->device_private_page_owner =
> device_private_page_owner;
> +	gpusvm->mm_start = mm_start;
> +	gpusvm->mm_range = mm_range;
> +	gpusvm->notifier_size = notifier_size;
> +	gpusvm->ops = ops;
> +	gpusvm->chunk_sizes = chunk_sizes;
> +	gpusvm->num_chunks = num_chunks;
> +	gpusvm->zdd_wq = system_wq;
> +
> +	mmgrab(mm);
> +	gpusvm->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> +
> +	init_rwsem(&gpusvm->notifier_lock);
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	might_lock(&gpusvm->notifier_lock);
> +	fs_reclaim_release(GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure
> + * @fault_addr__: Fault address
> + *
> + * This macro finds the GPU SVM notifier associated with the fault
> address.
> + *
> + * Returns:
> + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> + */
> +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> +			    (fault_addr__ + 1))
> +
> +/**
> + * to_drm_gpusvm_notifier - retrieve the container struct for a
> given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_notifier struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_notifier
> structure.
> + */
> +#define to_drm_gpusvm_notifier(__node)				\
> +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> +
> +/**
> + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function inserts the GPU SVM notifier into the GPU SVM RB
> tree and list.
> + */
> +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	notifier_insert(notifier, &gpusvm->root);
> +
> +	node = rb_prev(&notifier->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> +	else
> +		head = &gpusvm->notifier_list;
> +
> +	list_add(&notifier->rb.entry, head);
> +}
> +
> +/**
> + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM tructure
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + *
> + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> and list.
> + */
> +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> +	list_del(&(notifier__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_fini - Finalize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + *
> + * This function finalizes the GPU SVM by cleaning up any remaining
> ranges and
> + * notifiers, and dropping a reference to struct MM.
> + */
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> +{
> +	struct drm_gpusvm_notifier *notifier, *next;
> +
> +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> LONG_MAX) {
> +		struct drm_gpusvm_range *range, *__next;
> +
> +		/*
> +		 * Remove notifier first to avoid racing with any
> invalidation
> +		 */
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +		notifier->flags.removed = true;
> +
> +		drm_gpusvm_for_each_range_safe(range, __next,
> notifier, 0,
> +					       LONG_MAX)
> +			drm_gpusvm_range_remove(gpusvm, range);
> +	}
> +
> +	mmdrop(gpusvm->mm);
> +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> +}
> +
> +/**
> + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + *
> + * This function allocates and initializes the GPU SVM notifier
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> on failure.
> + */
> +static struct drm_gpusvm_notifier *
> +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	if (gpusvm->ops->notifier_alloc)
> +		notifier = gpusvm->ops->notifier_alloc();
> +	else
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +
> +	if (!notifier)
> +		return ERR_PTR(-ENOMEM);
> +
> +	notifier->gpusvm = gpusvm;
> +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> >notifier_size);
> +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> >notifier_size);
> +	INIT_LIST_HEAD(&notifier->rb.entry);
> +	notifier->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&notifier->range_list);
> +
> +	return notifier;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function frees the GPU SVM notifier structure.
> + */
> +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> +				     struct drm_gpusvm_notifier
> *notifier)
> +{
> +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> +
> +	if (gpusvm->ops->notifier_free)
> +		gpusvm->ops->notifier_free(notifier);
> +	else
> +		kfree(notifier);
> +}
> +
> +/**
> + * to_drm_gpusvm_range - retrieve the container struct for a given
> rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_range struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_range structure.
> + */
> +#define to_drm_gpusvm_range(node__)	\
> +	container_of((node__), struct drm_gpusvm_range, rb.node)
> +
> +/**
> + * drm_gpusvm_range_insert - Insert GPU SVM range
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function inserts the GPU SVM range into the notifier RB tree
> and list.
> + */
> +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> *notifier,
> +				    struct drm_gpusvm_range *range)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> +	range_insert(range, &notifier->root);
> +
> +	node = rb_prev(&range->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> +	else
> +		head = &notifier->range_list;
> +
> +	list_add(&range->rb.entry, head);
> +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> +}
> +
> +/**
> + * __drm_gpusvm_range_remove - Remove GPU SVM range
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + * @range__: Pointer to the GPU SVM range structure
> + *
> + * This macro removes the GPU SVM range from the notifier RB tree
> and list.
> + */
> +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> +	range_remove((range__), &(notifier__)->root);		\
> +	list_del(&(range__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @fault_addr: Fault address
> + * @chunk_size: Chunk size
> + * @migrate_vram: Flag indicating whether to migrate VRAM
> + *
> + * This function allocates and initializes the GPU SVM range
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> failure.
> + */
> +static struct drm_gpusvm_range *
> +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> +		       struct drm_gpusvm_notifier *notifier,
> +		       u64 fault_addr, u64 chunk_size, bool
> migrate_vram)
> +{
> +	struct drm_gpusvm_range *range;
> +
> +	if (gpusvm->ops->range_alloc)
> +		range = gpusvm->ops->range_alloc(gpusvm);
> +	else
> +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&range->refcount);
> +	range->gpusvm = gpusvm;
> +	range->notifier = notifier;
> +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> +	INIT_LIST_HEAD(&range->rb.entry);
> +	range->notifier_seq = LONG_MAX;
> +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_check_pages - Check pages
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @start: Start address
> + * @end: End address
> + *
> + * Check if pages between start and end have been faulted in on the
> CPU. Use to
> + * prevent migration of pages without CPU backing store.
> + *
> + * Returns:
> + * True if pages have been faulted into CPU, False otherwise
> + */
> +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> +				   struct drm_gpusvm_notifier
> *notifier,
> +				   u64 start, u64 end)
> +{
> +	struct hmm_range hmm_range = {
> +		.default_flags = 0,
> +		.notifier = &notifier->notifier,
> +		.start = start,
> +		.end = end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long *pfns;
> +	unsigned long npages = npages_in_range(start, end);
> +	int err, i;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +	if (!pfns)
> +		return false;
> +
> +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> >notifier);
> +	hmm_range.hmm_pfns = pfns;
> +
> +	while (true) {
> +		err = hmm_range_fault(&hmm_range);
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(&notifier->notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (err)
> +		goto err_free;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!(pfns[i] & HMM_PFN_VALID)) {
> +			err = -EFAULT;
> +			goto err_free;
> +		}
> +	}
> +
> +err_free:
> +	kvfree(pfns);
> +	return err ? false : true;
> +}
> +
> +/**
> + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @vas: Pointer to the virtual memory area structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @check_pages: Flag indicating whether to check pages
> + *
> + * This function determines the chunk size for the GPU SVM range
> based on the
> + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> the virtual
> + * memory area boundaries.
> + *
> + * Returns:
> + * Chunk size on success, LONG_MAX on failure.
> + */
> +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier,
> +				       struct vm_area_struct *vas,
> +				       u64 fault_addr, u64
> gpuva_start,
> +				       u64 gpuva_end, bool
> check_pages)
> +{
> +	u64 start, end;
> +	int i = 0;
> +
> +retry:
> +	for (; i < gpusvm->num_chunks; ++i) {
> +		start = ALIGN_DOWN(fault_addr, gpusvm-
> >chunk_sizes[i]);
> +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> +
> +		if (start >= vas->vm_start && end <= vas->vm_end &&
> +		    start >= notifier->interval.start &&
> +		    end <= notifier->interval.end &&
> +		    start >= gpuva_start && end <= gpuva_end)
> +			break;
> +	}
> +
> +	if (i == gpusvm->num_chunks)
> +		return LONG_MAX;
> +
> +	/*
> +	 * If allocation more than page, ensure not to overlap with
> existing
> +	 * ranges.
> +	 */
> +	if (end - start != SZ_4K) {
> +		struct drm_gpusvm_range *range;
> +
> +		range = drm_gpusvm_range_find(notifier, start, end);
> +		if (range) {
> +			++i;
> +			goto retry;
> +		}
> +
> +		/*
> +		 * XXX: Only create range on pages CPU has faulted
> in. Without
> +		 * this check, or prefault, on BMG
> 'xe_exec_system_allocator --r
> +		 * process-many-malloc' fails. In the failure case,
> each process
> +		 * mallocs 16k but the CPU VMA is ~128k which
> results in 64k SVM
> +		 * ranges. When migrating the SVM ranges, some
> processes fail in
> +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> != npages'
> +		 * and then upon drm_gpusvm_range_get_pages device
> pages from
> +		 * other processes are collected + faulted in which
> creates all
> +		 * sorts of problems. Unsure exactly how this
> happening, also
> +		 * problem goes away if 'xe_exec_system_allocator --
> r
> +		 * process-many-malloc' mallocs at least 64k at a
> time.
> +		 */
> +		if (check_pages &&
> +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> end)) {
> +			++i;
> +			goto retry;
> +		}
> +	}
> +
> +	return end - start;
> +}
> +
> +/**
> + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @ctx: GPU SVM context
> + *
> + * This function finds or inserts a newly allocated a GPU SVM range
> based on the
> + * fault address. Caller must hold a lock to protect range lookup
> and insertion.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +	struct drm_gpusvm_range *range;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	bool notifier_alloc = false;
> +	u64 chunk_size;
> +	int err;
> +	bool migrate_vram;
> +
> +	if (fault_addr < gpusvm->mm_start ||
> +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_write_locked(mm);
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> +	if (!notifier) {
> +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> fault_addr);
> +		if (IS_ERR(notifier)) {
> +			err = PTR_ERR(notifier);
> +			goto err_mmunlock;
> +		}
> +		notifier_alloc = true;
> +		err = mmu_interval_notifier_insert_locked(&notifier-
> >notifier,
> +							  mm,
> notifier->interval.start,
> +							  notifier-
> >interval.end -
> +							  notifier-
> >interval.start,
> +							 
> &drm_gpusvm_notifier_ops);
> +		if (err)
> +			goto err_notifier;
> +	}
> +
> +	vas = vma_lookup(mm, fault_addr);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_notifier_remove;
> +	}
> +
> +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> +		err = -EPERM;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_find(notifier, fault_addr,
> fault_addr + 1);
> +	if (range)
> +		goto out_mmunlock;
> +	/*
> +	 * XXX: Short-circuiting migration based on migrate_vma_*
> current
> +	 * limitations. If/when migrate_vma_* add more support, this
> logic will
> +	 * have to change.
> +	 */
> +	migrate_vram = ctx->vram_possible &&
> +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> +
> +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> vas,
> +						 fault_addr,
> gpuva_start,
> +						 gpuva_end,
> migrate_vram &&
> +						 !ctx->prefault);
> +	if (chunk_size == LONG_MAX) {
> +		err = -EINVAL;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> chunk_size,
> +				       migrate_vram);
> +	if (IS_ERR(range)) {
> +		err = PTR_ERR(range);
> +		goto err_notifier_remove;
> +	}
> +
> +	drm_gpusvm_range_insert(notifier, range);
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> +
> +	if (ctx->prefault) {
> +		struct drm_gpusvm_ctx __ctx = *ctx;
> +
> +		__ctx.mmap_locked = true;
> +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &__ctx);
> +		if (err)
> +			goto err_range_remove;
> +	}
> +
> +out_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +
> +	return range;
> +
> +err_range_remove:
> +	__drm_gpusvm_range_remove(notifier, range);
> +err_notifier_remove:
> +	if (notifier_alloc)
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +err_notifier:
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * for_each_dma_page - iterate over pages in a DMA regio`n
> + * @i__: the current page index in the iteration
> + * @j__: the current page index, log order, in the iteration
> + * @npages__: the total number of pages in the DMA region
> + * @order__: the order of the pages in the DMA region
> + *
> + * This macro iterates over each page in a DMA region. The DMA
> region
> + * is assumed to be composed of 2^@order__ pages, and the macro will
> + * step through the region one block of 2^@order__ pages at a time.
> + */
> +#define for_each_dma_page(i__, j__, npages__, order__)	\
> +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> +	     (j__)++, (i__) += 0x1 << (order__))
> +
> +/**
> + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> GPU SVM range (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function unmap pages associated with a GPU SVM range.
> Assumes and
> + * asserts correct locking is in place when called.
> + */
> +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> *gpusvm,
> +					   struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		unsigned long i, j, npages = npages_in_range(range-
> >va.start,
> +							     range-
> >va.end);
> +
> +		if (range->flags.has_dma_mapping) {
> +			for_each_dma_page(i, j, npages, range-
> >order)
> +				dma_unmap_page(gpusvm->drm->dev,
> +					       range->dma_addr[j],
> +					       PAGE_SIZE << range-
> >order,
> +					       DMA_BIDIRECTIONAL);
> +		}
> +
> +		range->flags.has_vram_pages = false;
> +		range->flags.has_dma_mapping = false;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function free pages associated with a GPU SVM range.
> + */
> +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> +					struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		if (range->flags.kfree_mapping) {
> +			kfree(range->dma_addr);
> +			range->flags.kfree_mapping = false;
> +			range->pages = NULL;
> +		} else {
> +			kvfree(range->pages);
> +			range->pages = NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_remove - Remove GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range to be removed
> + *
> + * This function removes the specified GPU SVM range and also
> removes the parent
> + * GPU SVM notifier if no more ranges remain in the notifier. The
> caller must
> + * hold a lock to protect range and notifier removal.
> + */
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> >va.start);
> +	if (WARN_ON_ONCE(!notifier))
> +		return;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +	drm_gpusvm_range_free_pages(gpusvm, range);
> +	__drm_gpusvm_range_remove(notifier, range);
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	drm_gpusvm_range_put(range);
> +
> +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> +		if (!notifier->flags.removed)
> +			mmu_interval_notifier_remove(&notifier-
> >notifier);
> +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function increments the reference count of the specified GPU
> SVM range.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> +{
> +	kref_get(&range->refcount);
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> + * @refcount: Pointer to the reference counter embedded in the GPU
> SVM range
> + *
> + * This function destroys the specified GPU SVM range when its
> reference count
> + * reaches zero. If a custom range-free function is provided, it is
> invoked to
> + * free the range; otherwise, the range is deallocated using
> kfree().
> + */
> +static void drm_gpusvm_range_destroy(struct kref *refcount)
> +{
> +	struct drm_gpusvm_range *range =
> +		container_of(refcount, struct drm_gpusvm_range,
> refcount);
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->range_free)
> +		gpusvm->ops->range_free(range);
> +	else
> +		kfree(range);
> +}
> +
> +/**
> + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function decrements the reference count of the specified GPU
> SVM range
> + * and frees it when the count reaches zero.
> + */
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> +{
> +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called holding gpusvm->notifier_lock and as the last step before
> commiting a
> + * GPU binding.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	return range->flags.has_vram_pages || range-
> >flags.has_dma_mapping;
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> unlocked
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called without holding gpusvm->notifier_lock.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +static bool
> +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> +				      struct drm_gpusvm_range
> *range)
> +{
> +	bool pages_valid;
> +
> +	if (!range->pages)
> +		return false;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> +	if (!pages_valid && range->flags.kfree_mapping) {
> +		kfree(range->dma_addr);
> +		range->flags.kfree_mapping = false;
> +		range->pages = NULL;
> +	}
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	return pages_valid;
> +}
> +
> +/**
> + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function gets pages for a GPU SVM range and ensures they are
> mapped for
> + * DMA access.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{

Is it possible to split this function up to make it look more neat?


> +	struct mmu_interval_notifier *notifier = &range->notifier-
> >notifier;
> +	struct hmm_range hmm_range = {
> +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> ? 0 :
> +			HMM_PFN_REQ_WRITE),
> +		.notifier = notifier,
> +		.start = range->va.start,
> +		.end = range->va.end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long i, j;
> +	unsigned long npages = npages_in_range(range->va.start,
> range->va.end);
> +	unsigned int order = 0;
> +	unsigned long *pfns;
> +	struct page **pages;
> +	int err = 0;
> +	bool vram_pages = !!range->flags.migrate_vram;
> +	bool alloc_pfns = false, kfree_mapping;
> +
> +retry:
> +	kfree_mapping = false;
> +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> +		return 0;
> +
> +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> >pages) {
> +		if (ctx->prefault)
> +			return 0;
> +
> +		pfns = (unsigned long *)range->pages;
> +		pages = range->pages;
> +		goto map_pages;
> +	}
> +
> +	if (!range->pages) {
> +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> GFP_KERNEL);
> +		if (!pfns)
> +			return -ENOMEM;
> +		alloc_pfns = true;
> +	} else {
> +		pfns = (unsigned long *)range->pages;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +	}
> +
> +	hmm_range.hmm_pfns = pfns;
> +	while (true) {
> +		/* Must be checked after mmu_interval_read_begin */
> +		if (range->flags.unmapped) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only
> a read-lock
> +			 * is required but there apears to be a
> window between
> +			 * the MMU_NOTIFY_MIGRATE event triggered in
> a CPU fault
> +			 * via migrate_vma_setup and the pages
> actually moving
> +			 * in migrate_vma_finalize in which this
> code can grab
> +			 * garbage pages. Grabbing the write-lock if
> the range
> +			 * is attached to vram appears to protect
> against this
> +			 * race.
> +			 */
> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if
> (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {

Here it looks like you're assuming that all pages are the same order?
With THP that's definitely not the case, (unless hmm somehow thinks
they are 4K pages). This probably work because we only end up here in
the HugeTLB case where all pages are forced to the same oder.

> +			if (WARN_ON_ONCE(i && order !=
> +					
> hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if
> (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);
> +
> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE <<
> order,
> +						  
> DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev,
> dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j,
> sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] =
> (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order,
> DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If
> @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write
> mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> called on
> + * each GPU SVM range attached to notifier in gpusvm->ops-
> >invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> +					   unsigned long
> *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> grate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU
> SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to mapped
> pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU
> SVM. It
> + * iterates over each page frame number provided in @migrate_pfn,
> maps the
> + * corresponding page, and stores the DMA address in the provided
> @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int
> *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page =
> migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for GPU
> Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in
> @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the
> corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction
> dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev,
> dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> The caller
> + *                   should hold a reference to the VRAM allocation,
> which
> + *                   should be dropped via ops->vram_allocation or
> upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It
> performs the
> + * necessary setup and invokes the driver-specific operations for
> migration to
> + * VRAM. Upon successful return, @vram_allocation can safely
> reference @range
> + * until ops->vram_release is called which only upon successful
> return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> >copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> npages, not
> +	 * always an error. Need to revisit possible cases and how
> to handle. We
> +	 * could prefault on migrate.cpages != npages via
> hmm_range_fault.
> +	 */
> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.src, npages,
> DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers
> (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the
> VM area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> vm_area_struct *vas,
> +						unsigned long
> npages,
> +						unsigned long
> *src_mpfn,
> +						unsigned long *mpfn,
> u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> lock and
> + * migration done via migrate_device_* functions. Fallback path as
> it is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> >vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm-
> >device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   dst, npages,
> DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified
> GPU SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM
> PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	/* Corner where VMA area struct has been partially unmapped
> */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src,
> migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM
> range to
> + * SRAM. It performs necessary checks and invokes the internal
> migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err =
> drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner
> case when
> +	 * VRAM backing has been partially unmapped from MM's
> address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> end);
> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called
> while
> +		 * holding a dma-resv lock, and a final put can grab
> the mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device
> data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM
> range to RAM.
> + * It retrieves the GPU SVM range information from the faulting page
> and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);
> +
> +	return err ? VM_FAULT_SIGBUS : 0;
> +}
> +
> +/**
> + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> + */
> +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> +	.page_free = drm_gpusvm_page_free,
> +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> +};
> +
> +/**
> + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> operations
> + *
> + * Returns:
> + * Pointer to the GPU SVM device page map operations structure.
> + */
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> +{
> +	return &drm_gpusvm_pagemap_ops;
> +}
> +
> +/**
> + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> given address range
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @start: Start address
> + * @end: End address
> + *
> + * Returns:
> + * True if GPU SVM has mapping, False otherwise
> + */
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> u64 end)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> +		struct drm_gpusvm_range *range = NULL;
> +
> +		drm_gpusvm_for_each_range(range, notifier, start,
> end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> b/drivers/gpu/drm/xe/drm_gpusvm.h
> new file mode 100644
> index 000000000000..0ea70f8534a8
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> @@ -0,0 +1,415 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef __DRM_GPUSVM_H__
> +#define __DRM_GPUSVM_H__
> +
> +#include <linux/kref.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/workqueue.h>
> +
> +struct dev_pagemap_ops;
> +struct drm_device;
> +struct drm_gpusvm;
> +struct drm_gpusvm_notifier;
> +struct drm_gpusvm_ops;
> +struct drm_gpusvm_range;
> +
> +/**
> + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> + *
> + * This structure defines the operations for GPU Shared Virtual
> Memory (SVM).
> + * These operations are provided by the GPU driver to manage SVM
> ranges and
> + * perform operations such as migration between VRAM and system RAM.
> + */
> +struct drm_gpusvm_ops {
> +	/**
> +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> +	 *
> +	 * This function shall allocate a GPU SVM notifier.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM notifier on success,
> NULL on failure.
> +	 */
> +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> +
> +	/**
> +	 * @notifier_free: Free a GPU SVM notifier (optional)
> +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> +	 *
> +	 * This function shall free a GPU SVM notifier.
> +	 */
> +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> +
> +	/**
> +	 * @range_alloc: Allocate a GPU SVM range (optional)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 *
> +	 * This function shall allocate a GPU SVM range.
> +	 *
> +	 * Returns:
> +	 * Pointer to the allocated GPU SVM range on success, NULL
> on failure.
> +	 */
> +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> *gpusvm);
> +
> +	/**
> +	 * @range_free: Free a GPU SVM range (optional)
> +	 * @range: Pointer to the GPU SVM range to be freed
> +	 *
> +	 * This function shall free a GPU SVM range.
> +	 */
> +	void (*range_free)(struct drm_gpusvm_range *range);
> +
> +	/**
> +	 * @vram_release: Release VRAM allocation (optional)
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 *
> +	 * This function shall release VRAM allocation and expects
> to drop a
> +	 * reference to VRAM allocation.
> +	 */
> +	void (*vram_release)(void *vram_allocation);
> +
> +	/**
> +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> +	 * @npages: Number of pages to populate
> +	 * @pfn: Array of page frame numbers to populate
> +	 *
> +	 * This function shall populate VRAM page frame numbers
> (PFN).
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> +				 void *vram_allocation,
> +				 unsigned long npages,
> +				 unsigned long *pfn);
> +
> +	/**
> +	 * @copy_to_vram: Copy to VRAM (required for migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (destination)
> +	 * @dma_addr: Pointer to array of DMA addresses (source)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to VRAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @copy_to_sram: Copy to system RAM (required for
> migration)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @pages: Pointer to array of VRAM pages (source)
> +	 * @dma_addr: Pointer to array of DMA addresses
> (destination)
> +	 * @npages: Number of pages to copy
> +	 *
> +	 * This function shall copy pages to system RAM.
> +	 *
> +	 * Returns:
> +	 * 0 on success, a negative error code on failure.
> +	 */
> +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> +			    struct page **pages,
> +			    dma_addr_t *dma_addr,
> +			    unsigned long npages);
> +
> +	/**
> +	 * @invalidate: Invalidate GPU SVM notifier (required)
> +	 * @gpusvm: Pointer to the GPU SVM
> +	 * @notifier: Pointer to the GPU SVM notifier
> +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> +	 *
> +	 * This function shall invalidate the GPU page tables. It
> can safely
> +	 * walk the notifier range RB tree/list in this function.
> Called while
> +	 * holding the notifier lock.
> +	 */
> +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> +			   struct drm_gpusvm_notifier *notifier,
> +			   const struct mmu_notifier_range
> *mmu_range);
> +};
> +
> +/**
> + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> notifier
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: MMU interval notifier
> + * @interval: Interval for the notifier
> + * @rb: Red-black tree node for the parent GPU SVM structure
> notifier tree
> + * @root: Cached root node of the RB tree containing ranges
> + * @range_list: List head containing of ranges in the same order
> they appear in
> + *              interval tree. This is useful to keep iterating
> ranges while
> + *              doing modifications to RB tree.
> + * @flags.removed: Flag indicating whether the MMU interval notifier
> has been
> + *                 removed
> + *
> + * This structure represents a GPU SVM notifier.
> + */
> +struct drm_gpusvm_notifier {
> +	struct drm_gpusvm *gpusvm;
> +	struct mmu_interval_notifier notifier;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} interval;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct rb_root_cached root;
> +	struct list_head range_list;
> +	struct {
> +		u32 removed : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> + *
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier
> + * @refcount: Reference count for the range
> + * @rb: Red-black tree node for the parent GPU SVM notifier
> structure range tree
> + * @va: Virtual address range
> + * @notifier_seq: Notifier sequence number of the range's pages
> + * @pages: Pointer to the array of pages (if backing store is in
> VRAM)
> + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> mapped)
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> size
> + * @flags.migrate_vram: Flag indicating whether the range can be
> migrated to VRAM
> + * @flags.unmapped: Flag indicating if the range has been unmapped
> + * @flags.partial_unmap: Flag indicating if the range has been
> partially unmapped
> + * @flags.has_vram_pages: Flag indicating if the range has vram
> pages
> + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> mapping
> + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> allocation based
> + *                       on @order which releases via kfree
> + *
> + * This structure represents a GPU SVM range used for tracking
> memory ranges
> + * mapped in a DRM device.
> + */
> +struct drm_gpusvm_range {
> +	struct drm_gpusvm *gpusvm;
> +	struct drm_gpusvm_notifier *notifier;
> +	struct kref refcount;
> +	struct {
> +		struct rb_node node;
> +		struct list_head entry;
> +		u64 __subtree_last;
> +	} rb;
> +	struct {
> +		u64 start;
> +		u64 end;
> +	} va;
> +	unsigned long notifier_seq;
> +	union {
> +		struct page **pages;
> +		dma_addr_t *dma_addr;
> +	};
> +	void *vram_allocation;
> +	u16 order;
> +	struct {
> +		/* All flags below must be set upon creation */
> +		u16 migrate_vram : 1;
> +		/* All flags below must be set / cleared under
> notifier lock */
> +		u16 unmapped : 1;
> +		u16 partial_unmap : 1;
> +		u16 has_vram_pages : 1;
> +		u16 has_dma_mapping : 1;
> +		u16 kfree_mapping : 1;
> +	} flags;
> +};
> +
> +/**
> + * struct drm_gpusvm - GPU SVM structure
> + *
> + * @name: Name of the GPU SVM
> + * @drm: Pointer to the DRM device structure
> + * @mm: Pointer to the mm_struct for the address space
> + * @device_private_page_owner: Device private pages owner
> + * @mm_start: Start address of GPU SVM
> + * @mm_range: Range of the GPU SVM
> + * @notifier_size: Size of individual notifiers
> + * @ops: Pointer to the operations structure for GPU SVM
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order.
> + * @num_chunks: Number of chunks
> + * @notifier_lock: Read-write semaphore for protecting notifier
> operations
> + * @zdd_wq: Workqueue for deferred work on zdd destruction
> + * @root: Cached root node of the Red-Black tree containing GPU SVM
> notifiers
> + * @notifier_list: list head containing of notifiers in the same
> order they
> + *                 appear in interval tree. This is useful to keep
> iterating
> + *                 notifiers while doing modifications to RB tree.
> + *
> + * This structure represents a GPU SVM (Shared Virtual Memory) used
> for tracking
> + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> + *
> + * No reference counting is provided, as this is expected to be
> embedded in the
> + * driver VM structure along with the struct drm_gpuvm, which
> handles reference
> + * counting.
> + */
> +struct drm_gpusvm {
> +	const char *name;
> +	struct drm_device *drm;
> +	struct mm_struct *mm;
> +	void *device_private_page_owner;
> +	u64 mm_start;
> +	u64 mm_range;
> +	u64 notifier_size;
> +	const struct drm_gpusvm_ops *ops;
> +	const u64 *chunk_sizes;
> +	int num_chunks;
> +	struct rw_semaphore notifier_lock;
> +	struct workqueue_struct *zdd_wq;
> +	struct rb_root_cached root;
> +	struct list_head notifier_list;
> +};
> +
> +/**
> + * struct drm_gpusvm_ctx - DRM GPU SVM context
> + *
> + * @mmap_locked: mmap lock is locked
> + * @trylock_mmap: trylock mmap lock, used to avoid locking
> inversions
> + *                (e.g.dma-revs -> mmap lock)
> + * @in_notifier: entering from a MMU notifier
> + * @read_only: operating on read-only memory
> + * @vram_possible: possible to use VRAM
> + * @prefault: prefault pages
> + *
> + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> + */
> +struct drm_gpusvm_ctx {
> +	u32 mmap_locked :1;
> +	u32 trylock_mmap :1;
> +	u32 in_notifier :1;
> +	u32 read_only :1;
> +	u32 vram_possible :1;
> +	u32 prefault :1;
> +};
> +
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks);
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> +
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range);
> +
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx);
> +
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx);
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx);
> +
> +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> +
> +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> u64 end);
> +
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end);
> +
> +/**
> + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, take lock
> + */
> +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> +	down_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure.
> + *
> + * Abstract client usage GPU SVM notifier lock, drop lock
> + */
> +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> +	up_read(&(gpusvm__)->notifier_lock)
> +
> +/**
> + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> + * @range: a pointer to the current GPU SVM range
> + *
> + * Return: A pointer to the next drm_gpusvm_range if available, or
> NULL if the
> + *         current range is the last one or if the input range is
> NULL.
> + */
> +static inline struct drm_gpusvm_range *
> +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> +{
> +	if (range && !list_is_last(&range->rb.entry,
> +				   &range->notifier->range_list))
> +		return list_next_entry(range, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> notifier
> + * @range__: Iterator variable for the ranges. If set, it indicates
> the start of
> + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> get the range.
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier.
> It is safe
> + * to use while holding the driver SVM lock or the notifier lock.
> + */
> +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> end__)	\
> +	for ((range__) = (range__)
> ?:					\
> +	     drm_gpusvm_range_find((notifier__), (start__),
> (end__));	\
> +	     (range__) && (range__->va.start <
> (end__));		\
> +	     (range__) = __drm_gpusvm_range_next(range__))
> +
> +/**
> + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> + * @range: Pointer to the GPU SVM range structure.
> + * @mmu_range: Pointer to the MMU notifier range structure.
> + *
> + * This function marks a GPU SVM range as unmapped and sets the
> partial_unmap flag
> + * if the range partially falls within the provided MMU notifier
> range.
> + */
> +static inline void
> +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> +			      const struct mmu_notifier_range
> *mmu_range)
> +{
> +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> +
> +	range->flags.unmapped = true;
> +	if (range->va.start < mmu_range->start ||
> +	    range->va.end > mmu_range->end)
> +		range->flags.partial_unmap = true;
> +}
> +
> +#endif /* __DRM_GPUSVM_H__ */
Matthew Brost Sept. 24, 2024, 4:30 p.m. UTC | #38
On Tue, Sep 24, 2024 at 12:42:56PM +0200, Thomas Hellström wrote:
> Hi, Matt,
> 
> Some random review comments on this patch I came across while looking
> at multi-device.
> 
> Thanks,
> Thomas
> 
> 
> On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > sharing of memory between the CPU and GPU, enhancing performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It
> > also
> > provides mechanisms for allocating, deallocating, and migrating
> > memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> 
> NIT: Naming, Should it be drm_svm rather than drm_gpusvm? For the
> drm_gpuvm component, gpuvm clearly distinguished a gpu_vm from a
> mm_struct but here we don't have the same need.
> 

Can rename.

> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile
> > b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> >  
> >  # core driver code
> >  
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to
> > manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data
> > exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are
> > tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU
> > address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits
> > within the
> > + *	     faulting CPU address space is chosen for the range
> > size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration,
> > invalidations, and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping,
> > migrating, and
> > + * releasing memory ranges between the CPU and GPU. It handles all
> > core memory
> > + * management interactions (DMA mapping, HMM, and migration) and
> > provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure
> > is sufficient
> > + * to build the expected driver components for an SVM implementation
> > as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers
> > based on the
> > + *			     fault address, optionally migrate the
> > range to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > Ranges are
> > + *			expected to be added to the garbage
> > collector upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it
> > locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to
> > handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions
> > that require
> > + * the mmap lock. This option may be useful for drivers that need to
> > call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the
> > notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and
> > sequence
> > + * number. GPU SVM manages all necessary locking and unlocking
> > operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU
> > bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the
> > HMM
> > + * documentation (TODO: Link). Future revisions may transition from
> > a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is
> > deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should
> > implement a
> > + * lock to safeguard core GPU SVM function calls that modify state,
> > such as
> > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > Alternatively,
> > + * these core functions can be called within a single kernel thread,
> > for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between
> > SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently
> > does not
> > + * support mixing SRAM and VRAM pages within a range. This means
> > that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as
> > follows: it
> > + * simplifies the implementation, and range sizes are driver-defined
> > and should
> > + * be relatively small.
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > main one
> > + * being that a subset of the range still has CPU and GPU mappings.
> > If the
> > + * backing store for the range is in VRAM, a subset of the backing
> > store has
> > + * references. One option would be to split the range and VRAM
> > backing store,
> > + * but the implementation for this would be quite complicated. Given
> > that
> > + * partial unmappings are rare and driver-defined range sizes are
> > relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a
> > range, the
> > + * driver is expected to invalidate and destroy the entire range. If
> > the range
> > + * has VRAM as its backing, the driver is also expected to migrate
> > any remaining
> > + * pages back to SRAM.
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected
> > driver
> > + * components: the GPU page fault handler and the garbage collector.
> > A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex
> > migration
> > + * policies, optimized invalidations, or other potentially required
> > driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM
> > ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > fault_addr,
> > + *							gpuva_start,
> > gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have
> > changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > + *					struct drm_gpusvm_range
> > *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages
> > back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > &ctx);
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier
> > *notifier,
> > + *				 const struct mmu_notifier_range
> > *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > >start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier,
> > mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > &ctx);
> > + *
> > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range,
> > mmu_range);
> > + *			driver_garbage_collector_add(gpusvm, range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > >interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > >interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > notifier);
> > +
> 
> Since these trees span struct mm_struct address space which should fit
> in an unsigned long, can we use the generic version (interval_tree.h)
> rather than instantiating two new versions? I figure both contain
> overlapping ranges so we can't use maple trees?
> 

I can look into using a generic version but actually I don't think we
allow overlapping so a maple tree might work here too. I'll likely stick
a generic version in next rev but if the consensus is maple tree we can
switch over to that fairly easy at any point in time as the tree
interaction is completely encapsulated in DRM SVM layer.

> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given
> > range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory
> > range,
> > + * specified by the start and end addresses. It divides the
> > difference
> > + * between the end and start addresses by the page size (PAGE_SIZE)
> > to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up
> > a range
> > + * upon CPU page fault and asynchronously releasing VRAM once the
> > CPU has no
> > + * page references. Asynchronous release is useful because CPU page
> > references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
>  
> I still believe previous review comments are valid here, considering we
> do have multiple drm_gpusvm per struct mm_struct, potentially all
> mapping the above page.
> 

Exactly which comments?

If it related to the range pointer, that is going to be dropped. All
virtual references from zdd will be dropped (i.e. no pointer to even a
DRM SVM).

> > +	void *vram_allocation;
> 
> NIT: Naming. The core is using device memory or devmem. Should we
> follow.
>

I like devmem. Will change.
 
> Also could we, rather than using av void * use an embeddable struct
> with its own ops rather than using the gpusvm ops for this?
> 

Can you give me code snippet example of what you think this should look
like? Not opposed to this.

> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd,
> > destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It
> > sets up the
> > + * reference count, initializes the destroy work, and links the
> > provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work,
> > drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd
> > structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for asynchronous
> > destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd
> > structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > (start__), (end__)),	\
> > +	     (next__) =
> > __drm_gpusvm_range_next(range__);				\
> > +	     (range__) && (range__->va.start <
> > (end__));				\
> > +	     (range__) = (next__), (next__) =
> > __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > or NULL if
> > + *         the current notifier is the last one or if the input
> > notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm-
> > >notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1),	\
> > +	     (next__) =
> > __drm_gpusvm_notifier_next(notifier__);				\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = (next__), (next__) =
> > __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It
> > sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc
> > under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range
> > *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops
> > drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order
> > with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner =
> > device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault
> > address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier
> > structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)				\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining
> > ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any
> > invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next,
> > notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > >notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > >notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree
> > and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree
> > and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > +	range_remove((range__), &(notifier__)->root);		\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool
> > migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the
> > CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier
> > *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > >notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range
> > based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64
> > gpuva_start,
> > +				       u64 gpuva_end, bool
> > check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > >chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with
> > existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted
> > in. Without
> > +		 * this check, or prefault, on BMG
> > 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case,
> > each process
> > +		 * mallocs 16k but the CPU VMA is ~128k which
> > results in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some
> > processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages device
> > pages from
> > +		 * other processes are collected + faulted in which
> > creates all
> > +		 * sorts of problems. Unsure exactly how this
> > happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --
> > r
> > +		 * process-many-malloc' mallocs at least 64k at a
> > time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range
> > based on the
> > + * fault address. Caller must hold a lock to protect range lookup
> > and insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > >notifier,
> > +							  mm,
> > notifier->interval.start,
> > +							  notifier-
> > >interval.end -
> > +							  notifier-
> > >interval.start,
> > +							 
> > &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > current
> > +	 * limitations. If/when migrate_vma_* add more support, this
> > logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > vas,
> > +						 fault_addr,
> > gpuva_start,
> > +						 gpuva_end,
> > migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA
> > region
> > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range.
> > Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +					   struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range-
> > >va.start,
> > +							     range-
> > >va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range-
> > >order)
> > +				dma_unmap_page(gpusvm->drm->dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range-
> > >order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > +					struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also
> > removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > >va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier-
> > >notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified GPU
> > SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the GPU
> > SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its
> > reference count
> > + * reaches zero. If a custom range-free function is provided, it is
> > invoked to
> > + * free the range; otherwise, the range is deallocated using
> > kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range,
> > refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified GPU
> > SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before
> > commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range-
> > >flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > +				      struct drm_gpusvm_range
> > *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they are
> > mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> 
> Is it possible to split this function up to make it look more neat?
> 
> 
> > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > >notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start,
> > range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > >pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only
> > a read-lock
> > +			 * is required but there apears to be a
> > window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > a CPU fault
> > +			 * via migrate_vma_setup and the pages
> > actually moving
> > +			 * in migrate_vma_finalize in which this
> > code can grab
> > +			 * garbage pages. Grabbing the write-lock if
> > the range
> > +			 * is attached to vram appears to protect
> > against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if
> > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> 
> Here it looks like you're assuming that all pages are the same order?
> With THP that's definitely not the case, (unless hmm somehow thinks
> they are 4K pages). This probably work because we only end up here in
> the HugeTLB case where all pages are forced to the same oder.
> 

It assumes the order within a chunk (range size) is all the same. I
thought THP pages order would always be 9 (2M). THP tests
(*-large-malloc) seem to work on LNL.

This falls apart if chunks are larger than 2M as the first 2M could be a
THP and 2nd could not. We discussed that you were changing the dma addr
to support mixed mappings and encode the order. That is likely correct
and would fix this limitation of only supporting 1 order size for chunk.

I may not get this in the rev but agree this should be fixed. We
deferring fixing this be ok with you?

fwiw I haven't seen any ROI on chunks being larger than 2M so Xe likely
won't have chunks larger than that but agree the design should support
this.

Matt

> > +			if (WARN_ON_ONCE(i && order !=
> > +					
> > hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if
> > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE <<
> > order,
> > +						  
> > DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev,
> > dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j,
> > sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] =
> > (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order,
> > DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If
> > @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > >invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long
> > *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > grate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU
> > SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU
> > SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn,
> > maps the
> > + * corresponding page, and stores the DMA address in the provided
> > @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int
> > *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU
> > Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in
> > @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the
> > corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction
> > dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > The caller
> > + *                   should hold a reference to the VRAM allocation,
> > which
> > + *                   should be dropped via ops->vram_allocation or
> > upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It
> > performs the
> > + * necessary setup and invokes the driver-specific operations for
> > migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely
> > reference @range
> > + * until ops->vram_release is called which only upon successful
> > return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > >copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > npages, not
> > +	 * always an error. Need to revisit possible cases and how
> > to handle. We
> > +	 * could prefault on migrate.cpages != npages via
> > hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.src, npages,
> > DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers
> > (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the
> > VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > vm_area_struct *vas,
> > +						unsigned long
> > npages,
> > +						unsigned long
> > *src_mpfn,
> > +						unsigned long *mpfn,
> > u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > lock and
> > + * migration done via migrate_device_* functions. Fallback path as
> > it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > >vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm-
> > >device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   dst, npages,
> > DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified
> > GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially unmapped
> > */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src,
> > migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM
> > range to
> > + * SRAM. It performs necessary checks and invokes the internal
> > migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err =
> > drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner
> > case when
> > +	 * VRAM backing has been partially unmapped from MM's
> > address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > end);
> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called
> > while
> > +		 * holding a dma-resv lock, and a final put can grab
> > the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device
> > data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM
> > range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page
> > and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> > +
> > +	return err ? VM_FAULT_SIGBUS : 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > + */
> > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > +	.page_free = drm_gpusvm_page_free,
> > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > operations
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM device page map operations structure.
> > + */
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > +{
> > +	return &drm_gpusvm_pagemap_ops;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > given address range
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Returns:
> > + * True if GPU SVM has mapping, False otherwise
> > + */
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > +		struct drm_gpusvm_range *range = NULL;
> > +
> > +		drm_gpusvm_for_each_range(range, notifier, start,
> > end)
> > +			return true;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > new file mode 100644
> > index 000000000000..0ea70f8534a8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > @@ -0,0 +1,415 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + */
> > +
> > +#ifndef __DRM_GPUSVM_H__
> > +#define __DRM_GPUSVM_H__
> > +
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/workqueue.h>
> > +
> > +struct dev_pagemap_ops;
> > +struct drm_device;
> > +struct drm_gpusvm;
> > +struct drm_gpusvm_notifier;
> > +struct drm_gpusvm_ops;
> > +struct drm_gpusvm_range;
> > +
> > +/**
> > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > + *
> > + * This structure defines the operations for GPU Shared Virtual
> > Memory (SVM).
> > + * These operations are provided by the GPU driver to manage SVM
> > ranges and
> > + * perform operations such as migration between VRAM and system RAM.
> > + */
> > +struct drm_gpusvm_ops {
> > +	/**
> > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > +	 *
> > +	 * This function shall allocate a GPU SVM notifier.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM notifier on success,
> > NULL on failure.
> > +	 */
> > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > +
> > +	/**
> > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM notifier.
> > +	 */
> > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > +
> > +	/**
> > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 *
> > +	 * This function shall allocate a GPU SVM range.
> > +	 *
> > +	 * Returns:
> > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > on failure.
> > +	 */
> > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> > *gpusvm);
> > +
> > +	/**
> > +	 * @range_free: Free a GPU SVM range (optional)
> > +	 * @range: Pointer to the GPU SVM range to be freed
> > +	 *
> > +	 * This function shall free a GPU SVM range.
> > +	 */
> > +	void (*range_free)(struct drm_gpusvm_range *range);
> > +
> > +	/**
> > +	 * @vram_release: Release VRAM allocation (optional)
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 *
> > +	 * This function shall release VRAM allocation and expects
> > to drop a
> > +	 * reference to VRAM allocation.
> > +	 */
> > +	void (*vram_release)(void *vram_allocation);
> > +
> > +	/**
> > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > allocation
> > +	 * @npages: Number of pages to populate
> > +	 * @pfn: Array of page frame numbers to populate
> > +	 *
> > +	 * This function shall populate VRAM page frame numbers
> > (PFN).
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > +				 void *vram_allocation,
> > +				 unsigned long npages,
> > +				 unsigned long *pfn);
> > +
> > +	/**
> > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (destination)
> > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to VRAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @copy_to_sram: Copy to system RAM (required for
> > migration)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @pages: Pointer to array of VRAM pages (source)
> > +	 * @dma_addr: Pointer to array of DMA addresses
> > (destination)
> > +	 * @npages: Number of pages to copy
> > +	 *
> > +	 * This function shall copy pages to system RAM.
> > +	 *
> > +	 * Returns:
> > +	 * 0 on success, a negative error code on failure.
> > +	 */
> > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > +			    struct page **pages,
> > +			    dma_addr_t *dma_addr,
> > +			    unsigned long npages);
> > +
> > +	/**
> > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > +	 * @gpusvm: Pointer to the GPU SVM
> > +	 * @notifier: Pointer to the GPU SVM notifier
> > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > +	 *
> > +	 * This function shall invalidate the GPU page tables. It
> > can safely
> > +	 * walk the notifier range RB tree/list in this function.
> > Called while
> > +	 * holding the notifier lock.
> > +	 */
> > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > +			   struct drm_gpusvm_notifier *notifier,
> > +			   const struct mmu_notifier_range
> > *mmu_range);
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > notifier
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: MMU interval notifier
> > + * @interval: Interval for the notifier
> > + * @rb: Red-black tree node for the parent GPU SVM structure
> > notifier tree
> > + * @root: Cached root node of the RB tree containing ranges
> > + * @range_list: List head containing of ranges in the same order
> > they appear in
> > + *              interval tree. This is useful to keep iterating
> > ranges while
> > + *              doing modifications to RB tree.
> > + * @flags.removed: Flag indicating whether the MMU interval notifier
> > has been
> > + *                 removed
> > + *
> > + * This structure represents a GPU SVM notifier.
> > + */
> > +struct drm_gpusvm_notifier {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct mmu_interval_notifier notifier;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} interval;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct rb_root_cached root;
> > +	struct list_head range_list;
> > +	struct {
> > +		u32 removed : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > + *
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier
> > + * @refcount: Reference count for the range
> > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > structure range tree
> > + * @va: Virtual address range
> > + * @notifier_seq: Notifier sequence number of the range's pages
> > + * @pages: Pointer to the array of pages (if backing store is in
> > VRAM)
> > + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> > mapped)
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> > size
> > + * @flags.migrate_vram: Flag indicating whether the range can be
> > migrated to VRAM
> > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > + * @flags.partial_unmap: Flag indicating if the range has been
> > partially unmapped
> > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > pages
> > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > mapping
> > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > allocation based
> > + *                       on @order which releases via kfree
> > + *
> > + * This structure represents a GPU SVM range used for tracking
> > memory ranges
> > + * mapped in a DRM device.
> > + */
> > +struct drm_gpusvm_range {
> > +	struct drm_gpusvm *gpusvm;
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct kref refcount;
> > +	struct {
> > +		struct rb_node node;
> > +		struct list_head entry;
> > +		u64 __subtree_last;
> > +	} rb;
> > +	struct {
> > +		u64 start;
> > +		u64 end;
> > +	} va;
> > +	unsigned long notifier_seq;
> > +	union {
> > +		struct page **pages;
> > +		dma_addr_t *dma_addr;
> > +	};
> > +	void *vram_allocation;
> > +	u16 order;
> > +	struct {
> > +		/* All flags below must be set upon creation */
> > +		u16 migrate_vram : 1;
> > +		/* All flags below must be set / cleared under
> > notifier lock */
> > +		u16 unmapped : 1;
> > +		u16 partial_unmap : 1;
> > +		u16 has_vram_pages : 1;
> > +		u16 has_dma_mapping : 1;
> > +		u16 kfree_mapping : 1;
> > +	} flags;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm - GPU SVM structure
> > + *
> > + * @name: Name of the GPU SVM
> > + * @drm: Pointer to the DRM device structure
> > + * @mm: Pointer to the mm_struct for the address space
> > + * @device_private_page_owner: Device private pages owner
> > + * @mm_start: Start address of GPU SVM
> > + * @mm_range: Range of the GPU SVM
> > + * @notifier_size: Size of individual notifiers
> > + * @ops: Pointer to the operations structure for GPU SVM
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order.
> > + * @num_chunks: Number of chunks
> > + * @notifier_lock: Read-write semaphore for protecting notifier
> > operations
> > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > + * @root: Cached root node of the Red-Black tree containing GPU SVM
> > notifiers
> > + * @notifier_list: list head containing of notifiers in the same
> > order they
> > + *                 appear in interval tree. This is useful to keep
> > iterating
> > + *                 notifiers while doing modifications to RB tree.
> > + *
> > + * This structure represents a GPU SVM (Shared Virtual Memory) used
> > for tracking
> > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > + *
> > + * No reference counting is provided, as this is expected to be
> > embedded in the
> > + * driver VM structure along with the struct drm_gpuvm, which
> > handles reference
> > + * counting.
> > + */
> > +struct drm_gpusvm {
> > +	const char *name;
> > +	struct drm_device *drm;
> > +	struct mm_struct *mm;
> > +	void *device_private_page_owner;
> > +	u64 mm_start;
> > +	u64 mm_range;
> > +	u64 notifier_size;
> > +	const struct drm_gpusvm_ops *ops;
> > +	const u64 *chunk_sizes;
> > +	int num_chunks;
> > +	struct rw_semaphore notifier_lock;
> > +	struct workqueue_struct *zdd_wq;
> > +	struct rb_root_cached root;
> > +	struct list_head notifier_list;
> > +};
> > +
> > +/**
> > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > + *
> > + * @mmap_locked: mmap lock is locked
> > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > inversions
> > + *                (e.g.dma-revs -> mmap lock)
> > + * @in_notifier: entering from a MMU notifier
> > + * @read_only: operating on read-only memory
> > + * @vram_possible: possible to use VRAM
> > + * @prefault: prefault pages
> > + *
> > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > + */
> > +struct drm_gpusvm_ctx {
> > +	u32 mmap_locked :1;
> > +	u32 trylock_mmap :1;
> > +	u32 in_notifier :1;
> > +	u32 read_only :1;
> > +	u32 vram_possible :1;
> > +	u32 prefault :1;
> > +};
> > +
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks);
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > +
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range);
> > +
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx);
> > +
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx);
> > +
> > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > +
> > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > u64 end);
> > +
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end);
> > +
> > +/**
> > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, take lock
> > + */
> > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > +	down_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure.
> > + *
> > + * Abstract client usage GPU SVM notifier lock, drop lock
> > + */
> > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > +	up_read(&(gpusvm__)->notifier_lock)
> > +
> > +/**
> > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > + * @range: a pointer to the current GPU SVM range
> > + *
> > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > NULL if the
> > + *         current range is the last one or if the input range is
> > NULL.
> > + */
> > +static inline struct drm_gpusvm_range *
> > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > +{
> > +	if (range && !list_is_last(&range->rb.entry,
> > +				   &range->notifier->range_list))
> > +		return list_next_entry(range, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > notifier
> > + * @range__: Iterator variable for the ranges. If set, it indicates
> > the start of
> > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> > get the range.
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> > It is safe
> > + * to use while holding the driver SVM lock or the notifier lock.
> > + */
> > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > end__)	\
> > +	for ((range__) = (range__)
> > ?:					\
> > +	     drm_gpusvm_range_find((notifier__), (start__),
> > (end__));	\
> > +	     (range__) && (range__->va.start <
> > (end__));		\
> > +	     (range__) = __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > + * @range: Pointer to the GPU SVM range structure.
> > + * @mmu_range: Pointer to the MMU notifier range structure.
> > + *
> > + * This function marks a GPU SVM range as unmapped and sets the
> > partial_unmap flag
> > + * if the range partially falls within the provided MMU notifier
> > range.
> > + */
> > +static inline void
> > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > +			      const struct mmu_notifier_range
> > *mmu_range)
> > +{
> > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > +
> > +	range->flags.unmapped = true;
> > +	if (range->va.start < mmu_range->start ||
> > +	    range->va.end > mmu_range->end)
> > +		range->flags.partial_unmap = true;
> > +}
> > +
> > +#endif /* __DRM_GPUSVM_H__ */
>
Zeng, Oak Sept. 25, 2024, 4:34 p.m. UTC | #39
Hi Sima,

> -----Original Message-----
> From: Simona Vetter <simona.vetter@ffwll.ch>
> Sent: Tuesday, September 24, 2024 5:25 AM
> To: Zeng, Oak <oak.zeng@intel.com>
> Cc: Brost, Matthew <matthew.brost@intel.com>; intel-
> xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org;
> thomas.hellstrom@linux.intel.com; Auld, Matthew
> <matthew.auld@intel.com>; daniel@ffwll.ch; airlied@gmail.com;
> christian.koenig@amd.com
> Subject: Re: [RFC PATCH 05/28] drm/gpusvm: Add support for GPU
> Shared Virtual Memory
> 
> On Fri, Sep 06, 2024 at 06:41:18PM +0000, Zeng, Oak wrote:
> > There are fundamental design conflicts with what we have aligned,
> see inline.
> >
> > > -----Original Message-----
> > > From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On
> Behalf
> > > Of Matthew Brost
> > > Sent: Tuesday, August 27, 2024 10:49 PM
> > > To: intel-xe@lists.freedesktop.org; dri-
> devel@lists.freedesktop.org
> > > Cc: airlied@gmail.com; christian.koenig@amd.com;
> > > thomas.hellstrom@linux.intel.com; Auld, Matthew
> > > <matthew.auld@intel.com>; daniel@ffwll.ch
> > > Subject: [RFC PATCH 05/28] drm/gpusvm: Add support for GPU
> > > Shared Virtual Memory
> > >
> > > This patch introduces support for GPU Shared Virtual Memory
> (SVM)
> > > in the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for
> > > seamless
> > > sharing of memory between the CPU and GPU, enhancing
> > > performance and
> > > flexibility in GPU computing tasks.
> > >
> > > The patch adds the necessary infrastructure for SVM, including
> data
> > > structures and functions for managing SVM ranges and notifiers. It
> > > also
> > > provides mechanisms for allocating, deallocating, and migrating
> > > memory
> > > regions between system RAM and GPU VRAM.
> > >
> > > This mid-layer is largely inspired by GPUVM.
> > >
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > +++++++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > >
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > >
> > >  # core driver code
> > >
> > > -xe-y += xe_bb.o \
> > > +xe-y += drm_gpusvm.o \
> > > +	xe_bb.o \
> > >  	xe_bo.o \
> > >  	xe_bo_evict.o \
> > >  	xe_devcoredump.o \
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > new file mode 100644
> > > index 000000000000..fc1e44e6ae72
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > @@ -0,0 +1,2174 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + *
> > > + * Authors:
> > > + *     Matthew Brost <matthew.brost@intel.com>
> > > + */
> > > +
> > > +#include <linux/dma-mapping.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +#include <linux/hmm.h>
> > > +#include <linux/memremap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/mm_types.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include <drm/drm_device.h>
> > > +#include "drm_gpusvm.h"
> > > +
> > > +/**
> > > + * DOC: Overview
> > > + *
> > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > Rendering Manager (DRM)
> > > + *
> > > + * The GPU SVM layer is a component of the DRM framework
> > > designed to manage shared
> > > + * virtual memory between the CPU and GPU. It enables
> efficient
> > > data exchange and
> > > + * processing for GPU-accelerated applications by allowing
> memory
> > > sharing and
> > > + * synchronization between the CPU's and GPU's virtual address
> > > spaces.
> > > + *
> > > + * Key GPU SVM Components:
> > > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > > notifying the
> > > + *		GPU of changes, notifiers are sized based on a GPU
> > > SVM
> > > + *		initialization parameter, with a recommendation of
> > > 512M or
> > > + *		larger. They maintain a Red-BlacK tree and a list of
> > > ranges that
> > > + *		fall within the notifier interval. Notifiers are tracked
> > > within
> > > + *		a GPU SVM Red-BlacK tree and list and are
> > > dynamically inserted
> > > + *		or removed as ranges within the interval are created
> > > or
> > > + *		destroyed.
> > > + * - Ranges: Represent memory ranges mapped in a DRM device
> and
> > > managed
> > > + *	     by GPU SVM.
> >
> >
> > This svm_range concept has introduced a lot of code duplications in
> xekmd,
> > Indicating that this is a wrong design. I think one of the design
> principle is to
> > Reuse, not to duplicate.
> >
> > Look at patch 9, 11, bunch of duplicated codes to page table update,
> invalidate,
> > And page fault handler.
> >
> > I had this range concept in v1 [1], but after we agreed to unify svm
> and userptr
> > Codes during review, I dropped this concept, and the xe_svm
> concept, which ends
> > Up much less duplicated codes in v2[2]. I will say more below why I
> thought the svm
> > Concept can also be removed.
> >
> > Conceptually vma represent a range. Why duplicate?
> 
> Because we cannot rely on mmap_read/write locks or
> vma_read/write locks
> without causing headaches. They are core mm datastructures that the
> gpu
> driver does not own, so for better or worse we have to do a bit of
> duplication.

Seems there is a misunderstanding here. By vma I meant a data structure
In driver representing a range, such as xe_vma, not the core mm vma (
Struct vm_area_struct). Sorry I should be more clear.

The point I tried to make was, svm_range concept is pretty much a duplication
Of the xe_vma concept. If you look at the definition of those two data structure,
They are very similar. This further ends up with codes duplication in page table
Update codes:

- xe_pt_zap_ptes_range, duplicate xe_pt_zap_ptes
- xe_svm_invalidate, duplicate xe_userptr_invalidate
- xe_vm_range_rebind/unbind and many other functions are all duplicated.
- the rb-tree in drm_gpusvm duplicate the rb-tree in drm_gpuvm


> 
> Duplication for no reaons is bad, but trying to avoid necessary
> duplication that's inherit to the design challenge we face is much
> worse.

I agree some duplication is necessary. But let's discuss whether we can avoid
Duplication in this design, whether it is reasonable.

In some PoC codes, I was able to avoid duplication w/o breaking the design.
The idea is to unify userptr codes and svm codes:

So userptr (xe_userptr_vma in xekmd) is a sub-class of xe-vma which is a subclass of drm_gpuva.
We just move the userptr concept up to drm layer and renamed it to hmmptr.

This way we can reuse most of the xekmd userptr codes, and reuse the drm_gpuvm rb-tree.

Also there is no need of the drm_gpusvm concept, similar to the core mm design: there is only
Mm_struct but there isn't any shared mm concept.

To mark a drm_gpuvm participate a svm, we only need to introduce a *mm member to point to
The core mm_struct that this gpuvm participate.

As said, those idea wasn't originated from me. In my v1, it was also xe_svm/svm_range. I learned
Those idea during review. Even today, I still think this is a reasonable design. The key reason is, the svm
Design is nothing more than a userptr with the capability of migration to vram. We already have a
Working userptr build on top of drm_gpuvm, drm_gpuva and xe_vma, why should we re-invent all
Those concepts? 

More details: https://gitlab.freedesktop.org/oak/xe-kernel-driver-svm/-/commits/drm-tip-svm-drm-generic-page-centric-Sep05

Btw, in above codes, it not only support page-centric migration to sram (such as partially migrate a range to sram, worse case one page),
It does the same thing for migration to vram. The key idea is, it introduced a migration vector concept: migration vector collects
All the migratable pages (through migrate_vma_setup), and aggregate those pages into a migration job, regardless it is all pages in
A range or only a subset of pages. The lock design there is also "coincidentally" aligned with what you outlined in your previous email,
See the "Doc: lock design" section of https://gitlab.freedesktop.org/oak/xe-kernel-driver-svm/-/commit/10d1576533f549b0d521dfa997b7087d1926e6ed


Oak

> 
> 
> > [1]
> https://patchwork.freedesktop.org/patch/574898/?series=128910&r
> ev=1
> > [2] https://patchwork.freedesktop.org/series/132229/
> >
> >
> > They are sized based on an array of chunk
> > > sizes, which
> > > + *	     is a GPU SVM initialization parameter, and the CPU address
> > > space.
> > > + *	     Upon GPU fault, the largest aligned chunk that fits within
> > > the
> > > + *	     faulting CPU address space is chosen for the range size.
> > > Ranges are
> > > + *	     expected to be dynamically allocated on GPU fault and
> > > removed on an
> > > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > > are tracked in
> > > + *	     a notifier's Red-Black tree.
> > > + * - Operations: Define the interface for driver-specific SVM
> > > operations such as
> > > + *		 allocation, page collection, migration, invalidations,
> > > and VRAM
> > > + *		 release.
> > > + *
> > > + * This layer provides interfaces for allocating, mapping, migrating,
> > > and
> > > + * releasing memory ranges between the CPU and GPU. It
> handles
> > > all core memory
> > > + * management interactions (DMA mapping, HMM, and
> migration)
> > > and provides
> > > + * driver-specific virtual functions (vfuncs). This infrastructure is
> > > sufficient
> > > + * to build the expected driver components for an SVM
> > > implementation as detailed
> > > + * below.
> > > + *
> > > + * Expected Driver Components:
> > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > based on the
> > > + *			     fault address, optionally migrate the range
> > > to
> > > + *			     VRAM, and create GPU bindings.
> > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > Ranges are
> > > + *			expected to be added to the garbage collector
> > > upon
> > > + *			MMU_NOTIFY_UNMAP event.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Locking
> > > + *
> > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > locks/unlocks the
> > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > handle the mmap
> > > + * lock itself, a 'locked' argument is provided to the functions that
> > > require
> > > + * the mmap lock. This option may be useful for drivers that need
> to
> > > call into
> > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > locking
> > > + * inversions between the mmap and dma-resv locks.
> > > + *
> > > + * GPU SVM introduces a global notifier lock, which safeguards
> the
> > > notifier's
> > > + * range RB tree and list, as well as the range's DMA mappings
> and
> > > sequence
> > > + * number. GPU SVM manages all necessary locking and
> unlocking
> > > operations,
> > > + * except for the recheck of the range's sequence number
> > > + * (mmu_interval_read_retry) when the driver is committing
> GPU
> > > bindings. This
> > > + * lock corresponds to the 'driver->update' lock mentioned in the
> > > HMM
> > > + * documentation (TODO: Link). Future revisions may transition
> from
> > > a GPU SVM
> > > + * global lock to a per-notifier lock if finer-grained locking is
> deemed
> > > + * necessary.
> > > + *
> > > + * In addition to the locking mentioned above, the driver should
> > > implement a
> > > + * lock to safeguard core GPU SVM function calls that modify
> state,
> > > such as
> > > + * drm_gpusvm_range_find_or_insert and
> > > drm_gpusvm_range_remove. Alternatively,
> > > + * these core functions can be called within a single kernel thread,
> > > for
> > > + * instance, using an ordered work queue. This lock is denoted as
> > > + * 'driver_svm_lock' in code examples.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Migrataion
> > > + *
> > > + * The migration support is quite simple, allowing migration
> between
> > > SRAM and
> > > + * VRAM at the range granularity. For example, GPU SVM
> currently
> > > does not
> > > + * support mixing SRAM and VRAM pages within a range. This
> means
> > > that upon GPU
> > > + * fault, the entire range can be migrated to VRAM, and upon
> CPU
> > > fault, the
> > > + * entire range is migrated to SRAM.
> > > + *
> > > + * The reasoning for only supporting range granularity is as
> follows: it
> > > + * simplifies the implementation, and range sizes are driver-
> defined
> > > and should
> > > + * be relatively small.
> >
> > Migration at range granularity just couples the physical world with
> virtual world,
> > Which is against the fundamental page-centric design we aligned
> before.
> >
> > Looking at core mm behavior, the shrinking/swapping doesn't
> operate at vma or any
> > Virtual range granularity. This way we swap out the less frequently
> used pages and
> > Keep the more frequently used pages in ram.
> >
> > Similar thing should be done to vram migration to sram.
> >
> > > + */
> > > +
> > > +/**
> > > + * DOC: Partial Unmapping of Ranges
> > > + *
> > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped
> by
> > > CPU resulting
> > > + * in MMU_NOTIFY_UNMAP event) presents several challenges,
> >
> > As said above, the challenge is coming from a design choice. In a
> > Page centric design, the challenges don't exist at all.
> 
> See my other reply, as long as migrate_to_ram is entirely page centric
> we're fine. And I think Matt fixed that now.
> 
> The other aspect of being page centric is gpu pagetable locking, and
> there
> I also gained a lot of clarity on what exactly matters, and what doesn't.
> The mmu_notifer -> range -> page design wouldn't be my personal
> first
> choice, but it is a perfectly ok one I think. As long as we follow all the
> other rules we need to follow about page-centric
> locking/refcounting/pte
> invaliation that migrate_to_ram requires.
> 
> Cheers, Sima
> 
> 
> > > with the main one
> > > + * being that a subset of the range still has CPU and GPU
> mappings.
> > > If the
> > > + * backing store for the range is in VRAM, a subset of the backing
> > > store has
> > > + * references. One option would be to split the range and VRAM
> > > backing store,
> > > + * but the implementation for this would be quite complicated.
> > > Given that
> > > + * partial unmappings are rare and driver-defined range sizes are
> > > relatively
> > > + * small, GPU SVM does not support splitting of ranges.
> > > + *
> > > + * With no support for range splitting, upon partial unmapping of
> a
> > > range, the
> > > + * driver is expected to invalidate and destroy the entire range. If
> > > the range
> > > + * has VRAM as its backing, the driver is also expected to migrate
> any
> > > remaining
> > > + * pages back to SRAM.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Examples
> > > + *
> > > + * This section provides two examples of how to build the
> expected
> > > driver
> > > + * components: the GPU page fault handler and the garbage
> > > collector. A third
> > > + * example demonstrates a sample invalidation driver vfunc.
> > > + *
> > > + * The generic code provided does not include logic for complex
> > > migration
> > > + * policies, optimized invalidations, or other potentially required
> > > driver
> > > + * locking (e.g., DMA-resv locks).
> > > + *
> > > + * 1) GPU page fault handler
> > > + *
> > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > drm_gpusvm_range *range)
> > > + *	{
> > > + *		int err = 0;
> > > + *
> > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > range);
> > > + *
> > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > + *			driver_commit_bind(gpusvm, range);
> > > + *		else
> > > + *			err = -EAGAIN;
> > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > + *
> > > + *		return err;
> > > + *	}
> > > + *
> > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *		int err;
> > > + *
> > > + *		driver_svm_lock();
> > > + *	retry:
> > > + *		// Always process UNMAPs first so view of GPU SVM
> > > ranges is current
> > > + *		driver_garbage_collector(gpusvm);
> > > + *
> > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > fault_addr,
> > > + *							gpuva_start,
> > > gpuva_end,
> > > + *						        &ctx);
> > > + *		if (IS_ERR(range)) {
> > > + *			err = PTR_ERR(range);
> > > + *			goto unlock;
> > > + *		}
> > > + *
> > > + *		if (driver_migration_policy(range)) {
> > > + *			bo = driver_alloc_bo();
> > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > range, bo, &ctx);
> > > + *			if (err)	// CPU mappings may have changed
> > > + *				goto retry;
> > > + *		}
> > > + *
> > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &ctx);
> > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > mappings changed
> > > + *			goto retry;
> > > + *		else if (err)
> > > + *			goto unlock;
> > > + *
> > > + *		err = driver_bind_range(gpusvm, range);
> > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > + *			goto retry
> > > + *
> > > + *	unlock:
> > > + *		driver_svm_unlock();
> > > + *		return err;
> > > + *	}
> > > + *
> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > *gpusvm,
> > > + *					struct drm_gpusvm_range
> > > *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM pages
> > > back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > range, &ctx);
> > > + *
> > > + *		driver_unbind_range(range);
> > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > + *	}
> > > + *
> > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > + *	{
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > > + *			__driver_garbage_collector(gpusvm, range);
> > > + *	}
> > > + *
> > > + * 3) Invalidation driver vfunc.
> > > + *
> > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > + *				 struct drm_gpusvm_notifier *notifier,
> > > + *				 const struct mmu_notifier_range
> > > *mmu_range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
> > > + *		struct drm_gpusvm_range *range = NULL;
> > > + *
> > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > >start, mmu_range->end);
> > > + *
> > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > mmu_range->start,
> > > + *					  mmu_range->end) {
> > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > range, &ctx);
> > > + *
> > > + *			if (mmu_range->event !=
> > > MMU_NOTIFY_UNMAP)
> > > + *				continue;
> > > + *
> > > + *			drm_gpusvm_range_set_unmapped(range,
> > > mmu_range);
> > > + *			driver_garbage_collector_add(gpusvm,
> > > range);
> > > + *		}
> > > + *	}
> > > + */
> > > +
> > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)-
> > > >va.start)
> > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)-
> > > >va.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node,
> u64,
> > > rb.__subtree_last,
> > > +		     DRM_GPUSVM_RANGE_START,
> > > DRM_GPUSVM_RANGE_END,
> > > +		     static __maybe_unused, range);
> > > +
> > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)
> 	((_notifier)-
> > > >interval.start)
> > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)
> 	((_notifier)-
> > > >interval.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node,
> u64,
> > > +		     rb.__subtree_last,
> > > DRM_GPUSVM_NOTIFIER_START,
> > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > __maybe_unused, notifier);
> > > +
> > > +/**
> > > + * npages_in_range() - Calculate the number of pages in a given
> > > range
> > > + * @start__: The start address of the range
> > > + * @end__: The end address of the range
> > > + *
> > > + * This macro calculates the number of pages in a given memory
> > > range,
> > > + * specified by the start and end addresses. It divides the
> difference
> > > + * between the end and start addresses by the page size
> > > (PAGE_SIZE) to
> > > + * determine the number of pages in the range.
> > > + *
> > > + * Return: The number of pages in the specified range.
> > > + */
> > > +#define npages_in_range(start__, end__)	\
> > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > +
> > > +/**
> > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > + *
> > > + * @refcount: Reference count for the zdd
> > > + * @destroy_work: Work structure for asynchronous zdd
> > > destruction
> > > + * @range: Pointer to the GPU SVM range
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> > > + *
> > > + * This structure serves as a generic wrapper installed in
> > > + * page->zone_device_data. It provides infrastructure for
> looking up
> > > a range
> > > + * upon CPU page fault and asynchronously releasing VRAM once
> > > the CPU has no
> > > + * page references. Asynchronous release is useful because CPU
> > > page references
> > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > requires sleeping
> > > + * locks.
> > > + */
> > > +struct drm_gpusvm_zdd {
> > > +	struct kref refcount;
> > > +	struct work_struct destroy_work;
> > > +	struct drm_gpusvm_range *range;
> > > +	void *vram_allocation;
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > destroying a zdd
> > > + * @w: Pointer to the work_struct
> > > + *
> > > + * This function releases VRAM, puts GPU SVM range, and frees
> zdd.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy_work_func(struct
> > > work_struct *w)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(w, struct drm_gpusvm_zdd,
> > > destroy_work);
> > > +	struct drm_gpusvm_range *range = zdd->range;
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > +	drm_gpusvm_range_put(range);
> > > +	kfree(zdd);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > + * @range: Pointer to the GPU SVM range.
> > > + *
> > > + * This function allocates and initializes a new zdd structure. It
> sets
> > > up the
> > > + * reference count, initializes the destroy work, and links the
> > > provided GPU SVM
> > > + * range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > > + */
> > > +static struct drm_gpusvm_zdd *
> > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd;
> > > +
> > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > +	if (!zdd)
> > > +		return NULL;
> > > +
> > > +	kref_init(&zdd->refcount);
> > > +	INIT_WORK(&zdd->destroy_work,
> > > drm_gpusvm_zdd_destroy_work_func);
> > > +	zdd->range = drm_gpusvm_range_get(range);
> > > +	zdd->vram_allocation = NULL;
> > > +
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function increments the reference count of the provided
> zdd
> > > structure.
> > > + *
> > > + * Returns: Pointer to the zdd structure.
> > > + */
> > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_get(&zdd->refcount);
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > + * @ref: Pointer to the reference count structure.
> > > + *
> > > + * This function queues the destroy_work of the zdd for
> > > asynchronous destruction.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > +
> > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function decrements the reference count of the provided
> zdd
> > > structure
> > > + * and schedules its destruction if the count drops to zero.
> > > + */
> > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU
> SVM
> > > notifier
> > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > + * @start: Start address of the range
> > > + * @end: End address of the range
> > > + *
> > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> u64
> > > start, u64 end)
> > > +{
> > > +	return range_iter_first(&notifier->root, start, end - 1);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU
> > > SVM ranges in a notifier
> > > + * @range__: Iterator variable for the ranges
> > > + * @next__: Iterator variable for the ranges temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > > while
> > > + * removing ranges from it.
> > > + */
> > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > notifier__, start__, end__)	\
> > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > (start__), (end__)),	\
> > > +	     (next__) = __drm_gpusvm_range_next(range__);
> > > 			\
> > > +	     (range__) && (range__->va.start < (end__));
> > > 			\
> > > +	     (range__) = (next__), (next__) =
> > > __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_notifier_next - get the next
> > > drm_gpusvm_notifier in the list
> > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> or
> > > NULL if
> > > + *         the current notifier is the last one or if the input notifier is
> > > + *         NULL.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > +				      &notifier->gpusvm->notifier_list))
> > > +		return list_next_entry(notifier, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM
> notifiers
> > > in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> gpusvm.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > start__, end__)		\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1);	\
> > > +	     (notifier__) && (notifier__->interval.start < (end__));
> > > 			\
> > > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over
> GPU
> > > SVM notifiers in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @next__: Iterator variable for the notifiers temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> gpusvm
> > > while
> > > + * removing notifiers from it.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier_safe(notifier__,
> next__,
> > > gpusvm__, start__, end__)	\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1),	\
> > > +	     (next__) = __drm_gpusvm_notifier_next(notifier__);
> > > 				\
> > > +	     (notifier__) && (notifier__->interval.start < (end__));
> > > 			\
> > > +	     (notifier__) = (next__), (next__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> notifier.
> > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > + * @cur_seq: Current sequence number.
> > > + *
> > > + * This function serves as a generic MMU notifier for GPU SVM. It
> > > sets the MMU
> > > + * notifier sequence number and calls the driver invalidate vfunc
> > > under
> > > + * gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * true if the operation succeeds, false otherwise.
> > > + */
> > > +static bool
> > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> *mni,
> > > +			       const struct mmu_notifier_range
> > > *mmu_range,
> > > +			       unsigned long cur_seq)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier =
> > > +		container_of(mni, typeof(*notifier), notifier);
> > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > +
> > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > +		return false;
> > > +
> > > +	down_write(&gpusvm->notifier_lock);
> > > +	mmu_interval_set_seq(mni, cur_seq);
> > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > +	up_write(&gpusvm->notifier_lock);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations
> for
> > > GPU SVM
> > > + */
> > > +static const struct mmu_interval_notifier_ops
> > > drm_gpusvm_notifier_ops = {
> > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @name: Name of the GPU SVM.
> > > + * @drm: Pointer to the DRM device structure.
> > > + * @mm: Pointer to the mm_struct for the address space.
> > > + * @device_private_page_owner: Device private pages owner.
> > > + * @mm_start: Start address of GPU SVM.
> > > + * @mm_range: Range of the GPU SVM.
> > > + * @notifier_size: Size of individual notifiers.
> > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order with
> last
> > > + *               entry being SZ_4K.
> > > + * @num_chunks: Number of chunks.
> > > + *
> > > + * This function initializes the GPU SVM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, a negative error code on failure.
> > > + */
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks)
> > > +{
> > > +	if (!ops->invalidate || !num_chunks)
> > > +		return -EINVAL;
> > > +
> > > +	gpusvm->name = name;
> > > +	gpusvm->drm = drm;
> > > +	gpusvm->mm = mm;
> > > +	gpusvm->device_private_page_owner =
> > > device_private_page_owner;
> > > +	gpusvm->mm_start = mm_start;
> > > +	gpusvm->mm_range = mm_range;
> > > +	gpusvm->notifier_size = notifier_size;
> > > +	gpusvm->ops = ops;
> > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > +	gpusvm->num_chunks = num_chunks;
> > > +	gpusvm->zdd_wq = system_wq;
> > > +
> > > +	mmgrab(mm);
> > > +	gpusvm->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > +
> > > +	init_rwsem(&gpusvm->notifier_lock);
> > > +
> > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > +	might_lock(&gpusvm->notifier_lock);
> > > +	fs_reclaim_release(GFP_KERNEL);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > + * @fault_addr__: Fault address
> > > + *
> > > + * This macro finds the GPU SVM notifier associated with the
> fault
> > > address.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > + */
> > > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)
> > > 	\
> > > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),
> > > 	\
> > > +			    (fault_addr__ + 1))
> > > +
> > > +/**
> > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > given rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_notifier struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > structure.
> > > + */
> > > +#define to_drm_gpusvm_notifier(__node)
> > > 	\
> > > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function inserts the GPU SVM notifier into the GPU SVM
> RB
> > > tree and list.
> > > + */
> > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	notifier_insert(notifier, &gpusvm->root);
> > > +
> > > +	node = rb_prev(&notifier->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > > +	else
> > > +		head = &gpusvm->notifier_list;
> > > +
> > > +	list_add(&notifier->rb.entry, head);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This macro removes the GPU SVM notifier from the GPU SVM
> RB
> > > tree and list.
> > > + */
> > > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)
> > > 	\
> > > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > > +	list_del(&(notifier__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + *
> > > + * This function finalizes the GPU SVM by cleaning up any
> remaining
> > > ranges and
> > > + * notifiers, and dropping a reference to struct MM.
> > > + */
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > +
> > > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm,
> > > 0, LONG_MAX) {
> > > +		struct drm_gpusvm_range *range, *__next;
> > > +
> > > +		/*
> > > +		 * Remove notifier first to avoid racing with any
> > > invalidation
> > > +		 */
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +		notifier->flags.removed = true;
> > > +
> > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > notifier, 0,
> > > +					       LONG_MAX)
> > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > +	}
> > > +
> > > +	mmdrop(gpusvm->mm);
> > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + *
> > > + * This function allocates and initializes the GPU SVM notifier
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM notifier on success,
> ERR_PTR()
> > > on failure.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > fault_addr)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	if (gpusvm->ops->notifier_alloc)
> > > +		notifier = gpusvm->ops->notifier_alloc();
> > > +	else
> > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > +
> > > +	if (!notifier)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	notifier->gpusvm = gpusvm;
> > > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >notifier_size);
> > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > >notifier_size);
> > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > +	notifier->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > +
> > > +	return notifier;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function frees the GPU SVM notifier structure.
> > > + */
> > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm
> *gpusvm,
> > > +				     struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > +
> > > +	if (gpusvm->ops->notifier_free)
> > > +		gpusvm->ops->notifier_free(notifier);
> > > +	else
> > > +		kfree(notifier);
> > > +}
> > > +
> > > +/**
> > > + * to_drm_gpusvm_range - retrieve the container struct for a
> given
> > > rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_range struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_range
> structure.
> > > + */
> > > +#define to_drm_gpusvm_range(node__)	\
> > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function inserts the GPU SVM range into the notifier RB
> tree
> > > and list.
> > > + */
> > > +static void drm_gpusvm_range_insert(struct
> drm_gpusvm_notifier
> > > *notifier,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > +	range_insert(range, &notifier->root);
> > > +
> > > +	node = rb_prev(&range->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > +	else
> > > +		head = &notifier->range_list;
> > > +
> > > +	list_add(&range->rb.entry, head);
> > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + * @range__: Pointer to the GPU SVM range structure
> > > + *
> > > + * This macro removes the GPU SVM range from the notifier RB
> tree
> > > and list.
> > > + */
> > > +#define __drm_gpusvm_range_remove(notifier__, range__)
> > > 		\
> > > +	range_remove((range__), &(notifier__)->root);
> > > 	\
> > > +	list_del(&(range__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @fault_addr: Fault address
> > > + * @chunk_size: Chunk size
> > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > + *
> > > + * This function allocates and initializes the GPU SVM range
> structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR()
> on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_range *
> > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > +		       struct drm_gpusvm_notifier *notifier,
> > > +		       u64 fault_addr, u64 chunk_size, bool
> > > migrate_vram)
> > > +{
> > > +	struct drm_gpusvm_range *range;
> > > +
> > > +	if (gpusvm->ops->range_alloc)
> > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > +	else
> > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > +
> > > +	if (!range)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	kref_init(&range->refcount);
> > > +	range->gpusvm = gpusvm;
> > > +	range->notifier = notifier;
> > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > +	range->notifier_seq = LONG_MAX;
> > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_check_pages - Check pages
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Check if pages between start and end have been faulted in on
> the
> > > CPU. Use to
> > > + * prevent migration of pages without CPU backing store.
> > > + *
> > > + * Returns:
> > > + * True if pages have been faulted into CPU, False otherwise
> > > + */
> > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm
> *gpusvm,
> > > +				   struct drm_gpusvm_notifier
> > > *notifier,
> > > +				   u64 start, u64 end)
> > > +{
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = 0,
> > > +		.notifier = &notifier->notifier,
> > > +		.start = start,
> > > +		.end = end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long *pfns;
> > > +	unsigned long npages = npages_in_range(start, end);
> > > +	int err, i;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > > +	if (!pfns)
> > > +		return false;
> > > +
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +	hmm_range.hmm_pfns = pfns;
> > > +
> > > +	while (true) {
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > +			err = -EFAULT;
> > > +			goto err_free;
> > > +		}
> > > +	}
> > > +
> > > +err_free:
> > > +	kvfree(pfns);
> > > +	return err ? false : true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_chunk_size - Determine chunk size for
> GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @vas: Pointer to the virtual memory area structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @check_pages: Flag indicating whether to check pages
> > > + *
> > > + * This function determines the chunk size for the GPU SVM
> range
> > > based on the
> > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > > and the virtual
> > > + * memory area boundaries.
> > > + *
> > > + * Returns:
> > > + * Chunk size on success, LONG_MAX on failure.
> > > + */
> > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier,
> > > +				       struct vm_area_struct *vas,
> > > +				       u64 fault_addr, u64 gpuva_start,
> > > +				       u64 gpuva_end, bool check_pages)
> > > +{
> > > +	u64 start, end;
> > > +	int i = 0;
> > > +
> > > +retry:
> > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >chunk_sizes[i]);
> > > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > > +
> > > +		if (start >= vas->vm_start && end <= vas->vm_end
> > > &&
> > > +		    start >= notifier->interval.start &&
> > > +		    end <= notifier->interval.end &&
> > > +		    start >= gpuva_start && end <= gpuva_end)
> > > +			break;
> > > +	}
> > > +
> > > +	if (i == gpusvm->num_chunks)
> > > +		return LONG_MAX;
> > > +
> > > +	/*
> > > +	 * If allocation more than page, ensure not to overlap with
> > > existing
> > > +	 * ranges.
> > > +	 */
> > > +	if (end - start != SZ_4K) {
> > > +		struct drm_gpusvm_range *range;
> > > +
> > > +		range = drm_gpusvm_range_find(notifier, start, end);
> > > +		if (range) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +
> > > +		/*
> > > +		 * XXX: Only create range on pages CPU has faulted in.
> > > Without
> > > +		 * this check, or prefault, on BMG
> > > 'xe_exec_system_allocator --r
> > > +		 * process-many-malloc' fails. In the failure case, each
> > > process
> > > +		 * mallocs 16k but the CPU VMA is ~128k which results
> > > in 64k SVM
> > > +		 * ranges. When migrating the SVM ranges, some
> > > processes fail in
> > > +		 * drm_gpusvm_migrate_to_vram with
> > > 'migrate.cpages != npages'
> > > +		 * and then upon drm_gpusvm_range_get_pages
> > > device pages from
> > > +		 * other processes are collected + faulted in which
> > > creates all
> > > +		 * sorts of problems. Unsure exactly how this
> > > happening, also
> > > +		 * problem goes away if 'xe_exec_system_allocator --
> > > r
> > > +		 * process-many-malloc' mallocs at least 64k at a time.
> > > +		 */
> > > +		if (check_pages &&
> > > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > > end)) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +	}
> > > +
> > > +	return end - start;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function finds or inserts a newly allocated a GPU SVM
> range
> > > based on the
> > > + * fault address. Caller must hold a lock to protect range lookup
> and
> > > insertion.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> failure.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm
> *gpusvm,
> > > u64 fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct drm_gpusvm_range *range;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	bool notifier_alloc = false;
> > > +	u64 chunk_size;
> > > +	int err;
> > > +	bool migrate_vram;
> > > +
> > > +	if (fault_addr < gpusvm->mm_start ||
> > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > +		err = -EINVAL;
> > > +		goto err_out;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_write_locked(mm);
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > +	if (!notifier) {
> > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > fault_addr);
> > > +		if (IS_ERR(notifier)) {
> > > +			err = PTR_ERR(notifier);
> > > +			goto err_mmunlock;
> > > +		}
> > > +		notifier_alloc = true;
> > > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > > >notifier,
> > > +							  mm, notifier-
> > > >interval.start,
> > > +							  notifier-
> > > >interval.end -
> > > +							  notifier-
> > > >interval.start,
> > > +
> > > &drm_gpusvm_notifier_ops);
> > > +		if (err)
> > > +			goto err_notifier;
> > > +	}
> > > +
> > > +	vas = vma_lookup(mm, fault_addr);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > +		err = -EPERM;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > fault_addr + 1);
> > > +	if (range)
> > > +		goto out_mmunlock;
> > > +	/*
> > > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > > current
> > > +	 * limitations. If/when migrate_vma_* add more support, this
> > > logic will
> > > +	 * have to change.
> > > +	 */
> > > +	migrate_vram = ctx->vram_possible &&
> > > +		vma_is_anonymous(vas)
> > > && !is_vm_hugetlb_page(vas);
> > > +
> > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > notifier, vas,
> > > +						 fault_addr,
> > > gpuva_start,
> > > +						 gpuva_end,
> > > migrate_vram &&
> > > +						 !ctx->prefault);
> > > +	if (chunk_size == LONG_MAX) {
> > > +		err = -EINVAL;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > fault_addr, chunk_size,
> > > +				       migrate_vram);
> > > +	if (IS_ERR(range)) {
> > > +		err = PTR_ERR(range);
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	drm_gpusvm_range_insert(notifier, range);
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > +
> > > +	if (ctx->prefault) {
> > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > +
> > > +		__ctx.mmap_locked = true;
> > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &__ctx);
> > > +		if (err)
> > > +			goto err_range_remove;
> > > +	}
> > > +
> > > +out_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +
> > > +	return range;
> > > +
> > > +err_range_remove:
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +err_notifier_remove:
> > > +	if (notifier_alloc)
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +err_notifier:
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +/**
> > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > + * @i__: the current page index in the iteration
> > > + * @j__: the current page index, log order, in the iteration
> > > + * @npages__: the total number of pages in the DMA region
> > > + * @order__: the order of the pages in the DMA region
> > > + *
> > > + * This macro iterates over each page in a DMA region. The DMA
> > > region
> > > + * is assumed to be composed of 2^@order__ pages, and the
> macro
> > > will
> > > + * step through the region one block of 2^@order__ pages at a
> time.
> > > + */
> > > +#define for_each_dma_page(i__, j__, npages__, order__)
> 	\
> > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_unmap_pages - Unmap pages
> associated
> > > with a GPU SVM range (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function unmap pages associated with a GPU SVM range.
> > > Assumes and
> > > + * asserts correct locking is in place when called.
> > > + */
> > > +static void __drm_gpusvm_range_unmap_pages(struct
> > > drm_gpusvm *gpusvm,
> > > +					   struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		unsigned long i, j, npages = npages_in_range(range-
> > > >va.start,
> > > +							     range-
> > > >va.end);
> > > +
> > > +		if (range->flags.has_dma_mapping) {
> > > +			for_each_dma_page(i, j, npages, range-
> > > >order)
> > > +				dma_unmap_page(gpusvm->drm-
> > > >dev,
> > > +					       range->dma_addr[j],
> > > +					       PAGE_SIZE << range-
> > > >order,
> > > +					       DMA_BIDIRECTIONAL);
> > > +		}
> > > +
> > > +		range->flags.has_vram_pages = false;
> > > +		range->flags.has_dma_mapping = false;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_free_pages - Free pages associated with
> a
> > > GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function free pages associated with a GPU SVM range.
> > > + */
> > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		if (range->flags.kfree_mapping) {
> > > +			kfree(range->dma_addr);
> > > +			range->flags.kfree_mapping = false;
> > > +			range->pages = NULL;
> > > +		} else {
> > > +			kvfree(range->pages);
> > > +			range->pages = NULL;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range to be removed
> > > + *
> > > + * This function removes the specified GPU SVM range and also
> > > removes the parent
> > > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > > caller must
> > > + * hold a lock to protect range and notifier removal.
> > > + */
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > >va.start);
> > > +	if (WARN_ON_ONCE(!notifier))
> > > +		return;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	drm_gpusvm_range_put(range);
> > > +
> > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > +		if (!notifier->flags.removed)
> > > +			mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function increments the reference count of the specified
> > > GPU SVM range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_get(&range->refcount);
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > + * @refcount: Pointer to the reference counter embedded in the
> > > GPU SVM range
> > > + *
> > > + * This function destroys the specified GPU SVM range when its
> > > reference count
> > > + * reaches zero. If a custom range-free function is provided, it is
> > > invoked to
> > > + * free the range; otherwise, the range is deallocated using
> kfree().
> > > + */
> > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > +{
> > > +	struct drm_gpusvm_range *range =
> > > +		container_of(refcount, struct drm_gpusvm_range,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->range_free)
> > > +		gpusvm->ops->range_free(range);
> > > +	else
> > > +		kfree(range);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function decrements the reference count of the specified
> > > GPU SVM range
> > > + * and frees it when the count reaches zero.
> > > + */
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages
> valid
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called holding gpusvm->notifier_lock and as the last step
> before
> > > commiting a
> > > + * GPU binding.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm
> *gpusvm,
> > > +				  struct drm_gpusvm_range *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	return range->flags.has_vram_pages || range-
> > > >flags.has_dma_mapping;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range
> > > pages valid unlocked
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called without holding gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +static bool
> > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm
> > > *gpusvm,
> > > +				      struct drm_gpusvm_range *range)
> > > +{
> > > +	bool pages_valid;
> > > +
> > > +	if (!range->pages)
> > > +		return false;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > range);
> > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > +		kfree(range->dma_addr);
> > > +		range->flags.kfree_mapping = false;
> > > +		range->pages = NULL;
> > > +	}
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	return pages_valid;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM
> range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function gets pages for a GPU SVM range and ensures
> they
> > > are mapped for
> > > + * DMA access.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > > >notifier;
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > >read_only ? 0 :
> > > +			HMM_PFN_REQ_WRITE),
> > > +		.notifier = notifier,
> > > +		.start = range->va.start,
> > > +		.end = range->va.end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long i, j;
> > > +	unsigned long npages = npages_in_range(range->va.start,
> > > range->va.end);
> > > +	unsigned int order = 0;
> > > +	unsigned long *pfns;
> > > +	struct page **pages;
> > > +	int err = 0;
> > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > +	bool alloc_pfns = false, kfree_mapping;
> > > +
> > > +retry:
> > > +	kfree_mapping = false;
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > range))
> > > +		return 0;
> > > +
> > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > range->pages) {
> > > +		if (ctx->prefault)
> > > +			return 0;
> > > +
> > > +		pfns = (unsigned long *)range->pages;
> > > +		pages = range->pages;
> > > +		goto map_pages;
> > > +	}
> > > +
> > > +	if (!range->pages) {
> > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +		if (!pfns)
> > > +			return -ENOMEM;
> > > +		alloc_pfns = true;
> > > +	} else {
> > > +		pfns = (unsigned long *)range->pages;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +	}
> > > +
> > > +	hmm_range.hmm_pfns = pfns;
> > > +	while (true) {
> > > +		/* Must be checked after mmu_interval_read_begin
> > > */
> > > +		if (range->flags.unmapped) {
> > > +			err = -EFAULT;
> > > +			break;
> > > +		}
> > > +
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates only
> > > a read-lock
> > > +			 * is required but there apears to be a window
> > > between
> > > +			 * the MMU_NOTIFY_MIGRATE event
> > > triggered in a CPU fault
> > > +			 * via migrate_vma_setup and the pages
> > > actually moving
> > > +			 * in migrate_vma_finalize in which this code
> > > can grab
> > > +			 * garbage pages. Grabbing the write-lock if
> > > the range
> > > +			 * is attached to vram appears to protect
> > > against this
> > > +			 * race.
> > > +			 */
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if
> > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +
> > > 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +
> > > hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if
> > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm-
> > > >drm->dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE << order,
> > > +
> > > DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j,
> > > sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] =
> > > (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +
> > > 	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order,
> > > DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated
> > > with a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range.
> If
> > > @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > > mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must
> be
> > > called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > >invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long
> > > npages,
> > > +					   unsigned long *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +
> > > 	drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > > grate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM
> page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified GPU
> > > SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages
> for
> > > GPU SVM migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > mapped pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in
> GPU
> > > SVM. It
> > > + * iterates over each page frame number provided in
> @migrate_pfn,
> > > maps the
> > > + * corresponding page, and stores the DMA address in the
> provided
> > > @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during
> mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int
> > > *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page =
> > > migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > PAGE_SIZE, dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages
> previously
> > > mapped for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to
> mapped
> > > pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for
> > > GPU Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in
> > > @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the
> > > corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device
> *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to
> > > VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> allocation.
> > > The caller
> > > + *                   should hold a reference to the VRAM allocation,
> which
> > > + *                   should be dropped via ops->vram_allocation or upon
> the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM.
> It
> > > performs the
> > > + * necessary setup and invokes the driver-specific operations for
> > > migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > reference @range
> > > + * until ops->vram_release is called which only upon successful
> > > return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > >copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > > npages, not
> > > +	 * always an error. Need to revisit possible cases and how to
> > > handle. We
> > > +	 * could prefault on migrate.cpages != npages via
> > > hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.src, npages,
> > > DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages,
> > > dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > > Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr, npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM
> > > PFNs for a VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame
> numbers
> > > (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in the
> VM
> > > area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> allocation,
> > > if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > vm_area_struct *vas,
> > > +						unsigned long npages,
> > > +						unsigned long
> > > *src_mpfn,
> > > +						unsigned long *mpfn,
> > > u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not
> require
> > > mmap lock and
> > > + * migration done via migrate_device_* functions. Fallback path
> as it
> > > is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm
> *gpusvm,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > >vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm-
> > > >device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL,
> > > npages, src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   dst, npages,
> > > DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range
> to
> > > SRAM (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified
> GPU
> > > SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps
> SRAM
> > > PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		=
> > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially
> > > unmapped */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src,
> > > migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages,
> > > dma_addr, npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr, npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM
> range
> > > to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU SVM
> > > range to
> > > + * SRAM. It performs necessary checks and invokes the internal
> > > migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err =
> > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner
> > > case when
> > > +	 * VRAM backing has been partially unmapped from MM's
> > > address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > > start, end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be called
> > > while
> > > +		 * holding a dma-resv lock, and a final put can grab the
> > > mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > associated with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone
> device
> > > data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to
> RAM
> > > (page fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU
> SVM
> > > range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting
> > > page and invokes
> > > + * the internal migration function to migrate the range back to
> RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > >zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range-
> > > >gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for
> > > GPU SVM
> > > + */
> > > +static const struct dev_pagemap_ops
> drm_gpusvm_pagemap_ops =
> > > {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device
> > > page map operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops
> > > *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping
> for
> > > the given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm,
> u64
> > > start, u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end)
> > > {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier, start,
> > > end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual
> > > Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage
> SVM
> > > ranges and
> > > + * perform operations such as migration between VRAM and
> system
> > > RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success, NULL
> > > on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > > on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct
> > > drm_gpusvm *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and expects to
> > > drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers
> > > (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range
> > > structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables. It can
> > > safely
> > > +	 * walk the notifier range RB tree/list in this function. Called
> > > while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range
> > > *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU
> SVM
> > > notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order
> they
> > > appear in
> > > + *              interval tree. This is useful to keep iterating ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval
> > > notifier has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU
> SVM
> > > range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's
> pages
> > > + * @pages: Pointer to the array of pages (if backing store is in
> VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and
> > > DMA mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is
> > > mapping size
> > > + * @flags.migrate_vram: Flag indicating whether the range can
> be
> > > migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been
> > > unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been
> > > partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > > pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a
> DMA
> > > mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a
> compact
> > > allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking
> > > memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under notifier
> > > lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier
> > > operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing
> GPU
> > > SVM notifiers
> > > + * @notifier_list: list head containing of notifiers in the same
> order
> > > they
> > > + *                 appear in interval tree. This is useful to keep iterating
> > > + *                 notifiers while doing modifications to RB tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory)
> > > used for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager)
> > > device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be
> > > embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which
> > > handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> >
> > I also think the gpusvm concept is a duplication of the drm_gpuvm.
> > Look at the members here, mm_start, mm_range, rb_tree...
> >
> > Maintaining a list of notifier at this layer is odd. Everybody else
> seems
> > Embed the notifier in a range...
> >
> > Mm field is essential for svm though. I think what we can do is,
> introduce a
> > *mm field in drm_gpuvm and introduce uAPI to allow user to say
> one gpuvm
> > Participate svm. If one gpuvm participate svm, we set the mm field
> for this
> > Gpuvm.
> >
> > Another benefit of the proposed way is, multiple gpuvms can share
> address space
> > With single cpu mm process.
> >
> >
> > Oak
> >
> >
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user
> arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm
> *gpusvm,
> > > u64 fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range
> *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm
> *gpusvm,
> > > +				  struct drm_gpusvm_range *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +const struct dev_pagemap_ops
> > > *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm,
> u64
> > > start, u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier,
> u64
> > > start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in
> the
> > > list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available,
> or
> > > NULL if the
> > > + *         current range is the last one or if the input range is NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier->range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges
> in a
> > > notifier
> > > + * @range__: Iterator variable for the ranges. If set, it indicates
> the
> > > start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to get
> > > the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> It
> > > is safe
> > > + * to use while holding the driver SVM lock or the notifier lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__,
> start__,
> > > end__)	\
> > > +	for ((range__) = (range__) ?:
> > > 	\
> > > +	     drm_gpusvm_range_find((notifier__), (start__), (end__));
> > > 	\
> > > +	     (range__) && (range__->va.start < (end__));
> > > 	\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range
> as
> > > unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets
> the
> > > partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier
> range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range
> > > *range,
> > > +			      const struct mmu_notifier_range
> > > *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> > > --
> > > 2.34.1
> >
> 
> --
> Simona Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Matthew Brost Sept. 25, 2024, 9:12 p.m. UTC | #40
On Tue, Sep 24, 2024 at 04:30:06PM +0000, Matthew Brost wrote:
> On Tue, Sep 24, 2024 at 12:42:56PM +0200, Thomas Hellström wrote:
> > Hi, Matt,
> > 
> > Some random review comments on this patch I came across while looking
> > at multi-device.
> > 
> > Thanks,
> > Thomas
> > 
> > 
> > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > > the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > sharing of memory between the CPU and GPU, enhancing performance and
> > > flexibility in GPU computing tasks.
> > > 
> > > The patch adds the necessary infrastructure for SVM, including data
> > > structures and functions for managing SVM ranges and notifiers. It
> > > also
> > > provides mechanisms for allocating, deallocating, and migrating
> > > memory
> > > regions between system RAM and GPU VRAM.
> > > 
> > > This mid-layer is largely inspired by GPUVM.
> > 
> > NIT: Naming, Should it be drm_svm rather than drm_gpusvm? For the
> > drm_gpuvm component, gpuvm clearly distinguished a gpu_vm from a
> > mm_struct but here we don't have the same need.
> > 
> 
> Can rename.
> 
> > > 
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > +++++++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > 
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > >  
> > >  # core driver code
> > >  
> > > -xe-y += xe_bb.o \
> > > +xe-y += drm_gpusvm.o \
> > > +	xe_bb.o \
> > >  	xe_bo.o \
> > >  	xe_bo_evict.o \
> > >  	xe_devcoredump.o \
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > new file mode 100644
> > > index 000000000000..fc1e44e6ae72
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > @@ -0,0 +1,2174 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + *
> > > + * Authors:
> > > + *     Matthew Brost <matthew.brost@intel.com>
> > > + */
> > > +
> > > +#include <linux/dma-mapping.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +#include <linux/hmm.h>
> > > +#include <linux/memremap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/mm_types.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include <drm/drm_device.h>
> > > +#include "drm_gpusvm.h"
> > > +
> > > +/**
> > > + * DOC: Overview
> > > + *
> > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > Rendering Manager (DRM)
> > > + *
> > > + * The GPU SVM layer is a component of the DRM framework designed to
> > > manage shared
> > > + * virtual memory between the CPU and GPU. It enables efficient data
> > > exchange and
> > > + * processing for GPU-accelerated applications by allowing memory
> > > sharing and
> > > + * synchronization between the CPU's and GPU's virtual address
> > > spaces.
> > > + *
> > > + * Key GPU SVM Components:
> > > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > > notifying the
> > > + *		GPU of changes, notifiers are sized based on a GPU
> > > SVM
> > > + *		initialization parameter, with a recommendation of
> > > 512M or
> > > + *		larger. They maintain a Red-BlacK tree and a list of
> > > ranges that
> > > + *		fall within the notifier interval. Notifiers are
> > > tracked within
> > > + *		a GPU SVM Red-BlacK tree and list and are
> > > dynamically inserted
> > > + *		or removed as ranges within the interval are created
> > > or
> > > + *		destroyed.
> > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > managed
> > > + *	     by GPU SVM. They are sized based on an array of chunk
> > > sizes, which
> > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > address space.
> > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > within the
> > > + *	     faulting CPU address space is chosen for the range
> > > size. Ranges are
> > > + *	     expected to be dynamically allocated on GPU fault and
> > > removed on an
> > > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > > are tracked in
> > > + *	     a notifier's Red-Black tree.
> > > + * - Operations: Define the interface for driver-specific SVM
> > > operations such as
> > > + *		 allocation, page collection, migration,
> > > invalidations, and VRAM
> > > + *		 release.
> > > + *
> > > + * This layer provides interfaces for allocating, mapping,
> > > migrating, and
> > > + * releasing memory ranges between the CPU and GPU. It handles all
> > > core memory
> > > + * management interactions (DMA mapping, HMM, and migration) and
> > > provides
> > > + * driver-specific virtual functions (vfuncs). This infrastructure
> > > is sufficient
> > > + * to build the expected driver components for an SVM implementation
> > > as detailed
> > > + * below.
> > > + *
> > > + * Expected Driver Components:
> > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > based on the
> > > + *			     fault address, optionally migrate the
> > > range to
> > > + *			     VRAM, and create GPU bindings.
> > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > Ranges are
> > > + *			expected to be added to the garbage
> > > collector upon
> > > + *			MMU_NOTIFY_UNMAP event.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Locking
> > > + *
> > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > locks/unlocks the
> > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > handle the mmap
> > > + * lock itself, a 'locked' argument is provided to the functions
> > > that require
> > > + * the mmap lock. This option may be useful for drivers that need to
> > > call into
> > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > locking
> > > + * inversions between the mmap and dma-resv locks.
> > > + *
> > > + * GPU SVM introduces a global notifier lock, which safeguards the
> > > notifier's
> > > + * range RB tree and list, as well as the range's DMA mappings and
> > > sequence
> > > + * number. GPU SVM manages all necessary locking and unlocking
> > > operations,
> > > + * except for the recheck of the range's sequence number
> > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > bindings. This
> > > + * lock corresponds to the 'driver->update' lock mentioned in the
> > > HMM
> > > + * documentation (TODO: Link). Future revisions may transition from
> > > a GPU SVM
> > > + * global lock to a per-notifier lock if finer-grained locking is
> > > deemed
> > > + * necessary.
> > > + *
> > > + * In addition to the locking mentioned above, the driver should
> > > implement a
> > > + * lock to safeguard core GPU SVM function calls that modify state,
> > > such as
> > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > Alternatively,
> > > + * these core functions can be called within a single kernel thread,
> > > for
> > > + * instance, using an ordered work queue. This lock is denoted as
> > > + * 'driver_svm_lock' in code examples.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Migrataion
> > > + *
> > > + * The migration support is quite simple, allowing migration between
> > > SRAM and
> > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > does not
> > > + * support mixing SRAM and VRAM pages within a range. This means
> > > that upon GPU
> > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > fault, the
> > > + * entire range is migrated to SRAM.
> > > + *
> > > + * The reasoning for only supporting range granularity is as
> > > follows: it
> > > + * simplifies the implementation, and range sizes are driver-defined
> > > and should
> > > + * be relatively small.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Partial Unmapping of Ranges
> > > + *
> > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > > CPU resulting
> > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > > main one
> > > + * being that a subset of the range still has CPU and GPU mappings.
> > > If the
> > > + * backing store for the range is in VRAM, a subset of the backing
> > > store has
> > > + * references. One option would be to split the range and VRAM
> > > backing store,
> > > + * but the implementation for this would be quite complicated. Given
> > > that
> > > + * partial unmappings are rare and driver-defined range sizes are
> > > relatively
> > > + * small, GPU SVM does not support splitting of ranges.
> > > + *
> > > + * With no support for range splitting, upon partial unmapping of a
> > > range, the
> > > + * driver is expected to invalidate and destroy the entire range. If
> > > the range
> > > + * has VRAM as its backing, the driver is also expected to migrate
> > > any remaining
> > > + * pages back to SRAM.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Examples
> > > + *
> > > + * This section provides two examples of how to build the expected
> > > driver
> > > + * components: the GPU page fault handler and the garbage collector.
> > > A third
> > > + * example demonstrates a sample invalidation driver vfunc.
> > > + *
> > > + * The generic code provided does not include logic for complex
> > > migration
> > > + * policies, optimized invalidations, or other potentially required
> > > driver
> > > + * locking (e.g., DMA-resv locks).
> > > + *
> > > + * 1) GPU page fault handler
> > > + *
> > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > drm_gpusvm_range *range)
> > > + *	{
> > > + *		int err = 0;
> > > + *
> > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > range);
> > > + *
> > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > + *			driver_commit_bind(gpusvm, range);
> > > + *		else
> > > + *			err = -EAGAIN;
> > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > + *
> > > + *		return err;
> > > + *	}
> > > + *
> > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *		int err;
> > > + *
> > > + *		driver_svm_lock();
> > > + *	retry:
> > > + *		// Always process UNMAPs first so view of GPU SVM
> > > ranges is current
> > > + *		driver_garbage_collector(gpusvm);
> > > + *
> > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > fault_addr,
> > > + *							gpuva_start,
> > > gpuva_end,
> > > + *						        &ctx);
> > > + *		if (IS_ERR(range)) {
> > > + *			err = PTR_ERR(range);
> > > + *			goto unlock;
> > > + *		}
> > > + *
> > > + *		if (driver_migration_policy(range)) {
> > > + *			bo = driver_alloc_bo();
> > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > range, bo, &ctx);
> > > + *			if (err)	// CPU mappings may have
> > > changed
> > > + *				goto retry;
> > > + *		}
> > > + *
> > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &ctx);
> > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > mappings changed
> > > + *			goto retry;
> > > + *		else if (err)
> > > + *			goto unlock;
> > > + *
> > > + *		err = driver_bind_range(gpusvm, range);
> > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > + *			goto retry
> > > + *
> > > + *	unlock:
> > > + *		driver_svm_unlock();
> > > + *		return err;
> > > + *	}
> > > + *
> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > > + *					struct drm_gpusvm_range
> > > *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM pages
> > > back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > > &ctx);
> > > + *
> > > + *		driver_unbind_range(range);
> > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > + *	}
> > > + *
> > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > + *	{
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > > + *			__driver_garbage_collector(gpusvm, range);
> > > + *	}
> > > + *
> > > + * 3) Invalidation driver vfunc.
> > > + *
> > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > + *				 struct drm_gpusvm_notifier
> > > *notifier,
> > > + *				 const struct mmu_notifier_range
> > > *mmu_range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > > };
> > > + *		struct drm_gpusvm_range *range = NULL;
> > > + *
> > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > >start, mmu_range->end);
> > > + *
> > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > mmu_range->start,
> > > + *					  mmu_range->end) {
> > > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > > &ctx);
> > > + *
> > > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > > + *				continue;
> > > + *
> > > + *			drm_gpusvm_range_set_unmapped(range,
> > > mmu_range);
> > > + *			driver_garbage_collector_add(gpusvm, range);
> > > + *		}
> > > + *	}
> > > + */
> > > +
> > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > rb.__subtree_last,
> > > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > > +		     static __maybe_unused, range);
> > > +
> > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > >interval.start)
> > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > >interval.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > > notifier);
> > > +
> > 
> > Since these trees span struct mm_struct address space which should fit
> > in an unsigned long, can we use the generic version (interval_tree.h)
> > rather than instantiating two new versions? I figure both contain
> > overlapping ranges so we can't use maple trees?
> > 
> 
> I can look into using a generic version but actually I don't think we
> allow overlapping so a maple tree might work here too. I'll likely stick
> a generic version in next rev but if the consensus is maple tree we can
> switch over to that fairly easy at any point in time as the tree
> interaction is completely encapsulated in DRM SVM layer.
> 
> > > +/**
> > > + * npages_in_range() - Calculate the number of pages in a given
> > > range
> > > + * @start__: The start address of the range
> > > + * @end__: The end address of the range
> > > + *
> > > + * This macro calculates the number of pages in a given memory
> > > range,
> > > + * specified by the start and end addresses. It divides the
> > > difference
> > > + * between the end and start addresses by the page size (PAGE_SIZE)
> > > to
> > > + * determine the number of pages in the range.
> > > + *
> > > + * Return: The number of pages in the specified range.
> > > + */
> > > +#define npages_in_range(start__, end__)	\
> > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > +
> > > +/**
> > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > + *
> > > + * @refcount: Reference count for the zdd
> > > + * @destroy_work: Work structure for asynchronous zdd destruction
> > > + * @range: Pointer to the GPU SVM range
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + *
> > > + * This structure serves as a generic wrapper installed in
> > > + * page->zone_device_data. It provides infrastructure for looking up
> > > a range
> > > + * upon CPU page fault and asynchronously releasing VRAM once the
> > > CPU has no
> > > + * page references. Asynchronous release is useful because CPU page
> > > references
> > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > requires sleeping
> > > + * locks.
> > > + */
> > > +struct drm_gpusvm_zdd {
> > > +	struct kref refcount;
> > > +	struct work_struct destroy_work;
> > > +	struct drm_gpusvm_range *range;
> >  
> > I still believe previous review comments are valid here, considering we
> > do have multiple drm_gpusvm per struct mm_struct, potentially all
> > mapping the above page.
> > 
> 
> Exactly which comments?
> 
> If it related to the range pointer, that is going to be dropped. All
> virtual references from zdd will be dropped (i.e. no pointer to even a
> DRM SVM).
> 
> > > +	void *vram_allocation;
> > 
> > NIT: Naming. The core is using device memory or devmem. Should we
> > follow.
> >
> 
> I like devmem. Will change.
>  
> > Also could we, rather than using av void * use an embeddable struct
> > with its own ops rather than using the gpusvm ops for this?
> > 
> 
> Can you give me code snippet example of what you think this should look
> like? Not opposed to this.
> 

After reading your write up multi-device, yes I think a embeddable
struct with its own ops make sense.

I think the following ops should be moved to the embeddable struct:

vram_release
populate_vram_pfn
copy_to_vram
copy_to_sram

Also the local vram_attach, vram_detach, and vram_detached in the branch
I shared.

We likely want the device which owns the vram allocation in the embedded
struct too so that can be used for dma mapping when triggering
migration to a remote device or when a migrate_to_ram callback occurs.

Matt

> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > > zdd
> > > + * @w: Pointer to the work_struct
> > > + *
> > > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(w, struct drm_gpusvm_zdd,
> > > destroy_work);
> > > +	struct drm_gpusvm_range *range = zdd->range;
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > +	drm_gpusvm_range_put(range);
> > > +	kfree(zdd);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > + * @range: Pointer to the GPU SVM range.
> > > + *
> > > + * This function allocates and initializes a new zdd structure. It
> > > sets up the
> > > + * reference count, initializes the destroy work, and links the
> > > provided GPU SVM
> > > + * range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > > + */
> > > +static struct drm_gpusvm_zdd *
> > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd;
> > > +
> > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > +	if (!zdd)
> > > +		return NULL;
> > > +
> > > +	kref_init(&zdd->refcount);
> > > +	INIT_WORK(&zdd->destroy_work,
> > > drm_gpusvm_zdd_destroy_work_func);
> > > +	zdd->range = drm_gpusvm_range_get(range);
> > > +	zdd->vram_allocation = NULL;
> > > +
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function increments the reference count of the provided zdd
> > > structure.
> > > + *
> > > + * Returns: Pointer to the zdd structure.
> > > + */
> > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_get(&zdd->refcount);
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > + * @ref: Pointer to the reference count structure.
> > > + *
> > > + * This function queues the destroy_work of the zdd for asynchronous
> > > destruction.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > +
> > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function decrements the reference count of the provided zdd
> > > structure
> > > + * and schedules its destruction if the count drops to zero.
> > > + */
> > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > + * @start: Start address of the range
> > > + * @end: End address of the range
> > > + *
> > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end)
> > > +{
> > > +	return range_iter_first(&notifier->root, start, end - 1);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > ranges in a notifier
> > > + * @range__: Iterator variable for the ranges
> > > + * @next__: Iterator variable for the ranges temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > > while
> > > + * removing ranges from it.
> > > + */
> > > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > > start__, end__)	\
> > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > (start__), (end__)),	\
> > > +	     (next__) =
> > > __drm_gpusvm_range_next(range__);				\
> > > +	     (range__) && (range__->va.start <
> > > (end__));				\
> > > +	     (range__) = (next__), (next__) =
> > > __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > > the list
> > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > > or NULL if
> > > + *         the current notifier is the last one or if the input
> > > notifier is
> > > + *         NULL.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > +{
> > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > +				      &notifier->gpusvm-
> > > >notifier_list))
> > > +		return list_next_entry(notifier, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > > a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > > end__)		\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1);	\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > > notifiers in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @next__: Iterator variable for the notifiers temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > > while
> > > + * removing notifiers from it.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > gpusvm__, start__, end__)	\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > > (start__), (end__) - 1),	\
> > > +	     (next__) =
> > > __drm_gpusvm_notifier_next(notifier__);				\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = (next__), (next__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > + * @cur_seq: Current sequence number.
> > > + *
> > > + * This function serves as a generic MMU notifier for GPU SVM. It
> > > sets the MMU
> > > + * notifier sequence number and calls the driver invalidate vfunc
> > > under
> > > + * gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * true if the operation succeeds, false otherwise.
> > > + */
> > > +static bool
> > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > > +			       const struct mmu_notifier_range
> > > *mmu_range,
> > > +			       unsigned long cur_seq)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier =
> > > +		container_of(mni, typeof(*notifier), notifier);
> > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > +
> > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > +		return false;
> > > +
> > > +	down_write(&gpusvm->notifier_lock);
> > > +	mmu_interval_set_seq(mni, cur_seq);
> > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > +	up_write(&gpusvm->notifier_lock);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > > GPU SVM
> > > + */
> > > +static const struct mmu_interval_notifier_ops
> > > drm_gpusvm_notifier_ops = {
> > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @name: Name of the GPU SVM.
> > > + * @drm: Pointer to the DRM device structure.
> > > + * @mm: Pointer to the mm_struct for the address space.
> > > + * @device_private_page_owner: Device private pages owner.
> > > + * @mm_start: Start address of GPU SVM.
> > > + * @mm_range: Range of the GPU SVM.
> > > + * @notifier_size: Size of individual notifiers.
> > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order
> > > with last
> > > + *               entry being SZ_4K.
> > > + * @num_chunks: Number of chunks.
> > > + *
> > > + * This function initializes the GPU SVM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, a negative error code on failure.
> > > + */
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks)
> > > +{
> > > +	if (!ops->invalidate || !num_chunks)
> > > +		return -EINVAL;
> > > +
> > > +	gpusvm->name = name;
> > > +	gpusvm->drm = drm;
> > > +	gpusvm->mm = mm;
> > > +	gpusvm->device_private_page_owner =
> > > device_private_page_owner;
> > > +	gpusvm->mm_start = mm_start;
> > > +	gpusvm->mm_range = mm_range;
> > > +	gpusvm->notifier_size = notifier_size;
> > > +	gpusvm->ops = ops;
> > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > +	gpusvm->num_chunks = num_chunks;
> > > +	gpusvm->zdd_wq = system_wq;
> > > +
> > > +	mmgrab(mm);
> > > +	gpusvm->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > +
> > > +	init_rwsem(&gpusvm->notifier_lock);
> > > +
> > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > +	might_lock(&gpusvm->notifier_lock);
> > > +	fs_reclaim_release(GFP_KERNEL);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > + * @fault_addr__: Fault address
> > > + *
> > > + * This macro finds the GPU SVM notifier associated with the fault
> > > address.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > + */
> > > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > > +			    (fault_addr__ + 1))
> > > +
> > > +/**
> > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > given rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_notifier struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > structure.
> > > + */
> > > +#define to_drm_gpusvm_notifier(__node)				\
> > > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > > tree and list.
> > > + */
> > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	notifier_insert(notifier, &gpusvm->root);
> > > +
> > > +	node = rb_prev(&notifier->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > > +	else
> > > +		head = &gpusvm->notifier_list;
> > > +
> > > +	list_add(&notifier->rb.entry, head);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > > and list.
> > > + */
> > > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > > +	list_del(&(notifier__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + *
> > > + * This function finalizes the GPU SVM by cleaning up any remaining
> > > ranges and
> > > + * notifiers, and dropping a reference to struct MM.
> > > + */
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > +
> > > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > > LONG_MAX) {
> > > +		struct drm_gpusvm_range *range, *__next;
> > > +
> > > +		/*
> > > +		 * Remove notifier first to avoid racing with any
> > > invalidation
> > > +		 */
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +		notifier->flags.removed = true;
> > > +
> > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > notifier, 0,
> > > +					       LONG_MAX)
> > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > +	}
> > > +
> > > +	mmdrop(gpusvm->mm);
> > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + *
> > > + * This function allocates and initializes the GPU SVM notifier
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > > on failure.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	if (gpusvm->ops->notifier_alloc)
> > > +		notifier = gpusvm->ops->notifier_alloc();
> > > +	else
> > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > +
> > > +	if (!notifier)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	notifier->gpusvm = gpusvm;
> > > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >notifier_size);
> > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > >notifier_size);
> > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > +	notifier->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > +
> > > +	return notifier;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function frees the GPU SVM notifier structure.
> > > + */
> > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > +				     struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > +
> > > +	if (gpusvm->ops->notifier_free)
> > > +		gpusvm->ops->notifier_free(notifier);
> > > +	else
> > > +		kfree(notifier);
> > > +}
> > > +
> > > +/**
> > > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > > rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_range struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > > + */
> > > +#define to_drm_gpusvm_range(node__)	\
> > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function inserts the GPU SVM range into the notifier RB tree
> > > and list.
> > > + */
> > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > *notifier,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > +	range_insert(range, &notifier->root);
> > > +
> > > +	node = rb_prev(&range->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > +	else
> > > +		head = &notifier->range_list;
> > > +
> > > +	list_add(&range->rb.entry, head);
> > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + * @range__: Pointer to the GPU SVM range structure
> > > + *
> > > + * This macro removes the GPU SVM range from the notifier RB tree
> > > and list.
> > > + */
> > > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > > +	range_remove((range__), &(notifier__)->root);		\
> > > +	list_del(&(range__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @fault_addr: Fault address
> > > + * @chunk_size: Chunk size
> > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > + *
> > > + * This function allocates and initializes the GPU SVM range
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_range *
> > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > +		       struct drm_gpusvm_notifier *notifier,
> > > +		       u64 fault_addr, u64 chunk_size, bool
> > > migrate_vram)
> > > +{
> > > +	struct drm_gpusvm_range *range;
> > > +
> > > +	if (gpusvm->ops->range_alloc)
> > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > +	else
> > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > +
> > > +	if (!range)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	kref_init(&range->refcount);
> > > +	range->gpusvm = gpusvm;
> > > +	range->notifier = notifier;
> > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > +	range->notifier_seq = LONG_MAX;
> > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_check_pages - Check pages
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Check if pages between start and end have been faulted in on the
> > > CPU. Use to
> > > + * prevent migration of pages without CPU backing store.
> > > + *
> > > + * Returns:
> > > + * True if pages have been faulted into CPU, False otherwise
> > > + */
> > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > +				   struct drm_gpusvm_notifier
> > > *notifier,
> > > +				   u64 start, u64 end)
> > > +{
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = 0,
> > > +		.notifier = &notifier->notifier,
> > > +		.start = start,
> > > +		.end = end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long *pfns;
> > > +	unsigned long npages = npages_in_range(start, end);
> > > +	int err, i;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > > +	if (!pfns)
> > > +		return false;
> > > +
> > > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > > >notifier);
> > > +	hmm_range.hmm_pfns = pfns;
> > > +
> > > +	while (true) {
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > +			err = -EFAULT;
> > > +			goto err_free;
> > > +		}
> > > +	}
> > > +
> > > +err_free:
> > > +	kvfree(pfns);
> > > +	return err ? false : true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @vas: Pointer to the virtual memory area structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @check_pages: Flag indicating whether to check pages
> > > + *
> > > + * This function determines the chunk size for the GPU SVM range
> > > based on the
> > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > > the virtual
> > > + * memory area boundaries.
> > > + *
> > > + * Returns:
> > > + * Chunk size on success, LONG_MAX on failure.
> > > + */
> > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > > +				       struct drm_gpusvm_notifier
> > > *notifier,
> > > +				       struct vm_area_struct *vas,
> > > +				       u64 fault_addr, u64
> > > gpuva_start,
> > > +				       u64 gpuva_end, bool
> > > check_pages)
> > > +{
> > > +	u64 start, end;
> > > +	int i = 0;
> > > +
> > > +retry:
> > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > >chunk_sizes[i]);
> > > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > > +
> > > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > > +		    start >= notifier->interval.start &&
> > > +		    end <= notifier->interval.end &&
> > > +		    start >= gpuva_start && end <= gpuva_end)
> > > +			break;
> > > +	}
> > > +
> > > +	if (i == gpusvm->num_chunks)
> > > +		return LONG_MAX;
> > > +
> > > +	/*
> > > +	 * If allocation more than page, ensure not to overlap with
> > > existing
> > > +	 * ranges.
> > > +	 */
> > > +	if (end - start != SZ_4K) {
> > > +		struct drm_gpusvm_range *range;
> > > +
> > > +		range = drm_gpusvm_range_find(notifier, start, end);
> > > +		if (range) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +
> > > +		/*
> > > +		 * XXX: Only create range on pages CPU has faulted
> > > in. Without
> > > +		 * this check, or prefault, on BMG
> > > 'xe_exec_system_allocator --r
> > > +		 * process-many-malloc' fails. In the failure case,
> > > each process
> > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > results in 64k SVM
> > > +		 * ranges. When migrating the SVM ranges, some
> > > processes fail in
> > > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > > != npages'
> > > +		 * and then upon drm_gpusvm_range_get_pages device
> > > pages from
> > > +		 * other processes are collected + faulted in which
> > > creates all
> > > +		 * sorts of problems. Unsure exactly how this
> > > happening, also
> > > +		 * problem goes away if 'xe_exec_system_allocator --
> > > r
> > > +		 * process-many-malloc' mallocs at least 64k at a
> > > time.
> > > +		 */
> > > +		if (check_pages &&
> > > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > > end)) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +	}
> > > +
> > > +	return end - start;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function finds or inserts a newly allocated a GPU SVM range
> > > based on the
> > > + * fault address. Caller must hold a lock to protect range lookup
> > > and insertion.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct drm_gpusvm_range *range;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	bool notifier_alloc = false;
> > > +	u64 chunk_size;
> > > +	int err;
> > > +	bool migrate_vram;
> > > +
> > > +	if (fault_addr < gpusvm->mm_start ||
> > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > +		err = -EINVAL;
> > > +		goto err_out;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_write_locked(mm);
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > +	if (!notifier) {
> > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > fault_addr);
> > > +		if (IS_ERR(notifier)) {
> > > +			err = PTR_ERR(notifier);
> > > +			goto err_mmunlock;
> > > +		}
> > > +		notifier_alloc = true;
> > > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > > >notifier,
> > > +							  mm,
> > > notifier->interval.start,
> > > +							  notifier-
> > > >interval.end -
> > > +							  notifier-
> > > >interval.start,
> > > +							 
> > > &drm_gpusvm_notifier_ops);
> > > +		if (err)
> > > +			goto err_notifier;
> > > +	}
> > > +
> > > +	vas = vma_lookup(mm, fault_addr);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > +		err = -EPERM;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > fault_addr + 1);
> > > +	if (range)
> > > +		goto out_mmunlock;
> > > +	/*
> > > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > > current
> > > +	 * limitations. If/when migrate_vma_* add more support, this
> > > logic will
> > > +	 * have to change.
> > > +	 */
> > > +	migrate_vram = ctx->vram_possible &&
> > > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > > +
> > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > > vas,
> > > +						 fault_addr,
> > > gpuva_start,
> > > +						 gpuva_end,
> > > migrate_vram &&
> > > +						 !ctx->prefault);
> > > +	if (chunk_size == LONG_MAX) {
> > > +		err = -EINVAL;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > > chunk_size,
> > > +				       migrate_vram);
> > > +	if (IS_ERR(range)) {
> > > +		err = PTR_ERR(range);
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	drm_gpusvm_range_insert(notifier, range);
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > +
> > > +	if (ctx->prefault) {
> > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > +
> > > +		__ctx.mmap_locked = true;
> > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &__ctx);
> > > +		if (err)
> > > +			goto err_range_remove;
> > > +	}
> > > +
> > > +out_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +
> > > +	return range;
> > > +
> > > +err_range_remove:
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +err_notifier_remove:
> > > +	if (notifier_alloc)
> > > +		mmu_interval_notifier_remove(&notifier->notifier);
> > > +err_notifier:
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +/**
> > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > + * @i__: the current page index in the iteration
> > > + * @j__: the current page index, log order, in the iteration
> > > + * @npages__: the total number of pages in the DMA region
> > > + * @order__: the order of the pages in the DMA region
> > > + *
> > > + * This macro iterates over each page in a DMA region. The DMA
> > > region
> > > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > > + * step through the region one block of 2^@order__ pages at a time.
> > > + */
> > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > GPU SVM range (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function unmap pages associated with a GPU SVM range.
> > > Assumes and
> > > + * asserts correct locking is in place when called.
> > > + */
> > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					   struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		unsigned long i, j, npages = npages_in_range(range-
> > > >va.start,
> > > +							     range-
> > > >va.end);
> > > +
> > > +		if (range->flags.has_dma_mapping) {
> > > +			for_each_dma_page(i, j, npages, range-
> > > >order)
> > > +				dma_unmap_page(gpusvm->drm->dev,
> > > +					       range->dma_addr[j],
> > > +					       PAGE_SIZE << range-
> > > >order,
> > > +					       DMA_BIDIRECTIONAL);
> > > +		}
> > > +
> > > +		range->flags.has_vram_pages = false;
> > > +		range->flags.has_dma_mapping = false;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function free pages associated with a GPU SVM range.
> > > + */
> > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > > +					struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		if (range->flags.kfree_mapping) {
> > > +			kfree(range->dma_addr);
> > > +			range->flags.kfree_mapping = false;
> > > +			range->pages = NULL;
> > > +		} else {
> > > +			kvfree(range->pages);
> > > +			range->pages = NULL;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range to be removed
> > > + *
> > > + * This function removes the specified GPU SVM range and also
> > > removes the parent
> > > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > > caller must
> > > + * hold a lock to protect range and notifier removal.
> > > + */
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > >va.start);
> > > +	if (WARN_ON_ONCE(!notifier))
> > > +		return;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	drm_gpusvm_range_put(range);
> > > +
> > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > +		if (!notifier->flags.removed)
> > > +			mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function increments the reference count of the specified GPU
> > > SVM range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_get(&range->refcount);
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > + * @refcount: Pointer to the reference counter embedded in the GPU
> > > SVM range
> > > + *
> > > + * This function destroys the specified GPU SVM range when its
> > > reference count
> > > + * reaches zero. If a custom range-free function is provided, it is
> > > invoked to
> > > + * free the range; otherwise, the range is deallocated using
> > > kfree().
> > > + */
> > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > +{
> > > +	struct drm_gpusvm_range *range =
> > > +		container_of(refcount, struct drm_gpusvm_range,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->range_free)
> > > +		gpusvm->ops->range_free(range);
> > > +	else
> > > +		kfree(range);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function decrements the reference count of the specified GPU
> > > SVM range
> > > + * and frees it when the count reaches zero.
> > > + */
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called holding gpusvm->notifier_lock and as the last step before
> > > commiting a
> > > + * GPU binding.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	return range->flags.has_vram_pages || range-
> > > >flags.has_dma_mapping;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > > unlocked
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called without holding gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +static bool
> > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > +				      struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	bool pages_valid;
> > > +
> > > +	if (!range->pages)
> > > +		return false;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > +		kfree(range->dma_addr);
> > > +		range->flags.kfree_mapping = false;
> > > +		range->pages = NULL;
> > > +	}
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	return pages_valid;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function gets pages for a GPU SVM range and ensures they are
> > > mapped for
> > > + * DMA access.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > 
> > Is it possible to split this function up to make it look more neat?
> > 
> > 
> > > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > > >notifier;
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > > ? 0 :
> > > +			HMM_PFN_REQ_WRITE),
> > > +		.notifier = notifier,
> > > +		.start = range->va.start,
> > > +		.end = range->va.end,
> > > +		.dev_private_owner = gpusvm-
> > > >device_private_page_owner,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long i, j;
> > > +	unsigned long npages = npages_in_range(range->va.start,
> > > range->va.end);
> > > +	unsigned int order = 0;
> > > +	unsigned long *pfns;
> > > +	struct page **pages;
> > > +	int err = 0;
> > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > +	bool alloc_pfns = false, kfree_mapping;
> > > +
> > > +retry:
> > > +	kfree_mapping = false;
> > > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > > +		return 0;
> > > +
> > > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > > >pages) {
> > > +		if (ctx->prefault)
> > > +			return 0;
> > > +
> > > +		pfns = (unsigned long *)range->pages;
> > > +		pages = range->pages;
> > > +		goto map_pages;
> > > +	}
> > > +
> > > +	if (!range->pages) {
> > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +		if (!pfns)
> > > +			return -ENOMEM;
> > > +		alloc_pfns = true;
> > > +	} else {
> > > +		pfns = (unsigned long *)range->pages;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +	}
> > > +
> > > +	hmm_range.hmm_pfns = pfns;
> > > +	while (true) {
> > > +		/* Must be checked after mmu_interval_read_begin */
> > > +		if (range->flags.unmapped) {
> > > +			err = -EFAULT;
> > > +			break;
> > > +		}
> > > +
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates only
> > > a read-lock
> > > +			 * is required but there apears to be a
> > > window between
> > > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > > a CPU fault
> > > +			 * via migrate_vma_setup and the pages
> > > actually moving
> > > +			 * in migrate_vma_finalize in which this
> > > code can grab
> > > +			 * garbage pages. Grabbing the write-lock if
> > > the range
> > > +			 * is attached to vram appears to protect
> > > against this
> > > +			 * race.
> > > +			 */
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if
> > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > 
> > Here it looks like you're assuming that all pages are the same order?
> > With THP that's definitely not the case, (unless hmm somehow thinks
> > they are 4K pages). This probably work because we only end up here in
> > the HugeTLB case where all pages are forced to the same oder.
> > 
> 
> It assumes the order within a chunk (range size) is all the same. I
> thought THP pages order would always be 9 (2M). THP tests
> (*-large-malloc) seem to work on LNL.
> 
> This falls apart if chunks are larger than 2M as the first 2M could be a
> THP and 2nd could not. We discussed that you were changing the dma addr
> to support mixed mappings and encode the order. That is likely correct
> and would fix this limitation of only supporting 1 order size for chunk.
> 
> I may not get this in the rev but agree this should be fixed. We
> deferring fixing this be ok with you?
> 
> fwiw I haven't seen any ROI on chunks being larger than 2M so Xe likely
> won't have chunks larger than that but agree the design should support
> this.
> 
> Matt
> 
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					
> > > hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if
> > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE <<
> > > order,
> > > +						  
> > > DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j,
> > > sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] =
> > > (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order,
> > > DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range. If
> > > @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > > mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > > called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > >invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long
> > > *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > > grate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified GPU
> > > SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > > migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > > pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU
> > > SVM. It
> > > + * iterates over each page frame number provided in @migrate_pfn,
> > > maps the
> > > + * corresponding page, and stores the DMA address in the provided
> > > @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int
> > > *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page =
> > > migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > > dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > > for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for GPU
> > > Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in
> > > @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the
> > > corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > > The caller
> > > + *                   should hold a reference to the VRAM allocation,
> > > which
> > > + *                   should be dropped via ops->vram_allocation or
> > > upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM. It
> > > performs the
> > > + * necessary setup and invokes the driver-specific operations for
> > > migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > reference @range
> > > + * until ops->vram_release is called which only upon successful
> > > return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > >copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > > npages, not
> > > +	 * always an error. Need to revisit possible cases and how
> > > to handle. We
> > > +	 * could prefault on migrate.cpages != npages via
> > > hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.src, npages,
> > > DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > > Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > > VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers
> > > (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in the
> > > VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > > if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > vm_area_struct *vas,
> > > +						unsigned long
> > > npages,
> > > +						unsigned long
> > > *src_mpfn,
> > > +						unsigned long *mpfn,
> > > u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > > lock and
> > > + * migration done via migrate_device_* functions. Fallback path as
> > > it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range->va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > > +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > >vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm-
> > > >device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   dst, npages,
> > > DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the specified
> > > GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +					struct vm_area_struct *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially unmapped
> > > */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src,
> > > migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU SVM
> > > range to
> > > + * SRAM. It performs necessary checks and invokes the internal
> > > migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err =
> > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the corner
> > > case when
> > > +	 * VRAM backing has been partially unmapped from MM's
> > > address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > > end);
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be called
> > > while
> > > +		 * holding a dma-resv lock, and a final put can grab
> > > the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > > with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone device
> > > data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > > fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU SVM
> > > range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting page
> > > and invokes
> > > + * the internal migration function to migrate the range back to RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > > +
> > > +	return err ? VM_FAULT_SIGBUS : 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
> > > + */
> > > +static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
> > > +	.page_free = drm_gpusvm_page_free,
> > > +	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map
> > > operations
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM device page map operations structure.
> > > + */
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
> > > +{
> > > +	return &drm_gpusvm_pagemap_ops;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the
> > > given address range
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM has mapping, False otherwise
> > > + */
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > > u64 end)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
> > > +		struct drm_gpusvm_range *range = NULL;
> > > +
> > > +		drm_gpusvm_for_each_range(range, notifier, start,
> > > end)
> > > +			return true;
> > > +	}
> > > +
> > > +	return false;
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h
> > > b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > new file mode 100644
> > > index 000000000000..0ea70f8534a8
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.h
> > > @@ -0,0 +1,415 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef __DRM_GPUSVM_H__
> > > +#define __DRM_GPUSVM_H__
> > > +
> > > +#include <linux/kref.h>
> > > +#include <linux/mmu_notifier.h>
> > > +#include <linux/workqueue.h>
> > > +
> > > +struct dev_pagemap_ops;
> > > +struct drm_device;
> > > +struct drm_gpusvm;
> > > +struct drm_gpusvm_notifier;
> > > +struct drm_gpusvm_ops;
> > > +struct drm_gpusvm_range;
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ops - Operations structure for GPU SVM
> > > + *
> > > + * This structure defines the operations for GPU Shared Virtual
> > > Memory (SVM).
> > > + * These operations are provided by the GPU driver to manage SVM
> > > ranges and
> > > + * perform operations such as migration between VRAM and system RAM.
> > > + */
> > > +struct drm_gpusvm_ops {
> > > +	/**
> > > +	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM notifier.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM notifier on success,
> > > NULL on failure.
> > > +	 */
> > > +	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
> > > +
> > > +	/**
> > > +	 * @notifier_free: Free a GPU SVM notifier (optional)
> > > +	 * @notifier: Pointer to the GPU SVM notifier to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM notifier.
> > > +	 */
> > > +	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
> > > +
> > > +	/**
> > > +	 * @range_alloc: Allocate a GPU SVM range (optional)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 *
> > > +	 * This function shall allocate a GPU SVM range.
> > > +	 *
> > > +	 * Returns:
> > > +	 * Pointer to the allocated GPU SVM range on success, NULL
> > > on failure.
> > > +	 */
> > > +	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm
> > > *gpusvm);
> > > +
> > > +	/**
> > > +	 * @range_free: Free a GPU SVM range (optional)
> > > +	 * @range: Pointer to the GPU SVM range to be freed
> > > +	 *
> > > +	 * This function shall free a GPU SVM range.
> > > +	 */
> > > +	void (*range_free)(struct drm_gpusvm_range *range);
> > > +
> > > +	/**
> > > +	 * @vram_release: Release VRAM allocation (optional)
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 *
> > > +	 * This function shall release VRAM allocation and expects
> > > to drop a
> > > +	 * reference to VRAM allocation.
> > > +	 */
> > > +	void (*vram_release)(void *vram_allocation);
> > > +
> > > +	/**
> > > +	 * @populate_vram_pfn: Populate VRAM PFN (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > +	 * @npages: Number of pages to populate
> > > +	 * @pfn: Array of page frame numbers to populate
> > > +	 *
> > > +	 * This function shall populate VRAM page frame numbers
> > > (PFN).
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
> > > +				 void *vram_allocation,
> > > +				 unsigned long npages,
> > > +				 unsigned long *pfn);
> > > +
> > > +	/**
> > > +	 * @copy_to_vram: Copy to VRAM (required for migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (destination)
> > > +	 * @dma_addr: Pointer to array of DMA addresses (source)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to VRAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @copy_to_sram: Copy to system RAM (required for
> > > migration)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @pages: Pointer to array of VRAM pages (source)
> > > +	 * @dma_addr: Pointer to array of DMA addresses
> > > (destination)
> > > +	 * @npages: Number of pages to copy
> > > +	 *
> > > +	 * This function shall copy pages to system RAM.
> > > +	 *
> > > +	 * Returns:
> > > +	 * 0 on success, a negative error code on failure.
> > > +	 */
> > > +	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
> > > +			    struct page **pages,
> > > +			    dma_addr_t *dma_addr,
> > > +			    unsigned long npages);
> > > +
> > > +	/**
> > > +	 * @invalidate: Invalidate GPU SVM notifier (required)
> > > +	 * @gpusvm: Pointer to the GPU SVM
> > > +	 * @notifier: Pointer to the GPU SVM notifier
> > > +	 * @mmu_range: Pointer to the mmu_notifier_range structure
> > > +	 *
> > > +	 * This function shall invalidate the GPU page tables. It
> > > can safely
> > > +	 * walk the notifier range RB tree/list in this function.
> > > Called while
> > > +	 * holding the notifier lock.
> > > +	 */
> > > +	void (*invalidate)(struct drm_gpusvm *gpusvm,
> > > +			   struct drm_gpusvm_notifier *notifier,
> > > +			   const struct mmu_notifier_range
> > > *mmu_range);
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_notifier - Structure representing a GPU SVM
> > > notifier
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: MMU interval notifier
> > > + * @interval: Interval for the notifier
> > > + * @rb: Red-black tree node for the parent GPU SVM structure
> > > notifier tree
> > > + * @root: Cached root node of the RB tree containing ranges
> > > + * @range_list: List head containing of ranges in the same order
> > > they appear in
> > > + *              interval tree. This is useful to keep iterating
> > > ranges while
> > > + *              doing modifications to RB tree.
> > > + * @flags.removed: Flag indicating whether the MMU interval notifier
> > > has been
> > > + *                 removed
> > > + *
> > > + * This structure represents a GPU SVM notifier.
> > > + */
> > > +struct drm_gpusvm_notifier {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct mmu_interval_notifier notifier;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} interval;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct rb_root_cached root;
> > > +	struct list_head range_list;
> > > +	struct {
> > > +		u32 removed : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_range - Structure representing a GPU SVM range
> > > + *
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier
> > > + * @refcount: Reference count for the range
> > > + * @rb: Red-black tree node for the parent GPU SVM notifier
> > > structure range tree
> > > + * @va: Virtual address range
> > > + * @notifier_seq: Notifier sequence number of the range's pages
> > > + * @pages: Pointer to the array of pages (if backing store is in
> > > VRAM)
> > > + * @dma_addr: DMA address array (if backing store is SRAM and DMA
> > > mapped)
> > > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > > + * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping
> > > size
> > > + * @flags.migrate_vram: Flag indicating whether the range can be
> > > migrated to VRAM
> > > + * @flags.unmapped: Flag indicating if the range has been unmapped
> > > + * @flags.partial_unmap: Flag indicating if the range has been
> > > partially unmapped
> > > + * @flags.has_vram_pages: Flag indicating if the range has vram
> > > pages
> > > + * @flags.has_dma_mapping: Flag indicating if the range has a DMA
> > > mapping
> > > + * @flags.kfree_mapping: Flag indicating @dma_addr is a compact
> > > allocation based
> > > + *                       on @order which releases via kfree
> > > + *
> > > + * This structure represents a GPU SVM range used for tracking
> > > memory ranges
> > > + * mapped in a DRM device.
> > > + */
> > > +struct drm_gpusvm_range {
> > > +	struct drm_gpusvm *gpusvm;
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct kref refcount;
> > > +	struct {
> > > +		struct rb_node node;
> > > +		struct list_head entry;
> > > +		u64 __subtree_last;
> > > +	} rb;
> > > +	struct {
> > > +		u64 start;
> > > +		u64 end;
> > > +	} va;
> > > +	unsigned long notifier_seq;
> > > +	union {
> > > +		struct page **pages;
> > > +		dma_addr_t *dma_addr;
> > > +	};
> > > +	void *vram_allocation;
> > > +	u16 order;
> > > +	struct {
> > > +		/* All flags below must be set upon creation */
> > > +		u16 migrate_vram : 1;
> > > +		/* All flags below must be set / cleared under
> > > notifier lock */
> > > +		u16 unmapped : 1;
> > > +		u16 partial_unmap : 1;
> > > +		u16 has_vram_pages : 1;
> > > +		u16 has_dma_mapping : 1;
> > > +		u16 kfree_mapping : 1;
> > > +	} flags;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm - GPU SVM structure
> > > + *
> > > + * @name: Name of the GPU SVM
> > > + * @drm: Pointer to the DRM device structure
> > > + * @mm: Pointer to the mm_struct for the address space
> > > + * @device_private_page_owner: Device private pages owner
> > > + * @mm_start: Start address of GPU SVM
> > > + * @mm_range: Range of the GPU SVM
> > > + * @notifier_size: Size of individual notifiers
> > > + * @ops: Pointer to the operations structure for GPU SVM
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending order.
> > > + * @num_chunks: Number of chunks
> > > + * @notifier_lock: Read-write semaphore for protecting notifier
> > > operations
> > > + * @zdd_wq: Workqueue for deferred work on zdd destruction
> > > + * @root: Cached root node of the Red-Black tree containing GPU SVM
> > > notifiers
> > > + * @notifier_list: list head containing of notifiers in the same
> > > order they
> > > + *                 appear in interval tree. This is useful to keep
> > > iterating
> > > + *                 notifiers while doing modifications to RB tree.
> > > + *
> > > + * This structure represents a GPU SVM (Shared Virtual Memory) used
> > > for tracking
> > > + * memory ranges mapped in a DRM (Direct Rendering Manager) device.
> > > + *
> > > + * No reference counting is provided, as this is expected to be
> > > embedded in the
> > > + * driver VM structure along with the struct drm_gpuvm, which
> > > handles reference
> > > + * counting.
> > > + */
> > > +struct drm_gpusvm {
> > > +	const char *name;
> > > +	struct drm_device *drm;
> > > +	struct mm_struct *mm;
> > > +	void *device_private_page_owner;
> > > +	u64 mm_start;
> > > +	u64 mm_range;
> > > +	u64 notifier_size;
> > > +	const struct drm_gpusvm_ops *ops;
> > > +	const u64 *chunk_sizes;
> > > +	int num_chunks;
> > > +	struct rw_semaphore notifier_lock;
> > > +	struct workqueue_struct *zdd_wq;
> > > +	struct rb_root_cached root;
> > > +	struct list_head notifier_list;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_gpusvm_ctx - DRM GPU SVM context
> > > + *
> > > + * @mmap_locked: mmap lock is locked
> > > + * @trylock_mmap: trylock mmap lock, used to avoid locking
> > > inversions
> > > + *                (e.g.dma-revs -> mmap lock)
> > > + * @in_notifier: entering from a MMU notifier
> > > + * @read_only: operating on read-only memory
> > > + * @vram_possible: possible to use VRAM
> > > + * @prefault: prefault pages
> > > + *
> > > + * Context that is DRM GPUSVM is operating in (i.e. user arguments).
> > > + */
> > > +struct drm_gpusvm_ctx {
> > > +	u32 mmap_locked :1;
> > > +	u32 trylock_mmap :1;
> > > +	u32 in_notifier :1;
> > > +	u32 read_only :1;
> > > +	u32 vram_possible :1;
> > > +	u32 prefault :1;
> > > +};
> > > +
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks);
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
> > > +void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range);
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
> > > +
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range);
> > > +
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range *range,
> > > +				  const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx);
> > > +
> > > +const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
> > > +
> > > +bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start,
> > > u64 end);
> > > +
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end);
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, take lock
> > > + */
> > > +#define drm_gpusvm_notifier_lock(gpusvm__)	\
> > > +	down_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure.
> > > + *
> > > + * Abstract client usage GPU SVM notifier lock, drop lock
> > > + */
> > > +#define drm_gpusvm_notifier_unlock(gpusvm__)	\
> > > +	up_read(&(gpusvm__)->notifier_lock)
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
> > > + * @range: a pointer to the current GPU SVM range
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_range if available, or
> > > NULL if the
> > > + *         current range is the last one or if the input range is
> > > NULL.
> > > + */
> > > +static inline struct drm_gpusvm_range *
> > > +__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
> > > +{
> > > +	if (range && !list_is_last(&range->rb.entry,
> > > +				   &range->notifier->range_list))
> > > +		return list_next_entry(range, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a
> > > notifier
> > > + * @range__: Iterator variable for the ranges. If set, it indicates
> > > the start of
> > > + *	     the iterator. If NULL, call drm_gpusvm_range_find() to
> > > get the range.
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a notifier.
> > > It is safe
> > > + * to use while holding the driver SVM lock or the notifier lock.
> > > + */
> > > +#define drm_gpusvm_for_each_range(range__, notifier__, start__,
> > > end__)	\
> > > +	for ((range__) = (range__)
> > > ?:					\
> > > +	     drm_gpusvm_range_find((notifier__), (start__),
> > > (end__));	\
> > > +	     (range__) && (range__->va.start <
> > > (end__));		\
> > > +	     (range__) = __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
> > > + * @range: Pointer to the GPU SVM range structure.
> > > + * @mmu_range: Pointer to the MMU notifier range structure.
> > > + *
> > > + * This function marks a GPU SVM range as unmapped and sets the
> > > partial_unmap flag
> > > + * if the range partially falls within the provided MMU notifier
> > > range.
> > > + */
> > > +static inline void
> > > +drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
> > > +			      const struct mmu_notifier_range
> > > *mmu_range)
> > > +{
> > > +	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
> > > +
> > > +	range->flags.unmapped = true;
> > > +	if (range->va.start < mmu_range->start ||
> > > +	    range->va.end > mmu_range->end)
> > > +		range->flags.partial_unmap = true;
> > > +}
> > > +
> > > +#endif /* __DRM_GPUSVM_H__ */
> >
Thomas Hellström Oct. 9, 2024, 10:50 a.m. UTC | #41
Hi, Matthew.

Some comments below around migrating to SRAM.


On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> This patch introduces support for GPU Shared Virtual Memory (SVM) in
> the
> Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> sharing of memory between the CPU and GPU, enhancing performance and
> flexibility in GPU computing tasks.
> 
> The patch adds the necessary infrastructure for SVM, including data
> structures and functions for managing SVM ranges and notifiers. It
> also
> provides mechanisms for allocating, deallocating, and migrating
> memory
> regions between system RAM and GPU VRAM.
> 
> This mid-layer is largely inspired by GPUVM.
> 
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: <dri-devel@lists.freedesktop.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile     |    3 +-
>  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
>  3 files changed, 2591 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
>  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile
> b/drivers/gpu/drm/xe/Makefile
> index b9670ae09a9e..b8fc2ee58f1a 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>  
>  # core driver code
>  
> -xe-y += xe_bb.o \
> +xe-y += drm_gpusvm.o \
> +	xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
>  	xe_devcoredump.o \
> diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> b/drivers/gpu/drm/xe/drm_gpusvm.c
> new file mode 100644
> index 000000000000..fc1e44e6ae72
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> @@ -0,0 +1,2174 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + *
> + * Authors:
> + *     Matthew Brost <matthew.brost@intel.com>
> + */
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hmm.h>
> +#include <linux/memremap.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_types.h>
> +#include <linux/pagemap.h>
> +#include <linux/slab.h>
> +
> +#include <drm/drm_device.h>
> +#include "drm_gpusvm.h"
> +
> +/**
> + * DOC: Overview
> + *
> + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> Rendering Manager (DRM)
> + *
> + * The GPU SVM layer is a component of the DRM framework designed to
> manage shared
> + * virtual memory between the CPU and GPU. It enables efficient data
> exchange and
> + * processing for GPU-accelerated applications by allowing memory
> sharing and
> + * synchronization between the CPU's and GPU's virtual address
> spaces.
> + *
> + * Key GPU SVM Components:
> + * - Notifiers: Notifiers: Used for tracking memory intervals and
> notifying the
> + *		GPU of changes, notifiers are sized based on a GPU
> SVM
> + *		initialization parameter, with a recommendation of
> 512M or
> + *		larger. They maintain a Red-BlacK tree and a list of
> ranges that
> + *		fall within the notifier interval. Notifiers are
> tracked within
> + *		a GPU SVM Red-BlacK tree and list and are
> dynamically inserted
> + *		or removed as ranges within the interval are created
> or
> + *		destroyed.
> + * - Ranges: Represent memory ranges mapped in a DRM device and
> managed
> + *	     by GPU SVM. They are sized based on an array of chunk
> sizes, which
> + *	     is a GPU SVM initialization parameter, and the CPU
> address space.
> + *	     Upon GPU fault, the largest aligned chunk that fits
> within the
> + *	     faulting CPU address space is chosen for the range
> size. Ranges are
> + *	     expected to be dynamically allocated on GPU fault and
> removed on an
> + *	     MMU notifier UNMAP event. As mentioned above, ranges
> are tracked in
> + *	     a notifier's Red-Black tree.
> + * - Operations: Define the interface for driver-specific SVM
> operations such as
> + *		 allocation, page collection, migration,
> invalidations, and VRAM
> + *		 release.
> + *
> + * This layer provides interfaces for allocating, mapping,
> migrating, and
> + * releasing memory ranges between the CPU and GPU. It handles all
> core memory
> + * management interactions (DMA mapping, HMM, and migration) and
> provides
> + * driver-specific virtual functions (vfuncs). This infrastructure
> is sufficient
> + * to build the expected driver components for an SVM implementation
> as detailed
> + * below.
> + *
> + * Expected Driver Components:
> + * - GPU page fault handler: Used to create ranges and notifiers
> based on the
> + *			     fault address, optionally migrate the
> range to
> + *			     VRAM, and create GPU bindings.
> + * - Garbage collector: Used to destroy GPU bindings for ranges.
> Ranges are
> + *			expected to be added to the garbage
> collector upon
> + *			MMU_NOTIFY_UNMAP event.
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * GPU SVM handles locking for core MM interactions, i.e., it
> locks/unlocks the
> + * mmap lock as needed. Alternatively, if the driver prefers to
> handle the mmap
> + * lock itself, a 'locked' argument is provided to the functions
> that require
> + * the mmap lock. This option may be useful for drivers that need to
> call into
> + * GPU SVM while also holding a dma-resv lock, thus preventing
> locking
> + * inversions between the mmap and dma-resv locks.
> + *
> + * GPU SVM introduces a global notifier lock, which safeguards the
> notifier's
> + * range RB tree and list, as well as the range's DMA mappings and
> sequence
> + * number. GPU SVM manages all necessary locking and unlocking
> operations,
> + * except for the recheck of the range's sequence number
> + * (mmu_interval_read_retry) when the driver is committing GPU
> bindings. This
> + * lock corresponds to the 'driver->update' lock mentioned in the
> HMM
> + * documentation (TODO: Link). Future revisions may transition from
> a GPU SVM
> + * global lock to a per-notifier lock if finer-grained locking is
> deemed
> + * necessary.
> + *
> + * In addition to the locking mentioned above, the driver should
> implement a
> + * lock to safeguard core GPU SVM function calls that modify state,
> such as
> + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> Alternatively,
> + * these core functions can be called within a single kernel thread,
> for
> + * instance, using an ordered work queue. This lock is denoted as
> + * 'driver_svm_lock' in code examples.
> + */
> +
> +/**
> + * DOC: Migrataion
> + *
> + * The migration support is quite simple, allowing migration between
> SRAM and
> + * VRAM at the range granularity. For example, GPU SVM currently
> does not
> + * support mixing SRAM and VRAM pages within a range. This means
> that upon GPU
> + * fault, the entire range can be migrated to VRAM, and upon CPU
> fault, the
> + * entire range is migrated to SRAM.
> + *
> + * The reasoning for only supporting range granularity is as
> follows: it
> + * simplifies the implementation, and range sizes are driver-defined
> and should
> + * be relatively small.
> + */
> +
> +/**
> + * DOC: Partial Unmapping of Ranges
> + *
> + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> CPU resulting
> + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> main one
> + * being that a subset of the range still has CPU and GPU mappings.
> If the
> + * backing store for the range is in VRAM, a subset of the backing
> store has
> + * references. One option would be to split the range and VRAM
> backing store,
> + * but the implementation for this would be quite complicated. Given
> that
> + * partial unmappings are rare and driver-defined range sizes are
> relatively
> + * small, GPU SVM does not support splitting of ranges.
> + *
> + * With no support for range splitting, upon partial unmapping of a
> range, the
> + * driver is expected to invalidate and destroy the entire range. If
> the range
> + * has VRAM as its backing, the driver is also expected to migrate
> any remaining
> + * pages back to SRAM.
> + */
> +
> +/**
> + * DOC: Examples
> + *
> + * This section provides two examples of how to build the expected
> driver
> + * components: the GPU page fault handler and the garbage collector.
> A third
> + * example demonstrates a sample invalidation driver vfunc.
> + *
> + * The generic code provided does not include logic for complex
> migration
> + * policies, optimized invalidations, or other potentially required
> driver
> + * locking (e.g., DMA-resv locks).
> + *
> + * 1) GPU page fault handler
> + *
> + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> drm_gpusvm_range *range)
> + *	{
> + *		int err = 0;
> + *
> + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> range);
> + *
> + *		drm_gpusvm_notifier_lock(gpusvm);
> + *		if (drm_gpusvm_range_pages_valid(range))
> + *			driver_commit_bind(gpusvm, range);
> + *		else
> + *			err = -EAGAIN;
> + *		drm_gpusvm_notifier_unlock(gpusvm);
> + *
> + *		return err;
> + *	}
> + *
> + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> + *			     u64 gpuva_start, u64 gpuva_end)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *		int err;
> + *
> + *		driver_svm_lock();
> + *	retry:
> + *		// Always process UNMAPs first so view of GPU SVM
> ranges is current
> + *		driver_garbage_collector(gpusvm);
> + *
> + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> fault_addr,
> + *							gpuva_start,
> gpuva_end,
> + *						        &ctx);
> + *		if (IS_ERR(range)) {
> + *			err = PTR_ERR(range);
> + *			goto unlock;
> + *		}
> + *
> + *		if (driver_migration_policy(range)) {
> + *			bo = driver_alloc_bo();
> + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> range, bo, &ctx);
> + *			if (err)	// CPU mappings may have
> changed
> + *				goto retry;
> + *		}
> + *
> + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &ctx);
> + *		if (err == -EFAULT || err == -EPERM)	// CPU
> mappings changed
> + *			goto retry;
> + *		else if (err)
> + *			goto unlock;
> + *
> + *		err = driver_bind_range(gpusvm, range);
> + *		if (err == -EAGAIN)	// CPU mappings changed
> + *			goto retry
> + *
> + *	unlock:
> + *		driver_svm_unlock();
> + *		return err;
> + *	}
> + *
> + * 2) Garbage Collector.
> + *
> + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> + *					struct drm_gpusvm_range
> *range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = {};
> + *
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		// Partial unmap, migrate any remaining VRAM pages
> back to SRAM
> + *		if (range->flags.partial_unmap)
> + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> &ctx);
> + *
> + *		driver_unbind_range(range);
> + *		drm_gpusvm_range_remove(gpusvm, range);
> + *	}
> + *
> + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> + *	{
> + *		assert_driver_svm_locked(gpusvm);
> + *
> + *		for_each_range_in_garbage_collector(gpusvm, range)
> + *			__driver_garbage_collector(gpusvm, range);
> + *	}
> + *
> + * 3) Invalidation driver vfunc.
> + *
> + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> + *				 struct drm_gpusvm_notifier
> *notifier,
> + *				 const struct mmu_notifier_range
> *mmu_range)
> + *	{
> + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> };
> + *		struct drm_gpusvm_range *range = NULL;
> + *
> + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> >start, mmu_range->end);
> + *
> + *		drm_gpusvm_for_each_range(range, notifier,
> mmu_range->start,
> + *					  mmu_range->end) {
> + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> &ctx);
> + *
> + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> + *				continue;
> + *
> + *			drm_gpusvm_range_set_unmapped(range,
> mmu_range);
> + *			driver_garbage_collector_add(gpusvm, range);
> + *		}
> + *	}
> + */
> +
> +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> rb.__subtree_last,
> +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> +		     static __maybe_unused, range);
> +
> +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> >interval.start)
> +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> >interval.end - 1)
> +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> notifier);
> +
> +/**
> + * npages_in_range() - Calculate the number of pages in a given
> range
> + * @start__: The start address of the range
> + * @end__: The end address of the range
> + *
> + * This macro calculates the number of pages in a given memory
> range,
> + * specified by the start and end addresses. It divides the
> difference
> + * between the end and start addresses by the page size (PAGE_SIZE)
> to
> + * determine the number of pages in the range.
> + *
> + * Return: The number of pages in the specified range.
> + */
> +#define npages_in_range(start__, end__)	\
> +	(((end__) - (start__)) >> PAGE_SHIFT)
> +
> +/**
> + * struct drm_gpusvm_zdd - GPU SVM zone device data
> + *
> + * @refcount: Reference count for the zdd
> + * @destroy_work: Work structure for asynchronous zdd destruction
> + * @range: Pointer to the GPU SVM range
> + * @vram_allocation: Driver-private pointer to the VRAM allocation
> + *
> + * This structure serves as a generic wrapper installed in
> + * page->zone_device_data. It provides infrastructure for looking up
> a range
> + * upon CPU page fault and asynchronously releasing VRAM once the
> CPU has no
> + * page references. Asynchronous release is useful because CPU page
> references
> + * can be dropped in IRQ contexts, while releasing VRAM likely
> requires sleeping
> + * locks.
> + */
> +struct drm_gpusvm_zdd {
> +	struct kref refcount;
> +	struct work_struct destroy_work;
> +	struct drm_gpusvm_range *range;
> +	void *vram_allocation;
> +};
> +
> +/**
> + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> zdd
> + * @w: Pointer to the work_struct
> + *
> + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> + */
> +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(w, struct drm_gpusvm_zdd,
> destroy_work);
> +	struct drm_gpusvm_range *range = zdd->range;
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> +		gpusvm->ops->vram_release(zdd->vram_allocation);
> +	drm_gpusvm_range_put(range);
> +	kfree(zdd);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> + * @range: Pointer to the GPU SVM range.
> + *
> + * This function allocates and initializes a new zdd structure. It
> sets up the
> + * reference count, initializes the destroy work, and links the
> provided GPU SVM
> + * range.
> + *
> + * Returns:
> + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> + */
> +static struct drm_gpusvm_zdd *
> +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_zdd *zdd;
> +
> +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> +	if (!zdd)
> +		return NULL;
> +
> +	kref_init(&zdd->refcount);
> +	INIT_WORK(&zdd->destroy_work,
> drm_gpusvm_zdd_destroy_work_func);
> +	zdd->range = drm_gpusvm_range_get(range);
> +	zdd->vram_allocation = NULL;
> +
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function increments the reference count of the provided zdd
> structure.
> + *
> + * Returns: Pointer to the zdd structure.
> + */
> +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> drm_gpusvm_zdd *zdd)
> +{
> +	kref_get(&zdd->refcount);
> +	return zdd;
> +}
> +
> +/**
> + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> + * @ref: Pointer to the reference count structure.
> + *
> + * This function queues the destroy_work of the zdd for asynchronous
> destruction.
> + */
> +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> +{
> +	struct drm_gpusvm_zdd *zdd =
> +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> +
> +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> +}
> +
> +/**
> + * drm_gpusvm_zdd_put - Put a zdd reference.
> + * @zdd: Pointer to the zdd structure.
> + *
> + * This function decrements the reference count of the provided zdd
> structure
> + * and schedules its destruction if the count drops to zero.
> + */
> +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> +{
> +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> + * @notifier: Pointer to the GPU SVM notifier structure.
> + * @start: Start address of the range
> + * @end: End address of the range
> + *
> + * Return: A pointer to the drm_gpusvm_range if found or NULL
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> start, u64 end)
> +{
> +	return range_iter_first(&notifier->root, start, end - 1);
> +}
> +
> +/**
> + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> ranges in a notifier
> + * @range__: Iterator variable for the ranges
> + * @next__: Iterator variable for the ranges temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the range
> + * @end__: End address of the range
> + *
> + * This macro is used to iterate over GPU SVM ranges in a notifier
> while
> + * removing ranges from it.
> + */
> +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> start__, end__)	\
> +	for ((range__) = drm_gpusvm_range_find((notifier__),
> (start__), (end__)),	\
> +	     (next__) =
> __drm_gpusvm_range_next(range__);				\
> +	     (range__) && (range__->va.start <
> (end__));				\
> +	     (range__) = (next__), (next__) =
> __drm_gpusvm_range_next(range__))
> +
> +/**
> + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> the list
> + * @notifier: a pointer to the current drm_gpusvm_notifier
> + *
> + * Return: A pointer to the next drm_gpusvm_notifier if available,
> or NULL if
> + *         the current notifier is the last one or if the input
> notifier is
> + *         NULL.
> + */
> +static struct drm_gpusvm_notifier *
> +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> +{
> +	if (notifier && !list_is_last(&notifier->rb.entry,
> +				      &notifier->gpusvm-
> >notifier_list))
> +		return list_next_entry(notifier, rb.entry);
> +
> +	return NULL;
> +}
> +
> +/**
> + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> + */
> +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> end__)		\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1);	\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> notifiers in a gpusvm
> + * @notifier__: Iterator variable for the notifiers
> + * @next__: Iterator variable for the notifiers temporay storage
> + * @notifier__: Pointer to the GPU SVM notifier
> + * @start__: Start address of the notifier
> + * @end__: End address of the notifier
> + *
> + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> while
> + * removing notifiers from it.
> + */
> +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> gpusvm__, start__, end__)	\
> +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> (start__), (end__) - 1),	\
> +	     (next__) =
> __drm_gpusvm_notifier_next(notifier__);				\
> +	     (notifier__) && (notifier__->interval.start <
> (end__));			\
> +	     (notifier__) = (next__), (next__) =
> __drm_gpusvm_notifier_next(notifier__))
> +
> +/**
> + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> + * @mni: Pointer to the mmu_interval_notifier structure.
> + * @mmu_range: Pointer to the mmu_notifier_range structure.
> + * @cur_seq: Current sequence number.
> + *
> + * This function serves as a generic MMU notifier for GPU SVM. It
> sets the MMU
> + * notifier sequence number and calls the driver invalidate vfunc
> under
> + * gpusvm->notifier_lock.
> + *
> + * Returns:
> + * true if the operation succeeds, false otherwise.
> + */
> +static bool
> +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> +			       const struct mmu_notifier_range
> *mmu_range,
> +			       unsigned long cur_seq)
> +{
> +	struct drm_gpusvm_notifier *notifier =
> +		container_of(mni, typeof(*notifier), notifier);
> +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> +
> +	if (!mmu_notifier_range_blockable(mmu_range))
> +		return false;
> +
> +	down_write(&gpusvm->notifier_lock);
> +	mmu_interval_set_seq(mni, cur_seq);
> +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> +	up_write(&gpusvm->notifier_lock);
> +
> +	return true;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> GPU SVM
> + */
> +static const struct mmu_interval_notifier_ops
> drm_gpusvm_notifier_ops = {
> +	.invalidate = drm_gpusvm_notifier_invalidate,
> +};
> +
> +/**
> + * drm_gpusvm_init - Initialize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + * @name: Name of the GPU SVM.
> + * @drm: Pointer to the DRM device structure.
> + * @mm: Pointer to the mm_struct for the address space.
> + * @device_private_page_owner: Device private pages owner.
> + * @mm_start: Start address of GPU SVM.
> + * @mm_range: Range of the GPU SVM.
> + * @notifier_size: Size of individual notifiers.
> + * @ops: Pointer to the operations structure for GPU SVM.
> + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> allocation.
> + *               Entries should be powers of 2 in descending order
> with last
> + *               entry being SZ_4K.
> + * @num_chunks: Number of chunks.
> + *
> + * This function initializes the GPU SVM.
> + *
> + * Returns:
> + * 0 on success, a negative error code on failure.
> + */
> +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> +		    const char *name, struct drm_device *drm,
> +		    struct mm_struct *mm, void
> *device_private_page_owner,
> +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> +		    const struct drm_gpusvm_ops *ops,
> +		    const u64 *chunk_sizes, int num_chunks)
> +{
> +	if (!ops->invalidate || !num_chunks)
> +		return -EINVAL;
> +
> +	gpusvm->name = name;
> +	gpusvm->drm = drm;
> +	gpusvm->mm = mm;
> +	gpusvm->device_private_page_owner =
> device_private_page_owner;
> +	gpusvm->mm_start = mm_start;
> +	gpusvm->mm_range = mm_range;
> +	gpusvm->notifier_size = notifier_size;
> +	gpusvm->ops = ops;
> +	gpusvm->chunk_sizes = chunk_sizes;
> +	gpusvm->num_chunks = num_chunks;
> +	gpusvm->zdd_wq = system_wq;
> +
> +	mmgrab(mm);
> +	gpusvm->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> +
> +	init_rwsem(&gpusvm->notifier_lock);
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	might_lock(&gpusvm->notifier_lock);
> +	fs_reclaim_release(GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM structure
> + * @fault_addr__: Fault address
> + *
> + * This macro finds the GPU SVM notifier associated with the fault
> address.
> + *
> + * Returns:
> + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> + */
> +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> +			    (fault_addr__ + 1))
> +
> +/**
> + * to_drm_gpusvm_notifier - retrieve the container struct for a
> given rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_notifier struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_notifier
> structure.
> + */
> +#define to_drm_gpusvm_notifier(__node)				\
> +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> +
> +/**
> + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function inserts the GPU SVM notifier into the GPU SVM RB
> tree and list.
> + */
> +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	notifier_insert(notifier, &gpusvm->root);
> +
> +	node = rb_prev(&notifier->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> +	else
> +		head = &gpusvm->notifier_list;
> +
> +	list_add(&notifier->rb.entry, head);
> +}
> +
> +/**
> + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> + * @gpusvm__: Pointer to the GPU SVM tructure
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + *
> + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> and list.
> + */
> +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> +	list_del(&(notifier__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_fini - Finalize the GPU SVM.
> + * @gpusvm: Pointer to the GPU SVM structure.
> + *
> + * This function finalizes the GPU SVM by cleaning up any remaining
> ranges and
> + * notifiers, and dropping a reference to struct MM.
> + */
> +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> +{
> +	struct drm_gpusvm_notifier *notifier, *next;
> +
> +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> LONG_MAX) {
> +		struct drm_gpusvm_range *range, *__next;
> +
> +		/*
> +		 * Remove notifier first to avoid racing with any
> invalidation
> +		 */
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +		notifier->flags.removed = true;
> +
> +		drm_gpusvm_for_each_range_safe(range, __next,
> notifier, 0,
> +					       LONG_MAX)
> +			drm_gpusvm_range_remove(gpusvm, range);
> +	}
> +
> +	mmdrop(gpusvm->mm);
> +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> +}
> +
> +/**
> + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + *
> + * This function allocates and initializes the GPU SVM notifier
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> on failure.
> + */
> +static struct drm_gpusvm_notifier *
> +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	if (gpusvm->ops->notifier_alloc)
> +		notifier = gpusvm->ops->notifier_alloc();
> +	else
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +
> +	if (!notifier)
> +		return ERR_PTR(-ENOMEM);
> +
> +	notifier->gpusvm = gpusvm;
> +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> >notifier_size);
> +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> >notifier_size);
> +	INIT_LIST_HEAD(&notifier->rb.entry);
> +	notifier->root = RB_ROOT_CACHED;
> +	INIT_LIST_HEAD(&notifier->range_list);
> +
> +	return notifier;
> +}
> +
> +/**
> + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + *
> + * This function frees the GPU SVM notifier structure.
> + */
> +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> +				     struct drm_gpusvm_notifier
> *notifier)
> +{
> +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> +
> +	if (gpusvm->ops->notifier_free)
> +		gpusvm->ops->notifier_free(notifier);
> +	else
> +		kfree(notifier);
> +}
> +
> +/**
> + * to_drm_gpusvm_range - retrieve the container struct for a given
> rbtree node
> + * @node__: a pointer to the rbtree node embedded within a
> drm_gpusvm_range struct
> + *
> + * Return: A pointer to the containing drm_gpusvm_range structure.
> + */
> +#define to_drm_gpusvm_range(node__)	\
> +	container_of((node__), struct drm_gpusvm_range, rb.node)
> +
> +/**
> + * drm_gpusvm_range_insert - Insert GPU SVM range
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function inserts the GPU SVM range into the notifier RB tree
> and list.
> + */
> +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> *notifier,
> +				    struct drm_gpusvm_range *range)
> +{
> +	struct rb_node *node;
> +	struct list_head *head;
> +
> +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> +	range_insert(range, &notifier->root);
> +
> +	node = rb_prev(&range->rb.node);
> +	if (node)
> +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> +	else
> +		head = &notifier->range_list;
> +
> +	list_add(&range->rb.entry, head);
> +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> +}
> +
> +/**
> + * __drm_gpusvm_range_remove - Remove GPU SVM range
> + * @notifier__: Pointer to the GPU SVM notifier structure
> + * @range__: Pointer to the GPU SVM range structure
> + *
> + * This macro removes the GPU SVM range from the notifier RB tree
> and list.
> + */
> +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> +	range_remove((range__), &(notifier__)->root);		\
> +	list_del(&(range__)->rb.entry)
> +
> +/**
> + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @fault_addr: Fault address
> + * @chunk_size: Chunk size
> + * @migrate_vram: Flag indicating whether to migrate VRAM
> + *
> + * This function allocates and initializes the GPU SVM range
> structure.
> + *
> + * Returns:
> + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> failure.
> + */
> +static struct drm_gpusvm_range *
> +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> +		       struct drm_gpusvm_notifier *notifier,
> +		       u64 fault_addr, u64 chunk_size, bool
> migrate_vram)
> +{
> +	struct drm_gpusvm_range *range;
> +
> +	if (gpusvm->ops->range_alloc)
> +		range = gpusvm->ops->range_alloc(gpusvm);
> +	else
> +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> +
> +	if (!range)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&range->refcount);
> +	range->gpusvm = gpusvm;
> +	range->notifier = notifier;
> +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> +	INIT_LIST_HEAD(&range->rb.entry);
> +	range->notifier_seq = LONG_MAX;
> +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_check_pages - Check pages
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @start: Start address
> + * @end: End address
> + *
> + * Check if pages between start and end have been faulted in on the
> CPU. Use to
> + * prevent migration of pages without CPU backing store.
> + *
> + * Returns:
> + * True if pages have been faulted into CPU, False otherwise
> + */
> +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> +				   struct drm_gpusvm_notifier
> *notifier,
> +				   u64 start, u64 end)
> +{
> +	struct hmm_range hmm_range = {
> +		.default_flags = 0,
> +		.notifier = &notifier->notifier,
> +		.start = start,
> +		.end = end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long *pfns;
> +	unsigned long npages = npages_in_range(start, end);
> +	int err, i;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> +	if (!pfns)
> +		return false;
> +
> +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> >notifier);
> +	hmm_range.hmm_pfns = pfns;
> +
> +	while (true) {
> +		err = hmm_range_fault(&hmm_range);
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(&notifier->notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (err)
> +		goto err_free;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!(pfns[i] & HMM_PFN_VALID)) {
> +			err = -EFAULT;
> +			goto err_free;
> +		}
> +	}
> +
> +err_free:
> +	kvfree(pfns);
> +	return err ? false : true;
> +}
> +
> +/**
> + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @notifier: Pointer to the GPU SVM notifier structure
> + * @vas: Pointer to the virtual memory area structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @check_pages: Flag indicating whether to check pages
> + *
> + * This function determines the chunk size for the GPU SVM range
> based on the
> + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> the virtual
> + * memory area boundaries.
> + *
> + * Returns:
> + * Chunk size on success, LONG_MAX on failure.
> + */
> +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> +				       struct drm_gpusvm_notifier
> *notifier,
> +				       struct vm_area_struct *vas,
> +				       u64 fault_addr, u64
> gpuva_start,
> +				       u64 gpuva_end, bool
> check_pages)
> +{
> +	u64 start, end;
> +	int i = 0;
> +
> +retry:
> +	for (; i < gpusvm->num_chunks; ++i) {
> +		start = ALIGN_DOWN(fault_addr, gpusvm-
> >chunk_sizes[i]);
> +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> +
> +		if (start >= vas->vm_start && end <= vas->vm_end &&
> +		    start >= notifier->interval.start &&
> +		    end <= notifier->interval.end &&
> +		    start >= gpuva_start && end <= gpuva_end)
> +			break;
> +	}
> +
> +	if (i == gpusvm->num_chunks)
> +		return LONG_MAX;
> +
> +	/*
> +	 * If allocation more than page, ensure not to overlap with
> existing
> +	 * ranges.
> +	 */
> +	if (end - start != SZ_4K) {
> +		struct drm_gpusvm_range *range;
> +
> +		range = drm_gpusvm_range_find(notifier, start, end);
> +		if (range) {
> +			++i;
> +			goto retry;
> +		}
> +
> +		/*
> +		 * XXX: Only create range on pages CPU has faulted
> in. Without
> +		 * this check, or prefault, on BMG
> 'xe_exec_system_allocator --r
> +		 * process-many-malloc' fails. In the failure case,
> each process
> +		 * mallocs 16k but the CPU VMA is ~128k which
> results in 64k SVM
> +		 * ranges. When migrating the SVM ranges, some
> processes fail in
> +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> != npages'
> +		 * and then upon drm_gpusvm_range_get_pages device
> pages from
> +		 * other processes are collected + faulted in which
> creates all
> +		 * sorts of problems. Unsure exactly how this
> happening, also
> +		 * problem goes away if 'xe_exec_system_allocator --
> r
> +		 * process-many-malloc' mallocs at least 64k at a
> time.
> +		 */
> +		if (check_pages &&
> +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> end)) {
> +			++i;
> +			goto retry;
> +		}
> +	}
> +
> +	return end - start;
> +}
> +
> +/**
> + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @fault_addr: Fault address
> + * @gpuva_start: Start address of GPUVA which mirrors CPU
> + * @gpuva_end: End address of GPUVA which mirrors CPU
> + * @ctx: GPU SVM context
> + *
> + * This function finds or inserts a newly allocated a GPU SVM range
> based on the
> + * fault address. Caller must hold a lock to protect range lookup
> and insertion.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> fault_addr,
> +				u64 gpuva_start, u64 gpuva_end,
> +				const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +	struct drm_gpusvm_range *range;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	bool notifier_alloc = false;
> +	u64 chunk_size;
> +	int err;
> +	bool migrate_vram;
> +
> +	if (fault_addr < gpusvm->mm_start ||
> +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_write_locked(mm);
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> +	if (!notifier) {
> +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> fault_addr);
> +		if (IS_ERR(notifier)) {
> +			err = PTR_ERR(notifier);
> +			goto err_mmunlock;
> +		}
> +		notifier_alloc = true;
> +		err = mmu_interval_notifier_insert_locked(&notifier-
> >notifier,
> +							  mm,
> notifier->interval.start,
> +							  notifier-
> >interval.end -
> +							  notifier-
> >interval.start,
> +							 
> &drm_gpusvm_notifier_ops);
> +		if (err)
> +			goto err_notifier;
> +	}
> +
> +	vas = vma_lookup(mm, fault_addr);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_notifier_remove;
> +	}
> +
> +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> +		err = -EPERM;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_find(notifier, fault_addr,
> fault_addr + 1);
> +	if (range)
> +		goto out_mmunlock;
> +	/*
> +	 * XXX: Short-circuiting migration based on migrate_vma_*
> current
> +	 * limitations. If/when migrate_vma_* add more support, this
> logic will
> +	 * have to change.
> +	 */
> +	migrate_vram = ctx->vram_possible &&
> +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> +
> +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> vas,
> +						 fault_addr,
> gpuva_start,
> +						 gpuva_end,
> migrate_vram &&
> +						 !ctx->prefault);
> +	if (chunk_size == LONG_MAX) {
> +		err = -EINVAL;
> +		goto err_notifier_remove;
> +	}
> +
> +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> chunk_size,
> +				       migrate_vram);
> +	if (IS_ERR(range)) {
> +		err = PTR_ERR(range);
> +		goto err_notifier_remove;
> +	}
> +
> +	drm_gpusvm_range_insert(notifier, range);
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> +
> +	if (ctx->prefault) {
> +		struct drm_gpusvm_ctx __ctx = *ctx;
> +
> +		__ctx.mmap_locked = true;
> +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> &__ctx);
> +		if (err)
> +			goto err_range_remove;
> +	}
> +
> +out_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +
> +	return range;
> +
> +err_range_remove:
> +	__drm_gpusvm_range_remove(notifier, range);
> +err_notifier_remove:
> +	if (notifier_alloc)
> +		mmu_interval_notifier_remove(&notifier->notifier);
> +err_notifier:
> +	if (notifier_alloc)
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * for_each_dma_page - iterate over pages in a DMA regio`n
> + * @i__: the current page index in the iteration
> + * @j__: the current page index, log order, in the iteration
> + * @npages__: the total number of pages in the DMA region
> + * @order__: the order of the pages in the DMA region
> + *
> + * This macro iterates over each page in a DMA region. The DMA
> region
> + * is assumed to be composed of 2^@order__ pages, and the macro will
> + * step through the region one block of 2^@order__ pages at a time.
> + */
> +#define for_each_dma_page(i__, j__, npages__, order__)	\
> +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> +	     (j__)++, (i__) += 0x1 << (order__))
> +
> +/**
> + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> GPU SVM range (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function unmap pages associated with a GPU SVM range.
> Assumes and
> + * asserts correct locking is in place when called.
> + */
> +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> *gpusvm,
> +					   struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		unsigned long i, j, npages = npages_in_range(range-
> >va.start,
> +							     range-
> >va.end);
> +
> +		if (range->flags.has_dma_mapping) {
> +			for_each_dma_page(i, j, npages, range-
> >order)
> +				dma_unmap_page(gpusvm->drm->dev,
> +					       range->dma_addr[j],
> +					       PAGE_SIZE << range-
> >order,
> +					       DMA_BIDIRECTIONAL);
> +		}
> +
> +		range->flags.has_vram_pages = false;
> +		range->flags.has_dma_mapping = false;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function free pages associated with a GPU SVM range.
> + */
> +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> +					struct drm_gpusvm_range
> *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	if (range->pages) {
> +		if (range->flags.kfree_mapping) {
> +			kfree(range->dma_addr);
> +			range->flags.kfree_mapping = false;
> +			range->pages = NULL;
> +		} else {
> +			kvfree(range->pages);
> +			range->pages = NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_remove - Remove GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range to be removed
> + *
> + * This function removes the specified GPU SVM range and also
> removes the parent
> + * GPU SVM notifier if no more ranges remain in the notifier. The
> caller must
> + * hold a lock to protect range and notifier removal.
> + */
> +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> +			     struct drm_gpusvm_range *range)
> +{
> +	struct drm_gpusvm_notifier *notifier;
> +
> +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> >va.start);
> +	if (WARN_ON_ONCE(!notifier))
> +		return;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +	drm_gpusvm_range_free_pages(gpusvm, range);
> +	__drm_gpusvm_range_remove(notifier, range);
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	drm_gpusvm_range_put(range);
> +
> +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> +		if (!notifier->flags.removed)
> +			mmu_interval_notifier_remove(&notifier-
> >notifier);
> +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> +		drm_gpusvm_notifier_free(gpusvm, notifier);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function increments the reference count of the specified GPU
> SVM range.
> + *
> + * Returns:
> + * Pointer to the GPU SVM range.
> + */
> +struct drm_gpusvm_range *
> +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> +{
> +	kref_get(&range->refcount);
> +
> +	return range;
> +}
> +
> +/**
> + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> + * @refcount: Pointer to the reference counter embedded in the GPU
> SVM range
> + *
> + * This function destroys the specified GPU SVM range when its
> reference count
> + * reaches zero. If a custom range-free function is provided, it is
> invoked to
> + * free the range; otherwise, the range is deallocated using
> kfree().
> + */
> +static void drm_gpusvm_range_destroy(struct kref *refcount)
> +{
> +	struct drm_gpusvm_range *range =
> +		container_of(refcount, struct drm_gpusvm_range,
> refcount);
> +	struct drm_gpusvm *gpusvm = range->gpusvm;
> +
> +	if (gpusvm->ops->range_free)
> +		gpusvm->ops->range_free(range);
> +	else
> +		kfree(range);
> +}
> +
> +/**
> + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> + * @range: Pointer to the GPU SVM range
> + *
> + * This function decrements the reference count of the specified GPU
> SVM range
> + * and frees it when the count reaches zero.
> + */
> +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> +{
> +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called holding gpusvm->notifier_lock and as the last step before
> commiting a
> + * GPU binding.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range)
> +{
> +	lockdep_assert_held(&gpusvm->notifier_lock);
> +
> +	return range->flags.has_vram_pages || range-
> >flags.has_dma_mapping;
> +}
> +
> +/**
> + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> unlocked
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * This function determines if a GPU SVM range pages are valid.
> Expected be
> + * called without holding gpusvm->notifier_lock.
> + *
> + * Returns:
> + * True if GPU SVM range has valid pages, False otherwise
> + */
> +static bool
> +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> +				      struct drm_gpusvm_range
> *range)
> +{
> +	bool pages_valid;
> +
> +	if (!range->pages)
> +		return false;
> +
> +	drm_gpusvm_notifier_lock(gpusvm);
> +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> +	if (!pages_valid && range->flags.kfree_mapping) {
> +		kfree(range->dma_addr);
> +		range->flags.kfree_mapping = false;
> +		range->pages = NULL;
> +	}
> +	drm_gpusvm_notifier_unlock(gpusvm);
> +
> +	return pages_valid;
> +}
> +
> +/**
> + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function gets pages for a GPU SVM range and ensures they are
> mapped for
> + * DMA access.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	struct mmu_interval_notifier *notifier = &range->notifier-
> >notifier;
> +	struct hmm_range hmm_range = {
> +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> ? 0 :
> +			HMM_PFN_REQ_WRITE),
> +		.notifier = notifier,
> +		.start = range->va.start,
> +		.end = range->va.end,
> +		.dev_private_owner = gpusvm-
> >device_private_page_owner,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long timeout =
> +		jiffies +
> msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> +	unsigned long i, j;
> +	unsigned long npages = npages_in_range(range->va.start,
> range->va.end);
> +	unsigned int order = 0;
> +	unsigned long *pfns;
> +	struct page **pages;
> +	int err = 0;
> +	bool vram_pages = !!range->flags.migrate_vram;
> +	bool alloc_pfns = false, kfree_mapping;
> +
> +retry:
> +	kfree_mapping = false;
> +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> +		return 0;
> +
> +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> >pages) {
> +		if (ctx->prefault)
> +			return 0;
> +
> +		pfns = (unsigned long *)range->pages;
> +		pages = range->pages;
> +		goto map_pages;
> +	}
> +
> +	if (!range->pages) {
> +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> GFP_KERNEL);
> +		if (!pfns)
> +			return -ENOMEM;
> +		alloc_pfns = true;
> +	} else {
> +		pfns = (unsigned long *)range->pages;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +	}
> +
> +	hmm_range.hmm_pfns = pfns;
> +	while (true) {
> +		/* Must be checked after mmu_interval_read_begin */
> +		if (range->flags.unmapped) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		if (!ctx->mmap_locked) {
> +			/*
> +			 * XXX: HMM locking document indicates only
> a read-lock
> +			 * is required but there apears to be a
> window between
> +			 * the MMU_NOTIFY_MIGRATE event triggered in
> a CPU fault
> +			 * via migrate_vma_setup and the pages
> actually moving
> +			 * in migrate_vma_finalize in which this
> code can grab
> +			 * garbage pages. Grabbing the write-lock if
> the range
> +			 * is attached to vram appears to protect
> against this
> +			 * race.
> +			 */
> +			if (vram_pages)
> +				mmap_write_lock(mm);
> +			else
> +				mmap_read_lock(mm);
> +		}
> +		err = hmm_range_fault(&hmm_range);
> +		if (!ctx->mmap_locked) {
> +			if (vram_pages)
> +				mmap_write_unlock(mm);
> +			else
> +				mmap_read_unlock(mm);
> +		}
> +
> +		if (err == -EBUSY) {
> +			if (time_after(jiffies, timeout))
> +				break;
> +
> +			hmm_range.notifier_seq =
> mmu_interval_read_begin(notifier);
> +			continue;
> +		}
> +		break;
> +	}
> +	if (!ctx->mmap_locked)
> +		mmput(mm);
> +	if (err)
> +		goto err_free;
> +
> +	pages = (struct page **)pfns;
> +
> +	if (ctx->prefault) {
> +		range->pages = pages;
> +		goto set_seqno;
> +	}
> +
> +map_pages:
> +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> +		WARN_ON_ONCE(!range->vram_allocation);
> +
> +		for (i = 0; i < npages; ++i) {
> +			pages[i] = hmm_pfn_to_page(pfns[i]);
> +
> +			if
> (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> +				err = -EOPNOTSUPP;
> +				goto err_free;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->flags.has_vram_pages = true;
> +		range->pages = pages;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	} else {
> +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> +
> +		for_each_dma_page(i, j, npages, order) {
> +			if (WARN_ON_ONCE(i && order !=
> +					
> hmm_pfn_to_map_order(pfns[i]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +			order = hmm_pfn_to_map_order(pfns[i]);
> +
> +			pages[j] = hmm_pfn_to_page(pfns[i]);
> +			if
> (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> +				err = -EOPNOTSUPP;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +
> +			set_page_dirty_lock(pages[j]);
> +			mark_page_accessed(pages[j]);
> +
> +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> +						   pages[j], 0,
> +						   PAGE_SIZE <<
> order,
> +						  
> DMA_BIDIRECTIONAL);
> +			if (dma_mapping_error(gpusvm->drm->dev,
> dma_addr[j])) {
> +				err = -EFAULT;
> +				npages = i;
> +				goto err_unmap;
> +			}
> +		}
> +
> +		/* Huge pages, reduce memory footprint */
> +		if (order) {
> +			dma_addr = kmalloc_array(j,
> sizeof(*dma_addr),
> +						 GFP_KERNEL);
> +			if (dma_addr) {
> +				for (i = 0; i < j; ++i)
> +					dma_addr[i] =
> (dma_addr_t)pfns[i];
> +				kvfree(pfns);
> +				kfree_mapping = true;
> +			} else {
> +				dma_addr = (dma_addr_t *)pfns;
> +			}
> +		}
> +
> +		/* Do not race with notifier unmapping pages */
> +		drm_gpusvm_notifier_lock(gpusvm);
> +		range->order = order;
> +		range->flags.kfree_mapping = kfree_mapping;
> +		range->flags.has_dma_mapping = true;
> +		range->dma_addr = dma_addr;
> +		range->vram_allocation = NULL;
> +		if (mmu_interval_read_retry(notifier,
> hmm_range.notifier_seq)) {
> +			err = -EAGAIN;
> +			__drm_gpusvm_range_unmap_pages(gpusvm,
> range);
> +		}
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +	}
> +
> +	if (err == -EAGAIN)
> +		goto retry;
> +set_seqno:
> +	range->notifier_seq = hmm_range.notifier_seq;
> +
> +	return 0;
> +
> +err_unmap:
> +	for_each_dma_page(i, j, npages, order)
> +		dma_unmap_page(gpusvm->drm->dev,
> +			       (dma_addr_t)pfns[j],
> +			       PAGE_SIZE << order,
> DMA_BIDIRECTIONAL);
> +err_free:
> +	if (alloc_pfns)
> +		kvfree(pfns);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> SVM range
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function unmaps pages associated with a GPU SVM range. If
> @in_notifier
> + * is set, it is assumed that gpusvm->notifier_lock is held in write
> mode; if it
> + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> called on
> + * each GPU SVM range attached to notifier in gpusvm->ops-
> >invalidate for IOMMU
> + * security model.
> + */
> +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> +				  struct drm_gpusvm_range *range,
> +				  const struct drm_gpusvm_ctx *ctx)
> +{
> +	if (ctx->in_notifier)
> +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> +	else
> +		drm_gpusvm_notifier_lock(gpusvm);
> +
> +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> +
> +	if (!ctx->in_notifier)
> +		drm_gpusvm_notifier_unlock(gpusvm);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_page - Put a migration page
> + * @page: Pointer to the page to put
> + *
> + * This function unlocks and puts a page.
> + */
> +static void drm_gpusvm_migration_put_page(struct page *page)
> +{
> +	unlock_page(page);
> +	put_page(page);
> +}
> +
> +/**
> + * drm_gpusvm_migration_put_pages - Put migration pages
> + * @npages: Number of pages
> + * @migrate_pfn: Array of migrate page frame numbers
> + *
> + * This function puts an array of pages.
> + */
> +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> +					   unsigned long
> *migrate_pfn)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!migrate_pfn[i])
> +			continue;
> +
> +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> grate_pfn[i]));
> +		migrate_pfn[i] = 0;
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> + * @page: Pointer to the page
> + * @zdd: Pointer to the GPU SVM zone device data
> + *
> + * This function associates the given page with the specified GPU
> SVM zone
> + * device data and initializes it for zone device usage.
> + */
> +static void drm_gpusvm_get_vram_page(struct page *page,
> +				     struct drm_gpusvm_zdd *zdd)
> +{
> +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> +	zone_device_page_init(page);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> migration
> + * @dev: The device for which the pages are being mapped
> + * @dma_addr: Array to store DMA addresses corresponding to mapped
> pages
> + * @migrate_pfn: Array of migrate page frame numbers to map
> + * @npages: Number of pages to map
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function maps pages of memory for migration usage in GPU
> SVM. It
> + * iterates over each page frame number provided in @migrate_pfn,
> maps the
> + * corresponding page, and stores the DMA address in the provided
> @dma_addr
> + * array.
> + *
> + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> + */
> +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> +					dma_addr_t *dma_addr,
> +					long unsigned int
> *migrate_pfn,
> +					unsigned long npages,
> +					enum dma_data_direction dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page =
> migrate_pfn_to_page(migrate_pfn[i]);
> +
> +		if (!page)
> +			continue;
> +
> +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> +			return -EFAULT;
> +
> +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> dir);
> +		if (dma_mapping_error(dev, dma_addr[i]))
> +			return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> for GPU SVM migration
> + * @dev: The device for which the pages were mapped
> + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> + * @npages: Number of pages to unmap
> + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> + *
> + * This function unmaps previously mapped pages of memory for GPU
> Shared Virtual
> + * Memory (SVM). It iterates over each DMA address provided in
> @dma_addr, checks
> + * if it's valid and not already unmapped, and unmaps the
> corresponding page.
> + */
> +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> +					   dma_addr_t *dma_addr,
> +					   unsigned long npages,
> +					   enum dma_data_direction
> dir)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i] || dma_mapping_error(dev,
> dma_addr[i]))
> +			continue;
> +
> +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> +	}
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *                   failure of this function.
> + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> The caller
> + *                   should hold a reference to the VRAM allocation,
> which
> + *                   should be dropped via ops->vram_allocation or
> upon the
> + *                   failure of this function.
> + * @ctx: GPU SVM context
> + *
> + * This function migrates the specified GPU SVM range to VRAM. It
> performs the
> + * necessary setup and invokes the driver-specific operations for
> migration to
> + * VRAM. Upon successful return, @vram_allocation can safely
> reference @range
> + * until ops->vram_release is called which only upon successful
> return.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       void *vram_allocation,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct migrate_vma migrate = {
> +		.start		= start,
> +		.end		= end,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> +	};
> +	struct mm_struct *mm = gpusvm->mm;
> +	unsigned long i, npages = npages_in_range(start, end);
> +	struct vm_area_struct *vas;
> +	struct drm_gpusvm_zdd *zdd = NULL;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int err;
> +
> +	if (!range->flags.migrate_vram)
> +		return -EINVAL;
> +
> +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> >copy_to_vram ||
> +	    !gpusvm->ops->copy_to_sram)
> +		return -EOPNOTSUPP;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		mmap_write_lock(mm);
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	vas = vma_lookup(mm, start);
> +	if (!vas) {
> +		err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end > vas->vm_end || start < vas->vm_start) {
> +		err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	if (!vma_is_anonymous(vas)) {
> +		err = -EBUSY;
> +		goto err_mmunlock;
> +	}
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_mmunlock;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	zdd = drm_gpusvm_zdd_alloc(range);
> +	if (!zdd) {
> +		err = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/*
> +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> npages, not
> +	 * always an error. Need to revisit possible cases and how
> to handle. We
> +	 * could prefault on migrate.cpages != npages via
> hmm_range_fault.
> +	 */
> +
> +	if (!migrate.cpages) {
> +		err = -EFAULT;
> +		goto err_free;
> +	}
> +
> +	if (migrate.cpages != npages) {
> +		err = -EBUSY;
> +		goto err_finalize;
> +	}
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> vram_allocation, npages,
> +					     migrate.dst);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.src, npages,
> DMA_TO_DEVICE);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i) {
> +		struct page *page = pfn_to_page(migrate.dst[i]);
> +
> +		pages[i] = page;
> +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> +		drm_gpusvm_get_vram_page(page, zdd);
> +	}
> +
> +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +	/* Upon success bind vram allocation to range and zdd */
> +	range->vram_allocation = vram_allocation;
> +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> Owns ref */
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_TO_DEVICE);
> +err_free:
> +	if (zdd)
> +		drm_gpusvm_zdd_put(zdd);
> +	kvfree(buf);
> +err_mmunlock:
> +	if (!ctx->mmap_locked) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> VM area
> + * @vas: Pointer to the VM area structure, can be NULL
> + * @npages: Number of pages to populate
> + * @src_mpfn: Source array of migrate PFNs
> + * @mpfn: Array of migrate PFNs to populate
> + * @addr: Start address for PFN allocation
> + *
> + * This function populates the SRAM migrate page frame numbers
> (PFNs) for the
> + * specified VM area structure. It allocates and locks pages in the
> VM area for
> + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> if NULL use
> + * alloc_page for allocation.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> vm_area_struct *vas,
> +						unsigned long
> npages,
> +						unsigned long
> *src_mpfn,
> +						unsigned long *mpfn,
> u64 addr)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> +		struct page *page;
> +
> +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (vas)
> +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> addr);
> +		else
> +			page = alloc_page(GFP_HIGHUSER);
> +
> +		if (!page)
> +			return -ENOMEM;
> +
> +		lock_page(page);
> +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + *
> + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> lock and
> + * migration done via migrate_device_* functions. Fallback path as
> it is
> + * preferred to issue migrations with mmap lock.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> +				    struct drm_gpusvm_range *range)
> +{
> +	unsigned long npages;
> +	struct page **pages;
> +	unsigned long *src, *dst;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	npages = npages_in_range(range->va.start, range->va.end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	src = buf;
> +	dst = buf + (sizeof(*src) * npages);
> +	dma_addr = buf + (2 * sizeof(*src) * npages);
> +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> npages;
> +
> +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> >vram_allocation,
> +					     npages, src);
> +	if (err)
> +		goto err_free;
> +
> +	err = migrate_device_vma_range(gpusvm->mm,
> +				       gpusvm-
> >device_private_page_owner, src,
> +				       npages, range->va.start);
> +	if (err)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> src, dst, 0);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   dst, npages,
> DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(src[i]);
> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, dst);
> +	migrate_device_pages(src, dst, npages);
> +	migrate_device_finalize(src, dst, npages);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +
> +	return err;
> +}
> +
> +/**
> + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> (internal)
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @vas: Pointer to the VM area structure
> + * @page: Pointer to the page for fault handling (can be NULL)
> + * @start: Start address of the migration range
> + * @end: End address of the migration range
> + *
> + * This internal function performs the migration of the specified
> GPU SVM range
> + * to SRAM. It sets up the migration, populates + dma maps SRAM
> PFNs, and
> + * invokes the driver-specific operations for migration to SRAM.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +					struct vm_area_struct *vas,
> +					struct page *page,
> +					u64 start, u64 end)
> +{
> +	struct migrate_vma migrate = {
> +		.vma		= vas,
> +		.pgmap_owner	= gpusvm->device_private_page_owner,
> +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> +		.fault_page	= page,
> +	};
> +	unsigned long npages;
> +	struct page **pages;
> +	dma_addr_t *dma_addr;
> +	void *buf;
> +	int i, err = 0;
> +
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	/* Corner where VMA area struct has been partially unmapped
> */
> +	if (start < vas->vm_start)
> +		start = vas->vm_start;
> +	if (end > vas->vm_end)
> +		end = vas->vm_end;
> +
> +	migrate.start = start;
> +	migrate.end = end;
> +	npages = npages_in_range(start, end);
> +
> +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> sizeof(*dma_addr) +
> +		       sizeof(*pages), GFP_KERNEL);
> +	if (!buf) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
> +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> * npages;
> +
> +	migrate.vma = vas;
> +	migrate.src = buf;
> +	migrate.dst = migrate.src + npages;
> +
> +	err = migrate_vma_setup(&migrate);
> +	if (err)
> +		goto err_free;
> +
> +	/* Raced with another CPU fault, nothing to do */
> +	if (!migrate.cpages)
> +		goto err_free;
> +
> +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> +						   migrate.src,
> migrate.dst,
> +						   start);
> +	if (err)
> +		goto err_finalize;
> +
> +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> dma_addr,
> +					   migrate.dst, npages,
> +					   DMA_BIDIRECTIONAL);
> +	if (err)
> +		goto err_finalize;
> +
> +	for (i = 0; i < npages; ++i)
> +		pages[i] = migrate_pfn_to_page(migrate.src[i]);

See comments below which pages we actually want to migrate.


> +
> +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> npages);
> +	if (err)
> +		goto err_finalize;
> +
> +err_finalize:
> +	if (err)
> +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> +	migrate_vma_pages(&migrate);
> +	migrate_vma_finalize(&migrate);
> +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> npages,
> +				       DMA_BIDIRECTIONAL);
> +err_free:
> +	kvfree(buf);
> +err_out:
> +	mmap_assert_locked(gpusvm->mm);
> +
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> SRAM
> + * @gpusvm: Pointer to the GPU SVM structure
> + * @range: Pointer to the GPU SVM range structure
> + * @ctx: GPU SVM context
> + *
> + * This function initiates the migration of the specified GPU SVM
> range to
> + * SRAM. It performs necessary checks and invokes the internal
> migration
> + * function for actual migration.
> + *
> + * Returns:
> + * 0 on success, negative error code on failure.
> + */
> +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> +			       struct drm_gpusvm_range *range,
> +			       const struct drm_gpusvm_ctx *ctx)
> +{
> +	u64 start = range->va.start, end = range->va.end;
> +	struct mm_struct *mm = gpusvm->mm;
> +	struct vm_area_struct *vas;
> +	int err;
> +	bool retry = false;
> +
> +	if (!ctx->mmap_locked) {
> +		if (!mmget_not_zero(mm)) {
> +			err = -EFAULT;
> +			goto err_out;
> +		}
> +		if (ctx->trylock_mmap) {
> +			if (!mmap_read_trylock(mm))  {
> +				err =
> drm_gpusvm_evict_to_sram(gpusvm, range);
> +				goto err_mmput;
> +			}
> +		} else {
> +			mmap_read_lock(mm);
> +		}
> +	}
> +
> +	mmap_assert_locked(mm);
> +
> +	/*
> +	 * Loop required to find all VMA area structs for the corner
> case when
> +	 * VRAM backing has been partially unmapped from MM's
> address space.
> +	 */
> +again:
> +	vas = find_vma(mm, start);
> +	if (!vas) {
> +		if (!retry)
> +			err = -ENOENT;
> +		goto err_mmunlock;
> +	}
> +
> +	if (end <= vas->vm_start || start >= vas->vm_end) {
> +		if (!retry)
> +			err = -EINVAL;
> +		goto err_mmunlock;
> +	}
> +
> +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> end);

This function is typically called from the vm side to get a clean mm as
a last resort after get_pages() fail. As such should we have it evict
*everything*, even foreign device memory, and mismatching local device
pages. If so, we could use hmm_range_fault() with a NULL page owner +
faulting to do that.

> +	if (err)
> +		goto err_mmunlock;
> +
> +	if (vas->vm_end < end) {
> +		retry = true;
> +		start = vas->vm_end;
> +		goto again;
> +	}
> +
> +	if (!ctx->mmap_locked) {
> +		mmap_read_unlock(mm);
> +		/*
> +		 * Using mmput_async as this function can be called
> while
> +		 * holding a dma-resv lock, and a final put can grab
> the mmap
> +		 * lock, causing a lock inversion.
> +		 */
> +		mmput_async(mm);
> +	}
> +
> +	return 0;
> +
> +err_mmunlock:
> +	if (!ctx->mmap_locked)
> +		mmap_read_unlock(mm);
> +err_mmput:
> +	if (!ctx->mmap_locked)
> +		mmput_async(mm);
> +err_out:
> +	return err;
> +}
> +
> +/**
> + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> with a page
> + * @page: Pointer to the page
> + *
> + * This function is a callback used to put the GPU SVM zone device
> data
> + * associated with a page when it is being released.
> + */
> +static void drm_gpusvm_page_free(struct page *page)
> +{
> +	drm_gpusvm_zdd_put(page->zone_device_data);
> +}
> +
> +/**
> + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> fault handler)
> + * @vmf: Pointer to the fault information structure
> + *
> + * This function is a page fault handler used to migrate a GPU SVM
> range to RAM.
> + * It retrieves the GPU SVM range information from the faulting page
> and invokes
> + * the internal migration function to migrate the range back to RAM.
> + *
> + * Returns:
> + * VM_FAULT_SIGBUS on failure, 0 on success.
> + */
> +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> +	int err;
> +
> +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> +					   vmf->vma, vmf->page,
> +					   zdd->range->va.start,
> +					   zdd->range->va.end);

When called from here, since this is a pagemap op, we should ensure we
only migrate our own pagemap to RAM?

/Thanks,
Thomas
Matthew Brost Oct. 16, 2024, 3:18 a.m. UTC | #42
On Wed, Oct 09, 2024 at 12:50:42PM +0200, Thomas Hellström wrote:
> Hi, Matthew.
> 
> Some comments below around migrating to SRAM.
> 
> 
> On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > This patch introduces support for GPU Shared Virtual Memory (SVM) in
> > the
> > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > sharing of memory between the CPU and GPU, enhancing performance and
> > flexibility in GPU computing tasks.
> > 
> > The patch adds the necessary infrastructure for SVM, including data
> > structures and functions for managing SVM ranges and notifiers. It
> > also
> > provides mechanisms for allocating, deallocating, and migrating
> > memory
> > regions between system RAM and GPU VRAM.
> > 
> > This mid-layer is largely inspired by GPUVM.
> > 
> > Cc: Dave Airlie <airlied@redhat.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: <dri-devel@lists.freedesktop.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/Makefile     |    3 +-
> >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> >  3 files changed, 2591 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > 
> > diff --git a/drivers/gpu/drm/xe/Makefile
> > b/drivers/gpu/drm/xe/Makefile
> > index b9670ae09a9e..b8fc2ee58f1a 100644
> > --- a/drivers/gpu/drm/xe/Makefile
> > +++ b/drivers/gpu/drm/xe/Makefile
> > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> >  
> >  # core driver code
> >  
> > -xe-y += xe_bb.o \
> > +xe-y += drm_gpusvm.o \
> > +	xe_bb.o \
> >  	xe_bo.o \
> >  	xe_bo_evict.o \
> >  	xe_devcoredump.o \
> > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > new file mode 100644
> > index 000000000000..fc1e44e6ae72
> > --- /dev/null
> > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > @@ -0,0 +1,2174 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2024 Intel Corporation
> > + *
> > + * Authors:
> > + *     Matthew Brost <matthew.brost@intel.com>
> > + */
> > +
> > +#include <linux/dma-mapping.h>
> > +#include <linux/interval_tree_generic.h>
> > +#include <linux/hmm.h>
> > +#include <linux/memremap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_types.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/slab.h>
> > +
> > +#include <drm/drm_device.h>
> > +#include "drm_gpusvm.h"
> > +
> > +/**
> > + * DOC: Overview
> > + *
> > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > Rendering Manager (DRM)
> > + *
> > + * The GPU SVM layer is a component of the DRM framework designed to
> > manage shared
> > + * virtual memory between the CPU and GPU. It enables efficient data
> > exchange and
> > + * processing for GPU-accelerated applications by allowing memory
> > sharing and
> > + * synchronization between the CPU's and GPU's virtual address
> > spaces.
> > + *
> > + * Key GPU SVM Components:
> > + * - Notifiers: Notifiers: Used for tracking memory intervals and
> > notifying the
> > + *		GPU of changes, notifiers are sized based on a GPU
> > SVM
> > + *		initialization parameter, with a recommendation of
> > 512M or
> > + *		larger. They maintain a Red-BlacK tree and a list of
> > ranges that
> > + *		fall within the notifier interval. Notifiers are
> > tracked within
> > + *		a GPU SVM Red-BlacK tree and list and are
> > dynamically inserted
> > + *		or removed as ranges within the interval are created
> > or
> > + *		destroyed.
> > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > managed
> > + *	     by GPU SVM. They are sized based on an array of chunk
> > sizes, which
> > + *	     is a GPU SVM initialization parameter, and the CPU
> > address space.
> > + *	     Upon GPU fault, the largest aligned chunk that fits
> > within the
> > + *	     faulting CPU address space is chosen for the range
> > size. Ranges are
> > + *	     expected to be dynamically allocated on GPU fault and
> > removed on an
> > + *	     MMU notifier UNMAP event. As mentioned above, ranges
> > are tracked in
> > + *	     a notifier's Red-Black tree.
> > + * - Operations: Define the interface for driver-specific SVM
> > operations such as
> > + *		 allocation, page collection, migration,
> > invalidations, and VRAM
> > + *		 release.
> > + *
> > + * This layer provides interfaces for allocating, mapping,
> > migrating, and
> > + * releasing memory ranges between the CPU and GPU. It handles all
> > core memory
> > + * management interactions (DMA mapping, HMM, and migration) and
> > provides
> > + * driver-specific virtual functions (vfuncs). This infrastructure
> > is sufficient
> > + * to build the expected driver components for an SVM implementation
> > as detailed
> > + * below.
> > + *
> > + * Expected Driver Components:
> > + * - GPU page fault handler: Used to create ranges and notifiers
> > based on the
> > + *			     fault address, optionally migrate the
> > range to
> > + *			     VRAM, and create GPU bindings.
> > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > Ranges are
> > + *			expected to be added to the garbage
> > collector upon
> > + *			MMU_NOTIFY_UNMAP event.
> > + */
> > +
> > +/**
> > + * DOC: Locking
> > + *
> > + * GPU SVM handles locking for core MM interactions, i.e., it
> > locks/unlocks the
> > + * mmap lock as needed. Alternatively, if the driver prefers to
> > handle the mmap
> > + * lock itself, a 'locked' argument is provided to the functions
> > that require
> > + * the mmap lock. This option may be useful for drivers that need to
> > call into
> > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > locking
> > + * inversions between the mmap and dma-resv locks.
> > + *
> > + * GPU SVM introduces a global notifier lock, which safeguards the
> > notifier's
> > + * range RB tree and list, as well as the range's DMA mappings and
> > sequence
> > + * number. GPU SVM manages all necessary locking and unlocking
> > operations,
> > + * except for the recheck of the range's sequence number
> > + * (mmu_interval_read_retry) when the driver is committing GPU
> > bindings. This
> > + * lock corresponds to the 'driver->update' lock mentioned in the
> > HMM
> > + * documentation (TODO: Link). Future revisions may transition from
> > a GPU SVM
> > + * global lock to a per-notifier lock if finer-grained locking is
> > deemed
> > + * necessary.
> > + *
> > + * In addition to the locking mentioned above, the driver should
> > implement a
> > + * lock to safeguard core GPU SVM function calls that modify state,
> > such as
> > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > Alternatively,
> > + * these core functions can be called within a single kernel thread,
> > for
> > + * instance, using an ordered work queue. This lock is denoted as
> > + * 'driver_svm_lock' in code examples.
> > + */
> > +
> > +/**
> > + * DOC: Migrataion
> > + *
> > + * The migration support is quite simple, allowing migration between
> > SRAM and
> > + * VRAM at the range granularity. For example, GPU SVM currently
> > does not
> > + * support mixing SRAM and VRAM pages within a range. This means
> > that upon GPU
> > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > fault, the
> > + * entire range is migrated to SRAM.
> > + *
> > + * The reasoning for only supporting range granularity is as
> > follows: it
> > + * simplifies the implementation, and range sizes are driver-defined
> > and should
> > + * be relatively small.
> > + */
> > +
> > +/**
> > + * DOC: Partial Unmapping of Ranges
> > + *
> > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by
> > CPU resulting
> > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with the
> > main one
> > + * being that a subset of the range still has CPU and GPU mappings.
> > If the
> > + * backing store for the range is in VRAM, a subset of the backing
> > store has
> > + * references. One option would be to split the range and VRAM
> > backing store,
> > + * but the implementation for this would be quite complicated. Given
> > that
> > + * partial unmappings are rare and driver-defined range sizes are
> > relatively
> > + * small, GPU SVM does not support splitting of ranges.
> > + *
> > + * With no support for range splitting, upon partial unmapping of a
> > range, the
> > + * driver is expected to invalidate and destroy the entire range. If
> > the range
> > + * has VRAM as its backing, the driver is also expected to migrate
> > any remaining
> > + * pages back to SRAM.
> > + */
> > +
> > +/**
> > + * DOC: Examples
> > + *
> > + * This section provides two examples of how to build the expected
> > driver
> > + * components: the GPU page fault handler and the garbage collector.
> > A third
> > + * example demonstrates a sample invalidation driver vfunc.
> > + *
> > + * The generic code provided does not include logic for complex
> > migration
> > + * policies, optimized invalidations, or other potentially required
> > driver
> > + * locking (e.g., DMA-resv locks).
> > + *
> > + * 1) GPU page fault handler
> > + *
> > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > drm_gpusvm_range *range)
> > + *	{
> > + *		int err = 0;
> > + *
> > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > range);
> > + *
> > + *		drm_gpusvm_notifier_lock(gpusvm);
> > + *		if (drm_gpusvm_range_pages_valid(range))
> > + *			driver_commit_bind(gpusvm, range);
> > + *		else
> > + *			err = -EAGAIN;
> > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > + *
> > + *		return err;
> > + *	}
> > + *
> > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > + *			     u64 gpuva_start, u64 gpuva_end)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *		int err;
> > + *
> > + *		driver_svm_lock();
> > + *	retry:
> > + *		// Always process UNMAPs first so view of GPU SVM
> > ranges is current
> > + *		driver_garbage_collector(gpusvm);
> > + *
> > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > fault_addr,
> > + *							gpuva_start,
> > gpuva_end,
> > + *						        &ctx);
> > + *		if (IS_ERR(range)) {
> > + *			err = PTR_ERR(range);
> > + *			goto unlock;
> > + *		}
> > + *
> > + *		if (driver_migration_policy(range)) {
> > + *			bo = driver_alloc_bo();
> > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > range, bo, &ctx);
> > + *			if (err)	// CPU mappings may have
> > changed
> > + *				goto retry;
> > + *		}
> > + *
> > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &ctx);
> > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > mappings changed
> > + *			goto retry;
> > + *		else if (err)
> > + *			goto unlock;
> > + *
> > + *		err = driver_bind_range(gpusvm, range);
> > + *		if (err == -EAGAIN)	// CPU mappings changed
> > + *			goto retry
> > + *
> > + *	unlock:
> > + *		driver_svm_unlock();
> > + *		return err;
> > + *	}
> > + *
> > + * 2) Garbage Collector.
> > + *
> > + *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
> > + *					struct drm_gpusvm_range
> > *range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = {};
> > + *
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		// Partial unmap, migrate any remaining VRAM pages
> > back to SRAM
> > + *		if (range->flags.partial_unmap)
> > + *			drm_gpusvm_migrate_to_sram(gpusvm, range,
> > &ctx);
> > + *
> > + *		driver_unbind_range(range);
> > + *		drm_gpusvm_range_remove(gpusvm, range);
> > + *	}
> > + *
> > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > + *	{
> > + *		assert_driver_svm_locked(gpusvm);
> > + *
> > + *		for_each_range_in_garbage_collector(gpusvm, range)
> > + *			__driver_garbage_collector(gpusvm, range);
> > + *	}
> > + *
> > + * 3) Invalidation driver vfunc.
> > + *
> > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > + *				 struct drm_gpusvm_notifier
> > *notifier,
> > + *				 const struct mmu_notifier_range
> > *mmu_range)
> > + *	{
> > + *		struct drm_gpusvm_ctx ctx = { .in_notifier = true,
> > };
> > + *		struct drm_gpusvm_range *range = NULL;
> > + *
> > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > >start, mmu_range->end);
> > + *
> > + *		drm_gpusvm_for_each_range(range, notifier,
> > mmu_range->start,
> > + *					  mmu_range->end) {
> > + *			drm_gpusvm_range_unmap_pages(gpusvm, range,
> > &ctx);
> > + *
> > + *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
> > + *				continue;
> > + *
> > + *			drm_gpusvm_range_set_unmapped(range,
> > mmu_range);
> > + *			driver_garbage_collector_add(gpusvm, range);
> > + *		}
> > + *	}
> > + */
> > +
> > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > rb.__subtree_last,
> > +		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
> > +		     static __maybe_unused, range);
> > +
> > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > >interval.start)
> > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > >interval.end - 1)
> > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > +		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
> > +		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused,
> > notifier);
> > +
> > +/**
> > + * npages_in_range() - Calculate the number of pages in a given
> > range
> > + * @start__: The start address of the range
> > + * @end__: The end address of the range
> > + *
> > + * This macro calculates the number of pages in a given memory
> > range,
> > + * specified by the start and end addresses. It divides the
> > difference
> > + * between the end and start addresses by the page size (PAGE_SIZE)
> > to
> > + * determine the number of pages in the range.
> > + *
> > + * Return: The number of pages in the specified range.
> > + */
> > +#define npages_in_range(start__, end__)	\
> > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > +
> > +/**
> > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > + *
> > + * @refcount: Reference count for the zdd
> > + * @destroy_work: Work structure for asynchronous zdd destruction
> > + * @range: Pointer to the GPU SVM range
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation
> > + *
> > + * This structure serves as a generic wrapper installed in
> > + * page->zone_device_data. It provides infrastructure for looking up
> > a range
> > + * upon CPU page fault and asynchronously releasing VRAM once the
> > CPU has no
> > + * page references. Asynchronous release is useful because CPU page
> > references
> > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > requires sleeping
> > + * locks.
> > + */
> > +struct drm_gpusvm_zdd {
> > +	struct kref refcount;
> > +	struct work_struct destroy_work;
> > +	struct drm_gpusvm_range *range;
> > +	void *vram_allocation;
> > +};
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a
> > zdd
> > + * @w: Pointer to the work_struct
> > + *
> > + * This function releases VRAM, puts GPU SVM range, and frees zdd.
> > + */
> > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(w, struct drm_gpusvm_zdd,
> > destroy_work);
> > +	struct drm_gpusvm_range *range = zdd->range;
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > +	drm_gpusvm_range_put(range);
> > +	kfree(zdd);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > + * @range: Pointer to the GPU SVM range.
> > + *
> > + * This function allocates and initializes a new zdd structure. It
> > sets up the
> > + * reference count, initializes the destroy work, and links the
> > provided GPU SVM
> > + * range.
> > + *
> > + * Returns:
> > + * Pointer to the allocated zdd on success, ERR_PTR() on failure.
> > + */
> > +static struct drm_gpusvm_zdd *
> > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_zdd *zdd;
> > +
> > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > +	if (!zdd)
> > +		return NULL;
> > +
> > +	kref_init(&zdd->refcount);
> > +	INIT_WORK(&zdd->destroy_work,
> > drm_gpusvm_zdd_destroy_work_func);
> > +	zdd->range = drm_gpusvm_range_get(range);
> > +	zdd->vram_allocation = NULL;
> > +
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function increments the reference count of the provided zdd
> > structure.
> > + *
> > + * Returns: Pointer to the zdd structure.
> > + */
> > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_get(&zdd->refcount);
> > +	return zdd;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > + * @ref: Pointer to the reference count structure.
> > + *
> > + * This function queues the destroy_work of the zdd for asynchronous
> > destruction.
> > + */
> > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > +{
> > +	struct drm_gpusvm_zdd *zdd =
> > +		container_of(ref, struct drm_gpusvm_zdd, refcount);
> > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > +
> > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > + * @zdd: Pointer to the zdd structure.
> > + *
> > + * This function decrements the reference count of the provided zdd
> > structure
> > + * and schedules its destruction if the count drops to zero.
> > + */
> > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > +{
> > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
> > + * @notifier: Pointer to the GPU SVM notifier structure.
> > + * @start: Start address of the range
> > + * @end: End address of the range
> > + *
> > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > start, u64 end)
> > +{
> > +	return range_iter_first(&notifier->root, start, end - 1);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > ranges in a notifier
> > + * @range__: Iterator variable for the ranges
> > + * @next__: Iterator variable for the ranges temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the range
> > + * @end__: End address of the range
> > + *
> > + * This macro is used to iterate over GPU SVM ranges in a notifier
> > while
> > + * removing ranges from it.
> > + */
> > +#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__,
> > start__, end__)	\
> > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > (start__), (end__)),	\
> > +	     (next__) =
> > __drm_gpusvm_range_next(range__);				\
> > +	     (range__) && (range__->va.start <
> > (end__));				\
> > +	     (range__) = (next__), (next__) =
> > __drm_gpusvm_range_next(range__))
> > +
> > +/**
> > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in
> > the list
> > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > + *
> > + * Return: A pointer to the next drm_gpusvm_notifier if available,
> > or NULL if
> > + *         the current notifier is the last one or if the input
> > notifier is
> > + *         NULL.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > +{
> > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > +				      &notifier->gpusvm-
> > >notifier_list))
> > +		return list_next_entry(notifier, rb.entry);
> > +
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in
> > a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
> > + */
> > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__,
> > end__)		\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1);	\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM
> > notifiers in a gpusvm
> > + * @notifier__: Iterator variable for the notifiers
> > + * @next__: Iterator variable for the notifiers temporay storage
> > + * @notifier__: Pointer to the GPU SVM notifier
> > + * @start__: Start address of the notifier
> > + * @end__: End address of the notifier
> > + *
> > + * This macro is used to iterate over GPU SVM notifiers in a gpusvm
> > while
> > + * removing notifiers from it.
> > + */
> > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > gpusvm__, start__, end__)	\
> > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root,
> > (start__), (end__) - 1),	\
> > +	     (next__) =
> > __drm_gpusvm_notifier_next(notifier__);				\
> > +	     (notifier__) && (notifier__->interval.start <
> > (end__));			\
> > +	     (notifier__) = (next__), (next__) =
> > __drm_gpusvm_notifier_next(notifier__))
> > +
> > +/**
> > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
> > + * @mni: Pointer to the mmu_interval_notifier structure.
> > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > + * @cur_seq: Current sequence number.
> > + *
> > + * This function serves as a generic MMU notifier for GPU SVM. It
> > sets the MMU
> > + * notifier sequence number and calls the driver invalidate vfunc
> > under
> > + * gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * true if the operation succeeds, false otherwise.
> > + */
> > +static bool
> > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
> > +			       const struct mmu_notifier_range
> > *mmu_range,
> > +			       unsigned long cur_seq)
> > +{
> > +	struct drm_gpusvm_notifier *notifier =
> > +		container_of(mni, typeof(*notifier), notifier);
> > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > +
> > +	if (!mmu_notifier_range_blockable(mmu_range))
> > +		return false;
> > +
> > +	down_write(&gpusvm->notifier_lock);
> > +	mmu_interval_set_seq(mni, cur_seq);
> > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > +	up_write(&gpusvm->notifier_lock);
> > +
> > +	return true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_ops - MMU interval notifier operations for
> > GPU SVM
> > + */
> > +static const struct mmu_interval_notifier_ops
> > drm_gpusvm_notifier_ops = {
> > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > +};
> > +
> > +/**
> > + * drm_gpusvm_init - Initialize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + * @name: Name of the GPU SVM.
> > + * @drm: Pointer to the DRM device structure.
> > + * @mm: Pointer to the mm_struct for the address space.
> > + * @device_private_page_owner: Device private pages owner.
> > + * @mm_start: Start address of GPU SVM.
> > + * @mm_range: Range of the GPU SVM.
> > + * @notifier_size: Size of individual notifiers.
> > + * @ops: Pointer to the operations structure for GPU SVM.
> > + * @chunk_sizes: Pointer to the array of chunk sizes used in range
> > allocation.
> > + *               Entries should be powers of 2 in descending order
> > with last
> > + *               entry being SZ_4K.
> > + * @num_chunks: Number of chunks.
> > + *
> > + * This function initializes the GPU SVM.
> > + *
> > + * Returns:
> > + * 0 on success, a negative error code on failure.
> > + */
> > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > +		    const char *name, struct drm_device *drm,
> > +		    struct mm_struct *mm, void
> > *device_private_page_owner,
> > +		    u64 mm_start, u64 mm_range, u64 notifier_size,
> > +		    const struct drm_gpusvm_ops *ops,
> > +		    const u64 *chunk_sizes, int num_chunks)
> > +{
> > +	if (!ops->invalidate || !num_chunks)
> > +		return -EINVAL;
> > +
> > +	gpusvm->name = name;
> > +	gpusvm->drm = drm;
> > +	gpusvm->mm = mm;
> > +	gpusvm->device_private_page_owner =
> > device_private_page_owner;
> > +	gpusvm->mm_start = mm_start;
> > +	gpusvm->mm_range = mm_range;
> > +	gpusvm->notifier_size = notifier_size;
> > +	gpusvm->ops = ops;
> > +	gpusvm->chunk_sizes = chunk_sizes;
> > +	gpusvm->num_chunks = num_chunks;
> > +	gpusvm->zdd_wq = system_wq;
> > +
> > +	mmgrab(mm);
> > +	gpusvm->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > +
> > +	init_rwsem(&gpusvm->notifier_lock);
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	might_lock(&gpusvm->notifier_lock);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM structure
> > + * @fault_addr__: Fault address
> > + *
> > + * This macro finds the GPU SVM notifier associated with the fault
> > address.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > + */
> > +#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
> > +	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
> > +			    (fault_addr__ + 1))
> > +
> > +/**
> > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > given rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_notifier struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_notifier
> > structure.
> > + */
> > +#define to_drm_gpusvm_notifier(__node)				\
> > +	container_of((__node), struct drm_gpusvm_notifier, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function inserts the GPU SVM notifier into the GPU SVM RB
> > tree and list.
> > + */
> > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	notifier_insert(notifier, &gpusvm->root);
> > +
> > +	node = rb_prev(&notifier->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
> > +	else
> > +		head = &gpusvm->notifier_list;
> > +
> > +	list_add(&notifier->rb.entry, head);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > + * @gpusvm__: Pointer to the GPU SVM tructure
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + *
> > + * This macro removes the GPU SVM notifier from the GPU SVM RB tree
> > and list.
> > + */
> > +#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
> > +	notifier_remove((notifier__), &(gpusvm__)->root);	\
> > +	list_del(&(notifier__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > + * @gpusvm: Pointer to the GPU SVM structure.
> > + *
> > + * This function finalizes the GPU SVM by cleaning up any remaining
> > ranges and
> > + * notifiers, and dropping a reference to struct MM.
> > + */
> > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > +{
> > +	struct drm_gpusvm_notifier *notifier, *next;
> > +
> > +	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0,
> > LONG_MAX) {
> > +		struct drm_gpusvm_range *range, *__next;
> > +
> > +		/*
> > +		 * Remove notifier first to avoid racing with any
> > invalidation
> > +		 */
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +		notifier->flags.removed = true;
> > +
> > +		drm_gpusvm_for_each_range_safe(range, __next,
> > notifier, 0,
> > +					       LONG_MAX)
> > +			drm_gpusvm_range_remove(gpusvm, range);
> > +	}
> > +
> > +	mmdrop(gpusvm->mm);
> > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + *
> > + * This function allocates and initializes the GPU SVM notifier
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM notifier on success, ERR_PTR()
> > on failure.
> > + */
> > +static struct drm_gpusvm_notifier *
> > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	if (gpusvm->ops->notifier_alloc)
> > +		notifier = gpusvm->ops->notifier_alloc();
> > +	else
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +
> > +	if (!notifier)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	notifier->gpusvm = gpusvm;
> > +	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm-
> > >notifier_size);
> > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > >notifier_size);
> > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > +	notifier->root = RB_ROOT_CACHED;
> > +	INIT_LIST_HEAD(&notifier->range_list);
> > +
> > +	return notifier;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + *
> > + * This function frees the GPU SVM notifier structure.
> > + */
> > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > +				     struct drm_gpusvm_notifier
> > *notifier)
> > +{
> > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > +
> > +	if (gpusvm->ops->notifier_free)
> > +		gpusvm->ops->notifier_free(notifier);
> > +	else
> > +		kfree(notifier);
> > +}
> > +
> > +/**
> > + * to_drm_gpusvm_range - retrieve the container struct for a given
> > rbtree node
> > + * @node__: a pointer to the rbtree node embedded within a
> > drm_gpusvm_range struct
> > + *
> > + * Return: A pointer to the containing drm_gpusvm_range structure.
> > + */
> > +#define to_drm_gpusvm_range(node__)	\
> > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > +
> > +/**
> > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function inserts the GPU SVM range into the notifier RB tree
> > and list.
> > + */
> > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > *notifier,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	struct rb_node *node;
> > +	struct list_head *head;
> > +
> > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > +	range_insert(range, &notifier->root);
> > +
> > +	node = rb_prev(&range->rb.node);
> > +	if (node)
> > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > +	else
> > +		head = &notifier->range_list;
> > +
> > +	list_add(&range->rb.entry, head);
> > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @notifier__: Pointer to the GPU SVM notifier structure
> > + * @range__: Pointer to the GPU SVM range structure
> > + *
> > + * This macro removes the GPU SVM range from the notifier RB tree
> > and list.
> > + */
> > +#define __drm_gpusvm_range_remove(notifier__, range__)		\
> > +	range_remove((range__), &(notifier__)->root);		\
> > +	list_del(&(range__)->rb.entry)
> > +
> > +/**
> > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @fault_addr: Fault address
> > + * @chunk_size: Chunk size
> > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > + *
> > + * This function allocates and initializes the GPU SVM range
> > structure.
> > + *
> > + * Returns:
> > + * Pointer to the allocated GPU SVM range on success, ERR_PTR() on
> > failure.
> > + */
> > +static struct drm_gpusvm_range *
> > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > +		       struct drm_gpusvm_notifier *notifier,
> > +		       u64 fault_addr, u64 chunk_size, bool
> > migrate_vram)
> > +{
> > +	struct drm_gpusvm_range *range;
> > +
> > +	if (gpusvm->ops->range_alloc)
> > +		range = gpusvm->ops->range_alloc(gpusvm);
> > +	else
> > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > +
> > +	if (!range)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	kref_init(&range->refcount);
> > +	range->gpusvm = gpusvm;
> > +	range->notifier = notifier;
> > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > +	INIT_LIST_HEAD(&range->rb.entry);
> > +	range->notifier_seq = LONG_MAX;
> > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_check_pages - Check pages
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @start: Start address
> > + * @end: End address
> > + *
> > + * Check if pages between start and end have been faulted in on the
> > CPU. Use to
> > + * prevent migration of pages without CPU backing store.
> > + *
> > + * Returns:
> > + * True if pages have been faulted into CPU, False otherwise
> > + */
> > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > +				   struct drm_gpusvm_notifier
> > *notifier,
> > +				   u64 start, u64 end)
> > +{
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = 0,
> > +		.notifier = &notifier->notifier,
> > +		.start = start,
> > +		.end = end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long *pfns;
> > +	unsigned long npages = npages_in_range(start, end);
> > +	int err, i;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
> > +	if (!pfns)
> > +		return false;
> > +
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier-
> > >notifier);
> > +	hmm_range.hmm_pfns = pfns;
> > +
> > +	while (true) {
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(&notifier->notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (err)
> > +		goto err_free;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > +			err = -EFAULT;
> > +			goto err_free;
> > +		}
> > +	}
> > +
> > +err_free:
> > +	kvfree(pfns);
> > +	return err ? false : true;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM
> > range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @notifier: Pointer to the GPU SVM notifier structure
> > + * @vas: Pointer to the virtual memory area structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @check_pages: Flag indicating whether to check pages
> > + *
> > + * This function determines the chunk size for the GPU SVM range
> > based on the
> > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and
> > the virtual
> > + * memory area boundaries.
> > + *
> > + * Returns:
> > + * Chunk size on success, LONG_MAX on failure.
> > + */
> > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
> > +				       struct drm_gpusvm_notifier
> > *notifier,
> > +				       struct vm_area_struct *vas,
> > +				       u64 fault_addr, u64
> > gpuva_start,
> > +				       u64 gpuva_end, bool
> > check_pages)
> > +{
> > +	u64 start, end;
> > +	int i = 0;
> > +
> > +retry:
> > +	for (; i < gpusvm->num_chunks; ++i) {
> > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > >chunk_sizes[i]);
> > +		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
> > +
> > +		if (start >= vas->vm_start && end <= vas->vm_end &&
> > +		    start >= notifier->interval.start &&
> > +		    end <= notifier->interval.end &&
> > +		    start >= gpuva_start && end <= gpuva_end)
> > +			break;
> > +	}
> > +
> > +	if (i == gpusvm->num_chunks)
> > +		return LONG_MAX;
> > +
> > +	/*
> > +	 * If allocation more than page, ensure not to overlap with
> > existing
> > +	 * ranges.
> > +	 */
> > +	if (end - start != SZ_4K) {
> > +		struct drm_gpusvm_range *range;
> > +
> > +		range = drm_gpusvm_range_find(notifier, start, end);
> > +		if (range) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +
> > +		/*
> > +		 * XXX: Only create range on pages CPU has faulted
> > in. Without
> > +		 * this check, or prefault, on BMG
> > 'xe_exec_system_allocator --r
> > +		 * process-many-malloc' fails. In the failure case,
> > each process
> > +		 * mallocs 16k but the CPU VMA is ~128k which
> > results in 64k SVM
> > +		 * ranges. When migrating the SVM ranges, some
> > processes fail in
> > +		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages
> > != npages'
> > +		 * and then upon drm_gpusvm_range_get_pages device
> > pages from
> > +		 * other processes are collected + faulted in which
> > creates all
> > +		 * sorts of problems. Unsure exactly how this
> > happening, also
> > +		 * problem goes away if 'xe_exec_system_allocator --
> > r
> > +		 * process-many-malloc' mallocs at least 64k at a
> > time.
> > +		 */
> > +		if (check_pages &&
> > +		    !drm_gpusvm_check_pages(gpusvm, notifier, start,
> > end)) {
> > +			++i;
> > +			goto retry;
> > +		}
> > +	}
> > +
> > +	return end - start;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @fault_addr: Fault address
> > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > + * @ctx: GPU SVM context
> > + *
> > + * This function finds or inserts a newly allocated a GPU SVM range
> > based on the
> > + * fault address. Caller must hold a lock to protect range lookup
> > and insertion.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > fault_addr,
> > +				u64 gpuva_start, u64 gpuva_end,
> > +				const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +	struct drm_gpusvm_range *range;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	bool notifier_alloc = false;
> > +	u64 chunk_size;
> > +	int err;
> > +	bool migrate_vram;
> > +
> > +	if (fault_addr < gpusvm->mm_start ||
> > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > +		err = -EINVAL;
> > +		goto err_out;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > +	if (!notifier) {
> > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > fault_addr);
> > +		if (IS_ERR(notifier)) {
> > +			err = PTR_ERR(notifier);
> > +			goto err_mmunlock;
> > +		}
> > +		notifier_alloc = true;
> > +		err = mmu_interval_notifier_insert_locked(&notifier-
> > >notifier,
> > +							  mm,
> > notifier->interval.start,
> > +							  notifier-
> > >interval.end -
> > +							  notifier-
> > >interval.start,
> > +							 
> > &drm_gpusvm_notifier_ops);
> > +		if (err)
> > +			goto err_notifier;
> > +	}
> > +
> > +	vas = vma_lookup(mm, fault_addr);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > +		err = -EPERM;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > fault_addr + 1);
> > +	if (range)
> > +		goto out_mmunlock;
> > +	/*
> > +	 * XXX: Short-circuiting migration based on migrate_vma_*
> > current
> > +	 * limitations. If/when migrate_vma_* add more support, this
> > logic will
> > +	 * have to change.
> > +	 */
> > +	migrate_vram = ctx->vram_possible &&
> > +		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
> > +
> > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier,
> > vas,
> > +						 fault_addr,
> > gpuva_start,
> > +						 gpuva_end,
> > migrate_vram &&
> > +						 !ctx->prefault);
> > +	if (chunk_size == LONG_MAX) {
> > +		err = -EINVAL;
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr,
> > chunk_size,
> > +				       migrate_vram);
> > +	if (IS_ERR(range)) {
> > +		err = PTR_ERR(range);
> > +		goto err_notifier_remove;
> > +	}
> > +
> > +	drm_gpusvm_range_insert(notifier, range);
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > +
> > +	if (ctx->prefault) {
> > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > +
> > +		__ctx.mmap_locked = true;
> > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > &__ctx);
> > +		if (err)
> > +			goto err_range_remove;
> > +	}
> > +
> > +out_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +
> > +	return range;
> > +
> > +err_range_remove:
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +err_notifier_remove:
> > +	if (notifier_alloc)
> > +		mmu_interval_notifier_remove(&notifier->notifier);
> > +err_notifier:
> > +	if (notifier_alloc)
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return ERR_PTR(err);
> > +}
> > +
> > +/**
> > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > + * @i__: the current page index in the iteration
> > + * @j__: the current page index, log order, in the iteration
> > + * @npages__: the total number of pages in the DMA region
> > + * @order__: the order of the pages in the DMA region
> > + *
> > + * This macro iterates over each page in a DMA region. The DMA
> > region
> > + * is assumed to be composed of 2^@order__ pages, and the macro will
> > + * step through the region one block of 2^@order__ pages at a time.
> > + */
> > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > +	     (j__)++, (i__) += 0x1 << (order__))
> > +
> > +/**
> > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > GPU SVM range (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function unmap pages associated with a GPU SVM range.
> > Assumes and
> > + * asserts correct locking is in place when called.
> > + */
> > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > *gpusvm,
> > +					   struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		unsigned long i, j, npages = npages_in_range(range-
> > >va.start,
> > +							     range-
> > >va.end);
> > +
> > +		if (range->flags.has_dma_mapping) {
> > +			for_each_dma_page(i, j, npages, range-
> > >order)
> > +				dma_unmap_page(gpusvm->drm->dev,
> > +					       range->dma_addr[j],
> > +					       PAGE_SIZE << range-
> > >order,
> > +					       DMA_BIDIRECTIONAL);
> > +		}
> > +
> > +		range->flags.has_vram_pages = false;
> > +		range->flags.has_dma_mapping = false;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_free_pages - Free pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function free pages associated with a GPU SVM range.
> > + */
> > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
> > +					struct drm_gpusvm_range
> > *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	if (range->pages) {
> > +		if (range->flags.kfree_mapping) {
> > +			kfree(range->dma_addr);
> > +			range->flags.kfree_mapping = false;
> > +			range->pages = NULL;
> > +		} else {
> > +			kvfree(range->pages);
> > +			range->pages = NULL;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range to be removed
> > + *
> > + * This function removes the specified GPU SVM range and also
> > removes the parent
> > + * GPU SVM notifier if no more ranges remain in the notifier. The
> > caller must
> > + * hold a lock to protect range and notifier removal.
> > + */
> > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > +			     struct drm_gpusvm_range *range)
> > +{
> > +	struct drm_gpusvm_notifier *notifier;
> > +
> > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > >va.start);
> > +	if (WARN_ON_ONCE(!notifier))
> > +		return;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > +	__drm_gpusvm_range_remove(notifier, range);
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	drm_gpusvm_range_put(range);
> > +
> > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > +		if (!notifier->flags.removed)
> > +			mmu_interval_notifier_remove(&notifier-
> > >notifier);
> > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function increments the reference count of the specified GPU
> > SVM range.
> > + *
> > + * Returns:
> > + * Pointer to the GPU SVM range.
> > + */
> > +struct drm_gpusvm_range *
> > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > +{
> > +	kref_get(&range->refcount);
> > +
> > +	return range;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > + * @refcount: Pointer to the reference counter embedded in the GPU
> > SVM range
> > + *
> > + * This function destroys the specified GPU SVM range when its
> > reference count
> > + * reaches zero. If a custom range-free function is provided, it is
> > invoked to
> > + * free the range; otherwise, the range is deallocated using
> > kfree().
> > + */
> > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > +{
> > +	struct drm_gpusvm_range *range =
> > +		container_of(refcount, struct drm_gpusvm_range,
> > refcount);
> > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > +
> > +	if (gpusvm->ops->range_free)
> > +		gpusvm->ops->range_free(range);
> > +	else
> > +		kfree(range);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > + * @range: Pointer to the GPU SVM range
> > + *
> > + * This function decrements the reference count of the specified GPU
> > SVM range
> > + * and frees it when the count reaches zero.
> > + */
> > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > +{
> > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called holding gpusvm->notifier_lock and as the last step before
> > commiting a
> > + * GPU binding.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range)
> > +{
> > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > +
> > +	return range->flags.has_vram_pages || range-
> > >flags.has_dma_mapping;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid
> > unlocked
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * This function determines if a GPU SVM range pages are valid.
> > Expected be
> > + * called without holding gpusvm->notifier_lock.
> > + *
> > + * Returns:
> > + * True if GPU SVM range has valid pages, False otherwise
> > + */
> > +static bool
> > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > +				      struct drm_gpusvm_range
> > *range)
> > +{
> > +	bool pages_valid;
> > +
> > +	if (!range->pages)
> > +		return false;
> > +
> > +	drm_gpusvm_notifier_lock(gpusvm);
> > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
> > +	if (!pages_valid && range->flags.kfree_mapping) {
> > +		kfree(range->dma_addr);
> > +		range->flags.kfree_mapping = false;
> > +		range->pages = NULL;
> > +	}
> > +	drm_gpusvm_notifier_unlock(gpusvm);
> > +
> > +	return pages_valid;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function gets pages for a GPU SVM range and ensures they are
> > mapped for
> > + * DMA access.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	struct mmu_interval_notifier *notifier = &range->notifier-
> > >notifier;
> > +	struct hmm_range hmm_range = {
> > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only
> > ? 0 :
> > +			HMM_PFN_REQ_WRITE),
> > +		.notifier = notifier,
> > +		.start = range->va.start,
> > +		.end = range->va.end,
> > +		.dev_private_owner = gpusvm-
> > >device_private_page_owner,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long timeout =
> > +		jiffies +
> > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > +	unsigned long i, j;
> > +	unsigned long npages = npages_in_range(range->va.start,
> > range->va.end);
> > +	unsigned int order = 0;
> > +	unsigned long *pfns;
> > +	struct page **pages;
> > +	int err = 0;
> > +	bool vram_pages = !!range->flags.migrate_vram;
> > +	bool alloc_pfns = false, kfree_mapping;
> > +
> > +retry:
> > +	kfree_mapping = false;
> > +	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
> > +		return 0;
> > +
> > +	if (range->notifier_seq == hmm_range.notifier_seq && range-
> > >pages) {
> > +		if (ctx->prefault)
> > +			return 0;
> > +
> > +		pfns = (unsigned long *)range->pages;
> > +		pages = range->pages;
> > +		goto map_pages;
> > +	}
> > +
> > +	if (!range->pages) {
> > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > GFP_KERNEL);
> > +		if (!pfns)
> > +			return -ENOMEM;
> > +		alloc_pfns = true;
> > +	} else {
> > +		pfns = (unsigned long *)range->pages;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	hmm_range.hmm_pfns = pfns;
> > +	while (true) {
> > +		/* Must be checked after mmu_interval_read_begin */
> > +		if (range->flags.unmapped) {
> > +			err = -EFAULT;
> > +			break;
> > +		}
> > +
> > +		if (!ctx->mmap_locked) {
> > +			/*
> > +			 * XXX: HMM locking document indicates only
> > a read-lock
> > +			 * is required but there apears to be a
> > window between
> > +			 * the MMU_NOTIFY_MIGRATE event triggered in
> > a CPU fault
> > +			 * via migrate_vma_setup and the pages
> > actually moving
> > +			 * in migrate_vma_finalize in which this
> > code can grab
> > +			 * garbage pages. Grabbing the write-lock if
> > the range
> > +			 * is attached to vram appears to protect
> > against this
> > +			 * race.
> > +			 */
> > +			if (vram_pages)
> > +				mmap_write_lock(mm);
> > +			else
> > +				mmap_read_lock(mm);
> > +		}
> > +		err = hmm_range_fault(&hmm_range);
> > +		if (!ctx->mmap_locked) {
> > +			if (vram_pages)
> > +				mmap_write_unlock(mm);
> > +			else
> > +				mmap_read_unlock(mm);
> > +		}
> > +
> > +		if (err == -EBUSY) {
> > +			if (time_after(jiffies, timeout))
> > +				break;
> > +
> > +			hmm_range.notifier_seq =
> > mmu_interval_read_begin(notifier);
> > +			continue;
> > +		}
> > +		break;
> > +	}
> > +	if (!ctx->mmap_locked)
> > +		mmput(mm);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	pages = (struct page **)pfns;
> > +
> > +	if (ctx->prefault) {
> > +		range->pages = pages;
> > +		goto set_seqno;
> > +	}
> > +
> > +map_pages:
> > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > +		WARN_ON_ONCE(!range->vram_allocation);
> > +
> > +		for (i = 0; i < npages; ++i) {
> > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > +
> > +			if
> > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				goto err_free;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->flags.has_vram_pages = true;
> > +		range->pages = pages;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	} else {
> > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > +
> > +		for_each_dma_page(i, j, npages, order) {
> > +			if (WARN_ON_ONCE(i && order !=
> > +					
> > hmm_pfn_to_map_order(pfns[i]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +			order = hmm_pfn_to_map_order(pfns[i]);
> > +
> > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > +			if
> > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > +				err = -EOPNOTSUPP;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +
> > +			set_page_dirty_lock(pages[j]);
> > +			mark_page_accessed(pages[j]);
> > +
> > +			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
> > +						   pages[j], 0,
> > +						   PAGE_SIZE <<
> > order,
> > +						  
> > DMA_BIDIRECTIONAL);
> > +			if (dma_mapping_error(gpusvm->drm->dev,
> > dma_addr[j])) {
> > +				err = -EFAULT;
> > +				npages = i;
> > +				goto err_unmap;
> > +			}
> > +		}
> > +
> > +		/* Huge pages, reduce memory footprint */
> > +		if (order) {
> > +			dma_addr = kmalloc_array(j,
> > sizeof(*dma_addr),
> > +						 GFP_KERNEL);
> > +			if (dma_addr) {
> > +				for (i = 0; i < j; ++i)
> > +					dma_addr[i] =
> > (dma_addr_t)pfns[i];
> > +				kvfree(pfns);
> > +				kfree_mapping = true;
> > +			} else {
> > +				dma_addr = (dma_addr_t *)pfns;
> > +			}
> > +		}
> > +
> > +		/* Do not race with notifier unmapping pages */
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +		range->order = order;
> > +		range->flags.kfree_mapping = kfree_mapping;
> > +		range->flags.has_dma_mapping = true;
> > +		range->dma_addr = dma_addr;
> > +		range->vram_allocation = NULL;
> > +		if (mmu_interval_read_retry(notifier,
> > hmm_range.notifier_seq)) {
> > +			err = -EAGAIN;
> > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > range);
> > +		}
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +	}
> > +
> > +	if (err == -EAGAIN)
> > +		goto retry;
> > +set_seqno:
> > +	range->notifier_seq = hmm_range.notifier_seq;
> > +
> > +	return 0;
> > +
> > +err_unmap:
> > +	for_each_dma_page(i, j, npages, order)
> > +		dma_unmap_page(gpusvm->drm->dev,
> > +			       (dma_addr_t)pfns[j],
> > +			       PAGE_SIZE << order,
> > DMA_BIDIRECTIONAL);
> > +err_free:
> > +	if (alloc_pfns)
> > +		kvfree(pfns);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU
> > SVM range
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function unmaps pages associated with a GPU SVM range. If
> > @in_notifier
> > + * is set, it is assumed that gpusvm->notifier_lock is held in write
> > mode; if it
> > + * is clear, it acquires gpusvm->notifier_lock in read mode. Must be
> > called on
> > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > >invalidate for IOMMU
> > + * security model.
> > + */
> > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > +				  struct drm_gpusvm_range *range,
> > +				  const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	if (ctx->in_notifier)
> > +		lockdep_assert_held_write(&gpusvm->notifier_lock);
> > +	else
> > +		drm_gpusvm_notifier_lock(gpusvm);
> > +
> > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > +
> > +	if (!ctx->in_notifier)
> > +		drm_gpusvm_notifier_unlock(gpusvm);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_page - Put a migration page
> > + * @page: Pointer to the page to put
> > + *
> > + * This function unlocks and puts a page.
> > + */
> > +static void drm_gpusvm_migration_put_page(struct page *page)
> > +{
> > +	unlock_page(page);
> > +	put_page(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migration_put_pages - Put migration pages
> > + * @npages: Number of pages
> > + * @migrate_pfn: Array of migrate page frame numbers
> > + *
> > + * This function puts an array of pages.
> > + */
> > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > +					   unsigned long
> > *migrate_pfn)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!migrate_pfn[i])
> > +			continue;
> > +
> > +		drm_gpusvm_migration_put_page(migrate_pfn_to_page(mi
> > grate_pfn[i]));
> > +		migrate_pfn[i] = 0;
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > + * @page: Pointer to the page
> > + * @zdd: Pointer to the GPU SVM zone device data
> > + *
> > + * This function associates the given page with the specified GPU
> > SVM zone
> > + * device data and initializes it for zone device usage.
> > + */
> > +static void drm_gpusvm_get_vram_page(struct page *page,
> > +				     struct drm_gpusvm_zdd *zdd)
> > +{
> > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > +	zone_device_page_init(page);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM
> > migration
> > + * @dev: The device for which the pages are being mapped
> > + * @dma_addr: Array to store DMA addresses corresponding to mapped
> > pages
> > + * @migrate_pfn: Array of migrate page frame numbers to map
> > + * @npages: Number of pages to map
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function maps pages of memory for migration usage in GPU
> > SVM. It
> > + * iterates over each page frame number provided in @migrate_pfn,
> > maps the
> > + * corresponding page, and stores the DMA address in the provided
> > @dma_addr
> > + * array.
> > + *
> > + * Return: 0 on success, -EFAULT if an error occurs during mapping.
> > + */
> > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > +					dma_addr_t *dma_addr,
> > +					long unsigned int
> > *migrate_pfn,
> > +					unsigned long npages,
> > +					enum dma_data_direction dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > +		if (!page)
> > +			continue;
> > +
> > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > +			return -EFAULT;
> > +
> > +		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE,
> > dir);
> > +		if (dma_mapping_error(dev, dma_addr[i]))
> > +			return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped
> > for GPU SVM migration
> > + * @dev: The device for which the pages were mapped
> > + * @dma_addr: Array of DMA addresses corresponding to mapped pages
> > + * @npages: Number of pages to unmap
> > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > + *
> > + * This function unmaps previously mapped pages of memory for GPU
> > Shared Virtual
> > + * Memory (SVM). It iterates over each DMA address provided in
> > @dma_addr, checks
> > + * if it's valid and not already unmapped, and unmaps the
> > corresponding page.
> > + */
> > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > +					   dma_addr_t *dma_addr,
> > +					   unsigned long npages,
> > +					   enum dma_data_direction
> > dir)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > dma_addr[i]))
> > +			continue;
> > +
> > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
> > +	}
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *                   failure of this function.
> > + * @vram_allocation: Driver-private pointer to the VRAM allocation.
> > The caller
> > + *                   should hold a reference to the VRAM allocation,
> > which
> > + *                   should be dropped via ops->vram_allocation or
> > upon the
> > + *                   failure of this function.
> > + * @ctx: GPU SVM context
> > + *
> > + * This function migrates the specified GPU SVM range to VRAM. It
> > performs the
> > + * necessary setup and invokes the driver-specific operations for
> > migration to
> > + * VRAM. Upon successful return, @vram_allocation can safely
> > reference @range
> > + * until ops->vram_release is called which only upon successful
> > return.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       void *vram_allocation,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct migrate_vma migrate = {
> > +		.start		= start,
> > +		.end		= end,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > +	};
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	unsigned long i, npages = npages_in_range(start, end);
> > +	struct vm_area_struct *vas;
> > +	struct drm_gpusvm_zdd *zdd = NULL;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int err;
> > +
> > +	if (!range->flags.migrate_vram)
> > +		return -EINVAL;
> > +
> > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > >copy_to_vram ||
> > +	    !gpusvm->ops->copy_to_sram)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		mmap_write_lock(mm);
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	vas = vma_lookup(mm, start);
> > +	if (!vas) {
> > +		err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end > vas->vm_end || start < vas->vm_start) {
> > +		err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (!vma_is_anonymous(vas)) {
> > +		err = -EBUSY;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_mmunlock;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	zdd = drm_gpusvm_zdd_alloc(range);
> > +	if (!zdd) {
> > +		err = -ENOMEM;
> > +		goto err_free;
> > +	}
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/*
> > +	 * FIXME: Below cases, !migrate.cpages and migrate.cpages !=
> > npages, not
> > +	 * always an error. Need to revisit possible cases and how
> > to handle. We
> > +	 * could prefault on migrate.cpages != npages via
> > hmm_range_fault.
> > +	 */
> > +
> > +	if (!migrate.cpages) {
> > +		err = -EFAULT;
> > +		goto err_free;
> > +	}
> > +
> > +	if (migrate.cpages != npages) {
> > +		err = -EBUSY;
> > +		goto err_finalize;
> > +	}
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > vram_allocation, npages,
> > +					     migrate.dst);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.src, npages,
> > DMA_TO_DEVICE);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i) {
> > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > +
> > +		pages[i] = page;
> > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > +		drm_gpusvm_get_vram_page(page, zdd);
> > +	}
> > +
> > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	/* Upon success bind vram allocation to range and zdd */
> > +	range->vram_allocation = vram_allocation;
> > +	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/*
> > Owns ref */
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_TO_DEVICE);
> > +err_free:
> > +	if (zdd)
> > +		drm_gpusvm_zdd_put(zdd);
> > +	kvfree(buf);
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked) {
> > +		mmap_write_unlock(mm);
> > +		mmput(mm);
> > +	}
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a
> > VM area
> > + * @vas: Pointer to the VM area structure, can be NULL
> > + * @npages: Number of pages to populate
> > + * @src_mpfn: Source array of migrate PFNs
> > + * @mpfn: Array of migrate PFNs to populate
> > + * @addr: Start address for PFN allocation
> > + *
> > + * This function populates the SRAM migrate page frame numbers
> > (PFNs) for the
> > + * specified VM area structure. It allocates and locks pages in the
> > VM area for
> > + * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation,
> > if NULL use
> > + * alloc_page for allocation.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > vm_area_struct *vas,
> > +						unsigned long
> > npages,
> > +						unsigned long
> > *src_mpfn,
> > +						unsigned long *mpfn,
> > u64 addr)
> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > +		struct page *page;
> > +
> > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (vas)
> > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > addr);
> > +		else
> > +			page = alloc_page(GFP_HIGHUSER);
> > +
> > +		if (!page)
> > +			return -ENOMEM;
> > +
> > +		lock_page(page);
> > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + *
> > + * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap
> > lock and
> > + * migration done via migrate_device_* functions. Fallback path as
> > it is
> > + * preferred to issue migrations with mmap lock.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > +				    struct drm_gpusvm_range *range)
> > +{
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	unsigned long *src, *dst;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	npages = npages_in_range(range->va.start, range->va.end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr)
> > +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	src = buf;
> > +	dst = buf + (sizeof(*src) * npages);
> > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > npages;
> > +
> > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > >vram_allocation,
> > +					     npages, src);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = migrate_device_vma_range(gpusvm->mm,
> > +				       gpusvm-
> > >device_private_page_owner, src,
> > +				       npages, range->va.start);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > src, dst, 0);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   dst, npages,
> > DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(src[i]);
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, dst);
> > +	migrate_device_pages(src, dst, npages);
> > +	migrate_device_finalize(src, dst, npages);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > (internal)
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @vas: Pointer to the VM area structure
> > + * @page: Pointer to the page for fault handling (can be NULL)
> > + * @start: Start address of the migration range
> > + * @end: End address of the migration range
> > + *
> > + * This internal function performs the migration of the specified
> > GPU SVM range
> > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > PFNs, and
> > + * invokes the driver-specific operations for migration to SRAM.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +					struct vm_area_struct *vas,
> > +					struct page *page,
> > +					u64 start, u64 end)
> > +{
> > +	struct migrate_vma migrate = {
> > +		.vma		= vas,
> > +		.pgmap_owner	= gpusvm->device_private_page_owner,
> > +		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > +		.fault_page	= page,
> > +	};
> > +	unsigned long npages;
> > +	struct page **pages;
> > +	dma_addr_t *dma_addr;
> > +	void *buf;
> > +	int i, err = 0;
> > +
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	/* Corner where VMA area struct has been partially unmapped
> > */
> > +	if (start < vas->vm_start)
> > +		start = vas->vm_start;
> > +	if (end > vas->vm_end)
> > +		end = vas->vm_end;
> > +
> > +	migrate.start = start;
> > +	migrate.end = end;
> > +	npages = npages_in_range(start, end);
> > +
> > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > sizeof(*dma_addr) +
> > +		       sizeof(*pages), GFP_KERNEL);
> > +	if (!buf) {
> > +		err = -ENOMEM;
> > +		goto err_out;
> > +	}
> > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > +	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr))
> > * npages;
> > +
> > +	migrate.vma = vas;
> > +	migrate.src = buf;
> > +	migrate.dst = migrate.src + npages;
> > +
> > +	err = migrate_vma_setup(&migrate);
> > +	if (err)
> > +		goto err_free;
> > +
> > +	/* Raced with another CPU fault, nothing to do */
> > +	if (!migrate.cpages)
> > +		goto err_free;
> > +
> > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > +						   migrate.src,
> > migrate.dst,
> > +						   start);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > dma_addr,
> > +					   migrate.dst, npages,
> > +					   DMA_BIDIRECTIONAL);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +	for (i = 0; i < npages; ++i)
> > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> 
> See comments below which pages we actually want to migrate.
> 
> 
> > +
> > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > npages);
> > +	if (err)
> > +		goto err_finalize;
> > +
> > +err_finalize:
> > +	if (err)
> > +		drm_gpusvm_migration_put_pages(npages, migrate.dst);
> > +	migrate_vma_pages(&migrate);
> > +	migrate_vma_finalize(&migrate);
> > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr,
> > npages,
> > +				       DMA_BIDIRECTIONAL);
> > +err_free:
> > +	kvfree(buf);
> > +err_out:
> > +	mmap_assert_locked(gpusvm->mm);
> > +
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > SRAM
> > + * @gpusvm: Pointer to the GPU SVM structure
> > + * @range: Pointer to the GPU SVM range structure
> > + * @ctx: GPU SVM context
> > + *
> > + * This function initiates the migration of the specified GPU SVM
> > range to
> > + * SRAM. It performs necessary checks and invokes the internal
> > migration
> > + * function for actual migration.
> > + *
> > + * Returns:
> > + * 0 on success, negative error code on failure.
> > + */
> > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > +			       struct drm_gpusvm_range *range,
> > +			       const struct drm_gpusvm_ctx *ctx)
> > +{
> > +	u64 start = range->va.start, end = range->va.end;
> > +	struct mm_struct *mm = gpusvm->mm;
> > +	struct vm_area_struct *vas;
> > +	int err;
> > +	bool retry = false;
> > +
> > +	if (!ctx->mmap_locked) {
> > +		if (!mmget_not_zero(mm)) {
> > +			err = -EFAULT;
> > +			goto err_out;
> > +		}
> > +		if (ctx->trylock_mmap) {
> > +			if (!mmap_read_trylock(mm))  {
> > +				err =
> > drm_gpusvm_evict_to_sram(gpusvm, range);
> > +				goto err_mmput;
> > +			}
> > +		} else {
> > +			mmap_read_lock(mm);
> > +		}
> > +	}
> > +
> > +	mmap_assert_locked(mm);
> > +
> > +	/*
> > +	 * Loop required to find all VMA area structs for the corner
> > case when
> > +	 * VRAM backing has been partially unmapped from MM's
> > address space.
> > +	 */
> > +again:
> > +	vas = find_vma(mm, start);
> > +	if (!vas) {
> > +		if (!retry)
> > +			err = -ENOENT;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > +		if (!retry)
> > +			err = -EINVAL;
> > +		goto err_mmunlock;
> > +	}
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start,
> > end);
> 
> This function is typically called from the vm side to get a clean mm as
> a last resort after get_pages() fail. As such should we have it evict
> *everything*, even foreign device memory, and mismatching local device
> pages. If so, we could use hmm_range_fault() with a NULL page owner +
> faulting to do that.
> 

I've actually tried that and it seemed to mostly work well and actually
would be my preference as this avoids a VMA lookup in GPU SVM.

I think it is problem though if some of the pages are partially unmapped
though as hmm_range_fault will abort if fault cannot be resolved. Maybe
I'm mistaken on this. I won't get this in rev2 but will put this on my
list to continue to play around with.

> > +	if (err)
> > +		goto err_mmunlock;
> > +
> > +	if (vas->vm_end < end) {
> > +		retry = true;
> > +		start = vas->vm_end;
> > +		goto again;
> > +	}
> > +
> > +	if (!ctx->mmap_locked) {
> > +		mmap_read_unlock(mm);
> > +		/*
> > +		 * Using mmput_async as this function can be called
> > while
> > +		 * holding a dma-resv lock, and a final put can grab
> > the mmap
> > +		 * lock, causing a lock inversion.
> > +		 */
> > +		mmput_async(mm);
> > +	}
> > +
> > +	return 0;
> > +
> > +err_mmunlock:
> > +	if (!ctx->mmap_locked)
> > +		mmap_read_unlock(mm);
> > +err_mmput:
> > +	if (!ctx->mmap_locked)
> > +		mmput_async(mm);
> > +err_out:
> > +	return err;
> > +}
> > +
> > +/**
> > + * drm_gpusvm_page_free - Put GPU SVM zone device data associated
> > with a page
> > + * @page: Pointer to the page
> > + *
> > + * This function is a callback used to put the GPU SVM zone device
> > data
> > + * associated with a page when it is being released.
> > + */
> > +static void drm_gpusvm_page_free(struct page *page)
> > +{
> > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > +}
> > +
> > +/**
> > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page
> > fault handler)
> > + * @vmf: Pointer to the fault information structure
> > + *
> > + * This function is a page fault handler used to migrate a GPU SVM
> > range to RAM.
> > + * It retrieves the GPU SVM range information from the faulting page
> > and invokes
> > + * the internal migration function to migrate the range back to RAM.
> > + *
> > + * Returns:
> > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > + */
> > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
> > +	int err;
> > +
> > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > +					   vmf->vma, vmf->page,
> > +					   zdd->range->va.start,
> > +					   zdd->range->va.end);
> 
> When called from here, since this is a pagemap op, we should ensure we
> only migrate our own pagemap to RAM?
> 

I think you resolve this with the following the patch [1], right? I
think I agree.

Matt

[1] https://patchwork.freedesktop.org/series/139994/

> /Thanks,
> Thomas
>
Thomas Hellström Oct. 16, 2024, 6:27 a.m. UTC | #43
On Wed, 2024-10-16 at 03:18 +0000, Matthew Brost wrote:
> On Wed, Oct 09, 2024 at 12:50:42PM +0200, Thomas Hellström wrote:
> > Hi, Matthew.
> > 
> > Some comments below around migrating to SRAM.
> > 
> > 
> > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > This patch introduces support for GPU Shared Virtual Memory (SVM)
> > > in
> > > the
> > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > sharing of memory between the CPU and GPU, enhancing performance
> > > and
> > > flexibility in GPU computing tasks.
> > > 
> > > The patch adds the necessary infrastructure for SVM, including
> > > data
> > > structures and functions for managing SVM ranges and notifiers.
> > > It
> > > also
> > > provides mechanisms for allocating, deallocating, and migrating
> > > memory
> > > regions between system RAM and GPU VRAM.
> > > 
> > > This mid-layer is largely inspired by GPUVM.
> > > 
> > > Cc: Dave Airlie <airlied@redhat.com>
> > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > Cc: Christian König <christian.koenig@amd.com>
> > > Cc: <dri-devel@lists.freedesktop.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > +++++++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > 
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > >  
> > >  # core driver code
> > >  
> > > -xe-y += xe_bb.o \
> > > +xe-y += drm_gpusvm.o \
> > > +	xe_bb.o \
> > >  	xe_bo.o \
> > >  	xe_bo_evict.o \
> > >  	xe_devcoredump.o \
> > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > new file mode 100644
> > > index 000000000000..fc1e44e6ae72
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > @@ -0,0 +1,2174 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + *
> > > + * Authors:
> > > + *     Matthew Brost <matthew.brost@intel.com>
> > > + */
> > > +
> > > +#include <linux/dma-mapping.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +#include <linux/hmm.h>
> > > +#include <linux/memremap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/mm_types.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include <drm/drm_device.h>
> > > +#include "drm_gpusvm.h"
> > > +
> > > +/**
> > > + * DOC: Overview
> > > + *
> > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > Rendering Manager (DRM)
> > > + *
> > > + * The GPU SVM layer is a component of the DRM framework
> > > designed to
> > > manage shared
> > > + * virtual memory between the CPU and GPU. It enables efficient
> > > data
> > > exchange and
> > > + * processing for GPU-accelerated applications by allowing
> > > memory
> > > sharing and
> > > + * synchronization between the CPU's and GPU's virtual address
> > > spaces.
> > > + *
> > > + * Key GPU SVM Components:
> > > + * - Notifiers: Notifiers: Used for tracking memory intervals
> > > and
> > > notifying the
> > > + *		GPU of changes, notifiers are sized based on a
> > > GPU
> > > SVM
> > > + *		initialization parameter, with a recommendation
> > > of
> > > 512M or
> > > + *		larger. They maintain a Red-BlacK tree and a
> > > list of
> > > ranges that
> > > + *		fall within the notifier interval. Notifiers are
> > > tracked within
> > > + *		a GPU SVM Red-BlacK tree and list and are
> > > dynamically inserted
> > > + *		or removed as ranges within the interval are
> > > created
> > > or
> > > + *		destroyed.
> > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > managed
> > > + *	     by GPU SVM. They are sized based on an array of
> > > chunk
> > > sizes, which
> > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > address space.
> > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > within the
> > > + *	     faulting CPU address space is chosen for the range
> > > size. Ranges are
> > > + *	     expected to be dynamically allocated on GPU fault
> > > and
> > > removed on an
> > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > ranges
> > > are tracked in
> > > + *	     a notifier's Red-Black tree.
> > > + * - Operations: Define the interface for driver-specific SVM
> > > operations such as
> > > + *		 allocation, page collection, migration,
> > > invalidations, and VRAM
> > > + *		 release.
> > > + *
> > > + * This layer provides interfaces for allocating, mapping,
> > > migrating, and
> > > + * releasing memory ranges between the CPU and GPU. It handles
> > > all
> > > core memory
> > > + * management interactions (DMA mapping, HMM, and migration) and
> > > provides
> > > + * driver-specific virtual functions (vfuncs). This
> > > infrastructure
> > > is sufficient
> > > + * to build the expected driver components for an SVM
> > > implementation
> > > as detailed
> > > + * below.
> > > + *
> > > + * Expected Driver Components:
> > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > based on the
> > > + *			     fault address, optionally migrate
> > > the
> > > range to
> > > + *			     VRAM, and create GPU bindings.
> > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > Ranges are
> > > + *			expected to be added to the garbage
> > > collector upon
> > > + *			MMU_NOTIFY_UNMAP event.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Locking
> > > + *
> > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > locks/unlocks the
> > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > handle the mmap
> > > + * lock itself, a 'locked' argument is provided to the functions
> > > that require
> > > + * the mmap lock. This option may be useful for drivers that
> > > need to
> > > call into
> > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > locking
> > > + * inversions between the mmap and dma-resv locks.
> > > + *
> > > + * GPU SVM introduces a global notifier lock, which safeguards
> > > the
> > > notifier's
> > > + * range RB tree and list, as well as the range's DMA mappings
> > > and
> > > sequence
> > > + * number. GPU SVM manages all necessary locking and unlocking
> > > operations,
> > > + * except for the recheck of the range's sequence number
> > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > bindings. This
> > > + * lock corresponds to the 'driver->update' lock mentioned in
> > > the
> > > HMM
> > > + * documentation (TODO: Link). Future revisions may transition
> > > from
> > > a GPU SVM
> > > + * global lock to a per-notifier lock if finer-grained locking
> > > is
> > > deemed
> > > + * necessary.
> > > + *
> > > + * In addition to the locking mentioned above, the driver should
> > > implement a
> > > + * lock to safeguard core GPU SVM function calls that modify
> > > state,
> > > such as
> > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > Alternatively,
> > > + * these core functions can be called within a single kernel
> > > thread,
> > > for
> > > + * instance, using an ordered work queue. This lock is denoted
> > > as
> > > + * 'driver_svm_lock' in code examples.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Migrataion
> > > + *
> > > + * The migration support is quite simple, allowing migration
> > > between
> > > SRAM and
> > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > does not
> > > + * support mixing SRAM and VRAM pages within a range. This means
> > > that upon GPU
> > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > fault, the
> > > + * entire range is migrated to SRAM.
> > > + *
> > > + * The reasoning for only supporting range granularity is as
> > > follows: it
> > > + * simplifies the implementation, and range sizes are driver-
> > > defined
> > > and should
> > > + * be relatively small.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Partial Unmapping of Ranges
> > > + *
> > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped
> > > by
> > > CPU resulting
> > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with
> > > the
> > > main one
> > > + * being that a subset of the range still has CPU and GPU
> > > mappings.
> > > If the
> > > + * backing store for the range is in VRAM, a subset of the
> > > backing
> > > store has
> > > + * references. One option would be to split the range and VRAM
> > > backing store,
> > > + * but the implementation for this would be quite complicated.
> > > Given
> > > that
> > > + * partial unmappings are rare and driver-defined range sizes
> > > are
> > > relatively
> > > + * small, GPU SVM does not support splitting of ranges.
> > > + *
> > > + * With no support for range splitting, upon partial unmapping
> > > of a
> > > range, the
> > > + * driver is expected to invalidate and destroy the entire
> > > range. If
> > > the range
> > > + * has VRAM as its backing, the driver is also expected to
> > > migrate
> > > any remaining
> > > + * pages back to SRAM.
> > > + */
> > > +
> > > +/**
> > > + * DOC: Examples
> > > + *
> > > + * This section provides two examples of how to build the
> > > expected
> > > driver
> > > + * components: the GPU page fault handler and the garbage
> > > collector.
> > > A third
> > > + * example demonstrates a sample invalidation driver vfunc.
> > > + *
> > > + * The generic code provided does not include logic for complex
> > > migration
> > > + * policies, optimized invalidations, or other potentially
> > > required
> > > driver
> > > + * locking (e.g., DMA-resv locks).
> > > + *
> > > + * 1) GPU page fault handler
> > > + *
> > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > drm_gpusvm_range *range)
> > > + *	{
> > > + *		int err = 0;
> > > + *
> > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > range);
> > > + *
> > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > + *			driver_commit_bind(gpusvm, range);
> > > + *		else
> > > + *			err = -EAGAIN;
> > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > + *
> > > + *		return err;
> > > + *	}
> > > + *
> > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *		int err;
> > > + *
> > > + *		driver_svm_lock();
> > > + *	retry:
> > > + *		// Always process UNMAPs first so view of GPU
> > > SVM
> > > ranges is current
> > > + *		driver_garbage_collector(gpusvm);
> > > + *
> > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > fault_addr,
> > > +
> > > *							gpuva_start,
> > > gpuva_end,
> > > + *						        &ctx);
> > > + *		if (IS_ERR(range)) {
> > > + *			err = PTR_ERR(range);
> > > + *			goto unlock;
> > > + *		}
> > > + *
> > > + *		if (driver_migration_policy(range)) {
> > > + *			bo = driver_alloc_bo();
> > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > range, bo, &ctx);
> > > + *			if (err)	// CPU mappings may have
> > > changed
> > > + *				goto retry;
> > > + *		}
> > > + *
> > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &ctx);
> > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > mappings changed
> > > + *			goto retry;
> > > + *		else if (err)
> > > + *			goto unlock;
> > > + *
> > > + *		err = driver_bind_range(gpusvm, range);
> > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > + *			goto retry
> > > + *
> > > + *	unlock:
> > > + *		driver_svm_unlock();
> > > + *		return err;
> > > + *	}
> > > + *
> > > + * 2) Garbage Collector.
> > > + *
> > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > *gpusvm,
> > > + *					struct drm_gpusvm_range
> > > *range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = {};
> > > + *
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		// Partial unmap, migrate any remaining VRAM
> > > pages
> > > back to SRAM
> > > + *		if (range->flags.partial_unmap)
> > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > range,
> > > &ctx);
> > > + *
> > > + *		driver_unbind_range(range);
> > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > + *	}
> > > + *
> > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > + *	{
> > > + *		assert_driver_svm_locked(gpusvm);
> > > + *
> > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > range)
> > > + *			__driver_garbage_collector(gpusvm,
> > > range);
> > > + *	}
> > > + *
> > > + * 3) Invalidation driver vfunc.
> > > + *
> > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > + *				 struct drm_gpusvm_notifier
> > > *notifier,
> > > + *				 const struct mmu_notifier_range
> > > *mmu_range)
> > > + *	{
> > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > true,
> > > };
> > > + *		struct drm_gpusvm_range *range = NULL;
> > > + *
> > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > > start, mmu_range->end);
> > > + *
> > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > mmu_range->start,
> > > + *					  mmu_range->end) {
> > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > range,
> > > &ctx);
> > > + *
> > > + *			if (mmu_range->event !=
> > > MMU_NOTIFY_UNMAP)
> > > + *				continue;
> > > + *
> > > + *			drm_gpusvm_range_set_unmapped(range,
> > > mmu_range);
> > > + *			driver_garbage_collector_add(gpusvm,
> > > range);
> > > + *		}
> > > + *	}
> > > + */
> > > +
> > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > rb.__subtree_last,
> > > +		     DRM_GPUSVM_RANGE_START,
> > > DRM_GPUSVM_RANGE_END,
> > > +		     static __maybe_unused, range);
> > > +
> > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > interval.start)
> > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > interval.end - 1)
> > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > +		     rb.__subtree_last,
> > > DRM_GPUSVM_NOTIFIER_START,
> > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > __maybe_unused,
> > > notifier);
> > > +
> > > +/**
> > > + * npages_in_range() - Calculate the number of pages in a given
> > > range
> > > + * @start__: The start address of the range
> > > + * @end__: The end address of the range
> > > + *
> > > + * This macro calculates the number of pages in a given memory
> > > range,
> > > + * specified by the start and end addresses. It divides the
> > > difference
> > > + * between the end and start addresses by the page size
> > > (PAGE_SIZE)
> > > to
> > > + * determine the number of pages in the range.
> > > + *
> > > + * Return: The number of pages in the specified range.
> > > + */
> > > +#define npages_in_range(start__, end__)	\
> > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > +
> > > +/**
> > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > + *
> > > + * @refcount: Reference count for the zdd
> > > + * @destroy_work: Work structure for asynchronous zdd
> > > destruction
> > > + * @range: Pointer to the GPU SVM range
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation
> > > + *
> > > + * This structure serves as a generic wrapper installed in
> > > + * page->zone_device_data. It provides infrastructure for
> > > looking up
> > > a range
> > > + * upon CPU page fault and asynchronously releasing VRAM once
> > > the
> > > CPU has no
> > > + * page references. Asynchronous release is useful because CPU
> > > page
> > > references
> > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > requires sleeping
> > > + * locks.
> > > + */
> > > +struct drm_gpusvm_zdd {
> > > +	struct kref refcount;
> > > +	struct work_struct destroy_work;
> > > +	struct drm_gpusvm_range *range;
> > > +	void *vram_allocation;
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > destroying a
> > > zdd
> > > + * @w: Pointer to the work_struct
> > > + *
> > > + * This function releases VRAM, puts GPU SVM range, and frees
> > > zdd.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct
> > > *w)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(w, struct drm_gpusvm_zdd,
> > > destroy_work);
> > > +	struct drm_gpusvm_range *range = zdd->range;
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > +	drm_gpusvm_range_put(range);
> > > +	kfree(zdd);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > + * @range: Pointer to the GPU SVM range.
> > > + *
> > > + * This function allocates and initializes a new zdd structure.
> > > It
> > > sets up the
> > > + * reference count, initializes the destroy work, and links the
> > > provided GPU SVM
> > > + * range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_zdd *
> > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd;
> > > +
> > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > +	if (!zdd)
> > > +		return NULL;
> > > +
> > > +	kref_init(&zdd->refcount);
> > > +	INIT_WORK(&zdd->destroy_work,
> > > drm_gpusvm_zdd_destroy_work_func);
> > > +	zdd->range = drm_gpusvm_range_get(range);
> > > +	zdd->vram_allocation = NULL;
> > > +
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function increments the reference count of the provided
> > > zdd
> > > structure.
> > > + *
> > > + * Returns: Pointer to the zdd structure.
> > > + */
> > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_get(&zdd->refcount);
> > > +	return zdd;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > + * @ref: Pointer to the reference count structure.
> > > + *
> > > + * This function queues the destroy_work of the zdd for
> > > asynchronous
> > > destruction.
> > > + */
> > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd =
> > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > +
> > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > + * @zdd: Pointer to the zdd structure.
> > > + *
> > > + * This function decrements the reference count of the provided
> > > zdd
> > > structure
> > > + * and schedules its destruction if the count drops to zero.
> > > + */
> > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > notifier
> > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > + * @start: Start address of the range
> > > + * @end: End address of the range
> > > + *
> > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > start, u64 end)
> > > +{
> > > +	return range_iter_first(&notifier->root, start, end -
> > > 1);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > ranges in a notifier
> > > + * @range__: Iterator variable for the ranges
> > > + * @next__: Iterator variable for the ranges temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the range
> > > + * @end__: End address of the range
> > > + *
> > > + * This macro is used to iterate over GPU SVM ranges in a
> > > notifier
> > > while
> > > + * removing ranges from it.
> > > + */
> > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > notifier__,
> > > start__, end__)	\
> > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > (start__), (end__)),	\
> > > +	     (next__) =
> > > __drm_gpusvm_range_next(range__);				\
> > > +	     (range__) && (range__->va.start <
> > > (end__));				\
> > > +	     (range__) = (next__), (next__) =
> > > __drm_gpusvm_range_next(range__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier
> > > in
> > > the list
> > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > + *
> > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > available,
> > > or NULL if
> > > + *         the current notifier is the last one or if the input
> > > notifier is
> > > + *         NULL.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > +{
> > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > +				      &notifier->gpusvm-
> > > > notifier_list))
> > > +		return list_next_entry(notifier, rb.entry);
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> > > in
> > > a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > gpusvm.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > start__,
> > > end__)		\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > >root,
> > > (start__), (end__) - 1);	\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> > > SVM
> > > notifiers in a gpusvm
> > > + * @notifier__: Iterator variable for the notifiers
> > > + * @next__: Iterator variable for the notifiers temporay storage
> > > + * @notifier__: Pointer to the GPU SVM notifier
> > > + * @start__: Start address of the notifier
> > > + * @end__: End address of the notifier
> > > + *
> > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > gpusvm
> > > while
> > > + * removing notifiers from it.
> > > + */
> > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > gpusvm__, start__, end__)	\
> > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > >root,
> > > (start__), (end__) - 1),	\
> > > +	     (next__) =
> > > __drm_gpusvm_notifier_next(notifier__);				\
> > > +	     (notifier__) && (notifier__->interval.start <
> > > (end__));			\
> > > +	     (notifier__) = (next__), (next__) =
> > > __drm_gpusvm_notifier_next(notifier__))
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > notifier.
> > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > + * @cur_seq: Current sequence number.
> > > + *
> > > + * This function serves as a generic MMU notifier for GPU SVM.
> > > It
> > > sets the MMU
> > > + * notifier sequence number and calls the driver invalidate
> > > vfunc
> > > under
> > > + * gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * true if the operation succeeds, false otherwise.
> > > + */
> > > +static bool
> > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > *mni,
> > > +			       const struct mmu_notifier_range
> > > *mmu_range,
> > > +			       unsigned long cur_seq)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier =
> > > +		container_of(mni, typeof(*notifier), notifier);
> > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > +
> > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > +		return false;
> > > +
> > > +	down_write(&gpusvm->notifier_lock);
> > > +	mmu_interval_set_seq(mni, cur_seq);
> > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > +	up_write(&gpusvm->notifier_lock);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations
> > > for
> > > GPU SVM
> > > + */
> > > +static const struct mmu_interval_notifier_ops
> > > drm_gpusvm_notifier_ops = {
> > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > +};
> > > +
> > > +/**
> > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + * @name: Name of the GPU SVM.
> > > + * @drm: Pointer to the DRM device structure.
> > > + * @mm: Pointer to the mm_struct for the address space.
> > > + * @device_private_page_owner: Device private pages owner.
> > > + * @mm_start: Start address of GPU SVM.
> > > + * @mm_range: Range of the GPU SVM.
> > > + * @notifier_size: Size of individual notifiers.
> > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > range
> > > allocation.
> > > + *               Entries should be powers of 2 in descending
> > > order
> > > with last
> > > + *               entry being SZ_4K.
> > > + * @num_chunks: Number of chunks.
> > > + *
> > > + * This function initializes the GPU SVM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, a negative error code on failure.
> > > + */
> > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > +		    const char *name, struct drm_device *drm,
> > > +		    struct mm_struct *mm, void
> > > *device_private_page_owner,
> > > +		    u64 mm_start, u64 mm_range, u64
> > > notifier_size,
> > > +		    const struct drm_gpusvm_ops *ops,
> > > +		    const u64 *chunk_sizes, int num_chunks)
> > > +{
> > > +	if (!ops->invalidate || !num_chunks)
> > > +		return -EINVAL;
> > > +
> > > +	gpusvm->name = name;
> > > +	gpusvm->drm = drm;
> > > +	gpusvm->mm = mm;
> > > +	gpusvm->device_private_page_owner =
> > > device_private_page_owner;
> > > +	gpusvm->mm_start = mm_start;
> > > +	gpusvm->mm_range = mm_range;
> > > +	gpusvm->notifier_size = notifier_size;
> > > +	gpusvm->ops = ops;
> > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > +	gpusvm->num_chunks = num_chunks;
> > > +	gpusvm->zdd_wq = system_wq;
> > > +
> > > +	mmgrab(mm);
> > > +	gpusvm->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > +
> > > +	init_rwsem(&gpusvm->notifier_lock);
> > > +
> > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > +	might_lock(&gpusvm->notifier_lock);
> > > +	fs_reclaim_release(GFP_KERNEL);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > + * @fault_addr__: Fault address
> > > + *
> > > + * This macro finds the GPU SVM notifier associated with the
> > > fault
> > > address.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > + */
> > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > fault_addr__)	\
> > > +	notifier_iter_first(&(gpusvm__)->root,
> > > (fault_addr__),	\
> > > +			    (fault_addr__ + 1))
> > > +
> > > +/**
> > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > given rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_notifier struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > structure.
> > > + */
> > > +#define
> > > to_drm_gpusvm_notifier(__node)				\
> > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function inserts the GPU SVM notifier into the GPU SVM
> > > RB
> > > tree and list.
> > > + */
> > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct
> > > drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	notifier_insert(notifier, &gpusvm->root);
> > > +
> > > +	node = rb_prev(&notifier->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > >rb.entry;
> > > +	else
> > > +		head = &gpusvm->notifier_list;
> > > +
> > > +	list_add(&notifier->rb.entry, head);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This macro removes the GPU SVM notifier from the GPU SVM RB
> > > tree
> > > and list.
> > > + */
> > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > notifier__)	\
> > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > >root);	\
> > > +	list_del(&(notifier__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > + *
> > > + * This function finalizes the GPU SVM by cleaning up any
> > > remaining
> > > ranges and
> > > + * notifiers, and dropping a reference to struct MM.
> > > + */
> > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > +
> > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > gpusvm, 0,
> > > LONG_MAX) {
> > > +		struct drm_gpusvm_range *range, *__next;
> > > +
> > > +		/*
> > > +		 * Remove notifier first to avoid racing with
> > > any
> > > invalidation
> > > +		 */
> > > +		mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +		notifier->flags.removed = true;
> > > +
> > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > notifier, 0,
> > > +					       LONG_MAX)
> > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > +	}
> > > +
> > > +	mmdrop(gpusvm->mm);
> > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + *
> > > + * This function allocates and initializes the GPU SVM notifier
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM notifier on success,
> > > ERR_PTR()
> > > on failure.
> > > + */
> > > +static struct drm_gpusvm_notifier *
> > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > fault_addr)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	if (gpusvm->ops->notifier_alloc)
> > > +		notifier = gpusvm->ops->notifier_alloc();
> > > +	else
> > > +		notifier = kzalloc(sizeof(*notifier),
> > > GFP_KERNEL);
> > > +
> > > +	if (!notifier)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	notifier->gpusvm = gpusvm;
> > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > gpusvm-
> > > > notifier_size);
> > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > > notifier_size);
> > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > +	notifier->root = RB_ROOT_CACHED;
> > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > +
> > > +	return notifier;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + *
> > > + * This function frees the GPU SVM notifier structure.
> > > + */
> > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > +				     struct drm_gpusvm_notifier
> > > *notifier)
> > > +{
> > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > +
> > > +	if (gpusvm->ops->notifier_free)
> > > +		gpusvm->ops->notifier_free(notifier);
> > > +	else
> > > +		kfree(notifier);
> > > +}
> > > +
> > > +/**
> > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > given
> > > rbtree node
> > > + * @node__: a pointer to the rbtree node embedded within a
> > > drm_gpusvm_range struct
> > > + *
> > > + * Return: A pointer to the containing drm_gpusvm_range
> > > structure.
> > > + */
> > > +#define to_drm_gpusvm_range(node__)	\
> > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function inserts the GPU SVM range into the notifier RB
> > > tree
> > > and list.
> > > + */
> > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > *notifier,
> > > +				    struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	struct rb_node *node;
> > > +	struct list_head *head;
> > > +
> > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > +	range_insert(range, &notifier->root);
> > > +
> > > +	node = rb_prev(&range->rb.node);
> > > +	if (node)
> > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > +	else
> > > +		head = &notifier->range_list;
> > > +
> > > +	list_add(&range->rb.entry, head);
> > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > + * @range__: Pointer to the GPU SVM range structure
> > > + *
> > > + * This macro removes the GPU SVM range from the notifier RB
> > > tree
> > > and list.
> > > + */
> > > +#define __drm_gpusvm_range_remove(notifier__,
> > > range__)		\
> > > +	range_remove((range__), &(notifier__)-
> > > >root);		\
> > > +	list_del(&(range__)->rb.entry)
> > > +
> > > +/**
> > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @fault_addr: Fault address
> > > + * @chunk_size: Chunk size
> > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > + *
> > > + * This function allocates and initializes the GPU SVM range
> > > structure.
> > > + *
> > > + * Returns:
> > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR()
> > > on
> > > failure.
> > > + */
> > > +static struct drm_gpusvm_range *
> > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > +		       struct drm_gpusvm_notifier *notifier,
> > > +		       u64 fault_addr, u64 chunk_size, bool
> > > migrate_vram)
> > > +{
> > > +	struct drm_gpusvm_range *range;
> > > +
> > > +	if (gpusvm->ops->range_alloc)
> > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > +	else
> > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > +
> > > +	if (!range)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	kref_init(&range->refcount);
> > > +	range->gpusvm = gpusvm;
> > > +	range->notifier = notifier;
> > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > +	range->notifier_seq = LONG_MAX;
> > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_check_pages - Check pages
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @start: Start address
> > > + * @end: End address
> > > + *
> > > + * Check if pages between start and end have been faulted in on
> > > the
> > > CPU. Use to
> > > + * prevent migration of pages without CPU backing store.
> > > + *
> > > + * Returns:
> > > + * True if pages have been faulted into CPU, False otherwise
> > > + */
> > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > +				   struct drm_gpusvm_notifier
> > > *notifier,
> > > +				   u64 start, u64 end)
> > > +{
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = 0,
> > > +		.notifier = &notifier->notifier,
> > > +		.start = start,
> > > +		.end = end,
> > > +		.dev_private_owner = gpusvm-
> > > > device_private_page_owner,
> > > +	};
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long *pfns;
> > > +	unsigned long npages = npages_in_range(start, end);
> > > +	int err, i;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +	if (!pfns)
> > > +		return false;
> > > +
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier-
> > > > notifier);
> > > +	hmm_range.hmm_pfns = pfns;
> > > +
> > > +	while (true) {
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(&notifier->notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > +			err = -EFAULT;
> > > +			goto err_free;
> > > +		}
> > > +	}
> > > +
> > > +err_free:
> > > +	kvfree(pfns);
> > > +	return err ? false : true;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> > > SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > + * @vas: Pointer to the virtual memory area structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @check_pages: Flag indicating whether to check pages
> > > + *
> > > + * This function determines the chunk size for the GPU SVM range
> > > based on the
> > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > > and
> > > the virtual
> > > + * memory area boundaries.
> > > + *
> > > + * Returns:
> > > + * Chunk size on success, LONG_MAX on failure.
> > > + */
> > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > *gpusvm,
> > > +				       struct
> > > drm_gpusvm_notifier
> > > *notifier,
> > > +				       struct vm_area_struct
> > > *vas,
> > > +				       u64 fault_addr, u64
> > > gpuva_start,
> > > +				       u64 gpuva_end, bool
> > > check_pages)
> > > +{
> > > +	u64 start, end;
> > > +	int i = 0;
> > > +
> > > +retry:
> > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > chunk_sizes[i]);
> > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > >chunk_sizes[i]);
> > > +
> > > +		if (start >= vas->vm_start && end <= vas->vm_end
> > > &&
> > > +		    start >= notifier->interval.start &&
> > > +		    end <= notifier->interval.end &&
> > > +		    start >= gpuva_start && end <= gpuva_end)
> > > +			break;
> > > +	}
> > > +
> > > +	if (i == gpusvm->num_chunks)
> > > +		return LONG_MAX;
> > > +
> > > +	/*
> > > +	 * If allocation more than page, ensure not to overlap
> > > with
> > > existing
> > > +	 * ranges.
> > > +	 */
> > > +	if (end - start != SZ_4K) {
> > > +		struct drm_gpusvm_range *range;
> > > +
> > > +		range = drm_gpusvm_range_find(notifier, start,
> > > end);
> > > +		if (range) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +
> > > +		/*
> > > +		 * XXX: Only create range on pages CPU has
> > > faulted
> > > in. Without
> > > +		 * this check, or prefault, on BMG
> > > 'xe_exec_system_allocator --r
> > > +		 * process-many-malloc' fails. In the failure
> > > case,
> > > each process
> > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > results in 64k SVM
> > > +		 * ranges. When migrating the SVM ranges, some
> > > processes fail in
> > > +		 * drm_gpusvm_migrate_to_vram with
> > > 'migrate.cpages
> > > != npages'
> > > +		 * and then upon drm_gpusvm_range_get_pages
> > > device
> > > pages from
> > > +		 * other processes are collected + faulted in
> > > which
> > > creates all
> > > +		 * sorts of problems. Unsure exactly how this
> > > happening, also
> > > +		 * problem goes away if
> > > 'xe_exec_system_allocator --
> > > r
> > > +		 * process-many-malloc' mallocs at least 64k at
> > > a
> > > time.
> > > +		 */
> > > +		if (check_pages &&
> > > +		    !drm_gpusvm_check_pages(gpusvm, notifier,
> > > start,
> > > end)) {
> > > +			++i;
> > > +			goto retry;
> > > +		}
> > > +	}
> > > +
> > > +	return end - start;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @fault_addr: Fault address
> > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function finds or inserts a newly allocated a GPU SVM
> > > range
> > > based on the
> > > + * fault address. Caller must hold a lock to protect range
> > > lookup
> > > and insertion.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > failure.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > fault_addr,
> > > +				u64 gpuva_start, u64 gpuva_end,
> > > +				const struct drm_gpusvm_ctx
> > > *ctx)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +	struct drm_gpusvm_range *range;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	bool notifier_alloc = false;
> > > +	u64 chunk_size;
> > > +	int err;
> > > +	bool migrate_vram;
> > > +
> > > +	if (fault_addr < gpusvm->mm_start ||
> > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > +		err = -EINVAL;
> > > +		goto err_out;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_write_locked(mm);
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > +	if (!notifier) {
> > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > fault_addr);
> > > +		if (IS_ERR(notifier)) {
> > > +			err = PTR_ERR(notifier);
> > > +			goto err_mmunlock;
> > > +		}
> > > +		notifier_alloc = true;
> > > +		err =
> > > mmu_interval_notifier_insert_locked(&notifier-
> > > > notifier,
> > > +							  mm,
> > > notifier->interval.start,
> > > +							 
> > > notifier-
> > > > interval.end -
> > > +							 
> > > notifier-
> > > > interval.start,
> > > +							 
> > > &drm_gpusvm_notifier_ops);
> > > +		if (err)
> > > +			goto err_notifier;
> > > +	}
> > > +
> > > +	vas = vma_lookup(mm, fault_addr);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > +		err = -EPERM;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > fault_addr + 1);
> > > +	if (range)
> > > +		goto out_mmunlock;
> > > +	/*
> > > +	 * XXX: Short-circuiting migration based on
> > > migrate_vma_*
> > > current
> > > +	 * limitations. If/when migrate_vma_* add more support,
> > > this
> > > logic will
> > > +	 * have to change.
> > > +	 */
> > > +	migrate_vram = ctx->vram_possible &&
> > > +		vma_is_anonymous(vas) &&
> > > !is_vm_hugetlb_page(vas);
> > > +
> > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > notifier,
> > > vas,
> > > +						 fault_addr,
> > > gpuva_start,
> > > +						 gpuva_end,
> > > migrate_vram &&
> > > +						 !ctx-
> > > >prefault);
> > > +	if (chunk_size == LONG_MAX) {
> > > +		err = -EINVAL;
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > fault_addr,
> > > chunk_size,
> > > +				       migrate_vram);
> > > +	if (IS_ERR(range)) {
> > > +		err = PTR_ERR(range);
> > > +		goto err_notifier_remove;
> > > +	}
> > > +
> > > +	drm_gpusvm_range_insert(notifier, range);
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > +
> > > +	if (ctx->prefault) {
> > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > +
> > > +		__ctx.mmap_locked = true;
> > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > &__ctx);
> > > +		if (err)
> > > +			goto err_range_remove;
> > > +	}
> > > +
> > > +out_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +
> > > +	return range;
> > > +
> > > +err_range_remove:
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +err_notifier_remove:
> > > +	if (notifier_alloc)
> > > +		mmu_interval_notifier_remove(&notifier-
> > > >notifier);
> > > +err_notifier:
> > > +	if (notifier_alloc)
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return ERR_PTR(err);
> > > +}
> > > +
> > > +/**
> > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > + * @i__: the current page index in the iteration
> > > + * @j__: the current page index, log order, in the iteration
> > > + * @npages__: the total number of pages in the DMA region
> > > + * @order__: the order of the pages in the DMA region
> > > + *
> > > + * This macro iterates over each page in a DMA region. The DMA
> > > region
> > > + * is assumed to be composed of 2^@order__ pages, and the macro
> > > will
> > > + * step through the region one block of 2^@order__ pages at a
> > > time.
> > > + */
> > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > +
> > > +/**
> > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with
> > > a
> > > GPU SVM range (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function unmap pages associated with a GPU SVM range.
> > > Assumes and
> > > + * asserts correct locking is in place when called.
> > > + */
> > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					   struct
> > > drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		unsigned long i, j, npages =
> > > npages_in_range(range-
> > > > va.start,
> > > +							    
> > > range-
> > > > va.end);
> > > +
> > > +		if (range->flags.has_dma_mapping) {
> > > +			for_each_dma_page(i, j, npages, range-
> > > > order)
> > > +				dma_unmap_page(gpusvm->drm->dev,
> > > +					       range-
> > > >dma_addr[j],
> > > +					       PAGE_SIZE <<
> > > range-
> > > > order,
> > > +					      
> > > DMA_BIDIRECTIONAL);
> > > +		}
> > > +
> > > +		range->flags.has_vram_pages = false;
> > > +		range->flags.has_dma_mapping = false;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_free_pages - Free pages associated with a
> > > GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function free pages associated with a GPU SVM range.
> > > + */
> > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	if (range->pages) {
> > > +		if (range->flags.kfree_mapping) {
> > > +			kfree(range->dma_addr);
> > > +			range->flags.kfree_mapping = false;
> > > +			range->pages = NULL;
> > > +		} else {
> > > +			kvfree(range->pages);
> > > +			range->pages = NULL;
> > > +		}
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range to be removed
> > > + *
> > > + * This function removes the specified GPU SVM range and also
> > > removes the parent
> > > + * GPU SVM notifier if no more ranges remain in the notifier.
> > > The
> > > caller must
> > > + * hold a lock to protect range and notifier removal.
> > > + */
> > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > +			     struct drm_gpusvm_range *range)
> > > +{
> > > +	struct drm_gpusvm_notifier *notifier;
> > > +
> > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > va.start);
> > > +	if (WARN_ON_ONCE(!notifier))
> > > +		return;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > +	__drm_gpusvm_range_remove(notifier, range);
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	drm_gpusvm_range_put(range);
> > > +
> > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > +		if (!notifier->flags.removed)
> > > +			mmu_interval_notifier_remove(&notifier-
> > > > notifier);
> > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function increments the reference count of the specified
> > > GPU
> > > SVM range.
> > > + *
> > > + * Returns:
> > > + * Pointer to the GPU SVM range.
> > > + */
> > > +struct drm_gpusvm_range *
> > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_get(&range->refcount);
> > > +
> > > +	return range;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > + * @refcount: Pointer to the reference counter embedded in the
> > > GPU
> > > SVM range
> > > + *
> > > + * This function destroys the specified GPU SVM range when its
> > > reference count
> > > + * reaches zero. If a custom range-free function is provided, it
> > > is
> > > invoked to
> > > + * free the range; otherwise, the range is deallocated using
> > > kfree().
> > > + */
> > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > +{
> > > +	struct drm_gpusvm_range *range =
> > > +		container_of(refcount, struct drm_gpusvm_range,
> > > refcount);
> > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > +
> > > +	if (gpusvm->ops->range_free)
> > > +		gpusvm->ops->range_free(range);
> > > +	else
> > > +		kfree(range);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > + * @range: Pointer to the GPU SVM range
> > > + *
> > > + * This function decrements the reference count of the specified
> > > GPU
> > > SVM range
> > > + * and frees it when the count reaches zero.
> > > + */
> > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > +{
> > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called holding gpusvm->notifier_lock and as the last step
> > > before
> > > commiting a
> > > + * GPU binding.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > +
> > > +	return range->flags.has_vram_pages || range-
> > > > flags.has_dma_mapping;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages
> > > valid
> > > unlocked
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * This function determines if a GPU SVM range pages are valid.
> > > Expected be
> > > + * called without holding gpusvm->notifier_lock.
> > > + *
> > > + * Returns:
> > > + * True if GPU SVM range has valid pages, False otherwise
> > > + */
> > > +static bool
> > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > +				      struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	bool pages_valid;
> > > +
> > > +	if (!range->pages)
> > > +		return false;
> > > +
> > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > range);
> > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > +		kfree(range->dma_addr);
> > > +		range->flags.kfree_mapping = false;
> > > +		range->pages = NULL;
> > > +	}
> > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > +
> > > +	return pages_valid;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function gets pages for a GPU SVM range and ensures they
> > > are
> > > mapped for
> > > + * DMA access.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	struct mmu_interval_notifier *notifier = &range-
> > > >notifier-
> > > > notifier;
> > > +	struct hmm_range hmm_range = {
> > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > >read_only
> > > ? 0 :
> > > +			HMM_PFN_REQ_WRITE),
> > > +		.notifier = notifier,
> > > +		.start = range->va.start,
> > > +		.end = range->va.end,
> > > +		.dev_private_owner = gpusvm-
> > > > device_private_page_owner,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long timeout =
> > > +		jiffies +
> > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > +	unsigned long i, j;
> > > +	unsigned long npages = npages_in_range(range->va.start,
> > > range->va.end);
> > > +	unsigned int order = 0;
> > > +	unsigned long *pfns;
> > > +	struct page **pages;
> > > +	int err = 0;
> > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > +	bool alloc_pfns = false, kfree_mapping;
> > > +
> > > +retry:
> > > +	kfree_mapping = false;
> > > +	hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > range))
> > > +		return 0;
> > > +
> > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > range-
> > > > pages) {
> > > +		if (ctx->prefault)
> > > +			return 0;
> > > +
> > > +		pfns = (unsigned long *)range->pages;
> > > +		pages = range->pages;
> > > +		goto map_pages;
> > > +	}
> > > +
> > > +	if (!range->pages) {
> > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > GFP_KERNEL);
> > > +		if (!pfns)
> > > +			return -ENOMEM;
> > > +		alloc_pfns = true;
> > > +	} else {
> > > +		pfns = (unsigned long *)range->pages;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +	}
> > > +
> > > +	hmm_range.hmm_pfns = pfns;
> > > +	while (true) {
> > > +		/* Must be checked after mmu_interval_read_begin
> > > */
> > > +		if (range->flags.unmapped) {
> > > +			err = -EFAULT;
> > > +			break;
> > > +		}
> > > +
> > > +		if (!ctx->mmap_locked) {
> > > +			/*
> > > +			 * XXX: HMM locking document indicates
> > > only
> > > a read-lock
> > > +			 * is required but there apears to be a
> > > window between
> > > +			 * the MMU_NOTIFY_MIGRATE event
> > > triggered in
> > > a CPU fault
> > > +			 * via migrate_vma_setup and the pages
> > > actually moving
> > > +			 * in migrate_vma_finalize in which this
> > > code can grab
> > > +			 * garbage pages. Grabbing the write-
> > > lock if
> > > the range
> > > +			 * is attached to vram appears to
> > > protect
> > > against this
> > > +			 * race.
> > > +			 */
> > > +			if (vram_pages)
> > > +				mmap_write_lock(mm);
> > > +			else
> > > +				mmap_read_lock(mm);
> > > +		}
> > > +		err = hmm_range_fault(&hmm_range);
> > > +		if (!ctx->mmap_locked) {
> > > +			if (vram_pages)
> > > +				mmap_write_unlock(mm);
> > > +			else
> > > +				mmap_read_unlock(mm);
> > > +		}
> > > +
> > > +		if (err == -EBUSY) {
> > > +			if (time_after(jiffies, timeout))
> > > +				break;
> > > +
> > > +			hmm_range.notifier_seq =
> > > mmu_interval_read_begin(notifier);
> > > +			continue;
> > > +		}
> > > +		break;
> > > +	}
> > > +	if (!ctx->mmap_locked)
> > > +		mmput(mm);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	pages = (struct page **)pfns;
> > > +
> > > +	if (ctx->prefault) {
> > > +		range->pages = pages;
> > > +		goto set_seqno;
> > > +	}
> > > +
> > > +map_pages:
> > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > +
> > > +		for (i = 0; i < npages; ++i) {
> > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > +
> > > +			if
> > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				goto err_free;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->flags.has_vram_pages = true;
> > > +		range->pages = pages;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	} else {
> > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > +
> > > +		for_each_dma_page(i, j, npages, order) {
> > > +			if (WARN_ON_ONCE(i && order !=
> > > +					
> > > hmm_pfn_to_map_order(pfns[i]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > +
> > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > +			if
> > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > +				err = -EOPNOTSUPP;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +
> > > +			set_page_dirty_lock(pages[j]);
> > > +			mark_page_accessed(pages[j]);
> > > +
> > > +			dma_addr[j] = dma_map_page(gpusvm->drm-
> > > >dev,
> > > +						   pages[j], 0,
> > > +						   PAGE_SIZE <<
> > > order,
> > > +						  
> > > DMA_BIDIRECTIONAL);
> > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > dma_addr[j])) {
> > > +				err = -EFAULT;
> > > +				npages = i;
> > > +				goto err_unmap;
> > > +			}
> > > +		}
> > > +
> > > +		/* Huge pages, reduce memory footprint */
> > > +		if (order) {
> > > +			dma_addr = kmalloc_array(j,
> > > sizeof(*dma_addr),
> > > +						 GFP_KERNEL);
> > > +			if (dma_addr) {
> > > +				for (i = 0; i < j; ++i)
> > > +					dma_addr[i] =
> > > (dma_addr_t)pfns[i];
> > > +				kvfree(pfns);
> > > +				kfree_mapping = true;
> > > +			} else {
> > > +				dma_addr = (dma_addr_t *)pfns;
> > > +			}
> > > +		}
> > > +
> > > +		/* Do not race with notifier unmapping pages */
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +		range->order = order;
> > > +		range->flags.kfree_mapping = kfree_mapping;
> > > +		range->flags.has_dma_mapping = true;
> > > +		range->dma_addr = dma_addr;
> > > +		range->vram_allocation = NULL;
> > > +		if (mmu_interval_read_retry(notifier,
> > > hmm_range.notifier_seq)) {
> > > +			err = -EAGAIN;
> > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > range);
> > > +		}
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +	}
> > > +
> > > +	if (err == -EAGAIN)
> > > +		goto retry;
> > > +set_seqno:
> > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > +
> > > +	return 0;
> > > +
> > > +err_unmap:
> > > +	for_each_dma_page(i, j, npages, order)
> > > +		dma_unmap_page(gpusvm->drm->dev,
> > > +			       (dma_addr_t)pfns[j],
> > > +			       PAGE_SIZE << order,
> > > DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	if (alloc_pfns)
> > > +		kvfree(pfns);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > GPU
> > > SVM range
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function unmaps pages associated with a GPU SVM range.
> > > If
> > > @in_notifier
> > > + * is set, it is assumed that gpusvm->notifier_lock is held in
> > > write
> > > mode; if it
> > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > Must be
> > > called on
> > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > invalidate for IOMMU
> > > + * security model.
> > > + */
> > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > +				  struct drm_gpusvm_range
> > > *range,
> > > +				  const struct drm_gpusvm_ctx
> > > *ctx)
> > > +{
> > > +	if (ctx->in_notifier)
> > > +		lockdep_assert_held_write(&gpusvm-
> > > >notifier_lock);
> > > +	else
> > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > +
> > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > +
> > > +	if (!ctx->in_notifier)
> > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > + * @page: Pointer to the page to put
> > > + *
> > > + * This function unlocks and puts a page.
> > > + */
> > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > +{
> > > +	unlock_page(page);
> > > +	put_page(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > + * @npages: Number of pages
> > > + * @migrate_pfn: Array of migrate page frame numbers
> > > + *
> > > + * This function puts an array of pages.
> > > + */
> > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > +					   unsigned long
> > > *migrate_pfn)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!migrate_pfn[i])
> > > +			continue;
> > > +
> > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_pag
> > > e(mi
> > > grate_pfn[i]));
> > > +		migrate_pfn[i] = 0;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > + * @page: Pointer to the page
> > > + * @zdd: Pointer to the GPU SVM zone device data
> > > + *
> > > + * This function associates the given page with the specified
> > > GPU
> > > SVM zone
> > > + * device data and initializes it for zone device usage.
> > > + */
> > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > +				     struct drm_gpusvm_zdd *zdd)
> > > +{
> > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > +	zone_device_page_init(page);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU
> > > SVM
> > > migration
> > > + * @dev: The device for which the pages are being mapped
> > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > mapped
> > > pages
> > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > + * @npages: Number of pages to map
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function maps pages of memory for migration usage in GPU
> > > SVM. It
> > > + * iterates over each page frame number provided in
> > > @migrate_pfn,
> > > maps the
> > > + * corresponding page, and stores the DMA address in the
> > > provided
> > > @dma_addr
> > > + * array.
> > > + *
> > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > mapping.
> > > + */
> > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > +					dma_addr_t *dma_addr,
> > > +					long unsigned int
> > > *migrate_pfn,
> > > +					unsigned long npages,
> > > +					enum dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page =
> > > migrate_pfn_to_page(migrate_pfn[i]);
> > > +
> > > +		if (!page)
> > > +			continue;
> > > +
> > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > +			return -EFAULT;
> > > +
> > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > PAGE_SIZE,
> > > dir);
> > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > +			return -EFAULT;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > mapped
> > > for GPU SVM migration
> > > + * @dev: The device for which the pages were mapped
> > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > pages
> > > + * @npages: Number of pages to unmap
> > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > + *
> > > + * This function unmaps previously mapped pages of memory for
> > > GPU
> > > Shared Virtual
> > > + * Memory (SVM). It iterates over each DMA address provided in
> > > @dma_addr, checks
> > > + * if it's valid and not already unmapped, and unmaps the
> > > corresponding page.
> > > + */
> > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > +					   dma_addr_t *dma_addr,
> > > +					   unsigned long npages,
> > > +					   enum
> > > dma_data_direction
> > > dir)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > dma_addr[i]))
> > > +			continue;
> > > +
> > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > dir);
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *                   failure of this function.
> > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > allocation.
> > > The caller
> > > + *                   should hold a reference to the VRAM
> > > allocation,
> > > which
> > > + *                   should be dropped via ops->vram_allocation
> > > or
> > > upon the
> > > + *                   failure of this function.
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function migrates the specified GPU SVM range to VRAM.
> > > It
> > > performs the
> > > + * necessary setup and invokes the driver-specific operations
> > > for
> > > migration to
> > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > reference @range
> > > + * until ops->vram_release is called which only upon successful
> > > return.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       void *vram_allocation,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct migrate_vma migrate = {
> > > +		.start		= start,
> > > +		.end		= end,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > +	};
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	unsigned long i, npages = npages_in_range(start, end);
> > > +	struct vm_area_struct *vas;
> > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int err;
> > > +
> > > +	if (!range->flags.migrate_vram)
> > > +		return -EINVAL;
> > > +
> > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > copy_to_vram ||
> > > +	    !gpusvm->ops->copy_to_sram)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		mmap_write_lock(mm);
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	vas = vma_lookup(mm, start);
> > > +	if (!vas) {
> > > +		err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > +		err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (!vma_is_anonymous(vas)) {
> > > +		err = -EBUSY;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_mmunlock;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > +	if (!zdd) {
> > > +		err = -ENOMEM;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/*
> > > +	 * FIXME: Below cases, !migrate.cpages and
> > > migrate.cpages !=
> > > npages, not
> > > +	 * always an error. Need to revisit possible cases and
> > > how
> > > to handle. We
> > > +	 * could prefault on migrate.cpages != npages via
> > > hmm_range_fault.
> > > +	 */
> > > +
> > > +	if (!migrate.cpages) {
> > > +		err = -EFAULT;
> > > +		goto err_free;
> > > +	}
> > > +
> > > +	if (migrate.cpages != npages) {
> > > +		err = -EBUSY;
> > > +		goto err_finalize;
> > > +	}
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > vram_allocation, npages,
> > > +					     migrate.dst);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.src, npages,
> > > DMA_TO_DEVICE);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i) {
> > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > +
> > > +		pages[i] = page;
> > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > +	}
> > > +
> > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	/* Upon success bind vram allocation to range and zdd */
> > > +	range->vram_allocation = vram_allocation;
> > > +	WRITE_ONCE(zdd->vram_allocation,
> > > vram_allocation);	/*
> > > Owns ref */
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_TO_DEVICE);
> > > +err_free:
> > > +	if (zdd)
> > > +		drm_gpusvm_zdd_put(zdd);
> > > +	kvfree(buf);
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_write_unlock(mm);
> > > +		mmput(mm);
> > > +	}
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for
> > > a
> > > VM area
> > > + * @vas: Pointer to the VM area structure, can be NULL
> > > + * @npages: Number of pages to populate
> > > + * @src_mpfn: Source array of migrate PFNs
> > > + * @mpfn: Array of migrate PFNs to populate
> > > + * @addr: Start address for PFN allocation
> > > + *
> > > + * This function populates the SRAM migrate page frame numbers
> > > (PFNs) for the
> > > + * specified VM area structure. It allocates and locks pages in
> > > the
> > > VM area for
> > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > allocation,
> > > if NULL use
> > > + * alloc_page for allocation.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > vm_area_struct *vas,
> > > +						unsigned long
> > > npages,
> > > +						unsigned long
> > > *src_mpfn,
> > > +						unsigned long
> > > *mpfn,
> > > u64 addr)
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > +		struct page *page;
> > > +
> > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > +			continue;
> > > +
> > > +		if (vas)
> > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > addr);
> > > +		else
> > > +			page = alloc_page(GFP_HIGHUSER);
> > > +
> > > +		if (!page)
> > > +			return -ENOMEM;
> > > +
> > > +		lock_page(page);
> > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + *
> > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> > > mmap
> > > lock and
> > > + * migration done via migrate_device_* functions. Fallback path
> > > as
> > > it is
> > > + * preferred to issue migrations with mmap lock.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > +				    struct drm_gpusvm_range
> > > *range)
> > > +{
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	unsigned long *src, *dst;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	npages = npages_in_range(range->va.start, range-
> > > >va.end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > sizeof(*dma_addr)
> > > +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	src = buf;
> > > +	dst = buf + (sizeof(*src) * npages);
> > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > npages;
> > > +
> > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > vram_allocation,
> > > +					     npages, src);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > +				       gpusvm-
> > > > device_private_page_owner, src,
> > > +				       npages, range->va.start);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > src, dst, 0);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   dst, npages,
> > > DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > +	migrate_device_pages(src, dst, npages);
> > > +	migrate_device_finalize(src, dst, npages);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > (internal)
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @vas: Pointer to the VM area structure
> > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > + * @start: Start address of the migration range
> > > + * @end: End address of the migration range
> > > + *
> > > + * This internal function performs the migration of the
> > > specified
> > > GPU SVM range
> > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > PFNs, and
> > > + * invokes the driver-specific operations for migration to SRAM.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > *gpusvm,
> > > +					struct vm_area_struct
> > > *vas,
> > > +					struct page *page,
> > > +					u64 start, u64 end)
> > > +{
> > > +	struct migrate_vma migrate = {
> > > +		.vma		= vas,
> > > +		.pgmap_owner	= gpusvm-
> > > >device_private_page_owner,
> > > +		.flags		=
> > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > +		.fault_page	= page,
> > > +	};
> > > +	unsigned long npages;
> > > +	struct page **pages;
> > > +	dma_addr_t *dma_addr;
> > > +	void *buf;
> > > +	int i, err = 0;
> > > +
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	/* Corner where VMA area struct has been partially
> > > unmapped
> > > */
> > > +	if (start < vas->vm_start)
> > > +		start = vas->vm_start;
> > > +	if (end > vas->vm_end)
> > > +		end = vas->vm_end;
> > > +
> > > +	migrate.start = start;
> > > +	migrate.end = end;
> > > +	npages = npages_in_range(start, end);
> > > +
> > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr) +
> > > +		       sizeof(*pages), GFP_KERNEL);
> > > +	if (!buf) {
> > > +		err = -ENOMEM;
> > > +		goto err_out;
> > > +	}
> > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > sizeof(*dma_addr))
> > > * npages;
> > > +
> > > +	migrate.vma = vas;
> > > +	migrate.src = buf;
> > > +	migrate.dst = migrate.src + npages;
> > > +
> > > +	err = migrate_vma_setup(&migrate);
> > > +	if (err)
> > > +		goto err_free;
> > > +
> > > +	/* Raced with another CPU fault, nothing to do */
> > > +	if (!migrate.cpages)
> > > +		goto err_free;
> > > +
> > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > +						   migrate.src,
> > > migrate.dst,
> > > +						   start);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > +					   migrate.dst, npages,
> > > +					   DMA_BIDIRECTIONAL);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +	for (i = 0; i < npages; ++i)
> > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > 
> > See comments below which pages we actually want to migrate.
> > 
> > 
> > > +
> > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > npages);
> > > +	if (err)
> > > +		goto err_finalize;
> > > +
> > > +err_finalize:
> > > +	if (err)
> > > +		drm_gpusvm_migration_put_pages(npages,
> > > migrate.dst);
> > > +	migrate_vma_pages(&migrate);
> > > +	migrate_vma_finalize(&migrate);
> > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > dma_addr,
> > > npages,
> > > +				       DMA_BIDIRECTIONAL);
> > > +err_free:
> > > +	kvfree(buf);
> > > +err_out:
> > > +	mmap_assert_locked(gpusvm->mm);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > SRAM
> > > + * @gpusvm: Pointer to the GPU SVM structure
> > > + * @range: Pointer to the GPU SVM range structure
> > > + * @ctx: GPU SVM context
> > > + *
> > > + * This function initiates the migration of the specified GPU
> > > SVM
> > > range to
> > > + * SRAM. It performs necessary checks and invokes the internal
> > > migration
> > > + * function for actual migration.
> > > + *
> > > + * Returns:
> > > + * 0 on success, negative error code on failure.
> > > + */
> > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > +			       struct drm_gpusvm_range *range,
> > > +			       const struct drm_gpusvm_ctx *ctx)
> > > +{
> > > +	u64 start = range->va.start, end = range->va.end;
> > > +	struct mm_struct *mm = gpusvm->mm;
> > > +	struct vm_area_struct *vas;
> > > +	int err;
> > > +	bool retry = false;
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		if (!mmget_not_zero(mm)) {
> > > +			err = -EFAULT;
> > > +			goto err_out;
> > > +		}
> > > +		if (ctx->trylock_mmap) {
> > > +			if (!mmap_read_trylock(mm))  {
> > > +				err =
> > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > +				goto err_mmput;
> > > +			}
> > > +		} else {
> > > +			mmap_read_lock(mm);
> > > +		}
> > > +	}
> > > +
> > > +	mmap_assert_locked(mm);
> > > +
> > > +	/*
> > > +	 * Loop required to find all VMA area structs for the
> > > corner
> > > case when
> > > +	 * VRAM backing has been partially unmapped from MM's
> > > address space.
> > > +	 */
> > > +again:
> > > +	vas = find_vma(mm, start);
> > > +	if (!vas) {
> > > +		if (!retry)
> > > +			err = -ENOENT;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > +		if (!retry)
> > > +			err = -EINVAL;
> > > +		goto err_mmunlock;
> > > +	}
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > > start,
> > > end);
> > 
> > This function is typically called from the vm side to get a clean
> > mm as
> > a last resort after get_pages() fail. As such should we have it
> > evict
> > *everything*, even foreign device memory, and mismatching local
> > device
> > pages. If so, we could use hmm_range_fault() with a NULL page owner
> > +
> > faulting to do that.
> > 
> 
> I've actually tried that and it seemed to mostly work well and
> actually
> would be my preference as this avoids a VMA lookup in GPU SVM.
> 
> I think it is problem though if some of the pages are partially
> unmapped
> though as hmm_range_fault will abort if fault cannot be resolved.
> Maybe
> I'm mistaken on this. I won't get this in rev2 but will put this on
> my
> list to continue to play around with.

OK. Presumably if faulting fails we should try a narrower range unless
the page actually hitting the gpu pagefault is unmapped, to ensure we
make progress rather than aborting?


> 
> > > +	if (err)
> > > +		goto err_mmunlock;
> > > +
> > > +	if (vas->vm_end < end) {
> > > +		retry = true;
> > > +		start = vas->vm_end;
> > > +		goto again;
> > > +	}
> > > +
> > > +	if (!ctx->mmap_locked) {
> > > +		mmap_read_unlock(mm);
> > > +		/*
> > > +		 * Using mmput_async as this function can be
> > > called
> > > while
> > > +		 * holding a dma-resv lock, and a final put can
> > > grab
> > > the mmap
> > > +		 * lock, causing a lock inversion.
> > > +		 */
> > > +		mmput_async(mm);
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err_mmunlock:
> > > +	if (!ctx->mmap_locked)
> > > +		mmap_read_unlock(mm);
> > > +err_mmput:
> > > +	if (!ctx->mmap_locked)
> > > +		mmput_async(mm);
> > > +err_out:
> > > +	return err;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > associated
> > > with a page
> > > + * @page: Pointer to the page
> > > + *
> > > + * This function is a callback used to put the GPU SVM zone
> > > device
> > > data
> > > + * associated with a page when it is being released.
> > > + */
> > > +static void drm_gpusvm_page_free(struct page *page)
> > > +{
> > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > (page
> > > fault handler)
> > > + * @vmf: Pointer to the fault information structure
> > > + *
> > > + * This function is a page fault handler used to migrate a GPU
> > > SVM
> > > range to RAM.
> > > + * It retrieves the GPU SVM range information from the faulting
> > > page
> > > and invokes
> > > + * the internal migration function to migrate the range back to
> > > RAM.
> > > + *
> > > + * Returns:
> > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > + */
> > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > *vmf)
> > > +{
> > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > >zone_device_data;
> > > +	int err;
> > > +
> > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > +					   vmf->vma, vmf->page,
> > > +					   zdd->range->va.start,
> > > +					   zdd->range->va.end);
> > 
> > When called from here, since this is a pagemap op, we should ensure
> > we
> > only migrate our own pagemap to RAM?
> > 
> 
> I think you resolve this with the following the patch [1], right? I
> think I agree.

It doesn't fully resolve it, but adds the capability to do more
specified filtering. Another option would be to use the pagemap ptr
rather than the device ptr as device_private owner, but that would OTOH
require a wider filtering in hmm_range_fault() so that (or a similar)
patch would be needed anyway.

Thanks,
Thomas

> 
> Matt
> 
> [1] https://patchwork.freedesktop.org/series/139994/
> 
> > /Thanks,
> > Thomas
> >
Matthew Brost Oct. 16, 2024, 8:24 a.m. UTC | #44
On Wed, Oct 16, 2024 at 08:27:51AM +0200, Thomas Hellström wrote:
> On Wed, 2024-10-16 at 03:18 +0000, Matthew Brost wrote:
> > On Wed, Oct 09, 2024 at 12:50:42PM +0200, Thomas Hellström wrote:
> > > Hi, Matthew.
> > > 
> > > Some comments below around migrating to SRAM.
> > > 
> > > 
> > > On Tue, 2024-08-27 at 19:48 -0700, Matthew Brost wrote:
> > > > This patch introduces support for GPU Shared Virtual Memory (SVM)
> > > > in
> > > > the
> > > > Direct Rendering Manager (DRM) subsystem. SVM allows for seamless
> > > > sharing of memory between the CPU and GPU, enhancing performance
> > > > and
> > > > flexibility in GPU computing tasks.
> > > > 
> > > > The patch adds the necessary infrastructure for SVM, including
> > > > data
> > > > structures and functions for managing SVM ranges and notifiers.
> > > > It
> > > > also
> > > > provides mechanisms for allocating, deallocating, and migrating
> > > > memory
> > > > regions between system RAM and GPU VRAM.
> > > > 
> > > > This mid-layer is largely inspired by GPUVM.
> > > > 
> > > > Cc: Dave Airlie <airlied@redhat.com>
> > > > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > > > Cc: Christian König <christian.koenig@amd.com>
> > > > Cc: <dri-devel@lists.freedesktop.org>
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >  drivers/gpu/drm/xe/Makefile     |    3 +-
> > > >  drivers/gpu/drm/xe/drm_gpusvm.c | 2174
> > > > +++++++++++++++++++++++++++++++
> > > >  drivers/gpu/drm/xe/drm_gpusvm.h |  415 ++++++
> > > >  3 files changed, 2591 insertions(+), 1 deletion(-)
> > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.c
> > > >  create mode 100644 drivers/gpu/drm/xe/drm_gpusvm.h
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > > b/drivers/gpu/drm/xe/Makefile
> > > > index b9670ae09a9e..b8fc2ee58f1a 100644
> > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > @@ -25,7 +25,8 @@ $(obj)/generated/%_wa_oob.c
> > > > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > >  
> > > >  # core driver code
> > > >  
> > > > -xe-y += xe_bb.o \
> > > > +xe-y += drm_gpusvm.o \
> > > > +	xe_bb.o \
> > > >  	xe_bo.o \
> > > >  	xe_bo_evict.o \
> > > >  	xe_devcoredump.o \
> > > > diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > new file mode 100644
> > > > index 000000000000..fc1e44e6ae72
> > > > --- /dev/null
> > > > +++ b/drivers/gpu/drm/xe/drm_gpusvm.c
> > > > @@ -0,0 +1,2174 @@
> > > > +// SPDX-License-Identifier: MIT
> > > > +/*
> > > > + * Copyright © 2024 Intel Corporation
> > > > + *
> > > > + * Authors:
> > > > + *     Matthew Brost <matthew.brost@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/dma-mapping.h>
> > > > +#include <linux/interval_tree_generic.h>
> > > > +#include <linux/hmm.h>
> > > > +#include <linux/memremap.h>
> > > > +#include <linux/migrate.h>
> > > > +#include <linux/mm_types.h>
> > > > +#include <linux/pagemap.h>
> > > > +#include <linux/slab.h>
> > > > +
> > > > +#include <drm/drm_device.h>
> > > > +#include "drm_gpusvm.h"
> > > > +
> > > > +/**
> > > > + * DOC: Overview
> > > > + *
> > > > + * GPU Shared Virtual Memory (GPU SVM) layer for the Direct
> > > > Rendering Manager (DRM)
> > > > + *
> > > > + * The GPU SVM layer is a component of the DRM framework
> > > > designed to
> > > > manage shared
> > > > + * virtual memory between the CPU and GPU. It enables efficient
> > > > data
> > > > exchange and
> > > > + * processing for GPU-accelerated applications by allowing
> > > > memory
> > > > sharing and
> > > > + * synchronization between the CPU's and GPU's virtual address
> > > > spaces.
> > > > + *
> > > > + * Key GPU SVM Components:
> > > > + * - Notifiers: Notifiers: Used for tracking memory intervals
> > > > and
> > > > notifying the
> > > > + *		GPU of changes, notifiers are sized based on a
> > > > GPU
> > > > SVM
> > > > + *		initialization parameter, with a recommendation
> > > > of
> > > > 512M or
> > > > + *		larger. They maintain a Red-BlacK tree and a
> > > > list of
> > > > ranges that
> > > > + *		fall within the notifier interval. Notifiers are
> > > > tracked within
> > > > + *		a GPU SVM Red-BlacK tree and list and are
> > > > dynamically inserted
> > > > + *		or removed as ranges within the interval are
> > > > created
> > > > or
> > > > + *		destroyed.
> > > > + * - Ranges: Represent memory ranges mapped in a DRM device and
> > > > managed
> > > > + *	     by GPU SVM. They are sized based on an array of
> > > > chunk
> > > > sizes, which
> > > > + *	     is a GPU SVM initialization parameter, and the CPU
> > > > address space.
> > > > + *	     Upon GPU fault, the largest aligned chunk that fits
> > > > within the
> > > > + *	     faulting CPU address space is chosen for the range
> > > > size. Ranges are
> > > > + *	     expected to be dynamically allocated on GPU fault
> > > > and
> > > > removed on an
> > > > + *	     MMU notifier UNMAP event. As mentioned above,
> > > > ranges
> > > > are tracked in
> > > > + *	     a notifier's Red-Black tree.
> > > > + * - Operations: Define the interface for driver-specific SVM
> > > > operations such as
> > > > + *		 allocation, page collection, migration,
> > > > invalidations, and VRAM
> > > > + *		 release.
> > > > + *
> > > > + * This layer provides interfaces for allocating, mapping,
> > > > migrating, and
> > > > + * releasing memory ranges between the CPU and GPU. It handles
> > > > all
> > > > core memory
> > > > + * management interactions (DMA mapping, HMM, and migration) and
> > > > provides
> > > > + * driver-specific virtual functions (vfuncs). This
> > > > infrastructure
> > > > is sufficient
> > > > + * to build the expected driver components for an SVM
> > > > implementation
> > > > as detailed
> > > > + * below.
> > > > + *
> > > > + * Expected Driver Components:
> > > > + * - GPU page fault handler: Used to create ranges and notifiers
> > > > based on the
> > > > + *			     fault address, optionally migrate
> > > > the
> > > > range to
> > > > + *			     VRAM, and create GPU bindings.
> > > > + * - Garbage collector: Used to destroy GPU bindings for ranges.
> > > > Ranges are
> > > > + *			expected to be added to the garbage
> > > > collector upon
> > > > + *			MMU_NOTIFY_UNMAP event.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Locking
> > > > + *
> > > > + * GPU SVM handles locking for core MM interactions, i.e., it
> > > > locks/unlocks the
> > > > + * mmap lock as needed. Alternatively, if the driver prefers to
> > > > handle the mmap
> > > > + * lock itself, a 'locked' argument is provided to the functions
> > > > that require
> > > > + * the mmap lock. This option may be useful for drivers that
> > > > need to
> > > > call into
> > > > + * GPU SVM while also holding a dma-resv lock, thus preventing
> > > > locking
> > > > + * inversions between the mmap and dma-resv locks.
> > > > + *
> > > > + * GPU SVM introduces a global notifier lock, which safeguards
> > > > the
> > > > notifier's
> > > > + * range RB tree and list, as well as the range's DMA mappings
> > > > and
> > > > sequence
> > > > + * number. GPU SVM manages all necessary locking and unlocking
> > > > operations,
> > > > + * except for the recheck of the range's sequence number
> > > > + * (mmu_interval_read_retry) when the driver is committing GPU
> > > > bindings. This
> > > > + * lock corresponds to the 'driver->update' lock mentioned in
> > > > the
> > > > HMM
> > > > + * documentation (TODO: Link). Future revisions may transition
> > > > from
> > > > a GPU SVM
> > > > + * global lock to a per-notifier lock if finer-grained locking
> > > > is
> > > > deemed
> > > > + * necessary.
> > > > + *
> > > > + * In addition to the locking mentioned above, the driver should
> > > > implement a
> > > > + * lock to safeguard core GPU SVM function calls that modify
> > > > state,
> > > > such as
> > > > + * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove.
> > > > Alternatively,
> > > > + * these core functions can be called within a single kernel
> > > > thread,
> > > > for
> > > > + * instance, using an ordered work queue. This lock is denoted
> > > > as
> > > > + * 'driver_svm_lock' in code examples.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Migrataion
> > > > + *
> > > > + * The migration support is quite simple, allowing migration
> > > > between
> > > > SRAM and
> > > > + * VRAM at the range granularity. For example, GPU SVM currently
> > > > does not
> > > > + * support mixing SRAM and VRAM pages within a range. This means
> > > > that upon GPU
> > > > + * fault, the entire range can be migrated to VRAM, and upon CPU
> > > > fault, the
> > > > + * entire range is migrated to SRAM.
> > > > + *
> > > > + * The reasoning for only supporting range granularity is as
> > > > follows: it
> > > > + * simplifies the implementation, and range sizes are driver-
> > > > defined
> > > > and should
> > > > + * be relatively small.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Partial Unmapping of Ranges
> > > > + *
> > > > + * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped
> > > > by
> > > > CPU resulting
> > > > + * in MMU_NOTIFY_UNMAP event) presents several challenges, with
> > > > the
> > > > main one
> > > > + * being that a subset of the range still has CPU and GPU
> > > > mappings.
> > > > If the
> > > > + * backing store for the range is in VRAM, a subset of the
> > > > backing
> > > > store has
> > > > + * references. One option would be to split the range and VRAM
> > > > backing store,
> > > > + * but the implementation for this would be quite complicated.
> > > > Given
> > > > that
> > > > + * partial unmappings are rare and driver-defined range sizes
> > > > are
> > > > relatively
> > > > + * small, GPU SVM does not support splitting of ranges.
> > > > + *
> > > > + * With no support for range splitting, upon partial unmapping
> > > > of a
> > > > range, the
> > > > + * driver is expected to invalidate and destroy the entire
> > > > range. If
> > > > the range
> > > > + * has VRAM as its backing, the driver is also expected to
> > > > migrate
> > > > any remaining
> > > > + * pages back to SRAM.
> > > > + */
> > > > +
> > > > +/**
> > > > + * DOC: Examples
> > > > + *
> > > > + * This section provides two examples of how to build the
> > > > expected
> > > > driver
> > > > + * components: the GPU page fault handler and the garbage
> > > > collector.
> > > > A third
> > > > + * example demonstrates a sample invalidation driver vfunc.
> > > > + *
> > > > + * The generic code provided does not include logic for complex
> > > > migration
> > > > + * policies, optimized invalidations, or other potentially
> > > > required
> > > > driver
> > > > + * locking (e.g., DMA-resv locks).
> > > > + *
> > > > + * 1) GPU page fault handler
> > > > + *
> > > > + *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct
> > > > drm_gpusvm_range *range)
> > > > + *	{
> > > > + *		int err = 0;
> > > > + *
> > > > + *		driver_alloc_and_setup_memory_for_bind(gpusvm,
> > > > range);
> > > > + *
> > > > + *		drm_gpusvm_notifier_lock(gpusvm);
> > > > + *		if (drm_gpusvm_range_pages_valid(range))
> > > > + *			driver_commit_bind(gpusvm, range);
> > > > + *		else
> > > > + *			err = -EAGAIN;
> > > > + *		drm_gpusvm_notifier_unlock(gpusvm);
> > > > + *
> > > > + *		return err;
> > > > + *	}
> > > > + *
> > > > + *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr,
> > > > + *			     u64 gpuva_start, u64 gpuva_end)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > + *		int err;
> > > > + *
> > > > + *		driver_svm_lock();
> > > > + *	retry:
> > > > + *		// Always process UNMAPs first so view of GPU
> > > > SVM
> > > > ranges is current
> > > > + *		driver_garbage_collector(gpusvm);
> > > > + *
> > > > + *		range = drm_gpusvm_range_find_or_insert(gpusvm,
> > > > fault_addr,
> > > > +
> > > > *							gpuva_start,
> > > > gpuva_end,
> > > > + *						        &ctx);
> > > > + *		if (IS_ERR(range)) {
> > > > + *			err = PTR_ERR(range);
> > > > + *			goto unlock;
> > > > + *		}
> > > > + *
> > > > + *		if (driver_migration_policy(range)) {
> > > > + *			bo = driver_alloc_bo();
> > > > + *			err = drm_gpusvm_migrate_to_vram(gpusvm,
> > > > range, bo, &ctx);
> > > > + *			if (err)	// CPU mappings may have
> > > > changed
> > > > + *				goto retry;
> > > > + *		}
> > > > + *
> > > > + *		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > > &ctx);
> > > > + *		if (err == -EFAULT || err == -EPERM)	// CPU
> > > > mappings changed
> > > > + *			goto retry;
> > > > + *		else if (err)
> > > > + *			goto unlock;
> > > > + *
> > > > + *		err = driver_bind_range(gpusvm, range);
> > > > + *		if (err == -EAGAIN)	// CPU mappings changed
> > > > + *			goto retry
> > > > + *
> > > > + *	unlock:
> > > > + *		driver_svm_unlock();
> > > > + *		return err;
> > > > + *	}
> > > > + *
> > > > + * 2) Garbage Collector.
> > > > + *
> > > > + *	void __driver_garbage_collector(struct drm_gpusvm
> > > > *gpusvm,
> > > > + *					struct drm_gpusvm_range
> > > > *range)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = {};
> > > > + *
> > > > + *		assert_driver_svm_locked(gpusvm);
> > > > + *
> > > > + *		// Partial unmap, migrate any remaining VRAM
> > > > pages
> > > > back to SRAM
> > > > + *		if (range->flags.partial_unmap)
> > > > + *			drm_gpusvm_migrate_to_sram(gpusvm,
> > > > range,
> > > > &ctx);
> > > > + *
> > > > + *		driver_unbind_range(range);
> > > > + *		drm_gpusvm_range_remove(gpusvm, range);
> > > > + *	}
> > > > + *
> > > > + *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
> > > > + *	{
> > > > + *		assert_driver_svm_locked(gpusvm);
> > > > + *
> > > > + *		for_each_range_in_garbage_collector(gpusvm,
> > > > range)
> > > > + *			__driver_garbage_collector(gpusvm,
> > > > range);
> > > > + *	}
> > > > + *
> > > > + * 3) Invalidation driver vfunc.
> > > > + *
> > > > + *	void driver_invalidation(struct drm_gpusvm *gpusvm,
> > > > + *				 struct drm_gpusvm_notifier
> > > > *notifier,
> > > > + *				 const struct mmu_notifier_range
> > > > *mmu_range)
> > > > + *	{
> > > > + *		struct drm_gpusvm_ctx ctx = { .in_notifier =
> > > > true,
> > > > };
> > > > + *		struct drm_gpusvm_range *range = NULL;
> > > > + *
> > > > + *		driver_invalidate_device_tlb(gpusvm, mmu_range-
> > > > > start, mmu_range->end);
> > > > + *
> > > > + *		drm_gpusvm_for_each_range(range, notifier,
> > > > mmu_range->start,
> > > > + *					  mmu_range->end) {
> > > > + *			drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range,
> > > > &ctx);
> > > > + *
> > > > + *			if (mmu_range->event !=
> > > > MMU_NOTIFY_UNMAP)
> > > > + *				continue;
> > > > + *
> > > > + *			drm_gpusvm_range_set_unmapped(range,
> > > > mmu_range);
> > > > + *			driver_garbage_collector_add(gpusvm,
> > > > range);
> > > > + *		}
> > > > + *	}
> > > > + */
> > > > +
> > > > +#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
> > > > +#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
> > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64,
> > > > rb.__subtree_last,
> > > > +		     DRM_GPUSVM_RANGE_START,
> > > > DRM_GPUSVM_RANGE_END,
> > > > +		     static __maybe_unused, range);
> > > > +
> > > > +#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)-
> > > > > interval.start)
> > > > +#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)-
> > > > > interval.end - 1)
> > > > +INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
> > > > +		     rb.__subtree_last,
> > > > DRM_GPUSVM_NOTIFIER_START,
> > > > +		     DRM_GPUSVM_NOTIFIER_END, static
> > > > __maybe_unused,
> > > > notifier);
> > > > +
> > > > +/**
> > > > + * npages_in_range() - Calculate the number of pages in a given
> > > > range
> > > > + * @start__: The start address of the range
> > > > + * @end__: The end address of the range
> > > > + *
> > > > + * This macro calculates the number of pages in a given memory
> > > > range,
> > > > + * specified by the start and end addresses. It divides the
> > > > difference
> > > > + * between the end and start addresses by the page size
> > > > (PAGE_SIZE)
> > > > to
> > > > + * determine the number of pages in the range.
> > > > + *
> > > > + * Return: The number of pages in the specified range.
> > > > + */
> > > > +#define npages_in_range(start__, end__)	\
> > > > +	(((end__) - (start__)) >> PAGE_SHIFT)
> > > > +
> > > > +/**
> > > > + * struct drm_gpusvm_zdd - GPU SVM zone device data
> > > > + *
> > > > + * @refcount: Reference count for the zdd
> > > > + * @destroy_work: Work structure for asynchronous zdd
> > > > destruction
> > > > + * @range: Pointer to the GPU SVM range
> > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation
> > > > + *
> > > > + * This structure serves as a generic wrapper installed in
> > > > + * page->zone_device_data. It provides infrastructure for
> > > > looking up
> > > > a range
> > > > + * upon CPU page fault and asynchronously releasing VRAM once
> > > > the
> > > > CPU has no
> > > > + * page references. Asynchronous release is useful because CPU
> > > > page
> > > > references
> > > > + * can be dropped in IRQ contexts, while releasing VRAM likely
> > > > requires sleeping
> > > > + * locks.
> > > > + */
> > > > +struct drm_gpusvm_zdd {
> > > > +	struct kref refcount;
> > > > +	struct work_struct destroy_work;
> > > > +	struct drm_gpusvm_range *range;
> > > > +	void *vram_allocation;
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_destroy_work_func - Work function for
> > > > destroying a
> > > > zdd
> > > > + * @w: Pointer to the work_struct
> > > > + *
> > > > + * This function releases VRAM, puts GPU SVM range, and frees
> > > > zdd.
> > > > + */
> > > > +static void drm_gpusvm_zdd_destroy_work_func(struct work_struct
> > > > *w)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd =
> > > > +		container_of(w, struct drm_gpusvm_zdd,
> > > > destroy_work);
> > > > +	struct drm_gpusvm_range *range = zdd->range;
> > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > +
> > > > +	if (gpusvm->ops->vram_release && zdd->vram_allocation)
> > > > +		gpusvm->ops->vram_release(zdd->vram_allocation);
> > > > +	drm_gpusvm_range_put(range);
> > > > +	kfree(zdd);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
> > > > + * @range: Pointer to the GPU SVM range.
> > > > + *
> > > > + * This function allocates and initializes a new zdd structure.
> > > > It
> > > > sets up the
> > > > + * reference count, initializes the destroy work, and links the
> > > > provided GPU SVM
> > > > + * range.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated zdd on success, ERR_PTR() on
> > > > failure.
> > > > + */
> > > > +static struct drm_gpusvm_zdd *
> > > > +drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd;
> > > > +
> > > > +	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
> > > > +	if (!zdd)
> > > > +		return NULL;
> > > > +
> > > > +	kref_init(&zdd->refcount);
> > > > +	INIT_WORK(&zdd->destroy_work,
> > > > drm_gpusvm_zdd_destroy_work_func);
> > > > +	zdd->range = drm_gpusvm_range_get(range);
> > > > +	zdd->vram_allocation = NULL;
> > > > +
> > > > +	return zdd;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
> > > > + * @zdd: Pointer to the zdd structure.
> > > > + *
> > > > + * This function increments the reference count of the provided
> > > > zdd
> > > > structure.
> > > > + *
> > > > + * Returns: Pointer to the zdd structure.
> > > > + */
> > > > +static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct
> > > > drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	kref_get(&zdd->refcount);
> > > > +	return zdd;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
> > > > + * @ref: Pointer to the reference count structure.
> > > > + *
> > > > + * This function queues the destroy_work of the zdd for
> > > > asynchronous
> > > > destruction.
> > > > + */
> > > > +static void drm_gpusvm_zdd_destroy(struct kref *ref)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd =
> > > > +		container_of(ref, struct drm_gpusvm_zdd,
> > > > refcount);
> > > > +	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
> > > > +
> > > > +	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_zdd_put - Put a zdd reference.
> > > > + * @zdd: Pointer to the zdd structure.
> > > > + *
> > > > + * This function decrements the reference count of the provided
> > > > zdd
> > > > structure
> > > > + * and schedules its destruction if the count drops to zero.
> > > > + */
> > > > +static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM
> > > > notifier
> > > > + * @notifier: Pointer to the GPU SVM notifier structure.
> > > > + * @start: Start address of the range
> > > > + * @end: End address of the range
> > > > + *
> > > > + * Return: A pointer to the drm_gpusvm_range if found or NULL
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64
> > > > start, u64 end)
> > > > +{
> > > > +	return range_iter_first(&notifier->root, start, end -
> > > > 1);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM
> > > > ranges in a notifier
> > > > + * @range__: Iterator variable for the ranges
> > > > + * @next__: Iterator variable for the ranges temporay storage
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the range
> > > > + * @end__: End address of the range
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM ranges in a
> > > > notifier
> > > > while
> > > > + * removing ranges from it.
> > > > + */
> > > > +#define drm_gpusvm_for_each_range_safe(range__, next__,
> > > > notifier__,
> > > > start__, end__)	\
> > > > +	for ((range__) = drm_gpusvm_range_find((notifier__),
> > > > (start__), (end__)),	\
> > > > +	     (next__) =
> > > > __drm_gpusvm_range_next(range__);				\
> > > > +	     (range__) && (range__->va.start <
> > > > (end__));				\
> > > > +	     (range__) = (next__), (next__) =
> > > > __drm_gpusvm_range_next(range__))
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier
> > > > in
> > > > the list
> > > > + * @notifier: a pointer to the current drm_gpusvm_notifier
> > > > + *
> > > > + * Return: A pointer to the next drm_gpusvm_notifier if
> > > > available,
> > > > or NULL if
> > > > + *         the current notifier is the last one or if the input
> > > > notifier is
> > > > + *         NULL.
> > > > + */
> > > > +static struct drm_gpusvm_notifier *
> > > > +__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
> > > > +{
> > > > +	if (notifier && !list_is_last(&notifier->rb.entry,
> > > > +				      &notifier->gpusvm-
> > > > > notifier_list))
> > > > +		return list_next_entry(notifier, rb.entry);
> > > > +
> > > > +	return NULL;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers
> > > > in
> > > > a gpusvm
> > > > + * @notifier__: Iterator variable for the notifiers
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the notifier
> > > > + * @end__: End address of the notifier
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > gpusvm.
> > > > + */
> > > > +#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__,
> > > > start__,
> > > > end__)		\
> > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > >root,
> > > > (start__), (end__) - 1);	\
> > > > +	     (notifier__) && (notifier__->interval.start <
> > > > (end__));			\
> > > > +	     (notifier__) =
> > > > __drm_gpusvm_notifier_next(notifier__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU
> > > > SVM
> > > > notifiers in a gpusvm
> > > > + * @notifier__: Iterator variable for the notifiers
> > > > + * @next__: Iterator variable for the notifiers temporay storage
> > > > + * @notifier__: Pointer to the GPU SVM notifier
> > > > + * @start__: Start address of the notifier
> > > > + * @end__: End address of the notifier
> > > > + *
> > > > + * This macro is used to iterate over GPU SVM notifiers in a
> > > > gpusvm
> > > > while
> > > > + * removing notifiers from it.
> > > > + */
> > > > +#define drm_gpusvm_for_each_notifier_safe(notifier__, next__,
> > > > gpusvm__, start__, end__)	\
> > > > +	for ((notifier__) = notifier_iter_first(&(gpusvm__)-
> > > > >root,
> > > > (start__), (end__) - 1),	\
> > > > +	     (next__) =
> > > > __drm_gpusvm_notifier_next(notifier__);				\
> > > > +	     (notifier__) && (notifier__->interval.start <
> > > > (end__));			\
> > > > +	     (notifier__) = (next__), (next__) =
> > > > __drm_gpusvm_notifier_next(notifier__))
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM
> > > > notifier.
> > > > + * @mni: Pointer to the mmu_interval_notifier structure.
> > > > + * @mmu_range: Pointer to the mmu_notifier_range structure.
> > > > + * @cur_seq: Current sequence number.
> > > > + *
> > > > + * This function serves as a generic MMU notifier for GPU SVM.
> > > > It
> > > > sets the MMU
> > > > + * notifier sequence number and calls the driver invalidate
> > > > vfunc
> > > > under
> > > > + * gpusvm->notifier_lock.
> > > > + *
> > > > + * Returns:
> > > > + * true if the operation succeeds, false otherwise.
> > > > + */
> > > > +static bool
> > > > +drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier
> > > > *mni,
> > > > +			       const struct mmu_notifier_range
> > > > *mmu_range,
> > > > +			       unsigned long cur_seq)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier =
> > > > +		container_of(mni, typeof(*notifier), notifier);
> > > > +	struct drm_gpusvm *gpusvm = notifier->gpusvm;
> > > > +
> > > > +	if (!mmu_notifier_range_blockable(mmu_range))
> > > > +		return false;
> > > > +
> > > > +	down_write(&gpusvm->notifier_lock);
> > > > +	mmu_interval_set_seq(mni, cur_seq);
> > > > +	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
> > > > +	up_write(&gpusvm->notifier_lock);
> > > > +
> > > > +	return true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_ops - MMU interval notifier operations
> > > > for
> > > > GPU SVM
> > > > + */
> > > > +static const struct mmu_interval_notifier_ops
> > > > drm_gpusvm_notifier_ops = {
> > > > +	.invalidate = drm_gpusvm_notifier_invalidate,
> > > > +};
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_init - Initialize the GPU SVM.
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + * @name: Name of the GPU SVM.
> > > > + * @drm: Pointer to the DRM device structure.
> > > > + * @mm: Pointer to the mm_struct for the address space.
> > > > + * @device_private_page_owner: Device private pages owner.
> > > > + * @mm_start: Start address of GPU SVM.
> > > > + * @mm_range: Range of the GPU SVM.
> > > > + * @notifier_size: Size of individual notifiers.
> > > > + * @ops: Pointer to the operations structure for GPU SVM.
> > > > + * @chunk_sizes: Pointer to the array of chunk sizes used in
> > > > range
> > > > allocation.
> > > > + *               Entries should be powers of 2 in descending
> > > > order
> > > > with last
> > > > + *               entry being SZ_4K.
> > > > + * @num_chunks: Number of chunks.
> > > > + *
> > > > + * This function initializes the GPU SVM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, a negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
> > > > +		    const char *name, struct drm_device *drm,
> > > > +		    struct mm_struct *mm, void
> > > > *device_private_page_owner,
> > > > +		    u64 mm_start, u64 mm_range, u64
> > > > notifier_size,
> > > > +		    const struct drm_gpusvm_ops *ops,
> > > > +		    const u64 *chunk_sizes, int num_chunks)
> > > > +{
> > > > +	if (!ops->invalidate || !num_chunks)
> > > > +		return -EINVAL;
> > > > +
> > > > +	gpusvm->name = name;
> > > > +	gpusvm->drm = drm;
> > > > +	gpusvm->mm = mm;
> > > > +	gpusvm->device_private_page_owner =
> > > > device_private_page_owner;
> > > > +	gpusvm->mm_start = mm_start;
> > > > +	gpusvm->mm_range = mm_range;
> > > > +	gpusvm->notifier_size = notifier_size;
> > > > +	gpusvm->ops = ops;
> > > > +	gpusvm->chunk_sizes = chunk_sizes;
> > > > +	gpusvm->num_chunks = num_chunks;
> > > > +	gpusvm->zdd_wq = system_wq;
> > > > +
> > > > +	mmgrab(mm);
> > > > +	gpusvm->root = RB_ROOT_CACHED;
> > > > +	INIT_LIST_HEAD(&gpusvm->notifier_list);
> > > > +
> > > > +	init_rwsem(&gpusvm->notifier_lock);
> > > > +
> > > > +	fs_reclaim_acquire(GFP_KERNEL);
> > > > +	might_lock(&gpusvm->notifier_lock);
> > > > +	fs_reclaim_release(GFP_KERNEL);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_find - Find GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM structure
> > > > + * @fault_addr__: Fault address
> > > > + *
> > > > + * This macro finds the GPU SVM notifier associated with the
> > > > fault
> > > > address.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM notifier on success, NULL otherwise.
> > > > + */
> > > > +#define drm_gpusvm_notifier_find(gpusvm__,
> > > > fault_addr__)	\
> > > > +	notifier_iter_first(&(gpusvm__)->root,
> > > > (fault_addr__),	\
> > > > +			    (fault_addr__ + 1))
> > > > +
> > > > +/**
> > > > + * to_drm_gpusvm_notifier - retrieve the container struct for a
> > > > given rbtree node
> > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > drm_gpusvm_notifier struct
> > > > + *
> > > > + * Return: A pointer to the containing drm_gpusvm_notifier
> > > > structure.
> > > > + */
> > > > +#define
> > > > to_drm_gpusvm_notifier(__node)				\
> > > > +	container_of((__node), struct drm_gpusvm_notifier,
> > > > rb.node)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This function inserts the GPU SVM notifier into the GPU SVM
> > > > RB
> > > > tree and list.
> > > > + */
> > > > +static void drm_gpusvm_notifier_insert(struct drm_gpusvm
> > > > *gpusvm,
> > > > +				       struct
> > > > drm_gpusvm_notifier
> > > > *notifier)
> > > > +{
> > > > +	struct rb_node *node;
> > > > +	struct list_head *head;
> > > > +
> > > > +	notifier_insert(notifier, &gpusvm->root);
> > > > +
> > > > +	node = rb_prev(&notifier->rb.node);
> > > > +	if (node)
> > > > +		head = &(to_drm_gpusvm_notifier(node))-
> > > > >rb.entry;
> > > > +	else
> > > > +		head = &gpusvm->notifier_list;
> > > > +
> > > > +	list_add(&notifier->rb.entry, head);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
> > > > + * @gpusvm__: Pointer to the GPU SVM tructure
> > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This macro removes the GPU SVM notifier from the GPU SVM RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +#define drm_gpusvm_notifier_remove(gpusvm__,
> > > > notifier__)	\
> > > > +	notifier_remove((notifier__), &(gpusvm__)-
> > > > >root);	\
> > > > +	list_del(&(notifier__)->rb.entry)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_fini - Finalize the GPU SVM.
> > > > + * @gpusvm: Pointer to the GPU SVM structure.
> > > > + *
> > > > + * This function finalizes the GPU SVM by cleaning up any
> > > > remaining
> > > > ranges and
> > > > + * notifiers, and dropping a reference to struct MM.
> > > > + */
> > > > +void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier, *next;
> > > > +
> > > > +	drm_gpusvm_for_each_notifier_safe(notifier, next,
> > > > gpusvm, 0,
> > > > LONG_MAX) {
> > > > +		struct drm_gpusvm_range *range, *__next;
> > > > +
> > > > +		/*
> > > > +		 * Remove notifier first to avoid racing with
> > > > any
> > > > invalidation
> > > > +		 */
> > > > +		mmu_interval_notifier_remove(&notifier-
> > > > >notifier);
> > > > +		notifier->flags.removed = true;
> > > > +
> > > > +		drm_gpusvm_for_each_range_safe(range, __next,
> > > > notifier, 0,
> > > > +					       LONG_MAX)
> > > > +			drm_gpusvm_range_remove(gpusvm, range);
> > > > +	}
> > > > +
> > > > +	mmdrop(gpusvm->mm);
> > > > +	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @fault_addr: Fault address
> > > > + *
> > > > + * This function allocates and initializes the GPU SVM notifier
> > > > structure.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated GPU SVM notifier on success,
> > > > ERR_PTR()
> > > > on failure.
> > > > + */
> > > > +static struct drm_gpusvm_notifier *
> > > > +drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	if (gpusvm->ops->notifier_alloc)
> > > > +		notifier = gpusvm->ops->notifier_alloc();
> > > > +	else
> > > > +		notifier = kzalloc(sizeof(*notifier),
> > > > GFP_KERNEL);
> > > > +
> > > > +	if (!notifier)
> > > > +		return ERR_PTR(-ENOMEM);
> > > > +
> > > > +	notifier->gpusvm = gpusvm;
> > > > +	notifier->interval.start = ALIGN_DOWN(fault_addr,
> > > > gpusvm-
> > > > > notifier_size);
> > > > +	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm-
> > > > > notifier_size);
> > > > +	INIT_LIST_HEAD(&notifier->rb.entry);
> > > > +	notifier->root = RB_ROOT_CACHED;
> > > > +	INIT_LIST_HEAD(&notifier->range_list);
> > > > +
> > > > +	return notifier;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_notifier_free - Free GPU SVM notifier
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + *
> > > > + * This function frees the GPU SVM notifier structure.
> > > > + */
> > > > +static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
> > > > +				     struct drm_gpusvm_notifier
> > > > *notifier)
> > > > +{
> > > > +	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
> > > > +
> > > > +	if (gpusvm->ops->notifier_free)
> > > > +		gpusvm->ops->notifier_free(notifier);
> > > > +	else
> > > > +		kfree(notifier);
> > > > +}
> > > > +
> > > > +/**
> > > > + * to_drm_gpusvm_range - retrieve the container struct for a
> > > > given
> > > > rbtree node
> > > > + * @node__: a pointer to the rbtree node embedded within a
> > > > drm_gpusvm_range struct
> > > > + *
> > > > + * Return: A pointer to the containing drm_gpusvm_range
> > > > structure.
> > > > + */
> > > > +#define to_drm_gpusvm_range(node__)	\
> > > > +	container_of((node__), struct drm_gpusvm_range, rb.node)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_insert - Insert GPU SVM range
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function inserts the GPU SVM range into the notifier RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier
> > > > *notifier,
> > > > +				    struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	struct rb_node *node;
> > > > +	struct list_head *head;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(notifier->gpusvm);
> > > > +	range_insert(range, &notifier->root);
> > > > +
> > > > +	node = rb_prev(&range->rb.node);
> > > > +	if (node)
> > > > +		head = &(to_drm_gpusvm_range(node))->rb.entry;
> > > > +	else
> > > > +		head = &notifier->range_list;
> > > > +
> > > > +	list_add(&range->rb.entry, head);
> > > > +	drm_gpusvm_notifier_unlock(notifier->gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_remove - Remove GPU SVM range
> > > > + * @notifier__: Pointer to the GPU SVM notifier structure
> > > > + * @range__: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This macro removes the GPU SVM range from the notifier RB
> > > > tree
> > > > and list.
> > > > + */
> > > > +#define __drm_gpusvm_range_remove(notifier__,
> > > > range__)		\
> > > > +	range_remove((range__), &(notifier__)-
> > > > >root);		\
> > > > +	list_del(&(range__)->rb.entry)
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_alloc - Allocate GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @fault_addr: Fault address
> > > > + * @chunk_size: Chunk size
> > > > + * @migrate_vram: Flag indicating whether to migrate VRAM
> > > > + *
> > > > + * This function allocates and initializes the GPU SVM range
> > > > structure.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the allocated GPU SVM range on success, ERR_PTR()
> > > > on
> > > > failure.
> > > > + */
> > > > +static struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
> > > > +		       struct drm_gpusvm_notifier *notifier,
> > > > +		       u64 fault_addr, u64 chunk_size, bool
> > > > migrate_vram)
> > > > +{
> > > > +	struct drm_gpusvm_range *range;
> > > > +
> > > > +	if (gpusvm->ops->range_alloc)
> > > > +		range = gpusvm->ops->range_alloc(gpusvm);
> > > > +	else
> > > > +		range = kzalloc(sizeof(*range), GFP_KERNEL);
> > > > +
> > > > +	if (!range)
> > > > +		return ERR_PTR(-ENOMEM);
> > > > +
> > > > +	kref_init(&range->refcount);
> > > > +	range->gpusvm = gpusvm;
> > > > +	range->notifier = notifier;
> > > > +	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
> > > > +	range->va.end = ALIGN(fault_addr + 1, chunk_size);
> > > > +	INIT_LIST_HEAD(&range->rb.entry);
> > > > +	range->notifier_seq = LONG_MAX;
> > > > +	range->flags.migrate_vram = migrate_vram ? 1 : 0;
> > > > +
> > > > +	return range;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_check_pages - Check pages
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @start: Start address
> > > > + * @end: End address
> > > > + *
> > > > + * Check if pages between start and end have been faulted in on
> > > > the
> > > > CPU. Use to
> > > > + * prevent migration of pages without CPU backing store.
> > > > + *
> > > > + * Returns:
> > > > + * True if pages have been faulted into CPU, False otherwise
> > > > + */
> > > > +static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
> > > > +				   struct drm_gpusvm_notifier
> > > > *notifier,
> > > > +				   u64 start, u64 end)
> > > > +{
> > > > +	struct hmm_range hmm_range = {
> > > > +		.default_flags = 0,
> > > > +		.notifier = &notifier->notifier,
> > > > +		.start = start,
> > > > +		.end = end,
> > > > +		.dev_private_owner = gpusvm-
> > > > > device_private_page_owner,
> > > > +	};
> > > > +	unsigned long timeout =
> > > > +		jiffies +
> > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > +	unsigned long *pfns;
> > > > +	unsigned long npages = npages_in_range(start, end);
> > > > +	int err, i;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > GFP_KERNEL);
> > > > +	if (!pfns)
> > > > +		return false;
> > > > +
> > > > +	hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(&notifier-
> > > > > notifier);
> > > > +	hmm_range.hmm_pfns = pfns;
> > > > +
> > > > +	while (true) {
> > > > +		err = hmm_range_fault(&hmm_range);
> > > > +		if (err == -EBUSY) {
> > > > +			if (time_after(jiffies, timeout))
> > > > +				break;
> > > > +
> > > > +			hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(&notifier->notifier);
> > > > +			continue;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!(pfns[i] & HMM_PFN_VALID)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_free;
> > > > +		}
> > > > +	}
> > > > +
> > > > +err_free:
> > > > +	kvfree(pfns);
> > > > +	return err ? false : true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_chunk_size - Determine chunk size for GPU
> > > > SVM
> > > > range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @notifier: Pointer to the GPU SVM notifier structure
> > > > + * @vas: Pointer to the virtual memory area structure
> > > > + * @fault_addr: Fault address
> > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > + * @check_pages: Flag indicating whether to check pages
> > > > + *
> > > > + * This function determines the chunk size for the GPU SVM range
> > > > based on the
> > > > + * fault address, GPU SVM chunk sizes, existing GPU SVM ranges,
> > > > and
> > > > the virtual
> > > > + * memory area boundaries.
> > > > + *
> > > > + * Returns:
> > > > + * Chunk size on success, LONG_MAX on failure.
> > > > + */
> > > > +static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm
> > > > *gpusvm,
> > > > +				       struct
> > > > drm_gpusvm_notifier
> > > > *notifier,
> > > > +				       struct vm_area_struct
> > > > *vas,
> > > > +				       u64 fault_addr, u64
> > > > gpuva_start,
> > > > +				       u64 gpuva_end, bool
> > > > check_pages)
> > > > +{
> > > > +	u64 start, end;
> > > > +	int i = 0;
> > > > +
> > > > +retry:
> > > > +	for (; i < gpusvm->num_chunks; ++i) {
> > > > +		start = ALIGN_DOWN(fault_addr, gpusvm-
> > > > > chunk_sizes[i]);
> > > > +		end = ALIGN(fault_addr + 1, gpusvm-
> > > > >chunk_sizes[i]);
> > > > +
> > > > +		if (start >= vas->vm_start && end <= vas->vm_end
> > > > &&
> > > > +		    start >= notifier->interval.start &&
> > > > +		    end <= notifier->interval.end &&
> > > > +		    start >= gpuva_start && end <= gpuva_end)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	if (i == gpusvm->num_chunks)
> > > > +		return LONG_MAX;
> > > > +
> > > > +	/*
> > > > +	 * If allocation more than page, ensure not to overlap
> > > > with
> > > > existing
> > > > +	 * ranges.
> > > > +	 */
> > > > +	if (end - start != SZ_4K) {
> > > > +		struct drm_gpusvm_range *range;
> > > > +
> > > > +		range = drm_gpusvm_range_find(notifier, start,
> > > > end);
> > > > +		if (range) {
> > > > +			++i;
> > > > +			goto retry;
> > > > +		}
> > > > +
> > > > +		/*
> > > > +		 * XXX: Only create range on pages CPU has
> > > > faulted
> > > > in. Without
> > > > +		 * this check, or prefault, on BMG
> > > > 'xe_exec_system_allocator --r
> > > > +		 * process-many-malloc' fails. In the failure
> > > > case,
> > > > each process
> > > > +		 * mallocs 16k but the CPU VMA is ~128k which
> > > > results in 64k SVM
> > > > +		 * ranges. When migrating the SVM ranges, some
> > > > processes fail in
> > > > +		 * drm_gpusvm_migrate_to_vram with
> > > > 'migrate.cpages
> > > > != npages'
> > > > +		 * and then upon drm_gpusvm_range_get_pages
> > > > device
> > > > pages from
> > > > +		 * other processes are collected + faulted in
> > > > which
> > > > creates all
> > > > +		 * sorts of problems. Unsure exactly how this
> > > > happening, also
> > > > +		 * problem goes away if
> > > > 'xe_exec_system_allocator --
> > > > r
> > > > +		 * process-many-malloc' mallocs at least 64k at
> > > > a
> > > > time.
> > > > +		 */
> > > > +		if (check_pages &&
> > > > +		    !drm_gpusvm_check_pages(gpusvm, notifier,
> > > > start,
> > > > end)) {
> > > > +			++i;
> > > > +			goto retry;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	return end - start;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM
> > > > range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @fault_addr: Fault address
> > > > + * @gpuva_start: Start address of GPUVA which mirrors CPU
> > > > + * @gpuva_end: End address of GPUVA which mirrors CPU
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function finds or inserts a newly allocated a GPU SVM
> > > > range
> > > > based on the
> > > > + * fault address. Caller must hold a lock to protect range
> > > > lookup
> > > > and insertion.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM range on success, ERR_PTR() on
> > > > failure.
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64
> > > > fault_addr,
> > > > +				u64 gpuva_start, u64 gpuva_end,
> > > > +				const struct drm_gpusvm_ctx
> > > > *ctx)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +	struct drm_gpusvm_range *range;
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	struct vm_area_struct *vas;
> > > > +	bool notifier_alloc = false;
> > > > +	u64 chunk_size;
> > > > +	int err;
> > > > +	bool migrate_vram;
> > > > +
> > > > +	if (fault_addr < gpusvm->mm_start ||
> > > > +	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
> > > > +		err = -EINVAL;
> > > > +		goto err_out;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		mmap_write_lock(mm);
> > > > +	}
> > > > +
> > > > +	mmap_assert_write_locked(mm);
> > > > +
> > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
> > > > +	if (!notifier) {
> > > > +		notifier = drm_gpusvm_notifier_alloc(gpusvm,
> > > > fault_addr);
> > > > +		if (IS_ERR(notifier)) {
> > > > +			err = PTR_ERR(notifier);
> > > > +			goto err_mmunlock;
> > > > +		}
> > > > +		notifier_alloc = true;
> > > > +		err =
> > > > mmu_interval_notifier_insert_locked(&notifier-
> > > > > notifier,
> > > > +							  mm,
> > > > notifier->interval.start,
> > > > +							 
> > > > notifier-
> > > > > interval.end -
> > > > +							 
> > > > notifier-
> > > > > interval.start,
> > > > +							 
> > > > &drm_gpusvm_notifier_ops);
> > > > +		if (err)
> > > > +			goto err_notifier;
> > > > +	}
> > > > +
> > > > +	vas = vma_lookup(mm, fault_addr);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
> > > > +		err = -EPERM;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	range = drm_gpusvm_range_find(notifier, fault_addr,
> > > > fault_addr + 1);
> > > > +	if (range)
> > > > +		goto out_mmunlock;
> > > > +	/*
> > > > +	 * XXX: Short-circuiting migration based on
> > > > migrate_vma_*
> > > > current
> > > > +	 * limitations. If/when migrate_vma_* add more support,
> > > > this
> > > > logic will
> > > > +	 * have to change.
> > > > +	 */
> > > > +	migrate_vram = ctx->vram_possible &&
> > > > +		vma_is_anonymous(vas) &&
> > > > !is_vm_hugetlb_page(vas);
> > > > +
> > > > +	chunk_size = drm_gpusvm_range_chunk_size(gpusvm,
> > > > notifier,
> > > > vas,
> > > > +						 fault_addr,
> > > > gpuva_start,
> > > > +						 gpuva_end,
> > > > migrate_vram &&
> > > > +						 !ctx-
> > > > >prefault);
> > > > +	if (chunk_size == LONG_MAX) {
> > > > +		err = -EINVAL;
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	range = drm_gpusvm_range_alloc(gpusvm, notifier,
> > > > fault_addr,
> > > > chunk_size,
> > > > +				       migrate_vram);
> > > > +	if (IS_ERR(range)) {
> > > > +		err = PTR_ERR(range);
> > > > +		goto err_notifier_remove;
> > > > +	}
> > > > +
> > > > +	drm_gpusvm_range_insert(notifier, range);
> > > > +	if (notifier_alloc)
> > > > +		drm_gpusvm_notifier_insert(gpusvm, notifier);
> > > > +
> > > > +	if (ctx->prefault) {
> > > > +		struct drm_gpusvm_ctx __ctx = *ctx;
> > > > +
> > > > +		__ctx.mmap_locked = true;
> > > > +		err = drm_gpusvm_range_get_pages(gpusvm, range,
> > > > &__ctx);
> > > > +		if (err)
> > > > +			goto err_range_remove;
> > > > +	}
> > > > +
> > > > +out_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +
> > > > +	return range;
> > > > +
> > > > +err_range_remove:
> > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > +err_notifier_remove:
> > > > +	if (notifier_alloc)
> > > > +		mmu_interval_notifier_remove(&notifier-
> > > > >notifier);
> > > > +err_notifier:
> > > > +	if (notifier_alloc)
> > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +err_out:
> > > > +	return ERR_PTR(err);
> > > > +}
> > > > +
> > > > +/**
> > > > + * for_each_dma_page - iterate over pages in a DMA regio`n
> > > > + * @i__: the current page index in the iteration
> > > > + * @j__: the current page index, log order, in the iteration
> > > > + * @npages__: the total number of pages in the DMA region
> > > > + * @order__: the order of the pages in the DMA region
> > > > + *
> > > > + * This macro iterates over each page in a DMA region. The DMA
> > > > region
> > > > + * is assumed to be composed of 2^@order__ pages, and the macro
> > > > will
> > > > + * step through the region one block of 2^@order__ pages at a
> > > > time.
> > > > + */
> > > > +#define for_each_dma_page(i__, j__, npages__, order__)	\
> > > > +	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
> > > > +	     (j__)++, (i__) += 0x1 << (order__))
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_range_unmap_pages - Unmap pages associated with
> > > > a
> > > > GPU SVM range (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function unmap pages associated with a GPU SVM range.
> > > > Assumes and
> > > > + * asserts correct locking is in place when called.
> > > > + */
> > > > +static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					   struct
> > > > drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	if (range->pages) {
> > > > +		unsigned long i, j, npages =
> > > > npages_in_range(range-
> > > > > va.start,
> > > > +							    
> > > > range-
> > > > > va.end);
> > > > +
> > > > +		if (range->flags.has_dma_mapping) {
> > > > +			for_each_dma_page(i, j, npages, range-
> > > > > order)
> > > > +				dma_unmap_page(gpusvm->drm->dev,
> > > > +					       range-
> > > > >dma_addr[j],
> > > > +					       PAGE_SIZE <<
> > > > range-
> > > > > order,
> > > > +					      
> > > > DMA_BIDIRECTIONAL);
> > > > +		}
> > > > +
> > > > +		range->flags.has_vram_pages = false;
> > > > +		range->flags.has_dma_mapping = false;
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_free_pages - Free pages associated with a
> > > > GPU
> > > > SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function free pages associated with a GPU SVM range.
> > > > + */
> > > > +static void drm_gpusvm_range_free_pages(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	if (range->pages) {
> > > > +		if (range->flags.kfree_mapping) {
> > > > +			kfree(range->dma_addr);
> > > > +			range->flags.kfree_mapping = false;
> > > > +			range->pages = NULL;
> > > > +		} else {
> > > > +			kvfree(range->pages);
> > > > +			range->pages = NULL;
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_remove - Remove GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range to be removed
> > > > + *
> > > > + * This function removes the specified GPU SVM range and also
> > > > removes the parent
> > > > + * GPU SVM notifier if no more ranges remain in the notifier.
> > > > The
> > > > caller must
> > > > + * hold a lock to protect range and notifier removal.
> > > > + */
> > > > +void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
> > > > +			     struct drm_gpusvm_range *range)
> > > > +{
> > > > +	struct drm_gpusvm_notifier *notifier;
> > > > +
> > > > +	notifier = drm_gpusvm_notifier_find(gpusvm, range-
> > > > > va.start);
> > > > +	if (WARN_ON_ONCE(!notifier))
> > > > +		return;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +	drm_gpusvm_range_free_pages(gpusvm, range);
> > > > +	__drm_gpusvm_range_remove(notifier, range);
> > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > +
> > > > +	drm_gpusvm_range_put(range);
> > > > +
> > > > +	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
> > > > +		if (!notifier->flags.removed)
> > > > +			mmu_interval_notifier_remove(&notifier-
> > > > > notifier);
> > > > +		drm_gpusvm_notifier_remove(gpusvm, notifier);
> > > > +		drm_gpusvm_notifier_free(gpusvm, notifier);
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_get - Get a reference to GPU SVM range
> > > > + * @range: Pointer to the GPU SVM range
> > > > + *
> > > > + * This function increments the reference count of the specified
> > > > GPU
> > > > SVM range.
> > > > + *
> > > > + * Returns:
> > > > + * Pointer to the GPU SVM range.
> > > > + */
> > > > +struct drm_gpusvm_range *
> > > > +drm_gpusvm_range_get(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	kref_get(&range->refcount);
> > > > +
> > > > +	return range;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_destroy - Destroy GPU SVM range
> > > > + * @refcount: Pointer to the reference counter embedded in the
> > > > GPU
> > > > SVM range
> > > > + *
> > > > + * This function destroys the specified GPU SVM range when its
> > > > reference count
> > > > + * reaches zero. If a custom range-free function is provided, it
> > > > is
> > > > invoked to
> > > > + * free the range; otherwise, the range is deallocated using
> > > > kfree().
> > > > + */
> > > > +static void drm_gpusvm_range_destroy(struct kref *refcount)
> > > > +{
> > > > +	struct drm_gpusvm_range *range =
> > > > +		container_of(refcount, struct drm_gpusvm_range,
> > > > refcount);
> > > > +	struct drm_gpusvm *gpusvm = range->gpusvm;
> > > > +
> > > > +	if (gpusvm->ops->range_free)
> > > > +		gpusvm->ops->range_free(range);
> > > > +	else
> > > > +		kfree(range);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_put - Put a reference to GPU SVM range
> > > > + * @range: Pointer to the GPU SVM range
> > > > + *
> > > > + * This function decrements the reference count of the specified
> > > > GPU
> > > > SVM range
> > > > + * and frees it when the count reaches zero.
> > > > + */
> > > > +void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
> > > > +{
> > > > +	kref_put(&range->refcount, drm_gpusvm_range_destroy);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function determines if a GPU SVM range pages are valid.
> > > > Expected be
> > > > + * called holding gpusvm->notifier_lock and as the last step
> > > > before
> > > > commiting a
> > > > + * GPU binding.
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > + */
> > > > +bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	lockdep_assert_held(&gpusvm->notifier_lock);
> > > > +
> > > > +	return range->flags.has_vram_pages || range-
> > > > > flags.has_dma_mapping;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages
> > > > valid
> > > > unlocked
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * This function determines if a GPU SVM range pages are valid.
> > > > Expected be
> > > > + * called without holding gpusvm->notifier_lock.
> > > > + *
> > > > + * Returns:
> > > > + * True if GPU SVM range has valid pages, False otherwise
> > > > + */
> > > > +static bool
> > > > +drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
> > > > +				      struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	bool pages_valid;
> > > > +
> > > > +	if (!range->pages)
> > > > +		return false;
> > > > +
> > > > +	drm_gpusvm_notifier_lock(gpusvm);
> > > > +	pages_valid = drm_gpusvm_range_pages_valid(gpusvm,
> > > > range);
> > > > +	if (!pages_valid && range->flags.kfree_mapping) {
> > > > +		kfree(range->dma_addr);
> > > > +		range->flags.kfree_mapping = false;
> > > > +		range->pages = NULL;
> > > > +	}
> > > > +	drm_gpusvm_notifier_unlock(gpusvm);
> > > > +
> > > > +	return pages_valid;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function gets pages for a GPU SVM range and ensures they
> > > > are
> > > > mapped for
> > > > + * DMA access.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	struct mmu_interval_notifier *notifier = &range-
> > > > >notifier-
> > > > > notifier;
> > > > +	struct hmm_range hmm_range = {
> > > > +		.default_flags = HMM_PFN_REQ_FAULT | (ctx-
> > > > >read_only
> > > > ? 0 :
> > > > +			HMM_PFN_REQ_WRITE),
> > > > +		.notifier = notifier,
> > > > +		.start = range->va.start,
> > > > +		.end = range->va.end,
> > > > +		.dev_private_owner = gpusvm-
> > > > > device_private_page_owner,
> > > > +	};
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	unsigned long timeout =
> > > > +		jiffies +
> > > > msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> > > > +	unsigned long i, j;
> > > > +	unsigned long npages = npages_in_range(range->va.start,
> > > > range->va.end);
> > > > +	unsigned int order = 0;
> > > > +	unsigned long *pfns;
> > > > +	struct page **pages;
> > > > +	int err = 0;
> > > > +	bool vram_pages = !!range->flags.migrate_vram;
> > > > +	bool alloc_pfns = false, kfree_mapping;
> > > > +
> > > > +retry:
> > > > +	kfree_mapping = false;
> > > > +	hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(notifier);
> > > > +	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm,
> > > > range))
> > > > +		return 0;
> > > > +
> > > > +	if (range->notifier_seq == hmm_range.notifier_seq &&
> > > > range-
> > > > > pages) {
> > > > +		if (ctx->prefault)
> > > > +			return 0;
> > > > +
> > > > +		pfns = (unsigned long *)range->pages;
> > > > +		pages = range->pages;
> > > > +		goto map_pages;
> > > > +	}
> > > > +
> > > > +	if (!range->pages) {
> > > > +		pfns = kvmalloc_array(npages, sizeof(*pfns),
> > > > GFP_KERNEL);
> > > > +		if (!pfns)
> > > > +			return -ENOMEM;
> > > > +		alloc_pfns = true;
> > > > +	} else {
> > > > +		pfns = (unsigned long *)range->pages;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	hmm_range.hmm_pfns = pfns;
> > > > +	while (true) {
> > > > +		/* Must be checked after mmu_interval_read_begin
> > > > */
> > > > +		if (range->flags.unmapped) {
> > > > +			err = -EFAULT;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		if (!ctx->mmap_locked) {
> > > > +			/*
> > > > +			 * XXX: HMM locking document indicates
> > > > only
> > > > a read-lock
> > > > +			 * is required but there apears to be a
> > > > window between
> > > > +			 * the MMU_NOTIFY_MIGRATE event
> > > > triggered in
> > > > a CPU fault
> > > > +			 * via migrate_vma_setup and the pages
> > > > actually moving
> > > > +			 * in migrate_vma_finalize in which this
> > > > code can grab
> > > > +			 * garbage pages. Grabbing the write-
> > > > lock if
> > > > the range
> > > > +			 * is attached to vram appears to
> > > > protect
> > > > against this
> > > > +			 * race.
> > > > +			 */
> > > > +			if (vram_pages)
> > > > +				mmap_write_lock(mm);
> > > > +			else
> > > > +				mmap_read_lock(mm);
> > > > +		}
> > > > +		err = hmm_range_fault(&hmm_range);
> > > > +		if (!ctx->mmap_locked) {
> > > > +			if (vram_pages)
> > > > +				mmap_write_unlock(mm);
> > > > +			else
> > > > +				mmap_read_unlock(mm);
> > > > +		}
> > > > +
> > > > +		if (err == -EBUSY) {
> > > > +			if (time_after(jiffies, timeout))
> > > > +				break;
> > > > +
> > > > +			hmm_range.notifier_seq =
> > > > mmu_interval_read_begin(notifier);
> > > > +			continue;
> > > > +		}
> > > > +		break;
> > > > +	}
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput(mm);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	pages = (struct page **)pfns;
> > > > +
> > > > +	if (ctx->prefault) {
> > > > +		range->pages = pages;
> > > > +		goto set_seqno;
> > > > +	}
> > > > +
> > > > +map_pages:
> > > > +	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
> > > > +		WARN_ON_ONCE(!range->vram_allocation);
> > > > +
> > > > +		for (i = 0; i < npages; ++i) {
> > > > +			pages[i] = hmm_pfn_to_page(pfns[i]);
> > > > +
> > > > +			if
> > > > (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				goto err_free;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->flags.has_vram_pages = true;
> > > > +		range->pages = pages;
> > > > +		if (mmu_interval_read_retry(notifier,
> > > > hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	} else {
> > > > +		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
> > > > +
> > > > +		for_each_dma_page(i, j, npages, order) {
> > > > +			if (WARN_ON_ONCE(i && order !=
> > > > +					
> > > > hmm_pfn_to_map_order(pfns[i]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +			order = hmm_pfn_to_map_order(pfns[i]);
> > > > +
> > > > +			pages[j] = hmm_pfn_to_page(pfns[i]);
> > > > +			if
> > > > (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
> > > > +				err = -EOPNOTSUPP;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +
> > > > +			set_page_dirty_lock(pages[j]);
> > > > +			mark_page_accessed(pages[j]);
> > > > +
> > > > +			dma_addr[j] = dma_map_page(gpusvm->drm-
> > > > >dev,
> > > > +						   pages[j], 0,
> > > > +						   PAGE_SIZE <<
> > > > order,
> > > > +						  
> > > > DMA_BIDIRECTIONAL);
> > > > +			if (dma_mapping_error(gpusvm->drm->dev,
> > > > dma_addr[j])) {
> > > > +				err = -EFAULT;
> > > > +				npages = i;
> > > > +				goto err_unmap;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Huge pages, reduce memory footprint */
> > > > +		if (order) {
> > > > +			dma_addr = kmalloc_array(j,
> > > > sizeof(*dma_addr),
> > > > +						 GFP_KERNEL);
> > > > +			if (dma_addr) {
> > > > +				for (i = 0; i < j; ++i)
> > > > +					dma_addr[i] =
> > > > (dma_addr_t)pfns[i];
> > > > +				kvfree(pfns);
> > > > +				kfree_mapping = true;
> > > > +			} else {
> > > > +				dma_addr = (dma_addr_t *)pfns;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* Do not race with notifier unmapping pages */
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +		range->order = order;
> > > > +		range->flags.kfree_mapping = kfree_mapping;
> > > > +		range->flags.has_dma_mapping = true;
> > > > +		range->dma_addr = dma_addr;
> > > > +		range->vram_allocation = NULL;
> > > > +		if (mmu_interval_read_retry(notifier,
> > > > hmm_range.notifier_seq)) {
> > > > +			err = -EAGAIN;
> > > > +			__drm_gpusvm_range_unmap_pages(gpusvm,
> > > > range);
> > > > +		}
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +	}
> > > > +
> > > > +	if (err == -EAGAIN)
> > > > +		goto retry;
> > > > +set_seqno:
> > > > +	range->notifier_seq = hmm_range.notifier_seq;
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_unmap:
> > > > +	for_each_dma_page(i, j, npages, order)
> > > > +		dma_unmap_page(gpusvm->drm->dev,
> > > > +			       (dma_addr_t)pfns[j],
> > > > +			       PAGE_SIZE << order,
> > > > DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	if (alloc_pfns)
> > > > +		kvfree(pfns);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_range_unmap_pages - Unmap pages associated with a
> > > > GPU
> > > > SVM range
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function unmaps pages associated with a GPU SVM range.
> > > > If
> > > > @in_notifier
> > > > + * is set, it is assumed that gpusvm->notifier_lock is held in
> > > > write
> > > > mode; if it
> > > > + * is clear, it acquires gpusvm->notifier_lock in read mode.
> > > > Must be
> > > > called on
> > > > + * each GPU SVM range attached to notifier in gpusvm->ops-
> > > > > invalidate for IOMMU
> > > > + * security model.
> > > > + */
> > > > +void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
> > > > +				  struct drm_gpusvm_range
> > > > *range,
> > > > +				  const struct drm_gpusvm_ctx
> > > > *ctx)
> > > > +{
> > > > +	if (ctx->in_notifier)
> > > > +		lockdep_assert_held_write(&gpusvm-
> > > > >notifier_lock);
> > > > +	else
> > > > +		drm_gpusvm_notifier_lock(gpusvm);
> > > > +
> > > > +	__drm_gpusvm_range_unmap_pages(gpusvm, range);
> > > > +
> > > > +	if (!ctx->in_notifier)
> > > > +		drm_gpusvm_notifier_unlock(gpusvm);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_page - Put a migration page
> > > > + * @page: Pointer to the page to put
> > > > + *
> > > > + * This function unlocks and puts a page.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_page(struct page *page)
> > > > +{
> > > > +	unlock_page(page);
> > > > +	put_page(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migration_put_pages - Put migration pages
> > > > + * @npages: Number of pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers
> > > > + *
> > > > + * This function puts an array of pages.
> > > > + */
> > > > +static void drm_gpusvm_migration_put_pages(unsigned long npages,
> > > > +					   unsigned long
> > > > *migrate_pfn)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!migrate_pfn[i])
> > > > +			continue;
> > > > +
> > > > +		drm_gpusvm_migration_put_page(migrate_pfn_to_pag
> > > > e(mi
> > > > grate_pfn[i]));
> > > > +		migrate_pfn[i] = 0;
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
> > > > + * @page: Pointer to the page
> > > > + * @zdd: Pointer to the GPU SVM zone device data
> > > > + *
> > > > + * This function associates the given page with the specified
> > > > GPU
> > > > SVM zone
> > > > + * device data and initializes it for zone device usage.
> > > > + */
> > > > +static void drm_gpusvm_get_vram_page(struct page *page,
> > > > +				     struct drm_gpusvm_zdd *zdd)
> > > > +{
> > > > +	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
> > > > +	zone_device_page_init(page);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU
> > > > SVM
> > > > migration
> > > > + * @dev: The device for which the pages are being mapped
> > > > + * @dma_addr: Array to store DMA addresses corresponding to
> > > > mapped
> > > > pages
> > > > + * @migrate_pfn: Array of migrate page frame numbers to map
> > > > + * @npages: Number of pages to map
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function maps pages of memory for migration usage in GPU
> > > > SVM. It
> > > > + * iterates over each page frame number provided in
> > > > @migrate_pfn,
> > > > maps the
> > > > + * corresponding page, and stores the DMA address in the
> > > > provided
> > > > @dma_addr
> > > > + * array.
> > > > + *
> > > > + * Return: 0 on success, -EFAULT if an error occurs during
> > > > mapping.
> > > > + */
> > > > +static int drm_gpusvm_migrate_map_pages(struct device *dev,
> > > > +					dma_addr_t *dma_addr,
> > > > +					long unsigned int
> > > > *migrate_pfn,
> > > > +					unsigned long npages,
> > > > +					enum dma_data_direction
> > > > dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page =
> > > > migrate_pfn_to_page(migrate_pfn[i]);
> > > > +
> > > > +		if (!page)
> > > > +			continue;
> > > > +
> > > > +		if (WARN_ON_ONCE(is_zone_device_page(page)))
> > > > +			return -EFAULT;
> > > > +
> > > > +		dma_addr[i] = dma_map_page(dev, page, 0,
> > > > PAGE_SIZE,
> > > > dir);
> > > > +		if (dma_mapping_error(dev, dma_addr[i]))
> > > > +			return -EFAULT;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously
> > > > mapped
> > > > for GPU SVM migration
> > > > + * @dev: The device for which the pages were mapped
> > > > + * @dma_addr: Array of DMA addresses corresponding to mapped
> > > > pages
> > > > + * @npages: Number of pages to unmap
> > > > + * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > > > + *
> > > > + * This function unmaps previously mapped pages of memory for
> > > > GPU
> > > > Shared Virtual
> > > > + * Memory (SVM). It iterates over each DMA address provided in
> > > > @dma_addr, checks
> > > > + * if it's valid and not already unmapped, and unmaps the
> > > > corresponding page.
> > > > + */
> > > > +static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
> > > > +					   dma_addr_t *dma_addr,
> > > > +					   unsigned long npages,
> > > > +					   enum
> > > > dma_data_direction
> > > > dir)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		if (!dma_addr[i] || dma_mapping_error(dev,
> > > > dma_addr[i]))
> > > > +			continue;
> > > > +
> > > > +		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE,
> > > > dir);
> > > > +	}
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *                   failure of this function.
> > > > + * @vram_allocation: Driver-private pointer to the VRAM
> > > > allocation.
> > > > The caller
> > > > + *                   should hold a reference to the VRAM
> > > > allocation,
> > > > which
> > > > + *                   should be dropped via ops->vram_allocation
> > > > or
> > > > upon the
> > > > + *                   failure of this function.
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function migrates the specified GPU SVM range to VRAM.
> > > > It
> > > > performs the
> > > > + * necessary setup and invokes the driver-specific operations
> > > > for
> > > > migration to
> > > > + * VRAM. Upon successful return, @vram_allocation can safely
> > > > reference @range
> > > > + * until ops->vram_release is called which only upon successful
> > > > return.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       void *vram_allocation,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct migrate_vma migrate = {
> > > > +		.start		= start,
> > > > +		.end		= end,
> > > > +		.pgmap_owner	= gpusvm-
> > > > >device_private_page_owner,
> > > > +		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
> > > > +	};
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	unsigned long i, npages = npages_in_range(start, end);
> > > > +	struct vm_area_struct *vas;
> > > > +	struct drm_gpusvm_zdd *zdd = NULL;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int err;
> > > > +
> > > > +	if (!range->flags.migrate_vram)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops-
> > > > > copy_to_vram ||
> > > > +	    !gpusvm->ops->copy_to_sram)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		mmap_write_lock(mm);
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	vas = vma_lookup(mm, start);
> > > > +	if (!vas) {
> > > > +		err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end > vas->vm_end || start < vas->vm_start) {
> > > > +		err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (!vma_is_anonymous(vas)) {
> > > > +		err = -EBUSY;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr))
> > > > * npages;
> > > > +
> > > > +	zdd = drm_gpusvm_zdd_alloc(range);
> > > > +	if (!zdd) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/*
> > > > +	 * FIXME: Below cases, !migrate.cpages and
> > > > migrate.cpages !=
> > > > npages, not
> > > > +	 * always an error. Need to revisit possible cases and
> > > > how
> > > > to handle. We
> > > > +	 * could prefault on migrate.cpages != npages via
> > > > hmm_range_fault.
> > > > +	 */
> > > > +
> > > > +	if (!migrate.cpages) {
> > > > +		err = -EFAULT;
> > > > +		goto err_free;
> > > > +	}
> > > > +
> > > > +	if (migrate.cpages != npages) {
> > > > +		err = -EBUSY;
> > > > +		goto err_finalize;
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm,
> > > > vram_allocation, npages,
> > > > +					     migrate.dst);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   migrate.src, npages,
> > > > DMA_TO_DEVICE);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i) {
> > > > +		struct page *page = pfn_to_page(migrate.dst[i]);
> > > > +
> > > > +		pages[i] = page;
> > > > +		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > > > +		drm_gpusvm_get_vram_page(page, zdd);
> > > > +	}
> > > > +
> > > > +	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	/* Upon success bind vram allocation to range and zdd */
> > > > +	range->vram_allocation = vram_allocation;
> > > > +	WRITE_ONCE(zdd->vram_allocation,
> > > > vram_allocation);	/*
> > > > Owns ref */
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_TO_DEVICE);
> > > > +err_free:
> > > > +	if (zdd)
> > > > +		drm_gpusvm_zdd_put(zdd);
> > > > +	kvfree(buf);
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_write_unlock(mm);
> > > > +		mmput(mm);
> > > > +	}
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for
> > > > a
> > > > VM area
> > > > + * @vas: Pointer to the VM area structure, can be NULL
> > > > + * @npages: Number of pages to populate
> > > > + * @src_mpfn: Source array of migrate PFNs
> > > > + * @mpfn: Array of migrate PFNs to populate
> > > > + * @addr: Start address for PFN allocation
> > > > + *
> > > > + * This function populates the SRAM migrate page frame numbers
> > > > (PFNs) for the
> > > > + * specified VM area structure. It allocates and locks pages in
> > > > the
> > > > VM area for
> > > > + * SRAM usage. If vas is non-NULL use alloc_page_vma for
> > > > allocation,
> > > > if NULL use
> > > > + * alloc_page for allocation.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_migrate_populate_sram_pfn(struct
> > > > vm_area_struct *vas,
> > > > +						unsigned long
> > > > npages,
> > > > +						unsigned long
> > > > *src_mpfn,
> > > > +						unsigned long
> > > > *mpfn,
> > > > u64 addr)
> > > > +{
> > > > +	unsigned long i;
> > > > +
> > > > +	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
> > > > +		struct page *page;
> > > > +
> > > > +		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
> > > > +			continue;
> > > > +
> > > > +		if (vas)
> > > > +			page = alloc_page_vma(GFP_HIGHUSER, vas,
> > > > addr);
> > > > +		else
> > > > +			page = alloc_page(GFP_HIGHUSER);
> > > > +
> > > > +		if (!page)
> > > > +			return -ENOMEM;
> > > > +
> > > > +		lock_page(page);
> > > > +		mpfn[i] = migrate_pfn(page_to_pfn(page));
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + *
> > > > + * Similar to __drm_gpusvm_migrate_to_sram but does not require
> > > > mmap
> > > > lock and
> > > > + * migration done via migrate_device_* functions. Fallback path
> > > > as
> > > > it is
> > > > + * preferred to issue migrations with mmap lock.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
> > > > +				    struct drm_gpusvm_range
> > > > *range)
> > > > +{
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	unsigned long *src, *dst;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	npages = npages_in_range(range->va.start, range-
> > > > >va.end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*src) +
> > > > sizeof(*dma_addr)
> > > > +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	src = buf;
> > > > +	dst = buf + (sizeof(*src) * npages);
> > > > +	dma_addr = buf + (2 * sizeof(*src) * npages);
> > > > +	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) *
> > > > npages;
> > > > +
> > > > +	err = gpusvm->ops->populate_vram_pfn(gpusvm, range-
> > > > > vram_allocation,
> > > > +					     npages, src);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = migrate_device_vma_range(gpusvm->mm,
> > > > +				       gpusvm-
> > > > > device_private_page_owner, src,
> > > > +				       npages, range->va.start);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages,
> > > > src, dst, 0);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   dst, npages,
> > > > DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(src[i]);
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages, dst);
> > > > +	migrate_device_pages(src, dst, npages);
> > > > +	migrate_device_finalize(src, dst, npages);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM
> > > > (internal)
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @vas: Pointer to the VM area structure
> > > > + * @page: Pointer to the page for fault handling (can be NULL)
> > > > + * @start: Start address of the migration range
> > > > + * @end: End address of the migration range
> > > > + *
> > > > + * This internal function performs the migration of the
> > > > specified
> > > > GPU SVM range
> > > > + * to SRAM. It sets up the migration, populates + dma maps SRAM
> > > > PFNs, and
> > > > + * invokes the driver-specific operations for migration to SRAM.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm
> > > > *gpusvm,
> > > > +					struct vm_area_struct
> > > > *vas,
> > > > +					struct page *page,
> > > > +					u64 start, u64 end)
> > > > +{
> > > > +	struct migrate_vma migrate = {
> > > > +		.vma		= vas,
> > > > +		.pgmap_owner	= gpusvm-
> > > > >device_private_page_owner,
> > > > +		.flags		=
> > > > MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
> > > > +		.fault_page	= page,
> > > > +	};
> > > > +	unsigned long npages;
> > > > +	struct page **pages;
> > > > +	dma_addr_t *dma_addr;
> > > > +	void *buf;
> > > > +	int i, err = 0;
> > > > +
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	/* Corner where VMA area struct has been partially
> > > > unmapped
> > > > */
> > > > +	if (start < vas->vm_start)
> > > > +		start = vas->vm_start;
> > > > +	if (end > vas->vm_end)
> > > > +		end = vas->vm_end;
> > > > +
> > > > +	migrate.start = start;
> > > > +	migrate.end = end;
> > > > +	npages = npages_in_range(start, end);
> > > > +
> > > > +	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr) +
> > > > +		       sizeof(*pages), GFP_KERNEL);
> > > > +	if (!buf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err_out;
> > > > +	}
> > > > +	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > > > +	pages = buf + (2 * sizeof(*migrate.src) +
> > > > sizeof(*dma_addr))
> > > > * npages;
> > > > +
> > > > +	migrate.vma = vas;
> > > > +	migrate.src = buf;
> > > > +	migrate.dst = migrate.src + npages;
> > > > +
> > > > +	err = migrate_vma_setup(&migrate);
> > > > +	if (err)
> > > > +		goto err_free;
> > > > +
> > > > +	/* Raced with another CPU fault, nothing to do */
> > > > +	if (!migrate.cpages)
> > > > +		goto err_free;
> > > > +
> > > > +	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
> > > > +						   migrate.src,
> > > > migrate.dst,
> > > > +						   start);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > +					   migrate.dst, npages,
> > > > +					   DMA_BIDIRECTIONAL);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +	for (i = 0; i < npages; ++i)
> > > > +		pages[i] = migrate_pfn_to_page(migrate.src[i]);
> > > 
> > > See comments below which pages we actually want to migrate.
> > > 
> > > 
> > > > +
> > > > +	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr,
> > > > npages);
> > > > +	if (err)
> > > > +		goto err_finalize;
> > > > +
> > > > +err_finalize:
> > > > +	if (err)
> > > > +		drm_gpusvm_migration_put_pages(npages,
> > > > migrate.dst);
> > > > +	migrate_vma_pages(&migrate);
> > > > +	migrate_vma_finalize(&migrate);
> > > > +	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev,
> > > > dma_addr,
> > > > npages,
> > > > +				       DMA_BIDIRECTIONAL);
> > > > +err_free:
> > > > +	kvfree(buf);
> > > > +err_out:
> > > > +	mmap_assert_locked(gpusvm->mm);
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to
> > > > SRAM
> > > > + * @gpusvm: Pointer to the GPU SVM structure
> > > > + * @range: Pointer to the GPU SVM range structure
> > > > + * @ctx: GPU SVM context
> > > > + *
> > > > + * This function initiates the migration of the specified GPU
> > > > SVM
> > > > range to
> > > > + * SRAM. It performs necessary checks and invokes the internal
> > > > migration
> > > > + * function for actual migration.
> > > > + *
> > > > + * Returns:
> > > > + * 0 on success, negative error code on failure.
> > > > + */
> > > > +int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
> > > > +			       struct drm_gpusvm_range *range,
> > > > +			       const struct drm_gpusvm_ctx *ctx)
> > > > +{
> > > > +	u64 start = range->va.start, end = range->va.end;
> > > > +	struct mm_struct *mm = gpusvm->mm;
> > > > +	struct vm_area_struct *vas;
> > > > +	int err;
> > > > +	bool retry = false;
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		if (!mmget_not_zero(mm)) {
> > > > +			err = -EFAULT;
> > > > +			goto err_out;
> > > > +		}
> > > > +		if (ctx->trylock_mmap) {
> > > > +			if (!mmap_read_trylock(mm))  {
> > > > +				err =
> > > > drm_gpusvm_evict_to_sram(gpusvm, range);
> > > > +				goto err_mmput;
> > > > +			}
> > > > +		} else {
> > > > +			mmap_read_lock(mm);
> > > > +		}
> > > > +	}
> > > > +
> > > > +	mmap_assert_locked(mm);
> > > > +
> > > > +	/*
> > > > +	 * Loop required to find all VMA area structs for the
> > > > corner
> > > > case when
> > > > +	 * VRAM backing has been partially unmapped from MM's
> > > > address space.
> > > > +	 */
> > > > +again:
> > > > +	vas = find_vma(mm, start);
> > > > +	if (!vas) {
> > > > +		if (!retry)
> > > > +			err = -ENOENT;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	if (end <= vas->vm_start || start >= vas->vm_end) {
> > > > +		if (!retry)
> > > > +			err = -EINVAL;
> > > > +		goto err_mmunlock;
> > > > +	}
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL,
> > > > start,
> > > > end);
> > > 
> > > This function is typically called from the vm side to get a clean
> > > mm as
> > > a last resort after get_pages() fail. As such should we have it
> > > evict
> > > *everything*, even foreign device memory, and mismatching local
> > > device
> > > pages. If so, we could use hmm_range_fault() with a NULL page owner
> > > +
> > > faulting to do that.
> > > 
> > 
> > I've actually tried that and it seemed to mostly work well and
> > actually
> > would be my preference as this avoids a VMA lookup in GPU SVM.
> > 
> > I think it is problem though if some of the pages are partially
> > unmapped
> > though as hmm_range_fault will abort if fault cannot be resolved.
> > Maybe
> > I'm mistaken on this. I won't get this in rev2 but will put this on
> > my
> > list to continue to play around with.
> 
> OK. Presumably if faulting fails we should try a narrower range unless
> the page actually hitting the gpu pagefault is unmapped, to ensure we
> make progress rather than aborting?
> 

I think the easiest thing would be add a flag to HMM that says continue
on fault failure. Now I remember another issue, hmm_range_fault doesn't
work for coherent pages if we ever decide to use them.

Maybe we can do something like hmm_range_fault without fault bit set to
collect device pages and then use migrate_device_* functions to evict.
Think drm_gpusvm_evict_to_ram in v2 (just posted) with
populate_devmem_pfn replaced with hmm_range_fault. That seems like this
would work. Maybe I'm missing a race here though, likely gets racier
with multi-GPU too but seems workable.

> 
> > 
> > > > +	if (err)
> > > > +		goto err_mmunlock;
> > > > +
> > > > +	if (vas->vm_end < end) {
> > > > +		retry = true;
> > > > +		start = vas->vm_end;
> > > > +		goto again;
> > > > +	}
> > > > +
> > > > +	if (!ctx->mmap_locked) {
> > > > +		mmap_read_unlock(mm);
> > > > +		/*
> > > > +		 * Using mmput_async as this function can be
> > > > called
> > > > while
> > > > +		 * holding a dma-resv lock, and a final put can
> > > > grab
> > > > the mmap
> > > > +		 * lock, causing a lock inversion.
> > > > +		 */
> > > > +		mmput_async(mm);
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err_mmunlock:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmap_read_unlock(mm);
> > > > +err_mmput:
> > > > +	if (!ctx->mmap_locked)
> > > > +		mmput_async(mm);
> > > > +err_out:
> > > > +	return err;
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_page_free - Put GPU SVM zone device data
> > > > associated
> > > > with a page
> > > > + * @page: Pointer to the page
> > > > + *
> > > > + * This function is a callback used to put the GPU SVM zone
> > > > device
> > > > data
> > > > + * associated with a page when it is being released.
> > > > + */
> > > > +static void drm_gpusvm_page_free(struct page *page)
> > > > +{
> > > > +	drm_gpusvm_zdd_put(page->zone_device_data);
> > > > +}
> > > > +
> > > > +/**
> > > > + * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM
> > > > (page
> > > > fault handler)
> > > > + * @vmf: Pointer to the fault information structure
> > > > + *
> > > > + * This function is a page fault handler used to migrate a GPU
> > > > SVM
> > > > range to RAM.
> > > > + * It retrieves the GPU SVM range information from the faulting
> > > > page
> > > > and invokes
> > > > + * the internal migration function to migrate the range back to
> > > > RAM.
> > > > + *
> > > > + * Returns:
> > > > + * VM_FAULT_SIGBUS on failure, 0 on success.
> > > > + */
> > > > +static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault
> > > > *vmf)
> > > > +{
> > > > +	struct drm_gpusvm_zdd *zdd = vmf->page-
> > > > >zone_device_data;
> > > > +	int err;
> > > > +
> > > > +	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
> > > > +					   vmf->vma, vmf->page,
> > > > +					   zdd->range->va.start,
> > > > +					   zdd->range->va.end);
> > > 
> > > When called from here, since this is a pagemap op, we should ensure
> > > we
> > > only migrate our own pagemap to RAM?
> > > 
> > 
> > I think you resolve this with the following the patch [1], right? I
> > think I agree.
> 
> It doesn't fully resolve it, but adds the capability to do more
> specified filtering. Another option would be to use the pagemap ptr
> rather than the device ptr as device_private owner, but that would OTOH
> require a wider filtering in hmm_range_fault() so that (or a similar)
> patch would be needed anyway.
>

Yea pagemap group is likely a better device_private_owner. Then I think
we'd drop gpusvm->device_private_owner pointer too which is likely a
good idea anyways.

Matt
 
> Thanks,
> Thomas
> 
> > 
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/series/139994/
> > 
> > > /Thanks,
> > > Thomas
> > > 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index b9670ae09a9e..b8fc2ee58f1a 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -25,7 +25,8 @@  $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
 
 # core driver code
 
-xe-y += xe_bb.o \
+xe-y += drm_gpusvm.o \
+	xe_bb.o \
 	xe_bo.o \
 	xe_bo_evict.o \
 	xe_devcoredump.o \
diff --git a/drivers/gpu/drm/xe/drm_gpusvm.c b/drivers/gpu/drm/xe/drm_gpusvm.c
new file mode 100644
index 000000000000..fc1e44e6ae72
--- /dev/null
+++ b/drivers/gpu/drm/xe/drm_gpusvm.c
@@ -0,0 +1,2174 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Authors:
+ *     Matthew Brost <matthew.brost@intel.com>
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/hmm.h>
+#include <linux/memremap.h>
+#include <linux/migrate.h>
+#include <linux/mm_types.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+
+#include <drm/drm_device.h>
+#include "drm_gpusvm.h"
+
+/**
+ * DOC: Overview
+ *
+ * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
+ *
+ * The GPU SVM layer is a component of the DRM framework designed to manage shared
+ * virtual memory between the CPU and GPU. It enables efficient data exchange and
+ * processing for GPU-accelerated applications by allowing memory sharing and
+ * synchronization between the CPU's and GPU's virtual address spaces.
+ *
+ * Key GPU SVM Components:
+ * - Notifiers: Notifiers: Used for tracking memory intervals and notifying the
+ *		GPU of changes, notifiers are sized based on a GPU SVM
+ *		initialization parameter, with a recommendation of 512M or
+ *		larger. They maintain a Red-BlacK tree and a list of ranges that
+ *		fall within the notifier interval. Notifiers are tracked within
+ *		a GPU SVM Red-BlacK tree and list and are dynamically inserted
+ *		or removed as ranges within the interval are created or
+ *		destroyed.
+ * - Ranges: Represent memory ranges mapped in a DRM device and managed
+ *	     by GPU SVM. They are sized based on an array of chunk sizes, which
+ *	     is a GPU SVM initialization parameter, and the CPU address space.
+ *	     Upon GPU fault, the largest aligned chunk that fits within the
+ *	     faulting CPU address space is chosen for the range size. Ranges are
+ *	     expected to be dynamically allocated on GPU fault and removed on an
+ *	     MMU notifier UNMAP event. As mentioned above, ranges are tracked in
+ *	     a notifier's Red-Black tree.
+ * - Operations: Define the interface for driver-specific SVM operations such as
+ *		 allocation, page collection, migration, invalidations, and VRAM
+ *		 release.
+ *
+ * This layer provides interfaces for allocating, mapping, migrating, and
+ * releasing memory ranges between the CPU and GPU. It handles all core memory
+ * management interactions (DMA mapping, HMM, and migration) and provides
+ * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
+ * to build the expected driver components for an SVM implementation as detailed
+ * below.
+ *
+ * Expected Driver Components:
+ * - GPU page fault handler: Used to create ranges and notifiers based on the
+ *			     fault address, optionally migrate the range to
+ *			     VRAM, and create GPU bindings.
+ * - Garbage collector: Used to destroy GPU bindings for ranges. Ranges are
+ *			expected to be added to the garbage collector upon
+ *			MMU_NOTIFY_UNMAP event.
+ */
+
+/**
+ * DOC: Locking
+ *
+ * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
+ * mmap lock as needed. Alternatively, if the driver prefers to handle the mmap
+ * lock itself, a 'locked' argument is provided to the functions that require
+ * the mmap lock. This option may be useful for drivers that need to call into
+ * GPU SVM while also holding a dma-resv lock, thus preventing locking
+ * inversions between the mmap and dma-resv locks.
+ *
+ * GPU SVM introduces a global notifier lock, which safeguards the notifier's
+ * range RB tree and list, as well as the range's DMA mappings and sequence
+ * number. GPU SVM manages all necessary locking and unlocking operations,
+ * except for the recheck of the range's sequence number
+ * (mmu_interval_read_retry) when the driver is committing GPU bindings. This
+ * lock corresponds to the 'driver->update' lock mentioned in the HMM
+ * documentation (TODO: Link). Future revisions may transition from a GPU SVM
+ * global lock to a per-notifier lock if finer-grained locking is deemed
+ * necessary.
+ *
+ * In addition to the locking mentioned above, the driver should implement a
+ * lock to safeguard core GPU SVM function calls that modify state, such as
+ * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. Alternatively,
+ * these core functions can be called within a single kernel thread, for
+ * instance, using an ordered work queue. This lock is denoted as
+ * 'driver_svm_lock' in code examples.
+ */
+
+/**
+ * DOC: Migrataion
+ *
+ * The migration support is quite simple, allowing migration between SRAM and
+ * VRAM at the range granularity. For example, GPU SVM currently does not
+ * support mixing SRAM and VRAM pages within a range. This means that upon GPU
+ * fault, the entire range can be migrated to VRAM, and upon CPU fault, the
+ * entire range is migrated to SRAM.
+ *
+ * The reasoning for only supporting range granularity is as follows: it
+ * simplifies the implementation, and range sizes are driver-defined and should
+ * be relatively small.
+ */
+
+/**
+ * DOC: Partial Unmapping of Ranges
+ *
+ * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
+ * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
+ * being that a subset of the range still has CPU and GPU mappings. If the
+ * backing store for the range is in VRAM, a subset of the backing store has
+ * references. One option would be to split the range and VRAM backing store,
+ * but the implementation for this would be quite complicated. Given that
+ * partial unmappings are rare and driver-defined range sizes are relatively
+ * small, GPU SVM does not support splitting of ranges.
+ *
+ * With no support for range splitting, upon partial unmapping of a range, the
+ * driver is expected to invalidate and destroy the entire range. If the range
+ * has VRAM as its backing, the driver is also expected to migrate any remaining
+ * pages back to SRAM.
+ */
+
+/**
+ * DOC: Examples
+ *
+ * This section provides two examples of how to build the expected driver
+ * components: the GPU page fault handler and the garbage collector. A third
+ * example demonstrates a sample invalidation driver vfunc.
+ *
+ * The generic code provided does not include logic for complex migration
+ * policies, optimized invalidations, or other potentially required driver
+ * locking (e.g., DMA-resv locks).
+ *
+ * 1) GPU page fault handler
+ *
+ *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
+ *	{
+ *		int err = 0;
+ *
+ *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
+ *
+ *		drm_gpusvm_notifier_lock(gpusvm);
+ *		if (drm_gpusvm_range_pages_valid(range))
+ *			driver_commit_bind(gpusvm, range);
+ *		else
+ *			err = -EAGAIN;
+ *		drm_gpusvm_notifier_unlock(gpusvm);
+ *
+ *		return err;
+ *	}
+ *
+ *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, u64 fault_addr,
+ *			     u64 gpuva_start, u64 gpuva_end)
+ *	{
+ *		struct drm_gpusvm_ctx ctx = {};
+ *		int err;
+ *
+ *		driver_svm_lock();
+ *	retry:
+ *		// Always process UNMAPs first so view of GPU SVM ranges is current
+ *		driver_garbage_collector(gpusvm);
+ *
+ *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
+ *							gpuva_start, gpuva_end,
+ *						        &ctx);
+ *		if (IS_ERR(range)) {
+ *			err = PTR_ERR(range);
+ *			goto unlock;
+ *		}
+ *
+ *		if (driver_migration_policy(range)) {
+ *			bo = driver_alloc_bo();
+ *			err = drm_gpusvm_migrate_to_vram(gpusvm, range, bo, &ctx);
+ *			if (err)	// CPU mappings may have changed
+ *				goto retry;
+ *		}
+ *
+ *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
+ *		if (err == -EFAULT || err == -EPERM)	// CPU mappings changed
+ *			goto retry;
+ *		else if (err)
+ *			goto unlock;
+ *
+ *		err = driver_bind_range(gpusvm, range);
+ *		if (err == -EAGAIN)	// CPU mappings changed
+ *			goto retry
+ *
+ *	unlock:
+ *		driver_svm_unlock();
+ *		return err;
+ *	}
+ *
+ * 2) Garbage Collector.
+ *
+ *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
+ *					struct drm_gpusvm_range *range)
+ *	{
+ *		struct drm_gpusvm_ctx ctx = {};
+ *
+ *		assert_driver_svm_locked(gpusvm);
+ *
+ *		// Partial unmap, migrate any remaining VRAM pages back to SRAM
+ *		if (range->flags.partial_unmap)
+ *			drm_gpusvm_migrate_to_sram(gpusvm, range, &ctx);
+ *
+ *		driver_unbind_range(range);
+ *		drm_gpusvm_range_remove(gpusvm, range);
+ *	}
+ *
+ *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
+ *	{
+ *		assert_driver_svm_locked(gpusvm);
+ *
+ *		for_each_range_in_garbage_collector(gpusvm, range)
+ *			__driver_garbage_collector(gpusvm, range);
+ *	}
+ *
+ * 3) Invalidation driver vfunc.
+ *
+ *	void driver_invalidation(struct drm_gpusvm *gpusvm,
+ *				 struct drm_gpusvm_notifier *notifier,
+ *				 const struct mmu_notifier_range *mmu_range)
+ *	{
+ *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
+ *		struct drm_gpusvm_range *range = NULL;
+ *
+ *		driver_invalidate_device_tlb(gpusvm, mmu_range->start, mmu_range->end);
+ *
+ *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
+ *					  mmu_range->end) {
+ *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
+ *
+ *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
+ *				continue;
+ *
+ *			drm_gpusvm_range_set_unmapped(range, mmu_range);
+ *			driver_garbage_collector_add(gpusvm, range);
+ *		}
+ *	}
+ */
+
+#define DRM_GPUSVM_RANGE_START(_range)	((_range)->va.start)
+#define DRM_GPUSVM_RANGE_END(_range)	((_range)->va.end - 1)
+INTERVAL_TREE_DEFINE(struct drm_gpusvm_range, rb.node, u64, rb.__subtree_last,
+		     DRM_GPUSVM_RANGE_START, DRM_GPUSVM_RANGE_END,
+		     static __maybe_unused, range);
+
+#define DRM_GPUSVM_NOTIFIER_START(_notifier)	((_notifier)->interval.start)
+#define DRM_GPUSVM_NOTIFIER_END(_notifier)	((_notifier)->interval.end - 1)
+INTERVAL_TREE_DEFINE(struct drm_gpusvm_notifier, rb.node, u64,
+		     rb.__subtree_last, DRM_GPUSVM_NOTIFIER_START,
+		     DRM_GPUSVM_NOTIFIER_END, static __maybe_unused, notifier);
+
+/**
+ * npages_in_range() - Calculate the number of pages in a given range
+ * @start__: The start address of the range
+ * @end__: The end address of the range
+ *
+ * This macro calculates the number of pages in a given memory range,
+ * specified by the start and end addresses. It divides the difference
+ * between the end and start addresses by the page size (PAGE_SIZE) to
+ * determine the number of pages in the range.
+ *
+ * Return: The number of pages in the specified range.
+ */
+#define npages_in_range(start__, end__)	\
+	(((end__) - (start__)) >> PAGE_SHIFT)
+
+/**
+ * struct drm_gpusvm_zdd - GPU SVM zone device data
+ *
+ * @refcount: Reference count for the zdd
+ * @destroy_work: Work structure for asynchronous zdd destruction
+ * @range: Pointer to the GPU SVM range
+ * @vram_allocation: Driver-private pointer to the VRAM allocation
+ *
+ * This structure serves as a generic wrapper installed in
+ * page->zone_device_data. It provides infrastructure for looking up a range
+ * upon CPU page fault and asynchronously releasing VRAM once the CPU has no
+ * page references. Asynchronous release is useful because CPU page references
+ * can be dropped in IRQ contexts, while releasing VRAM likely requires sleeping
+ * locks.
+ */
+struct drm_gpusvm_zdd {
+	struct kref refcount;
+	struct work_struct destroy_work;
+	struct drm_gpusvm_range *range;
+	void *vram_allocation;
+};
+
+/**
+ * drm_gpusvm_zdd_destroy_work_func - Work function for destroying a zdd
+ * @w: Pointer to the work_struct
+ *
+ * This function releases VRAM, puts GPU SVM range, and frees zdd.
+ */
+static void drm_gpusvm_zdd_destroy_work_func(struct work_struct *w)
+{
+	struct drm_gpusvm_zdd *zdd =
+		container_of(w, struct drm_gpusvm_zdd, destroy_work);
+	struct drm_gpusvm_range *range = zdd->range;
+	struct drm_gpusvm *gpusvm = range->gpusvm;
+
+	if (gpusvm->ops->vram_release && zdd->vram_allocation)
+		gpusvm->ops->vram_release(zdd->vram_allocation);
+	drm_gpusvm_range_put(range);
+	kfree(zdd);
+}
+
+/**
+ * drm_gpusvm_zdd_alloc - Allocate a zdd structure.
+ * @range: Pointer to the GPU SVM range.
+ *
+ * This function allocates and initializes a new zdd structure. It sets up the
+ * reference count, initializes the destroy work, and links the provided GPU SVM
+ * range.
+ *
+ * Returns:
+ * Pointer to the allocated zdd on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_zdd *
+drm_gpusvm_zdd_alloc(struct drm_gpusvm_range *range)
+{
+	struct drm_gpusvm_zdd *zdd;
+
+	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
+	if (!zdd)
+		return NULL;
+
+	kref_init(&zdd->refcount);
+	INIT_WORK(&zdd->destroy_work, drm_gpusvm_zdd_destroy_work_func);
+	zdd->range = drm_gpusvm_range_get(range);
+	zdd->vram_allocation = NULL;
+
+	return zdd;
+}
+
+/**
+ * drm_gpusvm_zdd_get - Get a reference to a zdd structure.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function increments the reference count of the provided zdd structure.
+ *
+ * Returns: Pointer to the zdd structure.
+ */
+static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
+{
+	kref_get(&zdd->refcount);
+	return zdd;
+}
+
+/**
+ * drm_gpusvm_zdd_destroy - Destroy a zdd structure.
+ * @ref: Pointer to the reference count structure.
+ *
+ * This function queues the destroy_work of the zdd for asynchronous destruction.
+ */
+static void drm_gpusvm_zdd_destroy(struct kref *ref)
+{
+	struct drm_gpusvm_zdd *zdd =
+		container_of(ref, struct drm_gpusvm_zdd, refcount);
+	struct drm_gpusvm *gpusvm = zdd->range->gpusvm;
+
+	queue_work(gpusvm->zdd_wq, &zdd->destroy_work);
+}
+
+/**
+ * drm_gpusvm_zdd_put - Put a zdd reference.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function decrements the reference count of the provided zdd structure
+ * and schedules its destruction if the count drops to zero.
+ */
+static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
+{
+	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
+}
+
+/**
+ * drm_gpusvm_range_find - Find GPU SVM range from GPU SVM notifier
+ * @notifier: Pointer to the GPU SVM notifier structure.
+ * @start: Start address of the range
+ * @end: End address of the range
+ *
+ * Return: A pointer to the drm_gpusvm_range if found or NULL
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end)
+{
+	return range_iter_first(&notifier->root, start, end - 1);
+}
+
+/**
+ * drm_gpusvm_for_each_range_safe - Safely iterate over GPU SVM ranges in a notifier
+ * @range__: Iterator variable for the ranges
+ * @next__: Iterator variable for the ranges temporay storage
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the range
+ * @end__: End address of the range
+ *
+ * This macro is used to iterate over GPU SVM ranges in a notifier while
+ * removing ranges from it.
+ */
+#define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__)	\
+	for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)),	\
+	     (next__) = __drm_gpusvm_range_next(range__);				\
+	     (range__) && (range__->va.start < (end__));				\
+	     (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
+
+/**
+ * __drm_gpusvm_notifier_next - get the next drm_gpusvm_notifier in the list
+ * @notifier: a pointer to the current drm_gpusvm_notifier
+ *
+ * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
+ *         the current notifier is the last one or if the input notifier is
+ *         NULL.
+ */
+static struct drm_gpusvm_notifier *
+__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
+{
+	if (notifier && !list_is_last(&notifier->rb.entry,
+				      &notifier->gpusvm->notifier_list))
+		return list_next_entry(notifier, rb.entry);
+
+	return NULL;
+}
+
+/**
+ * drm_gpusvm_for_each_notifier - Iterate over GPU SVM notifiers in a gpusvm
+ * @notifier__: Iterator variable for the notifiers
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the notifier
+ * @end__: End address of the notifier
+ *
+ * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
+ */
+#define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__)		\
+	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1);	\
+	     (notifier__) && (notifier__->interval.start < (end__));			\
+	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
+
+/**
+ * drm_gpusvm_for_each_notifier_safe - Safely iterate over GPU SVM notifiers in a gpusvm
+ * @notifier__: Iterator variable for the notifiers
+ * @next__: Iterator variable for the notifiers temporay storage
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the notifier
+ * @end__: End address of the notifier
+ *
+ * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
+ * removing notifiers from it.
+ */
+#define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__)	\
+	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1),	\
+	     (next__) = __drm_gpusvm_notifier_next(notifier__);				\
+	     (notifier__) && (notifier__->interval.start < (end__));			\
+	     (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
+
+/**
+ * drm_gpusvm_notifier_invalidate - Invalidate a GPU SVM notifier.
+ * @mni: Pointer to the mmu_interval_notifier structure.
+ * @mmu_range: Pointer to the mmu_notifier_range structure.
+ * @cur_seq: Current sequence number.
+ *
+ * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
+ * notifier sequence number and calls the driver invalidate vfunc under
+ * gpusvm->notifier_lock.
+ *
+ * Returns:
+ * true if the operation succeeds, false otherwise.
+ */
+static bool
+drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
+			       const struct mmu_notifier_range *mmu_range,
+			       unsigned long cur_seq)
+{
+	struct drm_gpusvm_notifier *notifier =
+		container_of(mni, typeof(*notifier), notifier);
+	struct drm_gpusvm *gpusvm = notifier->gpusvm;
+
+	if (!mmu_notifier_range_blockable(mmu_range))
+		return false;
+
+	down_write(&gpusvm->notifier_lock);
+	mmu_interval_set_seq(mni, cur_seq);
+	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
+	up_write(&gpusvm->notifier_lock);
+
+	return true;
+}
+
+/**
+ * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
+ */
+static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
+	.invalidate = drm_gpusvm_notifier_invalidate,
+};
+
+/**
+ * drm_gpusvm_init - Initialize the GPU SVM.
+ * @gpusvm: Pointer to the GPU SVM structure.
+ * @name: Name of the GPU SVM.
+ * @drm: Pointer to the DRM device structure.
+ * @mm: Pointer to the mm_struct for the address space.
+ * @device_private_page_owner: Device private pages owner.
+ * @mm_start: Start address of GPU SVM.
+ * @mm_range: Range of the GPU SVM.
+ * @notifier_size: Size of individual notifiers.
+ * @ops: Pointer to the operations structure for GPU SVM.
+ * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
+ *               Entries should be powers of 2 in descending order with last
+ *               entry being SZ_4K.
+ * @num_chunks: Number of chunks.
+ *
+ * This function initializes the GPU SVM.
+ *
+ * Returns:
+ * 0 on success, a negative error code on failure.
+ */
+int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
+		    const char *name, struct drm_device *drm,
+		    struct mm_struct *mm, void *device_private_page_owner,
+		    u64 mm_start, u64 mm_range, u64 notifier_size,
+		    const struct drm_gpusvm_ops *ops,
+		    const u64 *chunk_sizes, int num_chunks)
+{
+	if (!ops->invalidate || !num_chunks)
+		return -EINVAL;
+
+	gpusvm->name = name;
+	gpusvm->drm = drm;
+	gpusvm->mm = mm;
+	gpusvm->device_private_page_owner = device_private_page_owner;
+	gpusvm->mm_start = mm_start;
+	gpusvm->mm_range = mm_range;
+	gpusvm->notifier_size = notifier_size;
+	gpusvm->ops = ops;
+	gpusvm->chunk_sizes = chunk_sizes;
+	gpusvm->num_chunks = num_chunks;
+	gpusvm->zdd_wq = system_wq;
+
+	mmgrab(mm);
+	gpusvm->root = RB_ROOT_CACHED;
+	INIT_LIST_HEAD(&gpusvm->notifier_list);
+
+	init_rwsem(&gpusvm->notifier_lock);
+
+	fs_reclaim_acquire(GFP_KERNEL);
+	might_lock(&gpusvm->notifier_lock);
+	fs_reclaim_release(GFP_KERNEL);
+
+	return 0;
+}
+
+/**
+ * drm_gpusvm_notifier_find - Find GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM structure
+ * @fault_addr__: Fault address
+ *
+ * This macro finds the GPU SVM notifier associated with the fault address.
+ *
+ * Returns:
+ * Pointer to the GPU SVM notifier on success, NULL otherwise.
+ */
+#define drm_gpusvm_notifier_find(gpusvm__, fault_addr__)	\
+	notifier_iter_first(&(gpusvm__)->root, (fault_addr__),	\
+			    (fault_addr__ + 1))
+
+/**
+ * to_drm_gpusvm_notifier - retrieve the container struct for a given rbtree node
+ * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
+ *
+ * Return: A pointer to the containing drm_gpusvm_notifier structure.
+ */
+#define to_drm_gpusvm_notifier(__node)				\
+	container_of((__node), struct drm_gpusvm_notifier, rb.node)
+
+/**
+ * drm_gpusvm_notifier_insert - Insert GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ *
+ * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
+ */
+static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
+				       struct drm_gpusvm_notifier *notifier)
+{
+	struct rb_node *node;
+	struct list_head *head;
+
+	notifier_insert(notifier, &gpusvm->root);
+
+	node = rb_prev(&notifier->rb.node);
+	if (node)
+		head = &(to_drm_gpusvm_notifier(node))->rb.entry;
+	else
+		head = &gpusvm->notifier_list;
+
+	list_add(&notifier->rb.entry, head);
+}
+
+/**
+ * drm_gpusvm_notifier_remove - Remove GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM tructure
+ * @notifier__: Pointer to the GPU SVM notifier structure
+ *
+ * This macro removes the GPU SVM notifier from the GPU SVM RB tree and list.
+ */
+#define drm_gpusvm_notifier_remove(gpusvm__, notifier__)	\
+	notifier_remove((notifier__), &(gpusvm__)->root);	\
+	list_del(&(notifier__)->rb.entry)
+
+/**
+ * drm_gpusvm_fini - Finalize the GPU SVM.
+ * @gpusvm: Pointer to the GPU SVM structure.
+ *
+ * This function finalizes the GPU SVM by cleaning up any remaining ranges and
+ * notifiers, and dropping a reference to struct MM.
+ */
+void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
+{
+	struct drm_gpusvm_notifier *notifier, *next;
+
+	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
+		struct drm_gpusvm_range *range, *__next;
+
+		/*
+		 * Remove notifier first to avoid racing with any invalidation
+		 */
+		mmu_interval_notifier_remove(&notifier->notifier);
+		notifier->flags.removed = true;
+
+		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
+					       LONG_MAX)
+			drm_gpusvm_range_remove(gpusvm, range);
+	}
+
+	mmdrop(gpusvm->mm);
+	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
+}
+
+/**
+ * drm_gpusvm_notifier_alloc - Allocate GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @fault_addr: Fault address
+ *
+ * This function allocates and initializes the GPU SVM notifier structure.
+ *
+ * Returns:
+ * Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_notifier *
+drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, u64 fault_addr)
+{
+	struct drm_gpusvm_notifier *notifier;
+
+	if (gpusvm->ops->notifier_alloc)
+		notifier = gpusvm->ops->notifier_alloc();
+	else
+		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+
+	if (!notifier)
+		return ERR_PTR(-ENOMEM);
+
+	notifier->gpusvm = gpusvm;
+	notifier->interval.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
+	notifier->interval.end = ALIGN(fault_addr + 1, gpusvm->notifier_size);
+	INIT_LIST_HEAD(&notifier->rb.entry);
+	notifier->root = RB_ROOT_CACHED;
+	INIT_LIST_HEAD(&notifier->range_list);
+
+	return notifier;
+}
+
+/**
+ * drm_gpusvm_notifier_free - Free GPU SVM notifier
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ *
+ * This function frees the GPU SVM notifier structure.
+ */
+static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
+				     struct drm_gpusvm_notifier *notifier)
+{
+	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
+
+	if (gpusvm->ops->notifier_free)
+		gpusvm->ops->notifier_free(notifier);
+	else
+		kfree(notifier);
+}
+
+/**
+ * to_drm_gpusvm_range - retrieve the container struct for a given rbtree node
+ * @node__: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
+ *
+ * Return: A pointer to the containing drm_gpusvm_range structure.
+ */
+#define to_drm_gpusvm_range(node__)	\
+	container_of((node__), struct drm_gpusvm_range, rb.node)
+
+/**
+ * drm_gpusvm_range_insert - Insert GPU SVM range
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function inserts the GPU SVM range into the notifier RB tree and list.
+ */
+static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
+				    struct drm_gpusvm_range *range)
+{
+	struct rb_node *node;
+	struct list_head *head;
+
+	drm_gpusvm_notifier_lock(notifier->gpusvm);
+	range_insert(range, &notifier->root);
+
+	node = rb_prev(&range->rb.node);
+	if (node)
+		head = &(to_drm_gpusvm_range(node))->rb.entry;
+	else
+		head = &notifier->range_list;
+
+	list_add(&range->rb.entry, head);
+	drm_gpusvm_notifier_unlock(notifier->gpusvm);
+}
+
+/**
+ * __drm_gpusvm_range_remove - Remove GPU SVM range
+ * @notifier__: Pointer to the GPU SVM notifier structure
+ * @range__: Pointer to the GPU SVM range structure
+ *
+ * This macro removes the GPU SVM range from the notifier RB tree and list.
+ */
+#define __drm_gpusvm_range_remove(notifier__, range__)		\
+	range_remove((range__), &(notifier__)->root);		\
+	list_del(&(range__)->rb.entry)
+
+/**
+ * drm_gpusvm_range_alloc - Allocate GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @fault_addr: Fault address
+ * @chunk_size: Chunk size
+ * @migrate_vram: Flag indicating whether to migrate VRAM
+ *
+ * This function allocates and initializes the GPU SVM range structure.
+ *
+ * Returns:
+ * Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
+ */
+static struct drm_gpusvm_range *
+drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
+		       struct drm_gpusvm_notifier *notifier,
+		       u64 fault_addr, u64 chunk_size, bool migrate_vram)
+{
+	struct drm_gpusvm_range *range;
+
+	if (gpusvm->ops->range_alloc)
+		range = gpusvm->ops->range_alloc(gpusvm);
+	else
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+
+	if (!range)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&range->refcount);
+	range->gpusvm = gpusvm;
+	range->notifier = notifier;
+	range->va.start = ALIGN_DOWN(fault_addr, chunk_size);
+	range->va.end = ALIGN(fault_addr + 1, chunk_size);
+	INIT_LIST_HEAD(&range->rb.entry);
+	range->notifier_seq = LONG_MAX;
+	range->flags.migrate_vram = migrate_vram ? 1 : 0;
+
+	return range;
+}
+
+/**
+ * drm_gpusvm_check_pages - Check pages
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @start: Start address
+ * @end: End address
+ *
+ * Check if pages between start and end have been faulted in on the CPU. Use to
+ * prevent migration of pages without CPU backing store.
+ *
+ * Returns:
+ * True if pages have been faulted into CPU, False otherwise
+ */
+static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
+				   struct drm_gpusvm_notifier *notifier,
+				   u64 start, u64 end)
+{
+	struct hmm_range hmm_range = {
+		.default_flags = 0,
+		.notifier = &notifier->notifier,
+		.start = start,
+		.end = end,
+		.dev_private_owner = gpusvm->device_private_page_owner,
+	};
+	unsigned long timeout =
+		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+	unsigned long *pfns;
+	unsigned long npages = npages_in_range(start, end);
+	int err, i;
+
+	mmap_assert_locked(gpusvm->mm);
+
+	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
+	if (!pfns)
+		return false;
+
+	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
+	hmm_range.hmm_pfns = pfns;
+
+	while (true) {
+		err = hmm_range_fault(&hmm_range);
+		if (err == -EBUSY) {
+			if (time_after(jiffies, timeout))
+				break;
+
+			hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
+			continue;
+		}
+		break;
+	}
+	if (err)
+		goto err_free;
+
+	for (i = 0; i < npages; ++i) {
+		if (!(pfns[i] & HMM_PFN_VALID)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+	}
+
+err_free:
+	kvfree(pfns);
+	return err ? false : true;
+}
+
+/**
+ * drm_gpusvm_range_chunk_size - Determine chunk size for GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier structure
+ * @vas: Pointer to the virtual memory area structure
+ * @fault_addr: Fault address
+ * @gpuva_start: Start address of GPUVA which mirrors CPU
+ * @gpuva_end: End address of GPUVA which mirrors CPU
+ * @check_pages: Flag indicating whether to check pages
+ *
+ * This function determines the chunk size for the GPU SVM range based on the
+ * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
+ * memory area boundaries.
+ *
+ * Returns:
+ * Chunk size on success, LONG_MAX on failure.
+ */
+static u64 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
+				       struct drm_gpusvm_notifier *notifier,
+				       struct vm_area_struct *vas,
+				       u64 fault_addr, u64 gpuva_start,
+				       u64 gpuva_end, bool check_pages)
+{
+	u64 start, end;
+	int i = 0;
+
+retry:
+	for (; i < gpusvm->num_chunks; ++i) {
+		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
+		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
+
+		if (start >= vas->vm_start && end <= vas->vm_end &&
+		    start >= notifier->interval.start &&
+		    end <= notifier->interval.end &&
+		    start >= gpuva_start && end <= gpuva_end)
+			break;
+	}
+
+	if (i == gpusvm->num_chunks)
+		return LONG_MAX;
+
+	/*
+	 * If allocation more than page, ensure not to overlap with existing
+	 * ranges.
+	 */
+	if (end - start != SZ_4K) {
+		struct drm_gpusvm_range *range;
+
+		range = drm_gpusvm_range_find(notifier, start, end);
+		if (range) {
+			++i;
+			goto retry;
+		}
+
+		/*
+		 * XXX: Only create range on pages CPU has faulted in. Without
+		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
+		 * process-many-malloc' fails. In the failure case, each process
+		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
+		 * ranges. When migrating the SVM ranges, some processes fail in
+		 * drm_gpusvm_migrate_to_vram with 'migrate.cpages != npages'
+		 * and then upon drm_gpusvm_range_get_pages device pages from
+		 * other processes are collected + faulted in which creates all
+		 * sorts of problems. Unsure exactly how this happening, also
+		 * problem goes away if 'xe_exec_system_allocator --r
+		 * process-many-malloc' mallocs at least 64k at a time.
+		 */
+		if (check_pages &&
+		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
+			++i;
+			goto retry;
+		}
+	}
+
+	return end - start;
+}
+
+/**
+ * drm_gpusvm_range_find_or_insert - Find or insert GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @fault_addr: Fault address
+ * @gpuva_start: Start address of GPUVA which mirrors CPU
+ * @gpuva_end: End address of GPUVA which mirrors CPU
+ * @ctx: GPU SVM context
+ *
+ * This function finds or inserts a newly allocated a GPU SVM range based on the
+ * fault address. Caller must hold a lock to protect range lookup and insertion.
+ *
+ * Returns:
+ * Pointer to the GPU SVM range on success, ERR_PTR() on failure.
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
+				u64 gpuva_start, u64 gpuva_end,
+				const struct drm_gpusvm_ctx *ctx)
+{
+	struct drm_gpusvm_notifier *notifier;
+	struct drm_gpusvm_range *range;
+	struct mm_struct *mm = gpusvm->mm;
+	struct vm_area_struct *vas;
+	bool notifier_alloc = false;
+	u64 chunk_size;
+	int err;
+	bool migrate_vram;
+
+	if (fault_addr < gpusvm->mm_start ||
+	    fault_addr > gpusvm->mm_start + gpusvm->mm_range) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!ctx->mmap_locked) {
+		if (!mmget_not_zero(mm)) {
+			err = -EFAULT;
+			goto err_out;
+		}
+		mmap_write_lock(mm);
+	}
+
+	mmap_assert_write_locked(mm);
+
+	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
+	if (!notifier) {
+		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
+		if (IS_ERR(notifier)) {
+			err = PTR_ERR(notifier);
+			goto err_mmunlock;
+		}
+		notifier_alloc = true;
+		err = mmu_interval_notifier_insert_locked(&notifier->notifier,
+							  mm, notifier->interval.start,
+							  notifier->interval.end -
+							  notifier->interval.start,
+							  &drm_gpusvm_notifier_ops);
+		if (err)
+			goto err_notifier;
+	}
+
+	vas = vma_lookup(mm, fault_addr);
+	if (!vas) {
+		err = -ENOENT;
+		goto err_notifier_remove;
+	}
+
+	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
+		err = -EPERM;
+		goto err_notifier_remove;
+	}
+
+	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
+	if (range)
+		goto out_mmunlock;
+	/*
+	 * XXX: Short-circuiting migration based on migrate_vma_* current
+	 * limitations. If/when migrate_vma_* add more support, this logic will
+	 * have to change.
+	 */
+	migrate_vram = ctx->vram_possible &&
+		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
+
+	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
+						 fault_addr, gpuva_start,
+						 gpuva_end, migrate_vram &&
+						 !ctx->prefault);
+	if (chunk_size == LONG_MAX) {
+		err = -EINVAL;
+		goto err_notifier_remove;
+	}
+
+	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
+				       migrate_vram);
+	if (IS_ERR(range)) {
+		err = PTR_ERR(range);
+		goto err_notifier_remove;
+	}
+
+	drm_gpusvm_range_insert(notifier, range);
+	if (notifier_alloc)
+		drm_gpusvm_notifier_insert(gpusvm, notifier);
+
+	if (ctx->prefault) {
+		struct drm_gpusvm_ctx __ctx = *ctx;
+
+		__ctx.mmap_locked = true;
+		err = drm_gpusvm_range_get_pages(gpusvm, range, &__ctx);
+		if (err)
+			goto err_range_remove;
+	}
+
+out_mmunlock:
+	if (!ctx->mmap_locked) {
+		mmap_write_unlock(mm);
+		mmput(mm);
+	}
+
+	return range;
+
+err_range_remove:
+	__drm_gpusvm_range_remove(notifier, range);
+err_notifier_remove:
+	if (notifier_alloc)
+		mmu_interval_notifier_remove(&notifier->notifier);
+err_notifier:
+	if (notifier_alloc)
+		drm_gpusvm_notifier_free(gpusvm, notifier);
+err_mmunlock:
+	if (!ctx->mmap_locked) {
+		mmap_write_unlock(mm);
+		mmput(mm);
+	}
+err_out:
+	return ERR_PTR(err);
+}
+
+/**
+ * for_each_dma_page - iterate over pages in a DMA regio`n
+ * @i__: the current page index in the iteration
+ * @j__: the current page index, log order, in the iteration
+ * @npages__: the total number of pages in the DMA region
+ * @order__: the order of the pages in the DMA region
+ *
+ * This macro iterates over each page in a DMA region. The DMA region
+ * is assumed to be composed of 2^@order__ pages, and the macro will
+ * step through the region one block of 2^@order__ pages at a time.
+ */
+#define for_each_dma_page(i__, j__, npages__, order__)	\
+	for ((i__) = 0, (j__) = 0; (i__) < (npages__);	\
+	     (j__)++, (i__) += 0x1 << (order__))
+
+/**
+ * __drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range (internal)
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function unmap pages associated with a GPU SVM range. Assumes and
+ * asserts correct locking is in place when called.
+ */
+static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+					   struct drm_gpusvm_range *range)
+{
+	lockdep_assert_held(&gpusvm->notifier_lock);
+
+	if (range->pages) {
+		unsigned long i, j, npages = npages_in_range(range->va.start,
+							     range->va.end);
+
+		if (range->flags.has_dma_mapping) {
+			for_each_dma_page(i, j, npages, range->order)
+				dma_unmap_page(gpusvm->drm->dev,
+					       range->dma_addr[j],
+					       PAGE_SIZE << range->order,
+					       DMA_BIDIRECTIONAL);
+		}
+
+		range->flags.has_vram_pages = false;
+		range->flags.has_dma_mapping = false;
+	}
+}
+
+/**
+ * drm_gpusvm_range_free_pages - Free pages associated with a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function free pages associated with a GPU SVM range.
+ */
+static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
+					struct drm_gpusvm_range *range)
+{
+	lockdep_assert_held(&gpusvm->notifier_lock);
+
+	if (range->pages) {
+		if (range->flags.kfree_mapping) {
+			kfree(range->dma_addr);
+			range->flags.kfree_mapping = false;
+			range->pages = NULL;
+		} else {
+			kvfree(range->pages);
+			range->pages = NULL;
+		}
+	}
+}
+
+/**
+ * drm_gpusvm_range_remove - Remove GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range to be removed
+ *
+ * This function removes the specified GPU SVM range and also removes the parent
+ * GPU SVM notifier if no more ranges remain in the notifier. The caller must
+ * hold a lock to protect range and notifier removal.
+ */
+void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
+			     struct drm_gpusvm_range *range)
+{
+	struct drm_gpusvm_notifier *notifier;
+
+	notifier = drm_gpusvm_notifier_find(gpusvm, range->va.start);
+	if (WARN_ON_ONCE(!notifier))
+		return;
+
+	drm_gpusvm_notifier_lock(gpusvm);
+	__drm_gpusvm_range_unmap_pages(gpusvm, range);
+	drm_gpusvm_range_free_pages(gpusvm, range);
+	__drm_gpusvm_range_remove(notifier, range);
+	drm_gpusvm_notifier_unlock(gpusvm);
+
+	drm_gpusvm_range_put(range);
+
+	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
+		if (!notifier->flags.removed)
+			mmu_interval_notifier_remove(&notifier->notifier);
+		drm_gpusvm_notifier_remove(gpusvm, notifier);
+		drm_gpusvm_notifier_free(gpusvm, notifier);
+	}
+}
+
+/**
+ * drm_gpusvm_range_get - Get a reference to GPU SVM range
+ * @range: Pointer to the GPU SVM range
+ *
+ * This function increments the reference count of the specified GPU SVM range.
+ *
+ * Returns:
+ * Pointer to the GPU SVM range.
+ */
+struct drm_gpusvm_range *
+drm_gpusvm_range_get(struct drm_gpusvm_range *range)
+{
+	kref_get(&range->refcount);
+
+	return range;
+}
+
+/**
+ * drm_gpusvm_range_destroy - Destroy GPU SVM range
+ * @refcount: Pointer to the reference counter embedded in the GPU SVM range
+ *
+ * This function destroys the specified GPU SVM range when its reference count
+ * reaches zero. If a custom range-free function is provided, it is invoked to
+ * free the range; otherwise, the range is deallocated using kfree().
+ */
+static void drm_gpusvm_range_destroy(struct kref *refcount)
+{
+	struct drm_gpusvm_range *range =
+		container_of(refcount, struct drm_gpusvm_range, refcount);
+	struct drm_gpusvm *gpusvm = range->gpusvm;
+
+	if (gpusvm->ops->range_free)
+		gpusvm->ops->range_free(range);
+	else
+		kfree(range);
+}
+
+/**
+ * drm_gpusvm_range_put - Put a reference to GPU SVM range
+ * @range: Pointer to the GPU SVM range
+ *
+ * This function decrements the reference count of the specified GPU SVM range
+ * and frees it when the count reaches zero.
+ */
+void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
+{
+	kref_put(&range->refcount, drm_gpusvm_range_destroy);
+}
+
+/**
+ * drm_gpusvm_range_pages_valid - GPU SVM range pages valid
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function determines if a GPU SVM range pages are valid. Expected be
+ * called holding gpusvm->notifier_lock and as the last step before commiting a
+ * GPU binding.
+ *
+ * Returns:
+ * True if GPU SVM range has valid pages, False otherwise
+ */
+bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
+				  struct drm_gpusvm_range *range)
+{
+	lockdep_assert_held(&gpusvm->notifier_lock);
+
+	return range->flags.has_vram_pages || range->flags.has_dma_mapping;
+}
+
+/**
+ * drm_gpusvm_range_pages_valid_unlocked - GPU SVM range pages valid unlocked
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * This function determines if a GPU SVM range pages are valid. Expected be
+ * called without holding gpusvm->notifier_lock.
+ *
+ * Returns:
+ * True if GPU SVM range has valid pages, False otherwise
+ */
+static bool
+drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
+				      struct drm_gpusvm_range *range)
+{
+	bool pages_valid;
+
+	if (!range->pages)
+		return false;
+
+	drm_gpusvm_notifier_lock(gpusvm);
+	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
+	if (!pages_valid && range->flags.kfree_mapping) {
+		kfree(range->dma_addr);
+		range->flags.kfree_mapping = false;
+		range->pages = NULL;
+	}
+	drm_gpusvm_notifier_unlock(gpusvm);
+
+	return pages_valid;
+}
+
+/**
+ * drm_gpusvm_range_get_pages - Get pages for a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @ctx: GPU SVM context
+ *
+ * This function gets pages for a GPU SVM range and ensures they are mapped for
+ * DMA access.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       const struct drm_gpusvm_ctx *ctx)
+{
+	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
+	struct hmm_range hmm_range = {
+		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
+			HMM_PFN_REQ_WRITE),
+		.notifier = notifier,
+		.start = range->va.start,
+		.end = range->va.end,
+		.dev_private_owner = gpusvm->device_private_page_owner,
+	};
+	struct mm_struct *mm = gpusvm->mm;
+	unsigned long timeout =
+		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+	unsigned long i, j;
+	unsigned long npages = npages_in_range(range->va.start, range->va.end);
+	unsigned int order = 0;
+	unsigned long *pfns;
+	struct page **pages;
+	int err = 0;
+	bool vram_pages = !!range->flags.migrate_vram;
+	bool alloc_pfns = false, kfree_mapping;
+
+retry:
+	kfree_mapping = false;
+	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
+	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
+		return 0;
+
+	if (range->notifier_seq == hmm_range.notifier_seq && range->pages) {
+		if (ctx->prefault)
+			return 0;
+
+		pfns = (unsigned long *)range->pages;
+		pages = range->pages;
+		goto map_pages;
+	}
+
+	if (!range->pages) {
+		pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
+		if (!pfns)
+			return -ENOMEM;
+		alloc_pfns = true;
+	} else {
+		pfns = (unsigned long *)range->pages;
+	}
+
+	if (!ctx->mmap_locked) {
+		if (!mmget_not_zero(mm)) {
+			err = -EFAULT;
+			goto err_out;
+		}
+	}
+
+	hmm_range.hmm_pfns = pfns;
+	while (true) {
+		/* Must be checked after mmu_interval_read_begin */
+		if (range->flags.unmapped) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (!ctx->mmap_locked) {
+			/*
+			 * XXX: HMM locking document indicates only a read-lock
+			 * is required but there apears to be a window between
+			 * the MMU_NOTIFY_MIGRATE event triggered in a CPU fault
+			 * via migrate_vma_setup and the pages actually moving
+			 * in migrate_vma_finalize in which this code can grab
+			 * garbage pages. Grabbing the write-lock if the range
+			 * is attached to vram appears to protect against this
+			 * race.
+			 */
+			if (vram_pages)
+				mmap_write_lock(mm);
+			else
+				mmap_read_lock(mm);
+		}
+		err = hmm_range_fault(&hmm_range);
+		if (!ctx->mmap_locked) {
+			if (vram_pages)
+				mmap_write_unlock(mm);
+			else
+				mmap_read_unlock(mm);
+		}
+
+		if (err == -EBUSY) {
+			if (time_after(jiffies, timeout))
+				break;
+
+			hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
+			continue;
+		}
+		break;
+	}
+	if (!ctx->mmap_locked)
+		mmput(mm);
+	if (err)
+		goto err_free;
+
+	pages = (struct page **)pfns;
+
+	if (ctx->prefault) {
+		range->pages = pages;
+		goto set_seqno;
+	}
+
+map_pages:
+	if (is_device_private_page(hmm_pfn_to_page(pfns[0]))) {
+		WARN_ON_ONCE(!range->vram_allocation);
+
+		for (i = 0; i < npages; ++i) {
+			pages[i] = hmm_pfn_to_page(pfns[i]);
+
+			if (WARN_ON_ONCE(!is_device_private_page(pages[i]))) {
+				err = -EOPNOTSUPP;
+				goto err_free;
+			}
+		}
+
+		/* Do not race with notifier unmapping pages */
+		drm_gpusvm_notifier_lock(gpusvm);
+		range->flags.has_vram_pages = true;
+		range->pages = pages;
+		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
+			err = -EAGAIN;
+			__drm_gpusvm_range_unmap_pages(gpusvm, range);
+		}
+		drm_gpusvm_notifier_unlock(gpusvm);
+	} else {
+		dma_addr_t *dma_addr = (dma_addr_t *)pfns;
+
+		for_each_dma_page(i, j, npages, order) {
+			if (WARN_ON_ONCE(i && order !=
+					 hmm_pfn_to_map_order(pfns[i]))) {
+				err = -EOPNOTSUPP;
+				npages = i;
+				goto err_unmap;
+			}
+			order = hmm_pfn_to_map_order(pfns[i]);
+
+			pages[j] = hmm_pfn_to_page(pfns[i]);
+			if (WARN_ON_ONCE(is_zone_device_page(pages[j]))) {
+				err = -EOPNOTSUPP;
+				npages = i;
+				goto err_unmap;
+			}
+
+			set_page_dirty_lock(pages[j]);
+			mark_page_accessed(pages[j]);
+
+			dma_addr[j] = dma_map_page(gpusvm->drm->dev,
+						   pages[j], 0,
+						   PAGE_SIZE << order,
+						   DMA_BIDIRECTIONAL);
+			if (dma_mapping_error(gpusvm->drm->dev, dma_addr[j])) {
+				err = -EFAULT;
+				npages = i;
+				goto err_unmap;
+			}
+		}
+
+		/* Huge pages, reduce memory footprint */
+		if (order) {
+			dma_addr = kmalloc_array(j, sizeof(*dma_addr),
+						 GFP_KERNEL);
+			if (dma_addr) {
+				for (i = 0; i < j; ++i)
+					dma_addr[i] = (dma_addr_t)pfns[i];
+				kvfree(pfns);
+				kfree_mapping = true;
+			} else {
+				dma_addr = (dma_addr_t *)pfns;
+			}
+		}
+
+		/* Do not race with notifier unmapping pages */
+		drm_gpusvm_notifier_lock(gpusvm);
+		range->order = order;
+		range->flags.kfree_mapping = kfree_mapping;
+		range->flags.has_dma_mapping = true;
+		range->dma_addr = dma_addr;
+		range->vram_allocation = NULL;
+		if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
+			err = -EAGAIN;
+			__drm_gpusvm_range_unmap_pages(gpusvm, range);
+		}
+		drm_gpusvm_notifier_unlock(gpusvm);
+	}
+
+	if (err == -EAGAIN)
+		goto retry;
+set_seqno:
+	range->notifier_seq = hmm_range.notifier_seq;
+
+	return 0;
+
+err_unmap:
+	for_each_dma_page(i, j, npages, order)
+		dma_unmap_page(gpusvm->drm->dev,
+			       (dma_addr_t)pfns[j],
+			       PAGE_SIZE << order, DMA_BIDIRECTIONAL);
+err_free:
+	if (alloc_pfns)
+		kvfree(pfns);
+err_out:
+	return err;
+}
+
+/**
+ * drm_gpusvm_range_unmap_pages - Unmap pages associated with a GPU SVM range
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @ctx: GPU SVM context
+ *
+ * This function unmaps pages associated with a GPU SVM range. If @in_notifier
+ * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
+ * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
+ * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
+ * security model.
+ */
+void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+				  struct drm_gpusvm_range *range,
+				  const struct drm_gpusvm_ctx *ctx)
+{
+	if (ctx->in_notifier)
+		lockdep_assert_held_write(&gpusvm->notifier_lock);
+	else
+		drm_gpusvm_notifier_lock(gpusvm);
+
+	__drm_gpusvm_range_unmap_pages(gpusvm, range);
+
+	if (!ctx->in_notifier)
+		drm_gpusvm_notifier_unlock(gpusvm);
+}
+
+/**
+ * drm_gpusvm_migration_put_page - Put a migration page
+ * @page: Pointer to the page to put
+ *
+ * This function unlocks and puts a page.
+ */
+static void drm_gpusvm_migration_put_page(struct page *page)
+{
+	unlock_page(page);
+	put_page(page);
+}
+
+/**
+ * drm_gpusvm_migration_put_pages - Put migration pages
+ * @npages: Number of pages
+ * @migrate_pfn: Array of migrate page frame numbers
+ *
+ * This function puts an array of pages.
+ */
+static void drm_gpusvm_migration_put_pages(unsigned long npages,
+					   unsigned long *migrate_pfn)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		if (!migrate_pfn[i])
+			continue;
+
+		drm_gpusvm_migration_put_page(migrate_pfn_to_page(migrate_pfn[i]));
+		migrate_pfn[i] = 0;
+	}
+}
+
+/**
+ * drm_gpusvm_get_vram_page - Get a reference to a VRAM page
+ * @page: Pointer to the page
+ * @zdd: Pointer to the GPU SVM zone device data
+ *
+ * This function associates the given page with the specified GPU SVM zone
+ * device data and initializes it for zone device usage.
+ */
+static void drm_gpusvm_get_vram_page(struct page *page,
+				     struct drm_gpusvm_zdd *zdd)
+{
+	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
+	zone_device_page_init(page);
+}
+
+/**
+ * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
+ * @dev: The device for which the pages are being mapped
+ * @dma_addr: Array to store DMA addresses corresponding to mapped pages
+ * @migrate_pfn: Array of migrate page frame numbers to map
+ * @npages: Number of pages to map
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function maps pages of memory for migration usage in GPU SVM. It
+ * iterates over each page frame number provided in @migrate_pfn, maps the
+ * corresponding page, and stores the DMA address in the provided @dma_addr
+ * array.
+ *
+ * Return: 0 on success, -EFAULT if an error occurs during mapping.
+ */
+static int drm_gpusvm_migrate_map_pages(struct device *dev,
+					dma_addr_t *dma_addr,
+					long unsigned int *migrate_pfn,
+					unsigned long npages,
+					enum dma_data_direction dir)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
+
+		if (!page)
+			continue;
+
+		if (WARN_ON_ONCE(is_zone_device_page(page)))
+			return -EFAULT;
+
+		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
+		if (dma_mapping_error(dev, dma_addr[i]))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
+ * @dev: The device for which the pages were mapped
+ * @dma_addr: Array of DMA addresses corresponding to mapped pages
+ * @npages: Number of pages to unmap
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function unmaps previously mapped pages of memory for GPU Shared Virtual
+ * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
+ * if it's valid and not already unmapped, and unmaps the corresponding page.
+ */
+static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
+					   dma_addr_t *dma_addr,
+					   unsigned long npages,
+					   enum dma_data_direction dir)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
+			continue;
+
+		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
+	}
+}
+
+/**
+ * drm_gpusvm_migrate_to_vram - Migrate GPU SVM range to VRAM
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *                   failure of this function.
+ * @vram_allocation: Driver-private pointer to the VRAM allocation. The caller
+ *                   should hold a reference to the VRAM allocation, which
+ *                   should be dropped via ops->vram_allocation or upon the
+ *                   failure of this function.
+ * @ctx: GPU SVM context
+ *
+ * This function migrates the specified GPU SVM range to VRAM. It performs the
+ * necessary setup and invokes the driver-specific operations for migration to
+ * VRAM. Upon successful return, @vram_allocation can safely reference @range
+ * until ops->vram_release is called which only upon successful return.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       void *vram_allocation,
+			       const struct drm_gpusvm_ctx *ctx)
+{
+	u64 start = range->va.start, end = range->va.end;
+	struct migrate_vma migrate = {
+		.start		= start,
+		.end		= end,
+		.pgmap_owner	= gpusvm->device_private_page_owner,
+		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
+	};
+	struct mm_struct *mm = gpusvm->mm;
+	unsigned long i, npages = npages_in_range(start, end);
+	struct vm_area_struct *vas;
+	struct drm_gpusvm_zdd *zdd = NULL;
+	struct page **pages;
+	dma_addr_t *dma_addr;
+	void *buf;
+	int err;
+
+	if (!range->flags.migrate_vram)
+		return -EINVAL;
+
+	if (!gpusvm->ops->populate_vram_pfn || !gpusvm->ops->copy_to_vram ||
+	    !gpusvm->ops->copy_to_sram)
+		return -EOPNOTSUPP;
+
+	if (!ctx->mmap_locked) {
+		if (!mmget_not_zero(mm)) {
+			err = -EFAULT;
+			goto err_out;
+		}
+		mmap_write_lock(mm);
+	}
+
+	mmap_assert_locked(mm);
+
+	vas = vma_lookup(mm, start);
+	if (!vas) {
+		err = -ENOENT;
+		goto err_mmunlock;
+	}
+
+	if (end > vas->vm_end || start < vas->vm_start) {
+		err = -EINVAL;
+		goto err_mmunlock;
+	}
+
+	if (!vma_is_anonymous(vas)) {
+		err = -EBUSY;
+		goto err_mmunlock;
+	}
+
+	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_mmunlock;
+	}
+	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+	zdd = drm_gpusvm_zdd_alloc(range);
+	if (!zdd) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	migrate.vma = vas;
+	migrate.src = buf;
+	migrate.dst = migrate.src + npages;
+
+	err = migrate_vma_setup(&migrate);
+	if (err)
+		goto err_free;
+
+	/*
+	 * FIXME: Below cases, !migrate.cpages and migrate.cpages != npages, not
+	 * always an error. Need to revisit possible cases and how to handle. We
+	 * could prefault on migrate.cpages != npages via hmm_range_fault.
+	 */
+
+	if (!migrate.cpages) {
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	if (migrate.cpages != npages) {
+		err = -EBUSY;
+		goto err_finalize;
+	}
+
+	err = gpusvm->ops->populate_vram_pfn(gpusvm, vram_allocation, npages,
+					     migrate.dst);
+	if (err)
+		goto err_finalize;
+
+	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
+					   migrate.src, npages, DMA_TO_DEVICE);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page = pfn_to_page(migrate.dst[i]);
+
+		pages[i] = page;
+		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
+		drm_gpusvm_get_vram_page(page, zdd);
+	}
+
+	err = gpusvm->ops->copy_to_vram(gpusvm, pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+	/* Upon success bind vram allocation to range and zdd */
+	range->vram_allocation = vram_allocation;
+	WRITE_ONCE(zdd->vram_allocation, vram_allocation);	/* Owns ref */
+
+err_finalize:
+	if (err)
+		drm_gpusvm_migration_put_pages(npages, migrate.dst);
+	migrate_vma_pages(&migrate);
+	migrate_vma_finalize(&migrate);
+	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
+				       DMA_TO_DEVICE);
+err_free:
+	if (zdd)
+		drm_gpusvm_zdd_put(zdd);
+	kvfree(buf);
+err_mmunlock:
+	if (!ctx->mmap_locked) {
+		mmap_write_unlock(mm);
+		mmput(mm);
+	}
+err_out:
+	return err;
+}
+
+/**
+ * drm_gpusvm_migrate_populate_sram_pfn - Populate SRAM PFNs for a VM area
+ * @vas: Pointer to the VM area structure, can be NULL
+ * @npages: Number of pages to populate
+ * @src_mpfn: Source array of migrate PFNs
+ * @mpfn: Array of migrate PFNs to populate
+ * @addr: Start address for PFN allocation
+ *
+ * This function populates the SRAM migrate page frame numbers (PFNs) for the
+ * specified VM area structure. It allocates and locks pages in the VM area for
+ * SRAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
+ * alloc_page for allocation.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+static int drm_gpusvm_migrate_populate_sram_pfn(struct vm_area_struct *vas,
+						unsigned long npages,
+						unsigned long *src_mpfn,
+						unsigned long *mpfn, u64 addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
+		struct page *page;
+
+		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		if (vas)
+			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
+		else
+			page = alloc_page(GFP_HIGHUSER);
+
+		if (!page)
+			return -ENOMEM;
+
+		lock_page(page);
+		mpfn[i] = migrate_pfn(page_to_pfn(page));
+	}
+
+	return 0;
+}
+
+/**
+ * drm_gpusvm_evict_to_sram - Evict GPU SVM range to SRAM
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ *
+ * Similar to __drm_gpusvm_migrate_to_sram but does not require mmap lock and
+ * migration done via migrate_device_* functions. Fallback path as it is
+ * preferred to issue migrations with mmap lock.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+static int drm_gpusvm_evict_to_sram(struct drm_gpusvm *gpusvm,
+				    struct drm_gpusvm_range *range)
+{
+	unsigned long npages;
+	struct page **pages;
+	unsigned long *src, *dst;
+	dma_addr_t *dma_addr;
+	void *buf;
+	int i, err = 0;
+
+	npages = npages_in_range(range->va.start, range->va.end);
+
+	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	src = buf;
+	dst = buf + (sizeof(*src) * npages);
+	dma_addr = buf + (2 * sizeof(*src) * npages);
+	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
+
+	err = gpusvm->ops->populate_vram_pfn(gpusvm, range->vram_allocation,
+					     npages, src);
+	if (err)
+		goto err_free;
+
+	err = migrate_device_vma_range(gpusvm->mm,
+				       gpusvm->device_private_page_owner, src,
+				       npages, range->va.start);
+	if (err)
+		goto err_free;
+
+	err = drm_gpusvm_migrate_populate_sram_pfn(NULL, npages, src, dst, 0);
+	if (err)
+		goto err_finalize;
+
+	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
+					   dst, npages, DMA_BIDIRECTIONAL);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i)
+		pages[i] = migrate_pfn_to_page(src[i]);
+
+	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+err_finalize:
+	if (err)
+		drm_gpusvm_migration_put_pages(npages, dst);
+	migrate_device_pages(src, dst, npages);
+	migrate_device_finalize(src, dst, npages);
+	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
+				       DMA_BIDIRECTIONAL);
+err_free:
+	kvfree(buf);
+err_out:
+
+	return err;
+}
+
+/**
+ * __drm_gpusvm_migrate_to_sram - Migrate GPU SVM range to SRAM (internal)
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @vas: Pointer to the VM area structure
+ * @page: Pointer to the page for fault handling (can be NULL)
+ * @start: Start address of the migration range
+ * @end: End address of the migration range
+ *
+ * This internal function performs the migration of the specified GPU SVM range
+ * to SRAM. It sets up the migration, populates + dma maps SRAM PFNs, and
+ * invokes the driver-specific operations for migration to SRAM.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+static int __drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
+					struct vm_area_struct *vas,
+					struct page *page,
+					u64 start, u64 end)
+{
+	struct migrate_vma migrate = {
+		.vma		= vas,
+		.pgmap_owner	= gpusvm->device_private_page_owner,
+		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
+		.fault_page	= page,
+	};
+	unsigned long npages;
+	struct page **pages;
+	dma_addr_t *dma_addr;
+	void *buf;
+	int i, err = 0;
+
+	mmap_assert_locked(gpusvm->mm);
+
+	/* Corner where VMA area struct has been partially unmapped */
+	if (start < vas->vm_start)
+		start = vas->vm_start;
+	if (end > vas->vm_end)
+		end = vas->vm_end;
+
+	migrate.start = start;
+	migrate.end = end;
+	npages = npages_in_range(start, end);
+
+	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+	migrate.vma = vas;
+	migrate.src = buf;
+	migrate.dst = migrate.src + npages;
+
+	err = migrate_vma_setup(&migrate);
+	if (err)
+		goto err_free;
+
+	/* Raced with another CPU fault, nothing to do */
+	if (!migrate.cpages)
+		goto err_free;
+
+	err = drm_gpusvm_migrate_populate_sram_pfn(vas, npages,
+						   migrate.src, migrate.dst,
+						   start);
+	if (err)
+		goto err_finalize;
+
+	err = drm_gpusvm_migrate_map_pages(gpusvm->drm->dev, dma_addr,
+					   migrate.dst, npages,
+					   DMA_BIDIRECTIONAL);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i)
+		pages[i] = migrate_pfn_to_page(migrate.src[i]);
+
+	err = gpusvm->ops->copy_to_sram(gpusvm, pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+err_finalize:
+	if (err)
+		drm_gpusvm_migration_put_pages(npages, migrate.dst);
+	migrate_vma_pages(&migrate);
+	migrate_vma_finalize(&migrate);
+	drm_gpusvm_migrate_unmap_pages(gpusvm->drm->dev, dma_addr, npages,
+				       DMA_BIDIRECTIONAL);
+err_free:
+	kvfree(buf);
+err_out:
+	mmap_assert_locked(gpusvm->mm);
+
+	return err;
+}
+
+/**
+ * drm_gpusvm_migrate_to_sram - Migrate (evict) GPU SVM range to SRAM
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @range: Pointer to the GPU SVM range structure
+ * @ctx: GPU SVM context
+ *
+ * This function initiates the migration of the specified GPU SVM range to
+ * SRAM. It performs necessary checks and invokes the internal migration
+ * function for actual migration.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure.
+ */
+int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       const struct drm_gpusvm_ctx *ctx)
+{
+	u64 start = range->va.start, end = range->va.end;
+	struct mm_struct *mm = gpusvm->mm;
+	struct vm_area_struct *vas;
+	int err;
+	bool retry = false;
+
+	if (!ctx->mmap_locked) {
+		if (!mmget_not_zero(mm)) {
+			err = -EFAULT;
+			goto err_out;
+		}
+		if (ctx->trylock_mmap) {
+			if (!mmap_read_trylock(mm))  {
+				err = drm_gpusvm_evict_to_sram(gpusvm, range);
+				goto err_mmput;
+			}
+		} else {
+			mmap_read_lock(mm);
+		}
+	}
+
+	mmap_assert_locked(mm);
+
+	/*
+	 * Loop required to find all VMA area structs for the corner case when
+	 * VRAM backing has been partially unmapped from MM's address space.
+	 */
+again:
+	vas = find_vma(mm, start);
+	if (!vas) {
+		if (!retry)
+			err = -ENOENT;
+		goto err_mmunlock;
+	}
+
+	if (end <= vas->vm_start || start >= vas->vm_end) {
+		if (!retry)
+			err = -EINVAL;
+		goto err_mmunlock;
+	}
+
+	err = __drm_gpusvm_migrate_to_sram(gpusvm, vas, NULL, start, end);
+	if (err)
+		goto err_mmunlock;
+
+	if (vas->vm_end < end) {
+		retry = true;
+		start = vas->vm_end;
+		goto again;
+	}
+
+	if (!ctx->mmap_locked) {
+		mmap_read_unlock(mm);
+		/*
+		 * Using mmput_async as this function can be called while
+		 * holding a dma-resv lock, and a final put can grab the mmap
+		 * lock, causing a lock inversion.
+		 */
+		mmput_async(mm);
+	}
+
+	return 0;
+
+err_mmunlock:
+	if (!ctx->mmap_locked)
+		mmap_read_unlock(mm);
+err_mmput:
+	if (!ctx->mmap_locked)
+		mmput_async(mm);
+err_out:
+	return err;
+}
+
+/**
+ * drm_gpusvm_page_free - Put GPU SVM zone device data associated with a page
+ * @page: Pointer to the page
+ *
+ * This function is a callback used to put the GPU SVM zone device data
+ * associated with a page when it is being released.
+ */
+static void drm_gpusvm_page_free(struct page *page)
+{
+	drm_gpusvm_zdd_put(page->zone_device_data);
+}
+
+/**
+ * drm_gpusvm_migrate_to_ram - Migrate GPU SVM range to RAM (page fault handler)
+ * @vmf: Pointer to the fault information structure
+ *
+ * This function is a page fault handler used to migrate a GPU SVM range to RAM.
+ * It retrieves the GPU SVM range information from the faulting page and invokes
+ * the internal migration function to migrate the range back to RAM.
+ *
+ * Returns:
+ * VM_FAULT_SIGBUS on failure, 0 on success.
+ */
+static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
+{
+	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
+	int err;
+
+	err = __drm_gpusvm_migrate_to_sram(zdd->range->gpusvm,
+					   vmf->vma, vmf->page,
+					   zdd->range->va.start,
+					   zdd->range->va.end);
+
+	return err ? VM_FAULT_SIGBUS : 0;
+}
+
+/**
+ * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
+ */
+static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
+	.page_free = drm_gpusvm_page_free,
+	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
+};
+
+/**
+ * drm_gpusvm_pagemap_ops_get - Retrieve GPU SVM device page map operations
+ *
+ * Returns:
+ * Pointer to the GPU SVM device page map operations structure.
+ */
+const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
+{
+	return &drm_gpusvm_pagemap_ops;
+}
+
+/**
+ * drm_gpusvm_has_mapping - Check if GPU SVM has mapping for the given address range
+ * @gpusvm: Pointer to the GPU SVM structure.
+ * @start: Start address
+ * @end: End address
+ *
+ * Returns:
+ * True if GPU SVM has mapping, False otherwise
+ */
+bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end)
+{
+	struct drm_gpusvm_notifier *notifier;
+
+	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
+		struct drm_gpusvm_range *range = NULL;
+
+		drm_gpusvm_for_each_range(range, notifier, start, end)
+			return true;
+	}
+
+	return false;
+}
diff --git a/drivers/gpu/drm/xe/drm_gpusvm.h b/drivers/gpu/drm/xe/drm_gpusvm.h
new file mode 100644
index 000000000000..0ea70f8534a8
--- /dev/null
+++ b/drivers/gpu/drm/xe/drm_gpusvm.h
@@ -0,0 +1,415 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef __DRM_GPUSVM_H__
+#define __DRM_GPUSVM_H__
+
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/workqueue.h>
+
+struct dev_pagemap_ops;
+struct drm_device;
+struct drm_gpusvm;
+struct drm_gpusvm_notifier;
+struct drm_gpusvm_ops;
+struct drm_gpusvm_range;
+
+/**
+ * struct drm_gpusvm_ops - Operations structure for GPU SVM
+ *
+ * This structure defines the operations for GPU Shared Virtual Memory (SVM).
+ * These operations are provided by the GPU driver to manage SVM ranges and
+ * perform operations such as migration between VRAM and system RAM.
+ */
+struct drm_gpusvm_ops {
+	/**
+	 * @notifier_alloc: Allocate a GPU SVM notifier (optional)
+	 *
+	 * This function shall allocate a GPU SVM notifier.
+	 *
+	 * Returns:
+	 * Pointer to the allocated GPU SVM notifier on success, NULL on failure.
+	 */
+	struct drm_gpusvm_notifier *(*notifier_alloc)(void);
+
+	/**
+	 * @notifier_free: Free a GPU SVM notifier (optional)
+	 * @notifier: Pointer to the GPU SVM notifier to be freed
+	 *
+	 * This function shall free a GPU SVM notifier.
+	 */
+	void (*notifier_free)(struct drm_gpusvm_notifier *notifier);
+
+	/**
+	 * @range_alloc: Allocate a GPU SVM range (optional)
+	 * @gpusvm: Pointer to the GPU SVM
+	 *
+	 * This function shall allocate a GPU SVM range.
+	 *
+	 * Returns:
+	 * Pointer to the allocated GPU SVM range on success, NULL on failure.
+	 */
+	struct drm_gpusvm_range *(*range_alloc)(struct drm_gpusvm *gpusvm);
+
+	/**
+	 * @range_free: Free a GPU SVM range (optional)
+	 * @range: Pointer to the GPU SVM range to be freed
+	 *
+	 * This function shall free a GPU SVM range.
+	 */
+	void (*range_free)(struct drm_gpusvm_range *range);
+
+	/**
+	 * @vram_release: Release VRAM allocation (optional)
+	 * @vram_allocation: Driver-private pointer to the VRAM allocation
+	 *
+	 * This function shall release VRAM allocation and expects to drop a
+	 * reference to VRAM allocation.
+	 */
+	void (*vram_release)(void *vram_allocation);
+
+	/**
+	 * @populate_vram_pfn: Populate VRAM PFN (required for migration)
+	 * @gpusvm: Pointer to the GPU SVM
+	 * @vram_allocation: Driver-private pointer to the VRAM allocation
+	 * @npages: Number of pages to populate
+	 * @pfn: Array of page frame numbers to populate
+	 *
+	 * This function shall populate VRAM page frame numbers (PFN).
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code on failure.
+	 */
+	int (*populate_vram_pfn)(struct drm_gpusvm *gpusvm,
+				 void *vram_allocation,
+				 unsigned long npages,
+				 unsigned long *pfn);
+
+	/**
+	 * @copy_to_vram: Copy to VRAM (required for migration)
+	 * @gpusvm: Pointer to the GPU SVM
+	 * @pages: Pointer to array of VRAM pages (destination)
+	 * @dma_addr: Pointer to array of DMA addresses (source)
+	 * @npages: Number of pages to copy
+	 *
+	 * This function shall copy pages to VRAM.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code on failure.
+	 */
+	int (*copy_to_vram)(struct drm_gpusvm *gpusvm,
+			    struct page **pages,
+			    dma_addr_t *dma_addr,
+			    unsigned long npages);
+
+	/**
+	 * @copy_to_sram: Copy to system RAM (required for migration)
+	 * @gpusvm: Pointer to the GPU SVM
+	 * @pages: Pointer to array of VRAM pages (source)
+	 * @dma_addr: Pointer to array of DMA addresses (destination)
+	 * @npages: Number of pages to copy
+	 *
+	 * This function shall copy pages to system RAM.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code on failure.
+	 */
+	int (*copy_to_sram)(struct drm_gpusvm *gpusvm,
+			    struct page **pages,
+			    dma_addr_t *dma_addr,
+			    unsigned long npages);
+
+	/**
+	 * @invalidate: Invalidate GPU SVM notifier (required)
+	 * @gpusvm: Pointer to the GPU SVM
+	 * @notifier: Pointer to the GPU SVM notifier
+	 * @mmu_range: Pointer to the mmu_notifier_range structure
+	 *
+	 * This function shall invalidate the GPU page tables. It can safely
+	 * walk the notifier range RB tree/list in this function. Called while
+	 * holding the notifier lock.
+	 */
+	void (*invalidate)(struct drm_gpusvm *gpusvm,
+			   struct drm_gpusvm_notifier *notifier,
+			   const struct mmu_notifier_range *mmu_range);
+};
+
+/**
+ * struct drm_gpusvm_notifier - Structure representing a GPU SVM notifier
+ *
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: MMU interval notifier
+ * @interval: Interval for the notifier
+ * @rb: Red-black tree node for the parent GPU SVM structure notifier tree
+ * @root: Cached root node of the RB tree containing ranges
+ * @range_list: List head containing of ranges in the same order they appear in
+ *              interval tree. This is useful to keep iterating ranges while
+ *              doing modifications to RB tree.
+ * @flags.removed: Flag indicating whether the MMU interval notifier has been
+ *                 removed
+ *
+ * This structure represents a GPU SVM notifier.
+ */
+struct drm_gpusvm_notifier {
+	struct drm_gpusvm *gpusvm;
+	struct mmu_interval_notifier notifier;
+	struct {
+		u64 start;
+		u64 end;
+	} interval;
+	struct {
+		struct rb_node node;
+		struct list_head entry;
+		u64 __subtree_last;
+	} rb;
+	struct rb_root_cached root;
+	struct list_head range_list;
+	struct {
+		u32 removed : 1;
+	} flags;
+};
+
+/**
+ * struct drm_gpusvm_range - Structure representing a GPU SVM range
+ *
+ * @gpusvm: Pointer to the GPU SVM structure
+ * @notifier: Pointer to the GPU SVM notifier
+ * @refcount: Reference count for the range
+ * @rb: Red-black tree node for the parent GPU SVM notifier structure range tree
+ * @va: Virtual address range
+ * @notifier_seq: Notifier sequence number of the range's pages
+ * @pages: Pointer to the array of pages (if backing store is in VRAM)
+ * @dma_addr: DMA address array (if backing store is SRAM and DMA mapped)
+ * @vram_allocation: Driver-private pointer to the VRAM allocation
+ * @order: Order of dma mapping. i.e. PAGE_SIZE << order is mapping size
+ * @flags.migrate_vram: Flag indicating whether the range can be migrated to VRAM
+ * @flags.unmapped: Flag indicating if the range has been unmapped
+ * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
+ * @flags.has_vram_pages: Flag indicating if the range has vram pages
+ * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
+ * @flags.kfree_mapping: Flag indicating @dma_addr is a compact allocation based
+ *                       on @order which releases via kfree
+ *
+ * This structure represents a GPU SVM range used for tracking memory ranges
+ * mapped in a DRM device.
+ */
+struct drm_gpusvm_range {
+	struct drm_gpusvm *gpusvm;
+	struct drm_gpusvm_notifier *notifier;
+	struct kref refcount;
+	struct {
+		struct rb_node node;
+		struct list_head entry;
+		u64 __subtree_last;
+	} rb;
+	struct {
+		u64 start;
+		u64 end;
+	} va;
+	unsigned long notifier_seq;
+	union {
+		struct page **pages;
+		dma_addr_t *dma_addr;
+	};
+	void *vram_allocation;
+	u16 order;
+	struct {
+		/* All flags below must be set upon creation */
+		u16 migrate_vram : 1;
+		/* All flags below must be set / cleared under notifier lock */
+		u16 unmapped : 1;
+		u16 partial_unmap : 1;
+		u16 has_vram_pages : 1;
+		u16 has_dma_mapping : 1;
+		u16 kfree_mapping : 1;
+	} flags;
+};
+
+/**
+ * struct drm_gpusvm - GPU SVM structure
+ *
+ * @name: Name of the GPU SVM
+ * @drm: Pointer to the DRM device structure
+ * @mm: Pointer to the mm_struct for the address space
+ * @device_private_page_owner: Device private pages owner
+ * @mm_start: Start address of GPU SVM
+ * @mm_range: Range of the GPU SVM
+ * @notifier_size: Size of individual notifiers
+ * @ops: Pointer to the operations structure for GPU SVM
+ * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
+ *               Entries should be powers of 2 in descending order.
+ * @num_chunks: Number of chunks
+ * @notifier_lock: Read-write semaphore for protecting notifier operations
+ * @zdd_wq: Workqueue for deferred work on zdd destruction
+ * @root: Cached root node of the Red-Black tree containing GPU SVM notifiers
+ * @notifier_list: list head containing of notifiers in the same order they
+ *                 appear in interval tree. This is useful to keep iterating
+ *                 notifiers while doing modifications to RB tree.
+ *
+ * This structure represents a GPU SVM (Shared Virtual Memory) used for tracking
+ * memory ranges mapped in a DRM (Direct Rendering Manager) device.
+ *
+ * No reference counting is provided, as this is expected to be embedded in the
+ * driver VM structure along with the struct drm_gpuvm, which handles reference
+ * counting.
+ */
+struct drm_gpusvm {
+	const char *name;
+	struct drm_device *drm;
+	struct mm_struct *mm;
+	void *device_private_page_owner;
+	u64 mm_start;
+	u64 mm_range;
+	u64 notifier_size;
+	const struct drm_gpusvm_ops *ops;
+	const u64 *chunk_sizes;
+	int num_chunks;
+	struct rw_semaphore notifier_lock;
+	struct workqueue_struct *zdd_wq;
+	struct rb_root_cached root;
+	struct list_head notifier_list;
+};
+
+/**
+ * struct drm_gpusvm_ctx - DRM GPU SVM context
+ *
+ * @mmap_locked: mmap lock is locked
+ * @trylock_mmap: trylock mmap lock, used to avoid locking inversions
+ *                (e.g.dma-revs -> mmap lock)
+ * @in_notifier: entering from a MMU notifier
+ * @read_only: operating on read-only memory
+ * @vram_possible: possible to use VRAM
+ * @prefault: prefault pages
+ *
+ * Context that is DRM GPUSVM is operating in (i.e. user arguments).
+ */
+struct drm_gpusvm_ctx {
+	u32 mmap_locked :1;
+	u32 trylock_mmap :1;
+	u32 in_notifier :1;
+	u32 read_only :1;
+	u32 vram_possible :1;
+	u32 prefault :1;
+};
+
+int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
+		    const char *name, struct drm_device *drm,
+		    struct mm_struct *mm, void *device_private_page_owner,
+		    u64 mm_start, u64 mm_range, u64 notifier_size,
+		    const struct drm_gpusvm_ops *ops,
+		    const u64 *chunk_sizes, int num_chunks);
+void drm_gpusvm_fini(struct drm_gpusvm *gpusvm);
+void drm_gpusvm_free(struct drm_gpusvm *gpusvm);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm, u64 fault_addr,
+				u64 gpuva_start, u64 gpuva_end,
+				const struct drm_gpusvm_ctx *ctx);
+void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
+			     struct drm_gpusvm_range *range);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_get(struct drm_gpusvm_range *range);
+void drm_gpusvm_range_put(struct drm_gpusvm_range *range);
+
+bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
+				  struct drm_gpusvm_range *range);
+
+int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       const struct drm_gpusvm_ctx *ctx);
+void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
+				  struct drm_gpusvm_range *range,
+				  const struct drm_gpusvm_ctx *ctx);
+
+int drm_gpusvm_migrate_to_vram(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       void *vram_allocation,
+			       const struct drm_gpusvm_ctx *ctx);
+int drm_gpusvm_migrate_to_sram(struct drm_gpusvm *gpusvm,
+			       struct drm_gpusvm_range *range,
+			       const struct drm_gpusvm_ctx *ctx);
+
+const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
+
+bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, u64 start, u64 end);
+
+struct drm_gpusvm_range *
+drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, u64 start, u64 end);
+
+/**
+ * drm_gpusvm_notifier_lock - Lock GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM structure.
+ *
+ * Abstract client usage GPU SVM notifier lock, take lock
+ */
+#define drm_gpusvm_notifier_lock(gpusvm__)	\
+	down_read(&(gpusvm__)->notifier_lock)
+
+/**
+ * drm_gpusvm_notifier_unlock - Unlock GPU SVM notifier
+ * @gpusvm__: Pointer to the GPU SVM structure.
+ *
+ * Abstract client usage GPU SVM notifier lock, drop lock
+ */
+#define drm_gpusvm_notifier_unlock(gpusvm__)	\
+	up_read(&(gpusvm__)->notifier_lock)
+
+/**
+ * __drm_gpusvm_range_next - Get the next GPU SVM range in the list
+ * @range: a pointer to the current GPU SVM range
+ *
+ * Return: A pointer to the next drm_gpusvm_range if available, or NULL if the
+ *         current range is the last one or if the input range is NULL.
+ */
+static inline struct drm_gpusvm_range *
+__drm_gpusvm_range_next(struct drm_gpusvm_range *range)
+{
+	if (range && !list_is_last(&range->rb.entry,
+				   &range->notifier->range_list))
+		return list_next_entry(range, rb.entry);
+
+	return NULL;
+}
+
+/**
+ * drm_gpusvm_for_each_range - Iterate over GPU SVM ranges in a notifier
+ * @range__: Iterator variable for the ranges. If set, it indicates the start of
+ *	     the iterator. If NULL, call drm_gpusvm_range_find() to get the range.
+ * @notifier__: Pointer to the GPU SVM notifier
+ * @start__: Start address of the range
+ * @end__: End address of the range
+ *
+ * This macro is used to iterate over GPU SVM ranges in a notifier. It is safe
+ * to use while holding the driver SVM lock or the notifier lock.
+ */
+#define drm_gpusvm_for_each_range(range__, notifier__, start__, end__)	\
+	for ((range__) = (range__) ?:					\
+	     drm_gpusvm_range_find((notifier__), (start__), (end__));	\
+	     (range__) && (range__->va.start < (end__));		\
+	     (range__) = __drm_gpusvm_range_next(range__))
+
+/**
+ * drm_gpusvm_range_set_unmapped - Mark a GPU SVM range as unmapped
+ * @range: Pointer to the GPU SVM range structure.
+ * @mmu_range: Pointer to the MMU notifier range structure.
+ *
+ * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
+ * if the range partially falls within the provided MMU notifier range.
+ */
+static inline void
+drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
+			      const struct mmu_notifier_range *mmu_range)
+{
+	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
+
+	range->flags.unmapped = true;
+	if (range->va.start < mmu_range->start ||
+	    range->va.end > mmu_range->end)
+		range->flags.partial_unmap = true;
+}
+
+#endif /* __DRM_GPUSVM_H__ */