diff mbox series

[RFC,06/13] drm/i915/svm: Page table mirroring support

Message ID 20191122205734.15925-7-niranjana.vishwanathapura@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/svm: Add SVM support | expand

Commit Message

Niranjana Vishwanathapura Nov. 22, 2019, 8:57 p.m. UTC
Use HMM page table mirroring support to build device page table.
Implement the bind ioctl and bind the process address range in the
specified context's ppgtt.
Handle invalidation notifications by unbinding the address range.

Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 drivers/gpu/drm/i915/Kconfig        |   3 +
 drivers/gpu/drm/i915/Makefile       |   3 +-
 drivers/gpu/drm/i915/i915_drv.c     |   3 +
 drivers/gpu/drm/i915/i915_gem_gtt.c |   5 +
 drivers/gpu/drm/i915/i915_gem_gtt.h |   4 +
 drivers/gpu/drm/i915/i915_svm.c     | 296 ++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_svm.h     |  50 +++++
 7 files changed, 363 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/i915_svm.c
 create mode 100644 drivers/gpu/drm/i915/i915_svm.h

Comments

Jason Gunthorpe Nov. 22, 2019, 11:33 p.m. UTC | #1
On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:

> +static inline bool
> +i915_range_done(struct hmm_range *range)
> +{
> +	bool ret = hmm_range_valid(range);
> +
> +	hmm_range_unregister(range);
> +	return ret;
> +}

This needs to be updated to follow the new API, but this pattern is
generally wrong, failure if hmm_range_valid should retry the
range_fault and so forth. See the hmm.rst.

> +static int
> +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> +{
> +	long ret;
> +
> +	range->default_flags = 0;
> +	range->pfn_flags_mask = -1UL;
> +
> +	ret = hmm_range_register(range, &svm->mirror);
> +	if (ret) {
> +		up_read(&svm->mm->mmap_sem);
> +		return (int)ret;
> +	}


Using a temporary range is the pattern from nouveau, is it really
necessary in this driver?

> +static u32 i915_svm_build_sg(struct i915_address_space *vm,
> +			     struct hmm_range *range,
> +			     struct sg_table *st)
> +{
> +	struct scatterlist *sg;
> +	u32 sg_page_sizes = 0;
> +	u64 i, npages;
> +
> +	sg = NULL;
> +	st->nents = 0;
> +	npages = (range->end - range->start) / PAGE_SIZE;
> +
> +	/*
> +	 * No needd to dma map the host pages and later unmap it, as
> +	 * GPU is not allowed to access it with SVM. Hence, no need
> +	 * of any intermediate data strucutre to hold the mappings.
> +	 */
> +	for (i = 0; i < npages; i++) {
> +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
> +
> +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
> +			sg->length += PAGE_SIZE;
> +			sg_dma_len(sg) += PAGE_SIZE;
> +			continue;
> +		}
> +
> +		if (sg)
> +			sg_page_sizes |= sg->length;
> +
> +		sg =  sg ? __sg_next(sg) : st->sgl;
> +		sg_dma_address(sg) = addr;
> +		sg_dma_len(sg) = PAGE_SIZE;
> +		sg->length = PAGE_SIZE;
> +		st->nents++;

It is odd to build the range into a sgl.

IMHO it is not a good idea to use the sg_dma_address like this, that
should only be filled in by a dma map. Where does it end up being
used?

> +	range.pfn_shift = PAGE_SHIFT;
> +	range.start = args->start;
> +	range.flags = i915_range_flags;
> +	range.values = i915_range_values;
> +	range.end = range.start + args->length;
> +	for (i = 0; i < npages; ++i) {
> +		range.pfns[i] = range.flags[HMM_PFN_VALID];
> +		if (!(args->flags & I915_BIND_READONLY))
> +			range.pfns[i] |= range.flags[HMM_PFN_WRITE];
> +	}
> +
> +	down_read(&svm->mm->mmap_sem);
> +	vma = find_vma(svm->mm, range.start);

Why is find_vma needed?

> +	if (unlikely(!vma || vma->vm_end < range.end)) {
> +		ret = -EFAULT;
> +		goto vma_done;
> +	}
> +again:
> +	ret = i915_range_fault(svm, &range);
> +	if (unlikely(ret))
> +		goto vma_done;
> +
> +	sg_page_sizes = i915_svm_build_sg(vm, &range, &st);
>

Keep in mind what can be done with the range values is limited until
the lock is obtained. Reading the pfns and flags is OK though.

> +int i915_svm_bind_mm(struct i915_address_space *vm)
> +{
> +	struct i915_svm *svm;
> +	struct mm_struct *mm;
> +	int ret = 0;
> +
> +	mm = get_task_mm(current);

I don't think the get_task_mm(current) is needed, the mmget is already
done for current->mm ?

Jason
Niranjana Vishwanathapura Nov. 23, 2019, 4:44 a.m. UTC | #2
On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
>On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
>
>> +static inline bool
>> +i915_range_done(struct hmm_range *range)
>> +{
>> +	bool ret = hmm_range_valid(range);
>> +
>> +	hmm_range_unregister(range);
>> +	return ret;
>> +}
>
>This needs to be updated to follow the new API, but this pattern is
>generally wrong, failure if hmm_range_valid should retry the
>range_fault and so forth. See the hmm.rst.
>

Thanks Jason for the feedback.
Yah, will update as per new API in the next revision.
The caller of this function is indeed retrying if the range is not valid.
But I got the point. I made changes in my local build as per hmm.rst
and it is working fine. Will include the change in next revision.

Noticed that as hmm_range_fault() retuns number of valid pages, hmm.rst
example probably should be updated to check for that instead of non-zero
return value.

>> +static int
>> +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
>> +{
>> +	long ret;
>> +
>> +	range->default_flags = 0;
>> +	range->pfn_flags_mask = -1UL;
>> +
>> +	ret = hmm_range_register(range, &svm->mirror);
>> +	if (ret) {
>> +		up_read(&svm->mm->mmap_sem);
>> +		return (int)ret;
>> +	}
>
>
>Using a temporary range is the pattern from nouveau, is it really
>necessary in this driver?

Yah, not required. In my local build I tried with proper default_flags
and set pfn_flags_mask to 0 and it is working fine.

>
>> +static u32 i915_svm_build_sg(struct i915_address_space *vm,
>> +			     struct hmm_range *range,
>> +			     struct sg_table *st)
>> +{
>> +	struct scatterlist *sg;
>> +	u32 sg_page_sizes = 0;
>> +	u64 i, npages;
>> +
>> +	sg = NULL;
>> +	st->nents = 0;
>> +	npages = (range->end - range->start) / PAGE_SIZE;
>> +
>> +	/*
>> +	 * No needd to dma map the host pages and later unmap it, as
>> +	 * GPU is not allowed to access it with SVM. Hence, no need
>> +	 * of any intermediate data strucutre to hold the mappings.
>> +	 */
>> +	for (i = 0; i < npages; i++) {
>> +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
>> +
>> +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
>> +			sg->length += PAGE_SIZE;
>> +			sg_dma_len(sg) += PAGE_SIZE;
>> +			continue;
>> +		}
>> +
>> +		if (sg)
>> +			sg_page_sizes |= sg->length;
>> +
>> +		sg =  sg ? __sg_next(sg) : st->sgl;
>> +		sg_dma_address(sg) = addr;
>> +		sg_dma_len(sg) = PAGE_SIZE;
>> +		sg->length = PAGE_SIZE;
>> +		st->nents++;
>
>It is odd to build the range into a sgl.
>
>IMHO it is not a good idea to use the sg_dma_address like this, that
>should only be filled in by a dma map. Where does it end up being
>used?

The sgl is used to plug into the page table update function in i915.

For the device memory in discrete card, we don't need dma map which
is the case here. Here we don't expect any access to host memory
as mentioned in the comment above.
Well for integrated gfx, if we need to use SVM (with iommu is enabled),
we need to dma map. This requires we maintain the dma address for
each page in some intermediate data structure. Currently, this is TBD.

>
>> +	range.pfn_shift = PAGE_SHIFT;
>> +	range.start = args->start;
>> +	range.flags = i915_range_flags;
>> +	range.values = i915_range_values;
>> +	range.end = range.start + args->length;
>> +	for (i = 0; i < npages; ++i) {
>> +		range.pfns[i] = range.flags[HMM_PFN_VALID];
>> +		if (!(args->flags & I915_BIND_READONLY))
>> +			range.pfns[i] |= range.flags[HMM_PFN_WRITE];
>> +	}
>> +
>> +	down_read(&svm->mm->mmap_sem);
>> +	vma = find_vma(svm->mm, range.start);
>
>Why is find_vma needed?

Ok, looks like not needed, will remove it.

>
>> +	if (unlikely(!vma || vma->vm_end < range.end)) {
>> +		ret = -EFAULT;
>> +		goto vma_done;
>> +	}
>> +again:
>> +	ret = i915_range_fault(svm, &range);
>> +	if (unlikely(ret))
>> +		goto vma_done;
>> +
>> +	sg_page_sizes = i915_svm_build_sg(vm, &range, &st);
>>
>
>Keep in mind what can be done with the range values is limited until
>the lock is obtained. Reading the pfns and flags is OK though.
>

Yah, I either have to build sgl here now and discard later if the range
is not valid. Or, do it while holding the lock which increases contention
period. So, settled on doing it here as it seemed better choice to me.

>> +int i915_svm_bind_mm(struct i915_address_space *vm)
>> +{
>> +	struct i915_svm *svm;
>> +	struct mm_struct *mm;
>> +	int ret = 0;
>> +
>> +	mm = get_task_mm(current);
>
>I don't think the get_task_mm(current) is needed, the mmget is already
>done for current->mm ?

No, I don't think mmget is already done for current->mm here.

>
>Jason
Jason Gunthorpe Nov. 23, 2019, 11:53 p.m. UTC | #3
On Fri, Nov 22, 2019 at 08:44:18PM -0800, Niranjan Vishwanathapura wrote:
> On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
> > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
> > 
> > > +static inline bool
> > > +i915_range_done(struct hmm_range *range)
> > > +{
> > > +	bool ret = hmm_range_valid(range);
> > > +
> > > +	hmm_range_unregister(range);
> > > +	return ret;
> > > +}
> > 
> > This needs to be updated to follow the new API, but this pattern is
> > generally wrong, failure if hmm_range_valid should retry the
> > range_fault and so forth. See the hmm.rst.
> > 
> 
> Thanks Jason for the feedback.
> Yah, will update as per new API in the next revision.
> The caller of this function is indeed retrying if the range is not valid.
> But I got the point. I made changes in my local build as per hmm.rst
> and it is working fine. Will include the change in next revision.

Generally speaking this locking approach is a live-lockable
collision-retry scheme. 

For maintainability it is best if the retry loop is explicit and
doesn't unregister the range during the retry. I also encourage adding
an absolute time bound to the retry as it *could* live lock.

> > > +static int
> > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> > > +{
> > > +	long ret;
> > > +
> > > +	range->default_flags = 0;
> > > +	range->pfn_flags_mask = -1UL;
> > > +
> > > +	ret = hmm_range_register(range, &svm->mirror);
> > > +	if (ret) {
> > > +		up_read(&svm->mm->mmap_sem);
> > > +		return (int)ret;
> > > +	}
> > 
> > 
> > Using a temporary range is the pattern from nouveau, is it really
> > necessary in this driver?
> 
> Yah, not required. In my local build I tried with proper default_flags
> and set pfn_flags_mask to 0 and it is working fine.

Sorry, I ment calling hmm_range_register during fault processing.

If your driver works around user space objects that cover a VA then
the range should be created when the object is created.

> > > +	/*
> > > +	 * No needd to dma map the host pages and later unmap it, as
> > > +	 * GPU is not allowed to access it with SVM. Hence, no need
> > > +	 * of any intermediate data strucutre to hold the mappings.
> > > +	 */
> > > +	for (i = 0; i < npages; i++) {
> > > +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
> > > +
> > > +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
> > > +			sg->length += PAGE_SIZE;
> > > +			sg_dma_len(sg) += PAGE_SIZE;
> > > +			continue;
> > > +		}
> > > +
> > > +		if (sg)
> > > +			sg_page_sizes |= sg->length;
> > > +
> > > +		sg =  sg ? __sg_next(sg) : st->sgl;
> > > +		sg_dma_address(sg) = addr;
> > > +		sg_dma_len(sg) = PAGE_SIZE;
> > > +		sg->length = PAGE_SIZE;
> > > +		st->nents++;
> > 
> > It is odd to build the range into a sgl.
> > 
> > IMHO it is not a good idea to use the sg_dma_address like this, that
> > should only be filled in by a dma map. Where does it end up being
> > used?
> 
> The sgl is used to plug into the page table update function in i915.
> 
> For the device memory in discrete card, we don't need dma map which
> is the case here.

How did we get to device memory on a card? Isn't range->pfns a CPU PFN
at this point?

I'm confused.

> > > +int i915_svm_bind_mm(struct i915_address_space *vm)
> > > +{
> > > +	struct i915_svm *svm;
> > > +	struct mm_struct *mm;
> > > +	int ret = 0;
> > > +
> > > +	mm = get_task_mm(current);
> > 
> > I don't think the get_task_mm(current) is needed, the mmget is already
> > done for current->mm ?
> 
> No, I don't think mmget is already done for current->mm here.

I'm not certain, but I thought it is already done because it is
'current' and current cannot begin to destroy the mm while a syscall
is currently running.

Jason
Niranjana Vishwanathapura Nov. 24, 2019, 9:12 p.m. UTC | #4
On Sat, Nov 23, 2019 at 11:53:52PM +0000, Jason Gunthorpe wrote:
>On Fri, Nov 22, 2019 at 08:44:18PM -0800, Niranjan Vishwanathapura wrote:
>> On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
>> > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
>> >
>> > > +static inline bool
>> > > +i915_range_done(struct hmm_range *range)
>> > > +{
>> > > +	bool ret = hmm_range_valid(range);
>> > > +
>> > > +	hmm_range_unregister(range);
>> > > +	return ret;
>> > > +}
>> >
>> > This needs to be updated to follow the new API, but this pattern is
>> > generally wrong, failure if hmm_range_valid should retry the
>> > range_fault and so forth. See the hmm.rst.
>> >
>>
>> Thanks Jason for the feedback.
>> Yah, will update as per new API in the next revision.
>> The caller of this function is indeed retrying if the range is not valid.
>> But I got the point. I made changes in my local build as per hmm.rst
>> and it is working fine. Will include the change in next revision.
>
>Generally speaking this locking approach is a live-lockable
>collision-retry scheme.
>
>For maintainability it is best if the retry loop is explicit and
>doesn't unregister the range during the retry. I also encourage adding
>an absolute time bound to the retry as it *could* live lock.
>

Ok, thanks for the suggestion, will do.

>> > > +static int
>> > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
>> > > +{
>> > > +	long ret;
>> > > +
>> > > +	range->default_flags = 0;
>> > > +	range->pfn_flags_mask = -1UL;
>> > > +
>> > > +	ret = hmm_range_register(range, &svm->mirror);
>> > > +	if (ret) {
>> > > +		up_read(&svm->mm->mmap_sem);
>> > > +		return (int)ret;
>> > > +	}
>> >
>> >
>> > Using a temporary range is the pattern from nouveau, is it really
>> > necessary in this driver?
>>
>> Yah, not required. In my local build I tried with proper default_flags
>> and set pfn_flags_mask to 0 and it is working fine.
>
>Sorry, I ment calling hmm_range_register during fault processing.
>
>If your driver works around user space objects that cover a VA then
>the range should be created when the object is created.
>

Oh ok. No, there is no user space object here.
Binding the user space object to device page table is handled in
patch 03 of this series (no HMM there).
This is for binding a system allocated (malloc) memory. User calls
the bind ioctl with the VA range.

>> > > +	/*
>> > > +	 * No needd to dma map the host pages and later unmap it, as
>> > > +	 * GPU is not allowed to access it with SVM. Hence, no need
>> > > +	 * of any intermediate data strucutre to hold the mappings.
>> > > +	 */
>> > > +	for (i = 0; i < npages; i++) {
>> > > +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
>> > > +
>> > > +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
>> > > +			sg->length += PAGE_SIZE;
>> > > +			sg_dma_len(sg) += PAGE_SIZE;
>> > > +			continue;
>> > > +		}
>> > > +
>> > > +		if (sg)
>> > > +			sg_page_sizes |= sg->length;
>> > > +
>> > > +		sg =  sg ? __sg_next(sg) : st->sgl;
>> > > +		sg_dma_address(sg) = addr;
>> > > +		sg_dma_len(sg) = PAGE_SIZE;
>> > > +		sg->length = PAGE_SIZE;
>> > > +		st->nents++;
>> >
>> > It is odd to build the range into a sgl.
>> >
>> > IMHO it is not a good idea to use the sg_dma_address like this, that
>> > should only be filled in by a dma map. Where does it end up being
>> > used?
>>
>> The sgl is used to plug into the page table update function in i915.
>>
>> For the device memory in discrete card, we don't need dma map which
>> is the case here.
>
>How did we get to device memory on a card? Isn't range->pfns a CPU PFN
>at this point?
>
>I'm confused.
>

Device memory plugin is done through devm_memremap_pages() in patch 07 of
this series. In that patch, we convert the CPU PFN to device PFN before
building the sgl (this is similar to the nouveau driver).

>> > > +int i915_svm_bind_mm(struct i915_address_space *vm)
>> > > +{
>> > > +	struct i915_svm *svm;
>> > > +	struct mm_struct *mm;
>> > > +	int ret = 0;
>> > > +
>> > > +	mm = get_task_mm(current);
>> >
>> > I don't think the get_task_mm(current) is needed, the mmget is already
>> > done for current->mm ?
>>
>> No, I don't think mmget is already done for current->mm here.
>
>I'm not certain, but I thought it is already done because it is
>'current' and current cannot begin to destroy the mm while a syscall
>is currently running.
>

Ok, I don't know, at least the driver is not calling it.
But it makes sense to me. I will remove get_task_mm and directly use current->mm.

Niranjana

>Jason
Jason Gunthorpe Nov. 25, 2019, 1:24 p.m. UTC | #5
On Sun, Nov 24, 2019 at 01:12:47PM -0800, Niranjan Vishwanathapura wrote:

> > > > Using a temporary range is the pattern from nouveau, is it really
> > > > necessary in this driver?
> > > 
> > > Yah, not required. In my local build I tried with proper default_flags
> > > and set pfn_flags_mask to 0 and it is working fine.
> > 
> > Sorry, I ment calling hmm_range_register during fault processing.
> > 
> > If your driver works around user space objects that cover a VA then
> > the range should be created when the object is created.
> > 
> 
> Oh ok. No, there is no user space object here.
> Binding the user space object to device page table is handled in
> patch 03 of this series (no HMM there).
> This is for binding a system allocated (malloc) memory. User calls
> the bind ioctl with the VA range.
> 
> > > > > +	/*
> > > > > +	 * No needd to dma map the host pages and later unmap it, as
> > > > > +	 * GPU is not allowed to access it with SVM. Hence, no need
> > > > > +	 * of any intermediate data strucutre to hold the mappings.
> > > > > +	 */
> > > > > +	for (i = 0; i < npages; i++) {
> > > > > +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
> > > > > +
> > > > > +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
> > > > > +			sg->length += PAGE_SIZE;
> > > > > +			sg_dma_len(sg) += PAGE_SIZE;
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		if (sg)
> > > > > +			sg_page_sizes |= sg->length;
> > > > > +
> > > > > +		sg =  sg ? __sg_next(sg) : st->sgl;
> > > > > +		sg_dma_address(sg) = addr;
> > > > > +		sg_dma_len(sg) = PAGE_SIZE;
> > > > > +		sg->length = PAGE_SIZE;
> > > > > +		st->nents++;
> > > >
> > > > It is odd to build the range into a sgl.
> > > >
> > > > IMHO it is not a good idea to use the sg_dma_address like this, that
> > > > should only be filled in by a dma map. Where does it end up being
> > > > used?
> > > 
> > > The sgl is used to plug into the page table update function in i915.
> > > 
> > > For the device memory in discrete card, we don't need dma map which
> > > is the case here.
> > 
> > How did we get to device memory on a card? Isn't range->pfns a CPU PFN
> > at this point?
> > 
> > I'm confused.
> 
> Device memory plugin is done through devm_memremap_pages() in patch 07 of
> this series. In that patch, we convert the CPU PFN to device PFN before
> building the sgl (this is similar to the nouveau driver).

But earlier just called hmm_range_fault(), it can return all kinds of
pages. If these are only allowed to be device pages here then that
must be checked (under lock)

And putting the cpu PFN of a ZONE_DEVICE device page into
sg_dma_address still looks very wrong to me

Jason
Niranjana Vishwanathapura Nov. 25, 2019, 4:32 p.m. UTC | #6
On Mon, Nov 25, 2019 at 01:24:18PM +0000, Jason Gunthorpe wrote:
>On Sun, Nov 24, 2019 at 01:12:47PM -0800, Niranjan Vishwanathapura wrote:
>
>> > > > Using a temporary range is the pattern from nouveau, is it really
>> > > > necessary in this driver?
>> > >
>> > > Yah, not required. In my local build I tried with proper default_flags
>> > > and set pfn_flags_mask to 0 and it is working fine.
>> >
>> > Sorry, I ment calling hmm_range_register during fault processing.
>> >
>> > If your driver works around user space objects that cover a VA then
>> > the range should be created when the object is created.
>> >
>>
>> Oh ok. No, there is no user space object here.
>> Binding the user space object to device page table is handled in
>> patch 03 of this series (no HMM there).
>> This is for binding a system allocated (malloc) memory. User calls
>> the bind ioctl with the VA range.
>>
>> > > > > +	/*
>> > > > > +	 * No needd to dma map the host pages and later unmap it, as
>> > > > > +	 * GPU is not allowed to access it with SVM. Hence, no need
>> > > > > +	 * of any intermediate data strucutre to hold the mappings.
>> > > > > +	 */
>> > > > > +	for (i = 0; i < npages; i++) {
>> > > > > +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
>> > > > > +
>> > > > > +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
>> > > > > +			sg->length += PAGE_SIZE;
>> > > > > +			sg_dma_len(sg) += PAGE_SIZE;
>> > > > > +			continue;
>> > > > > +		}
>> > > > > +
>> > > > > +		if (sg)
>> > > > > +			sg_page_sizes |= sg->length;
>> > > > > +
>> > > > > +		sg =  sg ? __sg_next(sg) : st->sgl;
>> > > > > +		sg_dma_address(sg) = addr;
>> > > > > +		sg_dma_len(sg) = PAGE_SIZE;
>> > > > > +		sg->length = PAGE_SIZE;
>> > > > > +		st->nents++;
>> > > >
>> > > > It is odd to build the range into a sgl.
>> > > >
>> > > > IMHO it is not a good idea to use the sg_dma_address like this, that
>> > > > should only be filled in by a dma map. Where does it end up being
>> > > > used?
>> > >
>> > > The sgl is used to plug into the page table update function in i915.
>> > >
>> > > For the device memory in discrete card, we don't need dma map which
>> > > is the case here.
>> >
>> > How did we get to device memory on a card? Isn't range->pfns a CPU PFN
>> > at this point?
>> >
>> > I'm confused.
>>
>> Device memory plugin is done through devm_memremap_pages() in patch 07 of
>> this series. In that patch, we convert the CPU PFN to device PFN before
>> building the sgl (this is similar to the nouveau driver).
>
>But earlier just called hmm_range_fault(), it can return all kinds of
>pages. If these are only allowed to be device pages here then that
>must be checked (under lock)
>

Yah, let me add the check.
(I kept is unchecked for debug purpose, forgot to add the check before sending
the patches.)

>And putting the cpu PFN of a ZONE_DEVICE device page into
>sg_dma_address still looks very wrong to me
>

The below call in patch 7 does convert any cpu PFN to device address.
So, it won't be CPU PFN.
i915_dmem_convert_pfn(vm->i915, &range);

Also, only reason to use sgl list is because i915 driver page table update
functions takes an sgl, otherwise we can directly deal with range.pfns array.

Niranjana

>Jason
Jerome Glisse Nov. 25, 2019, 4:33 p.m. UTC | #7
On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
> On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:

[...]

> > +static int
> > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> > +{
> > +	long ret;
> > +
> > +	range->default_flags = 0;
> > +	range->pfn_flags_mask = -1UL;
> > +
> > +	ret = hmm_range_register(range, &svm->mirror);
> > +	if (ret) {
> > +		up_read(&svm->mm->mmap_sem);
> > +		return (int)ret;
> > +	}
> 
> 
> Using a temporary range is the pattern from nouveau, is it really
> necessary in this driver?

Just to comment on this, for GPU the usage model is not application
register range of virtual address it wants to use. It is GPU can
access _any_ CPU valid address just like the CPU would (modulo mmap
of device file).

This is because the API you want in userspace is application passing
random pointer to the GPU and GPU being able to chase down any kind
of random pointer chain (assuming all valid ie pointing to valid
virtual address for the process).

This is unlike the RDMA case.


That being said, for best performance we still expect well behaving
application to provide hint to kernel so that we know if a range of
virtual address is likely to be use by the GPU or not. But this is
not, and should not be a requirement.


I posted patchset and given talks about this, but long term i believe
we want a common API to manage hint provided by userspace (see my
talk at LPC this year about new syscall to bind memory to device).
With such thing in place we could hang mmu notifier range to it. But
the driver will still need to be able to handle the case where there
is no hint provided by userspace and thus no before knowledge of what
VA might be accessed.

Cheers,
Jérôme
Jerome Glisse Nov. 25, 2019, 4:34 p.m. UTC | #8
On Mon, Nov 25, 2019 at 01:24:18PM +0000, Jason Gunthorpe wrote:
> On Sun, Nov 24, 2019 at 01:12:47PM -0800, Niranjan Vishwanathapura wrote:
> 
> > > > > Using a temporary range is the pattern from nouveau, is it really
> > > > > necessary in this driver?
> > > > 
> > > > Yah, not required. In my local build I tried with proper default_flags
> > > > and set pfn_flags_mask to 0 and it is working fine.
> > > 
> > > Sorry, I ment calling hmm_range_register during fault processing.
> > > 
> > > If your driver works around user space objects that cover a VA then
> > > the range should be created when the object is created.
> > > 
> > 
> > Oh ok. No, there is no user space object here.
> > Binding the user space object to device page table is handled in
> > patch 03 of this series (no HMM there).
> > This is for binding a system allocated (malloc) memory. User calls
> > the bind ioctl with the VA range.
> > 
> > > > > > +	/*
> > > > > > +	 * No needd to dma map the host pages and later unmap it, as
> > > > > > +	 * GPU is not allowed to access it with SVM. Hence, no need
> > > > > > +	 * of any intermediate data strucutre to hold the mappings.
> > > > > > +	 */
> > > > > > +	for (i = 0; i < npages; i++) {
> > > > > > +		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
> > > > > > +
> > > > > > +		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
> > > > > > +			sg->length += PAGE_SIZE;
> > > > > > +			sg_dma_len(sg) += PAGE_SIZE;
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		if (sg)
> > > > > > +			sg_page_sizes |= sg->length;
> > > > > > +
> > > > > > +		sg =  sg ? __sg_next(sg) : st->sgl;
> > > > > > +		sg_dma_address(sg) = addr;
> > > > > > +		sg_dma_len(sg) = PAGE_SIZE;
> > > > > > +		sg->length = PAGE_SIZE;
> > > > > > +		st->nents++;
> > > > >
> > > > > It is odd to build the range into a sgl.
> > > > >
> > > > > IMHO it is not a good idea to use the sg_dma_address like this, that
> > > > > should only be filled in by a dma map. Where does it end up being
> > > > > used?
> > > > 
> > > > The sgl is used to plug into the page table update function in i915.
> > > > 
> > > > For the device memory in discrete card, we don't need dma map which
> > > > is the case here.
> > > 
> > > How did we get to device memory on a card? Isn't range->pfns a CPU PFN
> > > at this point?
> > > 
> > > I'm confused.
> > 
> > Device memory plugin is done through devm_memremap_pages() in patch 07 of
> > this series. In that patch, we convert the CPU PFN to device PFN before
> > building the sgl (this is similar to the nouveau driver).
> 
> But earlier just called hmm_range_fault(), it can return all kinds of
> pages. If these are only allowed to be device pages here then that
> must be checked (under lock)
> 
> And putting the cpu PFN of a ZONE_DEVICE device page into
> sg_dma_address still looks very wrong to me

Yeah, nouveau has different code path but this is because nouveau
driver architecture allows it, i do not see any easy way to hammer
this inside i915 current architecture. I will ponder on this a bit
more.

Cheers,
Jérôme
Niranjana Vishwanathapura Nov. 25, 2019, 9:29 p.m. UTC | #9
On Mon, Nov 25, 2019 at 11:33:27AM -0500, Jerome Glisse wrote:
>On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
>> On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
>
>[...]
>
>> > +static int
>> > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
>> > +{
>> > +	long ret;
>> > +
>> > +	range->default_flags = 0;
>> > +	range->pfn_flags_mask = -1UL;
>> > +
>> > +	ret = hmm_range_register(range, &svm->mirror);
>> > +	if (ret) {
>> > +		up_read(&svm->mm->mmap_sem);
>> > +		return (int)ret;
>> > +	}
>>
>>
>> Using a temporary range is the pattern from nouveau, is it really
>> necessary in this driver?
>
>Just to comment on this, for GPU the usage model is not application
>register range of virtual address it wants to use. It is GPU can
>access _any_ CPU valid address just like the CPU would (modulo mmap
>of device file).
>
>This is because the API you want in userspace is application passing
>random pointer to the GPU and GPU being able to chase down any kind
>of random pointer chain (assuming all valid ie pointing to valid
>virtual address for the process).
>
>This is unlike the RDMA case.
>
>
>That being said, for best performance we still expect well behaving
>application to provide hint to kernel so that we know if a range of
>virtual address is likely to be use by the GPU or not. But this is
>not, and should not be a requirement.
>
>
>I posted patchset and given talks about this, but long term i believe
>we want a common API to manage hint provided by userspace (see my
>talk at LPC this year about new syscall to bind memory to device).
>With such thing in place we could hang mmu notifier range to it. But
>the driver will still need to be able to handle the case where there
>is no hint provided by userspace and thus no before knowledge of what
>VA might be accessed.
>

Thanks Jerome for the explanation. Will checkout your LPC talk.
Yes I agree. When GPU faulting support is available, driver will handle the fault,
migrate page if needed and bind the page using HMM.
This patch series adds support for prefetch and bind hints (via explicit ioctls).
Also, patch 12 of the series provides the ability to enable/disable SVM on a per
VM basis for user, and by default SVM is disabled.

Niranjana

>Cheers,
>Jérôme
>
Jason Gunthorpe Nov. 26, 2019, 6:32 p.m. UTC | #10
On Mon, Nov 25, 2019 at 11:33:27AM -0500, Jerome Glisse wrote:
> On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
> > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
> 
> [...]
> 
> > > +static int
> > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> > > +{
> > > +	long ret;
> > > +
> > > +	range->default_flags = 0;
> > > +	range->pfn_flags_mask = -1UL;
> > > +
> > > +	ret = hmm_range_register(range, &svm->mirror);
> > > +	if (ret) {
> > > +		up_read(&svm->mm->mmap_sem);
> > > +		return (int)ret;
> > > +	}
> > 
> > 
> > Using a temporary range is the pattern from nouveau, is it really
> > necessary in this driver?
> 
> Just to comment on this, for GPU the usage model is not application
> register range of virtual address it wants to use. It is GPU can
> access _any_ CPU valid address just like the CPU would (modulo mmap
> of device file).
> 
> This is because the API you want in userspace is application passing
> random pointer to the GPU and GPU being able to chase down any kind
> of random pointer chain (assuming all valid ie pointing to valid
> virtual address for the process).
> 
> This is unlike the RDMA case.

No, RDMA has the full address space option as well, it is called
'implicit ODP'

This is implemented by registering ranges at a level in our page
table (currently 512G) whenever that level has populated pages below
it.

I think is a better strategy than temporary ranges.

But other GPU drivers like AMD are using BO and TTM objects with fixed
VA ranges and the range is tied to the BO/TTM.

I'm not sure if this i915 usage is closer to AMD or closer to nouveau.

Jason
Jason Gunthorpe Nov. 26, 2019, 6:45 p.m. UTC | #11
On Mon, Nov 25, 2019 at 08:32:58AM -0800, Niranjan Vishwanathapura wrote:
> > And putting the cpu PFN of a ZONE_DEVICE device page into
> > sg_dma_address still looks very wrong to me
> 
> The below call in patch 7 does convert any cpu PFN to device address.
> So, it won't be CPU PFN.
> i915_dmem_convert_pfn(vm->i915, &range);

Well, then it should be done here in patch 6, surely?

> Also, only reason to use sgl list is because i915 driver page table update
> functions takes an sgl, otherwise we can directly deal with range.pfns array.

Maybe that should be fixed instead of abusing sgl

Jason
Niranjana Vishwanathapura Dec. 3, 2019, 7:07 p.m. UTC | #12
On Tue, Nov 26, 2019 at 06:45:14PM +0000, Jason Gunthorpe wrote:
>On Mon, Nov 25, 2019 at 08:32:58AM -0800, Niranjan Vishwanathapura wrote:
>> > And putting the cpu PFN of a ZONE_DEVICE device page into
>> > sg_dma_address still looks very wrong to me
>>
>> The below call in patch 7 does convert any cpu PFN to device address.
>> So, it won't be CPU PFN.
>> i915_dmem_convert_pfn(vm->i915, &range);
>
>Well, then it should be done here in patch 6, surely?
>
>> Also, only reason to use sgl list is because i915 driver page table update
>> functions takes an sgl, otherwise we can directly deal with range.pfns array.
>
>Maybe that should be fixed instead of abusing sgl
>

Sorry, missed these comments.

Well, this i915 SVM support can be extended to work on intel integrated gfx also
with host memory (no ZONE_DEVICE), though enabling it is not the primary focus for
this patch series.
There we need to dma map these CPU PFNs and assign to the sg_dma_address.

Device memory plugin for discreate graphics is added in patch 7 of this series,
hence this convert_pfn function is added there.

Niranjana

>Jason
Niranjana Vishwanathapura Dec. 3, 2019, 7:19 p.m. UTC | #13
On Tue, Nov 26, 2019 at 06:32:52PM +0000, Jason Gunthorpe wrote:
>On Mon, Nov 25, 2019 at 11:33:27AM -0500, Jerome Glisse wrote:
>> On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
>> > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
>>
>> [...]
>>
>> > > +static int
>> > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
>> > > +{
>> > > +	long ret;
>> > > +
>> > > +	range->default_flags = 0;
>> > > +	range->pfn_flags_mask = -1UL;
>> > > +
>> > > +	ret = hmm_range_register(range, &svm->mirror);
>> > > +	if (ret) {
>> > > +		up_read(&svm->mm->mmap_sem);
>> > > +		return (int)ret;
>> > > +	}
>> >
>> >
>> > Using a temporary range is the pattern from nouveau, is it really
>> > necessary in this driver?
>>
>> Just to comment on this, for GPU the usage model is not application
>> register range of virtual address it wants to use. It is GPU can
>> access _any_ CPU valid address just like the CPU would (modulo mmap
>> of device file).
>>
>> This is because the API you want in userspace is application passing
>> random pointer to the GPU and GPU being able to chase down any kind
>> of random pointer chain (assuming all valid ie pointing to valid
>> virtual address for the process).
>>
>> This is unlike the RDMA case.
>
>No, RDMA has the full address space option as well, it is called
>'implicit ODP'
>
>This is implemented by registering ranges at a level in our page
>table (currently 512G) whenever that level has populated pages below
>it.
>
>I think is a better strategy than temporary ranges.
>
>But other GPU drivers like AMD are using BO and TTM objects with fixed
>VA ranges and the range is tied to the BO/TTM.
>
>I'm not sure if this i915 usage is closer to AMD or closer to nouveau.
>

I don't know the full details of the HMM usecases in amd and nouveau.
AMD seems to be using it for usrptr objects which is tied to a BO.
I am not sure if nouveau has any BO tied to these address ranges.

Niranjana

>Jason
Jerome Glisse Dec. 4, 2019, 9:51 p.m. UTC | #14
On Tue, Dec 03, 2019 at 11:19:43AM -0800, Niranjan Vishwanathapura wrote:
> On Tue, Nov 26, 2019 at 06:32:52PM +0000, Jason Gunthorpe wrote:
> > On Mon, Nov 25, 2019 at 11:33:27AM -0500, Jerome Glisse wrote:
> > > On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
> > > > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
> > > 
> > > [...]
> > > 
> > > > > +static int
> > > > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> > > > > +{
> > > > > +	long ret;
> > > > > +
> > > > > +	range->default_flags = 0;
> > > > > +	range->pfn_flags_mask = -1UL;
> > > > > +
> > > > > +	ret = hmm_range_register(range, &svm->mirror);
> > > > > +	if (ret) {
> > > > > +		up_read(&svm->mm->mmap_sem);
> > > > > +		return (int)ret;
> > > > > +	}
> > > >
> > > >
> > > > Using a temporary range is the pattern from nouveau, is it really
> > > > necessary in this driver?
> > > 
> > > Just to comment on this, for GPU the usage model is not application
> > > register range of virtual address it wants to use. It is GPU can
> > > access _any_ CPU valid address just like the CPU would (modulo mmap
> > > of device file).
> > > 
> > > This is because the API you want in userspace is application passing
> > > random pointer to the GPU and GPU being able to chase down any kind
> > > of random pointer chain (assuming all valid ie pointing to valid
> > > virtual address for the process).
> > > 
> > > This is unlike the RDMA case.
> > 
> > No, RDMA has the full address space option as well, it is called
> > 'implicit ODP'
> > 
> > This is implemented by registering ranges at a level in our page
> > table (currently 512G) whenever that level has populated pages below
> > it.
> > 
> > I think is a better strategy than temporary ranges.

HMM original design did not have range and was well suited to nouveau.
Recent changes make it more tie to the range and less suited to nouveau.
I would not consider 512GB implicit range as good thing. Plan i have is
to create implicit range and align them on vma. I do not know when i will
have time to get to that.

> > 
> > But other GPU drivers like AMD are using BO and TTM objects with fixed
> > VA ranges and the range is tied to the BO/TTM.
> > 
> > I'm not sure if this i915 usage is closer to AMD or closer to nouveau.
> > 
> 
> I don't know the full details of the HMM usecases in amd and nouveau.
> AMD seems to be using it for usrptr objects which is tied to a BO.
> I am not sure if nouveau has any BO tied to these address ranges.

It is closer to nouveau, AMD usage is old userptr usecase where you have
a BO tie to range. While SVM means any valid CPU address and thus imply
that there is no BO tie to it (there is still BO usecase that must still
work here).

Cheers,
Jérôme
Jason Gunthorpe Dec. 8, 2019, 6:23 p.m. UTC | #15
On Wed, Dec 04, 2019 at 04:51:36PM -0500, Jerome Glisse wrote:
> On Tue, Dec 03, 2019 at 11:19:43AM -0800, Niranjan Vishwanathapura wrote:
> > On Tue, Nov 26, 2019 at 06:32:52PM +0000, Jason Gunthorpe wrote:
> > > On Mon, Nov 25, 2019 at 11:33:27AM -0500, Jerome Glisse wrote:
> > > > On Fri, Nov 22, 2019 at 11:33:12PM +0000, Jason Gunthorpe wrote:
> > > > > On Fri, Nov 22, 2019 at 12:57:27PM -0800, Niranjana Vishwanathapura wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > > +static int
> > > > > > +i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
> > > > > > +{
> > > > > > +	long ret;
> > > > > > +
> > > > > > +	range->default_flags = 0;
> > > > > > +	range->pfn_flags_mask = -1UL;
> > > > > > +
> > > > > > +	ret = hmm_range_register(range, &svm->mirror);
> > > > > > +	if (ret) {
> > > > > > +		up_read(&svm->mm->mmap_sem);
> > > > > > +		return (int)ret;
> > > > > > +	}
> > > > >
> > > > >
> > > > > Using a temporary range is the pattern from nouveau, is it really
> > > > > necessary in this driver?
> > > > 
> > > > Just to comment on this, for GPU the usage model is not application
> > > > register range of virtual address it wants to use. It is GPU can
> > > > access _any_ CPU valid address just like the CPU would (modulo mmap
> > > > of device file).
> > > > 
> > > > This is because the API you want in userspace is application passing
> > > > random pointer to the GPU and GPU being able to chase down any kind
> > > > of random pointer chain (assuming all valid ie pointing to valid
> > > > virtual address for the process).
> > > > 
> > > > This is unlike the RDMA case.
> > > 
> > > No, RDMA has the full address space option as well, it is called
> > > 'implicit ODP'
> > > 
> > > This is implemented by registering ranges at a level in our page
> > > table (currently 512G) whenever that level has populated pages below
> > > it.
> > > 
> > > I think is a better strategy than temporary ranges.
> 
> HMM original design did not have range and was well suited to nouveau.
> Recent changes make it more tie to the range and less suited to nouveau.
> I would not consider 512GB implicit range as good thing. Plan i have is
> to create implicit range and align them on vma. I do not know when i will
> have time to get to that.

For mlx5 the 512G is aligned to the page table levels, so it is a
reasonable approximation. GPU could do the same.

Not sure VMA related is really any better..

Jason
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index c2e48710eec8..689e57fe3973 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -141,6 +141,9 @@  config DRM_I915_SVM
 	bool "Enable Shared Virtual Memory support in i915"
 	depends on STAGING
 	depends on DRM_I915
+	depends on MMU
+	select HMM_MIRROR
+	select MMU_NOTIFIER
 	default n
 	help
 	  Choose this option if you want Shared Virtual Memory (SVM)
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 75fe45633779..7d4cd9eefd12 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -154,7 +154,8 @@  i915-y += \
 	  intel_wopcm.o
 
 # SVM code
-i915-$(CONFIG_DRM_I915_SVM) += gem/i915_gem_svm.o
+i915-$(CONFIG_DRM_I915_SVM) += gem/i915_gem_svm.o \
+			       i915_svm.o
 
 # general-purpose microcontroller (GuC) support
 obj-y += gt/uc/
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9c525d3f694c..c190df614c48 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -73,6 +73,7 @@ 
 #include "i915_perf.h"
 #include "i915_query.h"
 #include "i915_suspend.h"
+#include "i915_svm.h"
 #include "i915_switcheroo.h"
 #include "i915_sysfs.h"
 #include "i915_trace.h"
@@ -2700,6 +2701,8 @@  static int i915_bind_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 
 	switch (args->type) {
+	case I915_BIND_SVM_BUFFER:
+		return i915_svm_bind(vm, args);
 	case I915_BIND_SVM_GEM_OBJ:
 		ret = i915_gem_svm_bind(vm, args, file);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d9e95229ba1d..08cbbb4d91cb 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -42,6 +42,7 @@ 
 
 #include "i915_drv.h"
 #include "i915_scatterlist.h"
+#include "i915_svm.h"
 #include "i915_trace.h"
 #include "i915_vgpu.h"
 
@@ -550,6 +551,7 @@  static void i915_address_space_fini(struct i915_address_space *vm)
 	drm_mm_takedown(&vm->mm);
 
 	mutex_destroy(&vm->mutex);
+	mutex_destroy(&vm->svm_mutex);
 }
 
 void __i915_vm_close(struct i915_address_space *vm)
@@ -593,6 +595,7 @@  void i915_vm_release(struct kref *kref)
 	GEM_BUG_ON(i915_is_ggtt(vm));
 	trace_i915_ppgtt_release(vm);
 
+	i915_svm_unbind_mm(vm);
 	queue_rcu_work(vm->i915->wq, &vm->rcu);
 }
 
@@ -608,6 +611,7 @@  static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 	 * attempt holding the lock is immediately reported by lockdep.
 	 */
 	mutex_init(&vm->mutex);
+	mutex_init(&vm->svm_mutex);
 	lockdep_set_subclass(&vm->mutex, subclass);
 	i915_gem_shrinker_taints_mutex(vm->i915, &vm->mutex);
 
@@ -619,6 +623,7 @@  static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 
 	INIT_LIST_HEAD(&vm->bound_list);
 	INIT_LIST_HEAD(&vm->svm_list);
+	RCU_INIT_POINTER(vm->svm, NULL);
 }
 
 static int __setup_page_dma(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 6a8d55490e39..722b1e7ce291 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -293,6 +293,8 @@  struct i915_svm_obj {
 	u64 offset;
 };
 
+struct i915_svm;
+
 struct i915_address_space {
 	struct kref ref;
 	struct rcu_work rcu;
@@ -342,6 +344,8 @@  struct i915_address_space {
 	 */
 	struct list_head svm_list;
 	unsigned int svm_count;
+	struct i915_svm *svm;
+	struct mutex svm_mutex; /* protects svm enabling */
 
 	struct pagestash free_pages;
 
diff --git a/drivers/gpu/drm/i915/i915_svm.c b/drivers/gpu/drm/i915/i915_svm.c
new file mode 100644
index 000000000000..4e414f257121
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_svm.c
@@ -0,0 +1,296 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include <linux/mm_types.h>
+#include <linux/sched/mm.h>
+
+#include "i915_svm.h"
+#include "intel_memory_region.h"
+#include "gem/i915_gem_context.h"
+
+static const u64 i915_range_flags[HMM_PFN_FLAG_MAX] = {
+	[HMM_PFN_VALID]          = (1 << 0), /* HMM_PFN_VALID */
+	[HMM_PFN_WRITE]          = (1 << 1), /* HMM_PFN_WRITE */
+	[HMM_PFN_DEVICE_PRIVATE] = (1 << 2) /* HMM_PFN_DEVICE_PRIVATE */
+};
+
+static const u64 i915_range_values[HMM_PFN_VALUE_MAX] = {
+	[HMM_PFN_ERROR]   = 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
+	[HMM_PFN_NONE]    = 0, /* HMM_PFN_NONE */
+	[HMM_PFN_SPECIAL] = 0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
+};
+
+static inline bool
+i915_range_done(struct hmm_range *range)
+{
+	bool ret = hmm_range_valid(range);
+
+	hmm_range_unregister(range);
+	return ret;
+}
+
+static int
+i915_range_fault(struct i915_svm *svm, struct hmm_range *range)
+{
+	long ret;
+
+	range->default_flags = 0;
+	range->pfn_flags_mask = -1UL;
+
+	ret = hmm_range_register(range, &svm->mirror);
+	if (ret) {
+		up_read(&svm->mm->mmap_sem);
+		return (int)ret;
+	}
+
+	if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+		up_read(&svm->mm->mmap_sem);
+		return -EBUSY;
+	}
+
+	ret = hmm_range_fault(range, 0);
+	if (ret <= 0) {
+		if (ret == 0)
+			ret = -EBUSY;
+		up_read(&svm->mm->mmap_sem);
+		hmm_range_unregister(range);
+		return ret;
+	}
+	return 0;
+}
+
+static struct i915_svm *vm_get_svm(struct i915_address_space *vm)
+{
+	struct i915_svm *svm = vm->svm;
+
+	mutex_lock(&vm->svm_mutex);
+	if (svm && !kref_get_unless_zero(&svm->ref))
+		svm = NULL;
+
+	mutex_unlock(&vm->svm_mutex);
+	return svm;
+}
+
+static void release_svm(struct kref *ref)
+{
+	struct i915_svm *svm = container_of(ref, typeof(*svm), ref);
+	struct i915_address_space *vm = svm->vm;
+
+	hmm_mirror_unregister(&svm->mirror);
+	mutex_destroy(&svm->mutex);
+	vm->svm = NULL;
+	kfree(svm);
+}
+
+static void vm_put_svm(struct i915_address_space *vm)
+{
+	mutex_lock(&vm->svm_mutex);
+	if (vm->svm)
+		kref_put(&vm->svm->ref, release_svm);
+	mutex_unlock(&vm->svm_mutex);
+}
+
+static u32 i915_svm_build_sg(struct i915_address_space *vm,
+			     struct hmm_range *range,
+			     struct sg_table *st)
+{
+	struct scatterlist *sg;
+	u32 sg_page_sizes = 0;
+	u64 i, npages;
+
+	sg = NULL;
+	st->nents = 0;
+	npages = (range->end - range->start) / PAGE_SIZE;
+
+	/*
+	 * No needd to dma map the host pages and later unmap it, as
+	 * GPU is not allowed to access it with SVM. Hence, no need
+	 * of any intermediate data strucutre to hold the mappings.
+	 */
+	for (i = 0; i < npages; i++) {
+		u64 addr = range->pfns[i] & ~((1UL << range->pfn_shift) - 1);
+
+		if (sg && (addr == (sg_dma_address(sg) + sg->length))) {
+			sg->length += PAGE_SIZE;
+			sg_dma_len(sg) += PAGE_SIZE;
+			continue;
+		}
+
+		if (sg)
+			sg_page_sizes |= sg->length;
+
+		sg =  sg ? __sg_next(sg) : st->sgl;
+		sg_dma_address(sg) = addr;
+		sg_dma_len(sg) = PAGE_SIZE;
+		sg->length = PAGE_SIZE;
+		st->nents++;
+	}
+
+	sg_page_sizes |= sg->length;
+	sg_mark_end(sg);
+	return sg_page_sizes;
+}
+
+static void i915_svm_unbind_addr(struct i915_address_space *vm,
+				 u64 start, u64 length)
+{
+	struct i915_svm *svm = vm->svm;
+
+	mutex_lock(&svm->mutex);
+	/* FIXME: Need to flush the TLB */
+	svm_unbind_addr(vm, start, length);
+	mutex_unlock(&svm->mutex);
+}
+
+int i915_svm_bind(struct i915_address_space *vm, struct drm_i915_bind *args)
+{
+	struct vm_area_struct *vma;
+	struct hmm_range range;
+	struct i915_svm *svm;
+	u64 i, flags, npages;
+	struct sg_table st;
+	u32 sg_page_sizes;
+	int ret = 0;
+
+	svm = vm_get_svm(vm);
+	if (!svm)
+		return -EINVAL;
+
+	args->length += (args->start & ~PAGE_MASK);
+	args->start &= PAGE_MASK;
+	DRM_DEBUG_DRIVER("%sing start 0x%llx length 0x%llx vm_id 0x%x\n",
+			 (args->flags & I915_BIND_UNBIND) ? "Unbind" : "Bind",
+			 args->start, args->length, args->vm_id);
+	if (args->flags & I915_BIND_UNBIND) {
+		i915_svm_unbind_addr(vm, args->start, args->length);
+		goto bind_done;
+	}
+
+	npages = args->length / PAGE_SIZE;
+	if (unlikely(sg_alloc_table(&st, npages, GFP_KERNEL))) {
+		ret = -ENOMEM;
+		goto bind_done;
+	}
+
+	range.pfns = kvmalloc_array(npages, sizeof(uint64_t), GFP_KERNEL);
+	if (unlikely(!range.pfns)) {
+		ret = -ENOMEM;
+		goto range_done;
+	}
+
+	ret = svm_bind_addr_prepare(vm, args->start, args->length);
+	if (unlikely(ret))
+		goto prepare_done;
+
+	range.pfn_shift = PAGE_SHIFT;
+	range.start = args->start;
+	range.flags = i915_range_flags;
+	range.values = i915_range_values;
+	range.end = range.start + args->length;
+	for (i = 0; i < npages; ++i) {
+		range.pfns[i] = range.flags[HMM_PFN_VALID];
+		if (!(args->flags & I915_BIND_READONLY))
+			range.pfns[i] |= range.flags[HMM_PFN_WRITE];
+	}
+
+	down_read(&svm->mm->mmap_sem);
+	vma = find_vma(svm->mm, range.start);
+	if (unlikely(!vma || vma->vm_end < range.end)) {
+		ret = -EFAULT;
+		goto vma_done;
+	}
+again:
+	ret = i915_range_fault(svm, &range);
+	if (unlikely(ret))
+		goto vma_done;
+
+	sg_page_sizes = i915_svm_build_sg(vm, &range, &st);
+
+	mutex_lock(&svm->mutex);
+	if (!i915_range_done(&range)) {
+		mutex_unlock(&svm->mutex);
+		goto again;
+	}
+
+	flags = (args->flags & I915_BIND_READONLY) ? I915_GTT_SVM_READONLY : 0;
+	ret = svm_bind_addr_commit(vm, args->start, args->length,
+				   flags, &st, sg_page_sizes);
+	mutex_unlock(&svm->mutex);
+vma_done:
+	up_read(&svm->mm->mmap_sem);
+	if (unlikely(ret))
+		i915_svm_unbind_addr(vm, args->start, args->length);
+prepare_done:
+	kvfree(range.pfns);
+range_done:
+	sg_free_table(&st);
+bind_done:
+	vm_put_svm(vm);
+	return ret;
+}
+
+static int
+i915_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
+				const struct mmu_notifier_range *update)
+{
+	struct i915_svm *svm = container_of(mirror, typeof(*svm), mirror);
+	unsigned long length = update->end - update->start;
+
+	DRM_DEBUG_DRIVER("start 0x%lx length 0x%lx\n", update->start, length);
+	if (!mmu_notifier_range_blockable(update))
+		return -EAGAIN;
+
+	i915_svm_unbind_addr(svm->vm, update->start, length);
+	return 0;
+}
+
+static void i915_mirror_release(struct hmm_mirror *mirror)
+{
+}
+
+static const struct hmm_mirror_ops i915_mirror_ops = {
+	.sync_cpu_device_pagetables = &i915_sync_cpu_device_pagetables,
+	.release = &i915_mirror_release,
+};
+
+void i915_svm_unbind_mm(struct i915_address_space *vm)
+{
+	vm_put_svm(vm);
+}
+
+int i915_svm_bind_mm(struct i915_address_space *vm)
+{
+	struct i915_svm *svm;
+	struct mm_struct *mm;
+	int ret = 0;
+
+	mm = get_task_mm(current);
+	down_write(&mm->mmap_sem);
+	mutex_lock(&vm->svm_mutex);
+	if (vm->svm)
+		goto bind_out;
+
+	svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+	if (!svm) {
+		ret = -ENOMEM;
+		goto bind_out;
+	}
+	svm->mirror.ops = &i915_mirror_ops;
+	mutex_init(&svm->mutex);
+	kref_init(&svm->ref);
+	svm->mm = mm;
+	svm->vm = vm;
+
+	ret = hmm_mirror_register(&svm->mirror, mm);
+	if (ret)
+		goto bind_out;
+
+	vm->svm = svm;
+bind_out:
+	mutex_unlock(&vm->svm_mutex);
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	return ret;
+}
diff --git a/drivers/gpu/drm/i915/i915_svm.h b/drivers/gpu/drm/i915/i915_svm.h
new file mode 100644
index 000000000000..f176f1dc493f
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_svm.h
@@ -0,0 +1,50 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef __I915_SVM_H
+#define __I915_SVM_H
+
+#include <linux/hmm.h>
+
+#include "i915_drv.h"
+
+#if defined(CONFIG_DRM_I915_SVM)
+struct i915_svm {
+	/* i915 address space */
+	struct i915_address_space *vm;
+
+	/* Associated process mm */
+	struct mm_struct *mm;
+
+	/* hmm_mirror to track memory invalidations for a process */
+	struct hmm_mirror mirror;
+
+	struct mutex mutex; /* protects svm operations */
+	struct kref ref;
+};
+
+int i915_svm_bind(struct i915_address_space *vm, struct drm_i915_bind *args);
+void i915_svm_unbind_mm(struct i915_address_space *vm);
+int i915_svm_bind_mm(struct i915_address_space *vm);
+static inline bool i915_vm_is_svm_enabled(struct i915_address_space *vm)
+{
+	return vm->svm;
+}
+
+#else
+
+struct i915_svm { };
+static inline int i915_svm_bind(struct i915_address_space *vm,
+				struct drm_i915_bind *args)
+{ return -ENOTSUPP; }
+static inline void i915_svm_unbind_mm(struct i915_address_space *vm) { }
+static inline int i915_svm_bind_mm(struct i915_address_space *vm)
+{ return -ENOTSUPP; }
+static inline bool i915_vm_is_svm_enabled(struct i915_address_space *vm)
+{ return false; }
+
+#endif
+
+#endif /* __I915_SVM_H */