diff mbox series

[v2,02/29] mm/migrate: Add migrate_device_prepopulated_range

Message ID 20241016032518.539495-3-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce GPU SVM and Xe SVM implementation | expand

Commit Message

Matthew Brost Oct. 16, 2024, 3:24 a.m. UTC
Add migrate_device_prepoluated_range which prepares an array of
pre-populated device pages for migration.

v2:
 - s/migrate_device_vma_range/migrate_device_prepopulated_range
 - Drop extra mmu invalidation (Vetter)

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c     | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

Comments

Alistair Popple Oct. 16, 2024, 4:04 a.m. UTC | #1
Matthew Brost <matthew.brost@intel.com> writes:

> Add migrate_device_prepoluated_range which prepares an array of
> pre-populated device pages for migration.

It would be nice if the commit message could also include an explanation
of why the existing migrate_device_range() is inadequate for your needs.

> v2:
>  - s/migrate_device_vma_range/migrate_device_prepopulated_range
>  - Drop extra mmu invalidation (Vetter)
>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  include/linux/migrate.h |  1 +
>  mm/migrate_device.c     | 35 +++++++++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 002e49b2ebd9..9146ed39a2a3 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -229,6 +229,7 @@ void migrate_vma_pages(struct migrate_vma *migrate);
>  void migrate_vma_finalize(struct migrate_vma *migrate);
>  int migrate_device_range(unsigned long *src_pfns, unsigned long start,
>  			unsigned long npages);
> +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages);
>  void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>  			unsigned long npages);
>  void migrate_device_finalize(unsigned long *src_pfns,
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 9cf26592ac93..f163c2131022 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -924,6 +924,41 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
>  }
>  EXPORT_SYMBOL(migrate_device_range);
>  
> +/**
> + * migrate_device_prepopulated_range() - migrate device private pfns to normal memory.
> + * @src_pfns: pre-popluated array of source device private pfns to migrate.
> + * @npages: number of pages to migrate.
> + *
> + * Similar to migrate_device_range() but supports non-contiguous pre-popluated
> + * array of device pages to migrate.
> + */
> +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages)

I don't love the name, I think because it is quite long and makes me
think of something complicated like prefaulting. Perhaps
migrate_device_pfns()?

> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < npages; i++) {
> +		struct page *page = pfn_to_page(src_pfns[i]);
> +
> +		if (!get_page_unless_zero(page)) {
> +			src_pfns[i] = 0;
> +			continue;
> +		}
> +
> +		if (!trylock_page(page)) {
> +			src_pfns[i] = 0;
> +			put_page(page);
> +			continue;
> +		}
> +
> +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;

This needs to be converted to use a folio like
migrate_device_range(). But more importantly this should be split out as
a function that both migrate_device_range() and this function can call
given this bit is identical.

> +	}
> +
> +	migrate_device_unmap(src_pfns, npages, NULL);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> +
>  /*
>   * Migrate a device coherent folio back to normal memory. The caller should have
>   * a reference on folio which will be copied to the new folio if migration is
Matthew Brost Oct. 16, 2024, 4:46 a.m. UTC | #2
On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > Add migrate_device_prepoluated_range which prepares an array of
> > pre-populated device pages for migration.
> 
> It would be nice if the commit message could also include an explanation
> of why the existing migrate_device_range() is inadequate for your needs.
> 

Yea, my bad. It should explain this is required for non-contiguous
device pages. I suppose I could always iterate for contiguous regions
with migrate_device_range too if you think that is better.

> > v2:
> >  - s/migrate_device_vma_range/migrate_device_prepopulated_range
> >  - Drop extra mmu invalidation (Vetter)
> >
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  include/linux/migrate.h |  1 +
> >  mm/migrate_device.c     | 35 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 36 insertions(+)
> >
> > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > index 002e49b2ebd9..9146ed39a2a3 100644
> > --- a/include/linux/migrate.h
> > +++ b/include/linux/migrate.h
> > @@ -229,6 +229,7 @@ void migrate_vma_pages(struct migrate_vma *migrate);
> >  void migrate_vma_finalize(struct migrate_vma *migrate);
> >  int migrate_device_range(unsigned long *src_pfns, unsigned long start,
> >  			unsigned long npages);
> > +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages);
> >  void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >  			unsigned long npages);
> >  void migrate_device_finalize(unsigned long *src_pfns,
> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > index 9cf26592ac93..f163c2131022 100644
> > --- a/mm/migrate_device.c
> > +++ b/mm/migrate_device.c
> > @@ -924,6 +924,41 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
> >  }
> >  EXPORT_SYMBOL(migrate_device_range);
> >  
> > +/**
> > + * migrate_device_prepopulated_range() - migrate device private pfns to normal memory.
> > + * @src_pfns: pre-popluated array of source device private pfns to migrate.
> > + * @npages: number of pages to migrate.
> > + *
> > + * Similar to migrate_device_range() but supports non-contiguous pre-popluated
> > + * array of device pages to migrate.
> > + */
> > +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages)
> 
> I don't love the name, I think because it is quite long and makes me
> think of something complicated like prefaulting. Perhaps
> migrate_device_pfns()?
> 

Sure.

> > +{
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < npages; i++) {
> > +		struct page *page = pfn_to_page(src_pfns[i]);
> > +
> > +		if (!get_page_unless_zero(page)) {
> > +			src_pfns[i] = 0;
> > +			continue;
> > +		}
> > +
> > +		if (!trylock_page(page)) {
> > +			src_pfns[i] = 0;
> > +			put_page(page);
> > +			continue;
> > +		}
> > +
> > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> 
> This needs to be converted to use a folio like
> migrate_device_range(). But more importantly this should be split out as
> a function that both migrate_device_range() and this function can call
> given this bit is identical.
> 

Missed the folio conversion and agree a helper shared between this
function and migrate_device_range would be a good idea. Let add that.

Matt

> > +	}
> > +
> > +	migrate_device_unmap(src_pfns, npages, NULL);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> > +
> >  /*
> >   * Migrate a device coherent folio back to normal memory. The caller should have
> >   * a reference on folio which will be copied to the new folio if migration is
>
Matthew Brost Oct. 17, 2024, 12:56 a.m. UTC | #3
On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> > 
> > Matthew Brost <matthew.brost@intel.com> writes:
> > 
> > > Add migrate_device_prepoluated_range which prepares an array of
> > > pre-populated device pages for migration.
> > 
> > It would be nice if the commit message could also include an explanation
> > of why the existing migrate_device_range() is inadequate for your needs.
> > 
> 
> Yea, my bad. It should explain this is required for non-contiguous
> device pages. I suppose I could always iterate for contiguous regions
> with migrate_device_range too if you think that is better.
> 
> > > v2:
> > >  - s/migrate_device_vma_range/migrate_device_prepopulated_range
> > >  - Drop extra mmu invalidation (Vetter)
> > >
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  include/linux/migrate.h |  1 +
> > >  mm/migrate_device.c     | 35 +++++++++++++++++++++++++++++++++++
> > >  2 files changed, 36 insertions(+)
> > >
> > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > > index 002e49b2ebd9..9146ed39a2a3 100644
> > > --- a/include/linux/migrate.h
> > > +++ b/include/linux/migrate.h
> > > @@ -229,6 +229,7 @@ void migrate_vma_pages(struct migrate_vma *migrate);
> > >  void migrate_vma_finalize(struct migrate_vma *migrate);
> > >  int migrate_device_range(unsigned long *src_pfns, unsigned long start,
> > >  			unsigned long npages);
> > > +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages);
> > >  void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> > >  			unsigned long npages);
> > >  void migrate_device_finalize(unsigned long *src_pfns,
> > > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > > index 9cf26592ac93..f163c2131022 100644
> > > --- a/mm/migrate_device.c
> > > +++ b/mm/migrate_device.c
> > > @@ -924,6 +924,41 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
> > >  }
> > >  EXPORT_SYMBOL(migrate_device_range);
> > >  
> > > +/**
> > > + * migrate_device_prepopulated_range() - migrate device private pfns to normal memory.
> > > + * @src_pfns: pre-popluated array of source device private pfns to migrate.
> > > + * @npages: number of pages to migrate.
> > > + *
> > > + * Similar to migrate_device_range() but supports non-contiguous pre-popluated
> > > + * array of device pages to migrate.
> > > + */
> > > +int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages)
> > 
> > I don't love the name, I think because it is quite long and makes me
> > think of something complicated like prefaulting. Perhaps
> > migrate_device_pfns()?
> > 
> 
> Sure.
> 
> > > +{
> > > +	unsigned long i;
> > > +
> > > +	for (i = 0; i < npages; i++) {
> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> > > +
> > > +		if (!get_page_unless_zero(page)) {
> > > +			src_pfns[i] = 0;
> > > +			continue;
> > > +		}
> > > +
> > > +		if (!trylock_page(page)) {
> > > +			src_pfns[i] = 0;
> > > +			put_page(page);
> > > +			continue;
> > > +		}
> > > +
> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> > 
> > This needs to be converted to use a folio like
> > migrate_device_range(). But more importantly this should be split out as
> > a function that both migrate_device_range() and this function can call
> > given this bit is identical.
> > 
> 
> Missed the folio conversion and agree a helper shared between this
> function and migrate_device_range would be a good idea. Let add that.
> 

Alistair,

Ok, I think now I want to go slightly different direction here to give
GPUSVM a bit more control over several eviction scenarios.

What if I exported the helper discussed above, e.g.,

 905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
 906 {
 907         struct folio *folio;
 908
 909         folio = folio_get_nontail_page(pfn_to_page(pfn));
 910         if (!folio)
 911                 return 0;
 912
 913         if (!folio_trylock(folio)) {
 914                 folio_put(folio);
 915                 return 0;
 916         }
 917
 918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 919 }
 920 EXPORT_SYMBOL(migrate_device_pfn_lock);

And then also export migrate_device_unmap.

The usage here would be let a driver collect the device pages in virtual
address range via hmm_range_fault, lock device pages under notifier
lock ensuring device pages are valid, drop the notifier lock and call
migrate_device_unmap. Sima has strongly suggested avoiding a CPUVMA
lookup during eviction cases and this would let me fixup
drm_gpusvm_range_evict in [1] to avoid this.

It would also make the function exported in this patch unnecessary too
as non-contiguous pfns can be setup on driver side via
migrate_device_pfn_lock and then migrate_device_unmap can be called.
This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
in [1].

Do you see an issue exporting migrate_device_pfn_lock,
migrate_device_unmap?

Matt

[1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2

> Matt
> 
> > > +	}
> > > +
> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> > > +
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> > > +
> > >  /*
> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> > >   * a reference on folio which will be copied to the new folio if migration is
> >
Alistair Popple Oct. 17, 2024, 1:49 a.m. UTC | #4
Matthew Brost <matthew.brost@intel.com> writes:

> On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:

[...]

>> > > +{
>> > > +	unsigned long i;
>> > > +
>> > > +	for (i = 0; i < npages; i++) {
>> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> > > +
>> > > +		if (!get_page_unless_zero(page)) {
>> > > +			src_pfns[i] = 0;
>> > > +			continue;
>> > > +		}
>> > > +
>> > > +		if (!trylock_page(page)) {
>> > > +			src_pfns[i] = 0;
>> > > +			put_page(page);
>> > > +			continue;
>> > > +		}
>> > > +
>> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> > 
>> > This needs to be converted to use a folio like
>> > migrate_device_range(). But more importantly this should be split out as
>> > a function that both migrate_device_range() and this function can call
>> > given this bit is identical.
>> > 
>> 
>> Missed the folio conversion and agree a helper shared between this
>> function and migrate_device_range would be a good idea. Let add that.
>> 
>
> Alistair,
>
> Ok, I think now I want to go slightly different direction here to give
> GPUSVM a bit more control over several eviction scenarios.
>
> What if I exported the helper discussed above, e.g.,
>
>  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>  906 {
>  907         struct folio *folio;
>  908
>  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>  910         if (!folio)
>  911                 return 0;
>  912
>  913         if (!folio_trylock(folio)) {
>  914                 folio_put(folio);
>  915                 return 0;
>  916         }
>  917
>  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>  919 }
>  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>
> And then also export migrate_device_unmap.
>
> The usage here would be let a driver collect the device pages in virtual
> address range via hmm_range_fault, lock device pages under notifier
> lock ensuring device pages are valid, drop the notifier lock and call
> migrate_device_unmap.

I'm still working through this series but that seems a bit dubious, the
locking here is pretty subtle and easy to get wrong so seeing some code
would help me a lot in understanding what you're suggesting.

> Sima has strongly suggested avoiding a CPUVMA
> lookup during eviction cases and this would let me fixup
> drm_gpusvm_range_evict in [1] to avoid this.

That sounds reasonable but for context do you have a link to the
comments/discussion on this? I couldn't readily find it, but I may have
just missed it.

> It would also make the function exported in this patch unnecessary too
> as non-contiguous pfns can be setup on driver side via
> migrate_device_pfn_lock and then migrate_device_unmap can be called.
> This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> in [1].
>
> Do you see an issue exporting migrate_device_pfn_lock,
> migrate_device_unmap?

If there is a good justification for it I can't see a problem with
exporting it. That said I don't really understand why you would
want/need to split those steps up but I'll wait to see the code.

 - Alistair

> Matt
>
> [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>
>> Matt
>> 
>> > > +	}
>> > > +
>> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> > > +
>> > > +	return 0;
>> > > +}
>> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> > > +
>> > >  /*
>> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> > >   * a reference on folio which will be copied to the new folio if migration is
>> >
Matthew Brost Oct. 17, 2024, 2:45 a.m. UTC | #5
On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> 
> [...]
> 
> >> > > +{
> >> > > +	unsigned long i;
> >> > > +
> >> > > +	for (i = 0; i < npages; i++) {
> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> > > +
> >> > > +		if (!get_page_unless_zero(page)) {
> >> > > +			src_pfns[i] = 0;
> >> > > +			continue;
> >> > > +		}
> >> > > +
> >> > > +		if (!trylock_page(page)) {
> >> > > +			src_pfns[i] = 0;
> >> > > +			put_page(page);
> >> > > +			continue;
> >> > > +		}
> >> > > +
> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> > 
> >> > This needs to be converted to use a folio like
> >> > migrate_device_range(). But more importantly this should be split out as
> >> > a function that both migrate_device_range() and this function can call
> >> > given this bit is identical.
> >> > 
> >> 
> >> Missed the folio conversion and agree a helper shared between this
> >> function and migrate_device_range would be a good idea. Let add that.
> >> 
> >
> > Alistair,
> >
> > Ok, I think now I want to go slightly different direction here to give
> > GPUSVM a bit more control over several eviction scenarios.
> >
> > What if I exported the helper discussed above, e.g.,
> >
> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >  906 {
> >  907         struct folio *folio;
> >  908
> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >  910         if (!folio)
> >  911                 return 0;
> >  912
> >  913         if (!folio_trylock(folio)) {
> >  914                 folio_put(folio);
> >  915                 return 0;
> >  916         }
> >  917
> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >  919 }
> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >
> > And then also export migrate_device_unmap.
> >
> > The usage here would be let a driver collect the device pages in virtual
> > address range via hmm_range_fault, lock device pages under notifier
> > lock ensuring device pages are valid, drop the notifier lock and call
> > migrate_device_unmap.
> 
> I'm still working through this series but that seems a bit dubious, the
> locking here is pretty subtle and easy to get wrong so seeing some code
> would help me a lot in understanding what you're suggesting.
>

For sure locking in tricky, my mistake on not working through this
before sending out the next rev but it came to mind after sending +
regarding some late feedback from Thomas about using hmm for eviction
[2]. His suggestion of using hmm_range_fault to trigger migration
doesn't work for coherent pages, while something like below does.

[2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461

Here is a snippet I have locally which seems to work.

2024 retry:
2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2026         hmm_range.hmm_pfns = src;
2027
2028         while (true) {
2029                 mmap_read_lock(mm);
2030                 err = hmm_range_fault(&hmm_range);
2031                 mmap_read_unlock(mm);
2032                 if (err == -EBUSY) {
2033                         if (time_after(jiffies, timeout))
2034                                 break;
2035
2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2037                         continue;
2038                 }
2039                 break;
2040         }
2041         if (err)
2042                 goto err_put;
2043
2044         drm_gpusvm_notifier_lock(gpusvm);
2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
2046                 drm_gpusvm_notifier_unlock(gpusvm);
2047                 memset(src, 0, sizeof(*src) * npages);
2048                 goto retry;
2049         }
2050         for (i = 0; i < npages; ++i) {
2051                 struct page *page = hmm_pfn_to_page(src[i]);
2052
2053                 if (page && (is_device_private_page(page) ||
2054                     is_device_coherent_page(page)) && page->zone_device_data)
2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
2056                 else
2057                         src[i] = 0;
2058                 if (src[i])
2059                         src[i] = migrate_device_pfn_lock(src[i]);
2060         }
2061         drm_gpusvm_notifier_unlock(gpusvm);
2062
2063         migrate_device_unmap(src, npages, NULL);
...
2101         migrate_device_pages(src, dst, npages);
2102         migrate_device_finalize(src, dst, npages);


> > Sima has strongly suggested avoiding a CPUVMA
> > lookup during eviction cases and this would let me fixup
> > drm_gpusvm_range_evict in [1] to avoid this.
> 
> That sounds reasonable but for context do you have a link to the
> comments/discussion on this? I couldn't readily find it, but I may have
> just missed it.
> 

See in [4], search for '2. eviction' comment from sima.

[3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
[4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78

> > It would also make the function exported in this patch unnecessary too
> > as non-contiguous pfns can be setup on driver side via
> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> > in [1].
> >
> > Do you see an issue exporting migrate_device_pfn_lock,
> > migrate_device_unmap?
> 
> If there is a good justification for it I can't see a problem with
> exporting it. That said I don't really understand why you would
> want/need to split those steps up but I'll wait to see the code.
>

It is so the device pages returned from hmm_range_fault, which are only
guaranteed to be valid under the notifier lock + a seqno check, to be
locked and ref taken for migration. migrate_device_unmap() can trigger a
MMU invalidation which takes the notifier lock thus calling the function
which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.

I think this flow makes sense and agree in general this likely better
than looking at a CPUVMA.

Matt
 
>  - Alistair
> 
> > Matt
> >
> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >
> >> Matt
> >> 
> >> > > +	}
> >> > > +
> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> > > +
> >> > > +	return 0;
> >> > > +}
> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> > > +
> >> > >  /*
> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> > 
>
Alistair Popple Oct. 17, 2024, 3:21 a.m. UTC | #6
Matthew Brost <matthew.brost@intel.com> writes:

> On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>> 
>> [...]
>> 
>> >> > > +{
>> >> > > +	unsigned long i;
>> >> > > +
>> >> > > +	for (i = 0; i < npages; i++) {
>> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> >> > > +
>> >> > > +		if (!get_page_unless_zero(page)) {
>> >> > > +			src_pfns[i] = 0;
>> >> > > +			continue;
>> >> > > +		}
>> >> > > +
>> >> > > +		if (!trylock_page(page)) {
>> >> > > +			src_pfns[i] = 0;
>> >> > > +			put_page(page);
>> >> > > +			continue;
>> >> > > +		}
>> >> > > +
>> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> >> > 
>> >> > This needs to be converted to use a folio like
>> >> > migrate_device_range(). But more importantly this should be split out as
>> >> > a function that both migrate_device_range() and this function can call
>> >> > given this bit is identical.
>> >> > 
>> >> 
>> >> Missed the folio conversion and agree a helper shared between this
>> >> function and migrate_device_range would be a good idea. Let add that.
>> >> 
>> >
>> > Alistair,
>> >
>> > Ok, I think now I want to go slightly different direction here to give
>> > GPUSVM a bit more control over several eviction scenarios.
>> >
>> > What if I exported the helper discussed above, e.g.,
>> >
>> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>> >  906 {
>> >  907         struct folio *folio;
>> >  908
>> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>> >  910         if (!folio)
>> >  911                 return 0;
>> >  912
>> >  913         if (!folio_trylock(folio)) {
>> >  914                 folio_put(folio);
>> >  915                 return 0;
>> >  916         }
>> >  917
>> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>> >  919 }
>> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>> >
>> > And then also export migrate_device_unmap.
>> >
>> > The usage here would be let a driver collect the device pages in virtual
>> > address range via hmm_range_fault, lock device pages under notifier
>> > lock ensuring device pages are valid, drop the notifier lock and call
>> > migrate_device_unmap.
>> 
>> I'm still working through this series but that seems a bit dubious, the
>> locking here is pretty subtle and easy to get wrong so seeing some code
>> would help me a lot in understanding what you're suggesting.
>>
>
> For sure locking in tricky, my mistake on not working through this
> before sending out the next rev but it came to mind after sending +
> regarding some late feedback from Thomas about using hmm for eviction
> [2]. His suggestion of using hmm_range_fault to trigger migration
> doesn't work for coherent pages, while something like below does.
>
> [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>
> Here is a snippet I have locally which seems to work.
>
> 2024 retry:
> 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> 2026         hmm_range.hmm_pfns = src;
> 2027
> 2028         while (true) {
> 2029                 mmap_read_lock(mm);
> 2030                 err = hmm_range_fault(&hmm_range);
> 2031                 mmap_read_unlock(mm);
> 2032                 if (err == -EBUSY) {
> 2033                         if (time_after(jiffies, timeout))
> 2034                                 break;
> 2035
> 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> 2037                         continue;
> 2038                 }
> 2039                 break;
> 2040         }
> 2041         if (err)
> 2042                 goto err_put;
> 2043
> 2044         drm_gpusvm_notifier_lock(gpusvm);
> 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> 2047                 memset(src, 0, sizeof(*src) * npages);
> 2048                 goto retry;
> 2049         }
> 2050         for (i = 0; i < npages; ++i) {
> 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> 2052
> 2053                 if (page && (is_device_private_page(page) ||
> 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> 2056                 else
> 2057                         src[i] = 0;
> 2058                 if (src[i])
> 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> 2060         }
> 2061         drm_gpusvm_notifier_unlock(gpusvm);

Practically for eviction isn't this much the same as calling
migrate_vma_setup()? And also for eviction as Sima mentioned you
probably shouldn't be looking at mm/vma structs.

> 2063         migrate_device_unmap(src, npages, NULL);
> ...
> 2101         migrate_device_pages(src, dst, npages);
> 2102         migrate_device_finalize(src, dst, npages);
>
>
>> > Sima has strongly suggested avoiding a CPUVMA
>> > lookup during eviction cases and this would let me fixup
>> > drm_gpusvm_range_evict in [1] to avoid this.
>> 
>> That sounds reasonable but for context do you have a link to the
>> comments/discussion on this? I couldn't readily find it, but I may have
>> just missed it.
>> 
>
> See in [4], search for '2. eviction' comment from sima.

Thanks for pointing that out. For reference here's Sima's comment:

> 2. eviction
> 
> Requirements much like migrate_to_ram, because otherwise we break the
> migration gurantee:
> 
> - Only looking at physical memory datastructures and locks, no looking at
>   mm/vma structs or relying on those being locked. We rely entirely on
>   reverse maps from try_to_migrate to find all the mappings on both cpu
>   and gpu side (cpu only zone device swap or migration pte entries ofc).

I also very much agree with this. That's basically why I added
migrate_device_range(), so that we can forcibly evict pages when the
driver needs them freed (eg. driver unload, low memory, etc.). In
general it is impossible to guarantee eviction og all pages using just
hmm_range_fault().

> [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>
>> > It would also make the function exported in this patch unnecessary too
>> > as non-contiguous pfns can be setup on driver side via
>> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
>> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>> > in [1].
>> >
>> > Do you see an issue exporting migrate_device_pfn_lock,
>> > migrate_device_unmap?
>> 
>> If there is a good justification for it I can't see a problem with
>> exporting it. That said I don't really understand why you would
>> want/need to split those steps up but I'll wait to see the code.
>>
>
> It is so the device pages returned from hmm_range_fault, which are only
> guaranteed to be valid under the notifier lock + a seqno check, to be
> locked and ref taken for migration. migrate_device_unmap() can trigger a
> MMU invalidation which takes the notifier lock thus calling the function
> which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>
> I think this flow makes sense and agree in general this likely better
> than looking at a CPUVMA.

I'm still a bit confused about what is better with this flow if you are
still calling hmm_range_fault(). How is it better than just calling
migrate_vma_setup()? Obviously it will fault the pages in, but it seems
we're talking about eviction here so I don't understand why that would
be relevant. And hmm_range_fault() still requires the VMA, although I
need to look at the patches more closely, probably CPUVMA is a DRM
specific concept?

Thanks.

 - Alistair

> Matt
>  
>>  - Alistair
>> 
>> > Matt
>> >
>> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>> >
>> >> Matt
>> >> 
>> >> > > +	}
>> >> > > +
>> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> >> > > +
>> >> > > +	return 0;
>> >> > > +}
>> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> >> > > +
>> >> > >  /*
>> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> >> > >   * a reference on folio which will be copied to the new folio if migration is
>> >> > 
>>
Matthew Brost Oct. 17, 2024, 4:07 a.m. UTC | #7
On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> >> 
> >> [...]
> >> 
> >> >> > > +{
> >> >> > > +	unsigned long i;
> >> >> > > +
> >> >> > > +	for (i = 0; i < npages; i++) {
> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> >> > > +
> >> >> > > +		if (!get_page_unless_zero(page)) {
> >> >> > > +			src_pfns[i] = 0;
> >> >> > > +			continue;
> >> >> > > +		}
> >> >> > > +
> >> >> > > +		if (!trylock_page(page)) {
> >> >> > > +			src_pfns[i] = 0;
> >> >> > > +			put_page(page);
> >> >> > > +			continue;
> >> >> > > +		}
> >> >> > > +
> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> >> > 
> >> >> > This needs to be converted to use a folio like
> >> >> > migrate_device_range(). But more importantly this should be split out as
> >> >> > a function that both migrate_device_range() and this function can call
> >> >> > given this bit is identical.
> >> >> > 
> >> >> 
> >> >> Missed the folio conversion and agree a helper shared between this
> >> >> function and migrate_device_range would be a good idea. Let add that.
> >> >> 
> >> >
> >> > Alistair,
> >> >
> >> > Ok, I think now I want to go slightly different direction here to give
> >> > GPUSVM a bit more control over several eviction scenarios.
> >> >
> >> > What if I exported the helper discussed above, e.g.,
> >> >
> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >> >  906 {
> >> >  907         struct folio *folio;
> >> >  908
> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >> >  910         if (!folio)
> >> >  911                 return 0;
> >> >  912
> >> >  913         if (!folio_trylock(folio)) {
> >> >  914                 folio_put(folio);
> >> >  915                 return 0;
> >> >  916         }
> >> >  917
> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >> >  919 }
> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >> >
> >> > And then also export migrate_device_unmap.
> >> >
> >> > The usage here would be let a driver collect the device pages in virtual
> >> > address range via hmm_range_fault, lock device pages under notifier
> >> > lock ensuring device pages are valid, drop the notifier lock and call
> >> > migrate_device_unmap.
> >> 
> >> I'm still working through this series but that seems a bit dubious, the
> >> locking here is pretty subtle and easy to get wrong so seeing some code
> >> would help me a lot in understanding what you're suggesting.
> >>
> >
> > For sure locking in tricky, my mistake on not working through this
> > before sending out the next rev but it came to mind after sending +
> > regarding some late feedback from Thomas about using hmm for eviction
> > [2]. His suggestion of using hmm_range_fault to trigger migration
> > doesn't work for coherent pages, while something like below does.
> >
> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> >
> > Here is a snippet I have locally which seems to work.
> >
> > 2024 retry:
> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > 2026         hmm_range.hmm_pfns = src;
> > 2027
> > 2028         while (true) {
> > 2029                 mmap_read_lock(mm);
> > 2030                 err = hmm_range_fault(&hmm_range);
> > 2031                 mmap_read_unlock(mm);
> > 2032                 if (err == -EBUSY) {
> > 2033                         if (time_after(jiffies, timeout))
> > 2034                                 break;
> > 2035
> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > 2037                         continue;
> > 2038                 }
> > 2039                 break;
> > 2040         }
> > 2041         if (err)
> > 2042                 goto err_put;
> > 2043
> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> > 2047                 memset(src, 0, sizeof(*src) * npages);
> > 2048                 goto retry;
> > 2049         }
> > 2050         for (i = 0; i < npages; ++i) {
> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> > 2052
> > 2053                 if (page && (is_device_private_page(page) ||
> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> > 2056                 else
> > 2057                         src[i] = 0;
> > 2058                 if (src[i])
> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> > 2060         }
> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> 
> Practically for eviction isn't this much the same as calling
> migrate_vma_setup()? And also for eviction as Sima mentioned you
> probably shouldn't be looking at mm/vma structs.
> 

hmm_range_fault is just collecting the pages, internally I suppose it
does look at a VMA (struct vm_area_struct) but I think the point is
drivers should not be looking at VMA here.

> > 2063         migrate_device_unmap(src, npages, NULL);
> > ...
> > 2101         migrate_device_pages(src, dst, npages);
> > 2102         migrate_device_finalize(src, dst, npages);
> >
> >
> >> > Sima has strongly suggested avoiding a CPUVMA
> >> > lookup during eviction cases and this would let me fixup
> >> > drm_gpusvm_range_evict in [1] to avoid this.
> >> 
> >> That sounds reasonable but for context do you have a link to the
> >> comments/discussion on this? I couldn't readily find it, but I may have
> >> just missed it.
> >> 
> >
> > See in [4], search for '2. eviction' comment from sima.
> 
> Thanks for pointing that out. For reference here's Sima's comment:
> 
> > 2. eviction
> > 
> > Requirements much like migrate_to_ram, because otherwise we break the
> > migration gurantee:
> > 
> > - Only looking at physical memory datastructures and locks, no looking at
> >   mm/vma structs or relying on those being locked. We rely entirely on
> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> 
> I also very much agree with this. That's basically why I added
> migrate_device_range(), so that we can forcibly evict pages when the
> driver needs them freed (eg. driver unload, low memory, etc.). In
> general it is impossible to guarantee eviction og all pages using just
> hmm_range_fault().
> 

In this code path we don't have device pages available, hence the
purposed collection via hmm_range_fault.

> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> >
> >> > It would also make the function exported in this patch unnecessary too
> >> > as non-contiguous pfns can be setup on driver side via
> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> >> > in [1].
> >> >
> >> > Do you see an issue exporting migrate_device_pfn_lock,
> >> > migrate_device_unmap?
> >> 
> >> If there is a good justification for it I can't see a problem with
> >> exporting it. That said I don't really understand why you would
> >> want/need to split those steps up but I'll wait to see the code.
> >>
> >
> > It is so the device pages returned from hmm_range_fault, which are only
> > guaranteed to be valid under the notifier lock + a seqno check, to be
> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> > MMU invalidation which takes the notifier lock thus calling the function
> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> >
> > I think this flow makes sense and agree in general this likely better
> > than looking at a CPUVMA.
> 
> I'm still a bit confused about what is better with this flow if you are
> still calling hmm_range_fault(). How is it better than just calling
> migrate_vma_setup()? Obviously it will fault the pages in, but it seems

The code in rev2 calls migrate_vma_setup but the requires a struct
vm_area_struct argument whereas hmm_range_fault does not.

> we're talking about eviction here so I don't understand why that would
> be relevant. And hmm_range_fault() still requires the VMA, although I
> need to look at the patches more closely, probably CPUVMA is a DRM

'hmm_range_fault() still requires the VMA' internal yes, but again not
as argument. This is about avoiding a driver side lookup of the VMA.

CPUVMA == struct vm_area_struct in this email.

Matt

> specific concept?
> 
> Thanks.
> 
>  - Alistair
> 
> > Matt
> >  
> >>  - Alistair
> >> 
> >> > Matt
> >> >
> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >> >
> >> >> Matt
> >> >> 
> >> >> > > +	}
> >> >> > > +
> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> >> > > +
> >> >> > > +	return 0;
> >> >> > > +}
> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> >> > > +
> >> >> > >  /*
> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> >> > 
> >> 
>
Alistair Popple Oct. 17, 2024, 5:49 a.m. UTC | #8
Matthew Brost <matthew.brost@intel.com> writes:

> On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> 
>> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>> >> 
>> >> [...]
>> >> 
>> >> >> > > +{
>> >> >> > > +	unsigned long i;
>> >> >> > > +
>> >> >> > > +	for (i = 0; i < npages; i++) {
>> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> >> >> > > +
>> >> >> > > +		if (!get_page_unless_zero(page)) {
>> >> >> > > +			src_pfns[i] = 0;
>> >> >> > > +			continue;
>> >> >> > > +		}
>> >> >> > > +
>> >> >> > > +		if (!trylock_page(page)) {
>> >> >> > > +			src_pfns[i] = 0;
>> >> >> > > +			put_page(page);
>> >> >> > > +			continue;
>> >> >> > > +		}
>> >> >> > > +
>> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> >> >> > 
>> >> >> > This needs to be converted to use a folio like
>> >> >> > migrate_device_range(). But more importantly this should be split out as
>> >> >> > a function that both migrate_device_range() and this function can call
>> >> >> > given this bit is identical.
>> >> >> > 
>> >> >> 
>> >> >> Missed the folio conversion and agree a helper shared between this
>> >> >> function and migrate_device_range would be a good idea. Let add that.
>> >> >> 
>> >> >
>> >> > Alistair,
>> >> >
>> >> > Ok, I think now I want to go slightly different direction here to give
>> >> > GPUSVM a bit more control over several eviction scenarios.
>> >> >
>> >> > What if I exported the helper discussed above, e.g.,
>> >> >
>> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>> >> >  906 {
>> >> >  907         struct folio *folio;
>> >> >  908
>> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>> >> >  910         if (!folio)
>> >> >  911                 return 0;
>> >> >  912
>> >> >  913         if (!folio_trylock(folio)) {
>> >> >  914                 folio_put(folio);
>> >> >  915                 return 0;
>> >> >  916         }
>> >> >  917
>> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>> >> >  919 }
>> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>> >> >
>> >> > And then also export migrate_device_unmap.
>> >> >
>> >> > The usage here would be let a driver collect the device pages in virtual
>> >> > address range via hmm_range_fault, lock device pages under notifier
>> >> > lock ensuring device pages are valid, drop the notifier lock and call
>> >> > migrate_device_unmap.
>> >> 
>> >> I'm still working through this series but that seems a bit dubious, the
>> >> locking here is pretty subtle and easy to get wrong so seeing some code
>> >> would help me a lot in understanding what you're suggesting.
>> >>
>> >
>> > For sure locking in tricky, my mistake on not working through this
>> > before sending out the next rev but it came to mind after sending +
>> > regarding some late feedback from Thomas about using hmm for eviction
>> > [2]. His suggestion of using hmm_range_fault to trigger migration
>> > doesn't work for coherent pages, while something like below does.
>> >
>> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>> >
>> > Here is a snippet I have locally which seems to work.
>> >
>> > 2024 retry:
>> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> > 2026         hmm_range.hmm_pfns = src;
>> > 2027
>> > 2028         while (true) {
>> > 2029                 mmap_read_lock(mm);
>> > 2030                 err = hmm_range_fault(&hmm_range);
>> > 2031                 mmap_read_unlock(mm);
>> > 2032                 if (err == -EBUSY) {
>> > 2033                         if (time_after(jiffies, timeout))
>> > 2034                                 break;
>> > 2035
>> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> > 2037                         continue;
>> > 2038                 }
>> > 2039                 break;
>> > 2040         }
>> > 2041         if (err)
>> > 2042                 goto err_put;
>> > 2043
>> > 2044         drm_gpusvm_notifier_lock(gpusvm);
>> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>> > 2047                 memset(src, 0, sizeof(*src) * npages);
>> > 2048                 goto retry;
>> > 2049         }
>> > 2050         for (i = 0; i < npages; ++i) {
>> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>> > 2052
>> > 2053                 if (page && (is_device_private_page(page) ||
>> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>> > 2056                 else
>> > 2057                         src[i] = 0;
>> > 2058                 if (src[i])
>> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>> > 2060         }
>> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
>> 
>> Practically for eviction isn't this much the same as calling
>> migrate_vma_setup()? And also for eviction as Sima mentioned you
>> probably shouldn't be looking at mm/vma structs.
>> 
>
> hmm_range_fault is just collecting the pages, internally I suppose it
> does look at a VMA (struct vm_area_struct) but I think the point is
> drivers should not be looking at VMA here.

migrate_vma_setup() is designed to be called by drivers and needs a vma,
so in general I don't see a problem with drivers looking up vma's. The
problem arises specifically for eviction and whether or not that happens
in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
issues there (see below).

>> > 2063         migrate_device_unmap(src, npages, NULL);
>> > ...
>> > 2101         migrate_device_pages(src, dst, npages);
>> > 2102         migrate_device_finalize(src, dst, npages);
>> >
>> >
>> >> > Sima has strongly suggested avoiding a CPUVMA
>> >> > lookup during eviction cases and this would let me fixup
>> >> > drm_gpusvm_range_evict in [1] to avoid this.
>> >> 
>> >> That sounds reasonable but for context do you have a link to the
>> >> comments/discussion on this? I couldn't readily find it, but I may have
>> >> just missed it.
>> >> 
>> >
>> > See in [4], search for '2. eviction' comment from sima.
>> 
>> Thanks for pointing that out. For reference here's Sima's comment:
>> 
>> > 2. eviction
>> > 
>> > Requirements much like migrate_to_ram, because otherwise we break the
>> > migration gurantee:
>> > 
>> > - Only looking at physical memory datastructures and locks, no looking at
>> >   mm/vma structs or relying on those being locked. We rely entirely on
>> >   reverse maps from try_to_migrate to find all the mappings on both cpu
>> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
>>
>> I also very much agree with this. That's basically why I added
>> migrate_device_range(), so that we can forcibly evict pages when the
>> driver needs them freed (eg. driver unload, low memory, etc.). In
>> general it is impossible to guarantee eviction og all pages using just
>> hmm_range_fault().
>> 
>
> In this code path we don't have device pages available, hence the
> purposed collection via hmm_range_fault.

Why don't you have the pfns requiring eviction available? I need to read
this series in more depth, but generally hmm_range_fault() can't
gurantee you will find every device page.

>> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>> >
>> >> > It would also make the function exported in this patch unnecessary too
>> >> > as non-contiguous pfns can be setup on driver side via
>> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
>> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>> >> > in [1].
>> >> >
>> >> > Do you see an issue exporting migrate_device_pfn_lock,
>> >> > migrate_device_unmap?
>> >> 
>> >> If there is a good justification for it I can't see a problem with
>> >> exporting it. That said I don't really understand why you would
>> >> want/need to split those steps up but I'll wait to see the code.
>> >>
>> >
>> > It is so the device pages returned from hmm_range_fault, which are only
>> > guaranteed to be valid under the notifier lock + a seqno check, to be
>> > locked and ref taken for migration. migrate_device_unmap() can trigger a
>> > MMU invalidation which takes the notifier lock thus calling the function
>> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>> >
>> > I think this flow makes sense and agree in general this likely better
>> > than looking at a CPUVMA.
>> 
>> I'm still a bit confused about what is better with this flow if you are
>> still calling hmm_range_fault(). How is it better than just calling
>> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>
> The code in rev2 calls migrate_vma_setup but the requires a struct
> vm_area_struct argument whereas hmm_range_fault does not.

I'm not sure that's a good enough justfication because the problem isn't
whether you're looking up vma's in driver code or mm code. The problem
is you are looking up vma's at all and all that goes with that (mainly
taking mmap lock, etc.)

And for eviction hmm_range_fault() won't even find all the pages because
their virtual address may have changed - consider what happens in cases
of mremap(), fork(), etc. So eviction really needs physical pages
(pfn's), not virtual addresses.

>> we're talking about eviction here so I don't understand why that would
>> be relevant. And hmm_range_fault() still requires the VMA, although I
>> need to look at the patches more closely, probably CPUVMA is a DRM
>
> 'hmm_range_fault() still requires the VMA' internal yes, but again not
> as argument. This is about avoiding a driver side lookup of the VMA.
>
> CPUVMA == struct vm_area_struct in this email.

Thanks for the clarification.

 - Alistair

> Matt
>
>> specific concept?
>> 
>> Thanks.
>> 
>>  - Alistair
>> 
>> > Matt
>> >  
>> >>  - Alistair
>> >> 
>> >> > Matt
>> >> >
>> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>> >> >
>> >> >> Matt
>> >> >> 
>> >> >> > > +	}
>> >> >> > > +
>> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> >> >> > > +
>> >> >> > > +	return 0;
>> >> >> > > +}
>> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> >> >> > > +
>> >> >> > >  /*
>> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
>> >> >> > 
>> >> 
>>
Matthew Brost Oct. 17, 2024, 3:40 p.m. UTC | #9
On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> 
> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> [...]
> >> >> 
> >> >> >> > > +{
> >> >> >> > > +	unsigned long i;
> >> >> >> > > +
> >> >> >> > > +	for (i = 0; i < npages; i++) {
> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> >> >> > > +
> >> >> >> > > +		if (!get_page_unless_zero(page)) {
> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> > > +			continue;
> >> >> >> > > +		}
> >> >> >> > > +
> >> >> >> > > +		if (!trylock_page(page)) {
> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> > > +			put_page(page);
> >> >> >> > > +			continue;
> >> >> >> > > +		}
> >> >> >> > > +
> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> >> >> > 
> >> >> >> > This needs to be converted to use a folio like
> >> >> >> > migrate_device_range(). But more importantly this should be split out as
> >> >> >> > a function that both migrate_device_range() and this function can call
> >> >> >> > given this bit is identical.
> >> >> >> > 
> >> >> >> 
> >> >> >> Missed the folio conversion and agree a helper shared between this
> >> >> >> function and migrate_device_range would be a good idea. Let add that.
> >> >> >> 
> >> >> >
> >> >> > Alistair,
> >> >> >
> >> >> > Ok, I think now I want to go slightly different direction here to give
> >> >> > GPUSVM a bit more control over several eviction scenarios.
> >> >> >
> >> >> > What if I exported the helper discussed above, e.g.,
> >> >> >
> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >> >> >  906 {
> >> >> >  907         struct folio *folio;
> >> >> >  908
> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >> >> >  910         if (!folio)
> >> >> >  911                 return 0;
> >> >> >  912
> >> >> >  913         if (!folio_trylock(folio)) {
> >> >> >  914                 folio_put(folio);
> >> >> >  915                 return 0;
> >> >> >  916         }
> >> >> >  917
> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >> >> >  919 }
> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >> >> >
> >> >> > And then also export migrate_device_unmap.
> >> >> >
> >> >> > The usage here would be let a driver collect the device pages in virtual
> >> >> > address range via hmm_range_fault, lock device pages under notifier
> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
> >> >> > migrate_device_unmap.
> >> >> 
> >> >> I'm still working through this series but that seems a bit dubious, the
> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
> >> >> would help me a lot in understanding what you're suggesting.
> >> >>
> >> >
> >> > For sure locking in tricky, my mistake on not working through this
> >> > before sending out the next rev but it came to mind after sending +
> >> > regarding some late feedback from Thomas about using hmm for eviction
> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
> >> > doesn't work for coherent pages, while something like below does.
> >> >
> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> >> >
> >> > Here is a snippet I have locally which seems to work.
> >> >
> >> > 2024 retry:
> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> > 2026         hmm_range.hmm_pfns = src;
> >> > 2027
> >> > 2028         while (true) {
> >> > 2029                 mmap_read_lock(mm);
> >> > 2030                 err = hmm_range_fault(&hmm_range);
> >> > 2031                 mmap_read_unlock(mm);
> >> > 2032                 if (err == -EBUSY) {
> >> > 2033                         if (time_after(jiffies, timeout))
> >> > 2034                                 break;
> >> > 2035
> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> > 2037                         continue;
> >> > 2038                 }
> >> > 2039                 break;
> >> > 2040         }
> >> > 2041         if (err)
> >> > 2042                 goto err_put;
> >> > 2043
> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
> >> > 2048                 goto retry;
> >> > 2049         }
> >> > 2050         for (i = 0; i < npages; ++i) {
> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> >> > 2052
> >> > 2053                 if (page && (is_device_private_page(page) ||
> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> >> > 2056                 else
> >> > 2057                         src[i] = 0;
> >> > 2058                 if (src[i])
> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> >> > 2060         }
> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> >> 
> >> Practically for eviction isn't this much the same as calling
> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
> >> probably shouldn't be looking at mm/vma structs.
> >> 
> >
> > hmm_range_fault is just collecting the pages, internally I suppose it
> > does look at a VMA (struct vm_area_struct) but I think the point is
> > drivers should not be looking at VMA here.
> 
> migrate_vma_setup() is designed to be called by drivers and needs a vma,
> so in general I don't see a problem with drivers looking up vma's. The
> problem arises specifically for eviction and whether or not that happens
> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
> issues there (see below).
> 

Ok, if you think it ok for drivers to lookup the VMA then purposed
exporting of migrate_device_pfn_lock & migrate_device_unmap is not
needed, rather just the original function exported in the this patch.

More below too.

> >> > 2063         migrate_device_unmap(src, npages, NULL);
> >> > ...
> >> > 2101         migrate_device_pages(src, dst, npages);
> >> > 2102         migrate_device_finalize(src, dst, npages);
> >> >
> >> >
> >> >> > Sima has strongly suggested avoiding a CPUVMA
> >> >> > lookup during eviction cases and this would let me fixup
> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
> >> >> 
> >> >> That sounds reasonable but for context do you have a link to the
> >> >> comments/discussion on this? I couldn't readily find it, but I may have
> >> >> just missed it.
> >> >> 
> >> >
> >> > See in [4], search for '2. eviction' comment from sima.
> >> 
> >> Thanks for pointing that out. For reference here's Sima's comment:
> >> 
> >> > 2. eviction
> >> > 
> >> > Requirements much like migrate_to_ram, because otherwise we break the
> >> > migration gurantee:
> >> > 
> >> > - Only looking at physical memory datastructures and locks, no looking at
> >> >   mm/vma structs or relying on those being locked. We rely entirely on
> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> >>
> >> I also very much agree with this. That's basically why I added
> >> migrate_device_range(), so that we can forcibly evict pages when the
> >> driver needs them freed (eg. driver unload, low memory, etc.). In
> >> general it is impossible to guarantee eviction og all pages using just
> >> hmm_range_fault().
> >> 
> >
> > In this code path we don't have device pages available, hence the
> > purposed collection via hmm_range_fault.
> 
> Why don't you have the pfns requiring eviction available? I need to read
> this series in more depth, but generally hmm_range_fault() can't
> gurantee you will find every device page.
> 

There are two cases for eviction in my series:

1. TTM decides it needs to move memory. This calls
drm_gpusvm_evict_to_ram. In this case the device pfns are available
directly from drm_gpusvm_devmem so the migrate_device_* calls be used
here albiet with the new function added in this patch as device pfns may
be non-contiguous.

2. An inconsistent state for VA range occurs (mixed system and device pages,
partial unmap of a range, etc...). Here we want to evict the range ram
to make the state consistent. No device pages are available due to an
intentional disconnect between a virtual range and physical
drm_gpusvm_devmem, thus the device pages have to be looked up. This the
function drm_gpusvm_range_evict. Based on what you tell me, it likely is
fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
using hmm_range_fault like I have suggested here.

Note #2 may be removed or unnecessary at some point if we decide to add
support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
now though. See 'Ranges with mixed system and device pages' in [5].

[5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2

> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> >> >
> >> >> > It would also make the function exported in this patch unnecessary too
> >> >> > as non-contiguous pfns can be setup on driver side via
> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> >> >> > in [1].
> >> >> >
> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
> >> >> > migrate_device_unmap?
> >> >> 
> >> >> If there is a good justification for it I can't see a problem with
> >> >> exporting it. That said I don't really understand why you would
> >> >> want/need to split those steps up but I'll wait to see the code.
> >> >>
> >> >
> >> > It is so the device pages returned from hmm_range_fault, which are only
> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> >> > MMU invalidation which takes the notifier lock thus calling the function
> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> >> >
> >> > I think this flow makes sense and agree in general this likely better
> >> > than looking at a CPUVMA.
> >> 
> >> I'm still a bit confused about what is better with this flow if you are
> >> still calling hmm_range_fault(). How is it better than just calling
> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
> >
> > The code in rev2 calls migrate_vma_setup but the requires a struct
> > vm_area_struct argument whereas hmm_range_fault does not.
> 
> I'm not sure that's a good enough justfication because the problem isn't
> whether you're looking up vma's in driver code or mm code. The problem
> is you are looking up vma's at all and all that goes with that (mainly
> taking mmap lock, etc.)
> 
> And for eviction hmm_range_fault() won't even find all the pages because
> their virtual address may have changed - consider what happens in cases
> of mremap(), fork(), etc. So eviction really needs physical pages
> (pfn's), not virtual addresses.
>

See above, #1 yes we use a physical pages. For #2 it is about making the
state consistent within a virtual address range.

Matt
 
> >> we're talking about eviction here so I don't understand why that would
> >> be relevant. And hmm_range_fault() still requires the VMA, although I
> >> need to look at the patches more closely, probably CPUVMA is a DRM
> >
> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
> > as argument. This is about avoiding a driver side lookup of the VMA.
> >
> > CPUVMA == struct vm_area_struct in this email.
> 
> Thanks for the clarification.
> 
>  - Alistair
> 
> > Matt
> >
> >> specific concept?
> >> 
> >> Thanks.
> >> 
> >>  - Alistair
> >> 
> >> > Matt
> >> >  
> >> >>  - Alistair
> >> >> 
> >> >> > Matt
> >> >> >
> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >> >> >
> >> >> >> Matt
> >> >> >> 
> >> >> >> > > +	}
> >> >> >> > > +
> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> >> >> > > +
> >> >> >> > > +	return 0;
> >> >> >> > > +}
> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> >> >> > > +
> >> >> >> > >  /*
> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> >> >> > 
> >> >> 
> >> 
>
Alistair Popple Oct. 17, 2024, 9:58 p.m. UTC | #10
Matthew Brost <matthew.brost@intel.com> writes:

> On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> 
>> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>> >> >> 
>> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> 
>> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>> >> >> 
>> >> >> [...]
>> >> >> 
>> >> >> >> > > +{
>> >> >> >> > > +	unsigned long i;
>> >> >> >> > > +
>> >> >> >> > > +	for (i = 0; i < npages; i++) {
>> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> >> >> >> > > +
>> >> >> >> > > +		if (!get_page_unless_zero(page)) {
>> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> > > +			continue;
>> >> >> >> > > +		}
>> >> >> >> > > +
>> >> >> >> > > +		if (!trylock_page(page)) {
>> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> > > +			put_page(page);
>> >> >> >> > > +			continue;
>> >> >> >> > > +		}
>> >> >> >> > > +
>> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> >> >> >> > 
>> >> >> >> > This needs to be converted to use a folio like
>> >> >> >> > migrate_device_range(). But more importantly this should be split out as
>> >> >> >> > a function that both migrate_device_range() and this function can call
>> >> >> >> > given this bit is identical.
>> >> >> >> > 
>> >> >> >> 
>> >> >> >> Missed the folio conversion and agree a helper shared between this
>> >> >> >> function and migrate_device_range would be a good idea. Let add that.
>> >> >> >> 
>> >> >> >
>> >> >> > Alistair,
>> >> >> >
>> >> >> > Ok, I think now I want to go slightly different direction here to give
>> >> >> > GPUSVM a bit more control over several eviction scenarios.
>> >> >> >
>> >> >> > What if I exported the helper discussed above, e.g.,
>> >> >> >
>> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>> >> >> >  906 {
>> >> >> >  907         struct folio *folio;
>> >> >> >  908
>> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>> >> >> >  910         if (!folio)
>> >> >> >  911                 return 0;
>> >> >> >  912
>> >> >> >  913         if (!folio_trylock(folio)) {
>> >> >> >  914                 folio_put(folio);
>> >> >> >  915                 return 0;
>> >> >> >  916         }
>> >> >> >  917
>> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>> >> >> >  919 }
>> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>> >> >> >
>> >> >> > And then also export migrate_device_unmap.
>> >> >> >
>> >> >> > The usage here would be let a driver collect the device pages in virtual
>> >> >> > address range via hmm_range_fault, lock device pages under notifier
>> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
>> >> >> > migrate_device_unmap.
>> >> >> 
>> >> >> I'm still working through this series but that seems a bit dubious, the
>> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
>> >> >> would help me a lot in understanding what you're suggesting.
>> >> >>
>> >> >
>> >> > For sure locking in tricky, my mistake on not working through this
>> >> > before sending out the next rev but it came to mind after sending +
>> >> > regarding some late feedback from Thomas about using hmm for eviction
>> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
>> >> > doesn't work for coherent pages, while something like below does.
>> >> >
>> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>> >> >
>> >> > Here is a snippet I have locally which seems to work.
>> >> >
>> >> > 2024 retry:
>> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> > 2026         hmm_range.hmm_pfns = src;
>> >> > 2027
>> >> > 2028         while (true) {
>> >> > 2029                 mmap_read_lock(mm);
>> >> > 2030                 err = hmm_range_fault(&hmm_range);
>> >> > 2031                 mmap_read_unlock(mm);
>> >> > 2032                 if (err == -EBUSY) {
>> >> > 2033                         if (time_after(jiffies, timeout))
>> >> > 2034                                 break;
>> >> > 2035
>> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> > 2037                         continue;
>> >> > 2038                 }
>> >> > 2039                 break;
>> >> > 2040         }
>> >> > 2041         if (err)
>> >> > 2042                 goto err_put;
>> >> > 2043
>> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
>> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
>> >> > 2048                 goto retry;
>> >> > 2049         }
>> >> > 2050         for (i = 0; i < npages; ++i) {
>> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>> >> > 2052
>> >> > 2053                 if (page && (is_device_private_page(page) ||
>> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>> >> > 2056                 else
>> >> > 2057                         src[i] = 0;
>> >> > 2058                 if (src[i])
>> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>> >> > 2060         }
>> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
>> >> 
>> >> Practically for eviction isn't this much the same as calling
>> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
>> >> probably shouldn't be looking at mm/vma structs.
>> >> 
>> >
>> > hmm_range_fault is just collecting the pages, internally I suppose it
>> > does look at a VMA (struct vm_area_struct) but I think the point is
>> > drivers should not be looking at VMA here.
>> 
>> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>> so in general I don't see a problem with drivers looking up vma's. The
>> problem arises specifically for eviction and whether or not that happens
>> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>> issues there (see below).
>> 
>
> Ok, if you think it ok for drivers to lookup the VMA then purposed
> exporting of migrate_device_pfn_lock & migrate_device_unmap is not
> needed, rather just the original function exported in the this patch.
>
> More below too.
>
>> >> > 2063         migrate_device_unmap(src, npages, NULL);
>> >> > ...
>> >> > 2101         migrate_device_pages(src, dst, npages);
>> >> > 2102         migrate_device_finalize(src, dst, npages);
>> >> >
>> >> >
>> >> >> > Sima has strongly suggested avoiding a CPUVMA
>> >> >> > lookup during eviction cases and this would let me fixup
>> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
>> >> >> 
>> >> >> That sounds reasonable but for context do you have a link to the
>> >> >> comments/discussion on this? I couldn't readily find it, but I may have
>> >> >> just missed it.
>> >> >> 
>> >> >
>> >> > See in [4], search for '2. eviction' comment from sima.
>> >> 
>> >> Thanks for pointing that out. For reference here's Sima's comment:
>> >> 
>> >> > 2. eviction
>> >> > 
>> >> > Requirements much like migrate_to_ram, because otherwise we break the
>> >> > migration gurantee:
>> >> > 
>> >> > - Only looking at physical memory datastructures and locks, no looking at
>> >> >   mm/vma structs or relying on those being locked. We rely entirely on
>> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
>> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
>> >>
>> >> I also very much agree with this. That's basically why I added
>> >> migrate_device_range(), so that we can forcibly evict pages when the
>> >> driver needs them freed (eg. driver unload, low memory, etc.). In
>> >> general it is impossible to guarantee eviction og all pages using just
>> >> hmm_range_fault().
>> >> 
>> >
>> > In this code path we don't have device pages available, hence the
>> > purposed collection via hmm_range_fault.
>> 
>> Why don't you have the pfns requiring eviction available? I need to read
>> this series in more depth, but generally hmm_range_fault() can't
>> gurantee you will find every device page.
>> 
>
> There are two cases for eviction in my series:
>
> 1. TTM decides it needs to move memory. This calls
> drm_gpusvm_evict_to_ram. In this case the device pfns are available
> directly from drm_gpusvm_devmem so the migrate_device_* calls be used
> here albiet with the new function added in this patch as device pfns may
> be non-contiguous.

That makes sense and is generally what I think of when I'm thinking of
eviction. The new function makes sense too - migrate_device_range() was
primarily introduced to allow a driver to evict all device-private pages
from a GPU so didn't consider non-contiguous cases, etc.

> 2. An inconsistent state for VA range occurs (mixed system and device pages,
> partial unmap of a range, etc...). Here we want to evict the range ram
> to make the state consistent. No device pages are available due to an
> intentional disconnect between a virtual range and physical
> drm_gpusvm_devmem, thus the device pages have to be looked up. This the
> function drm_gpusvm_range_evict. Based on what you tell me, it likely is
> fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
> using hmm_range_fault like I have suggested here.

Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
fine for this usage and is exactly what you want - it was designed to
either select all the system memory pages or device-private pages within
a VA range and migrate them.

FWIW I have toyed with the idea of a combined
hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
migrate_vma_*() process but haven't come up with something nice as
yet. I don't think mixing the two in an open-coded fashion is a good
idea though, I'd rather we come up with a new API that addresses the
short-comings of migrate_vma_setup().

> Note #2 may be removed or unnecessary at some point if we decide to add
> support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
> now though. See 'Ranges with mixed system and device pages' in [5].
>
> [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>
>> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>> >> >
>> >> >> > It would also make the function exported in this patch unnecessary too
>> >> >> > as non-contiguous pfns can be setup on driver side via
>> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
>> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>> >> >> > in [1].
>> >> >> >
>> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
>> >> >> > migrate_device_unmap?
>> >> >> 
>> >> >> If there is a good justification for it I can't see a problem with
>> >> >> exporting it. That said I don't really understand why you would
>> >> >> want/need to split those steps up but I'll wait to see the code.
>> >> >>
>> >> >
>> >> > It is so the device pages returned from hmm_range_fault, which are only
>> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
>> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
>> >> > MMU invalidation which takes the notifier lock thus calling the function
>> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>> >> >
>> >> > I think this flow makes sense and agree in general this likely better
>> >> > than looking at a CPUVMA.
>> >> 
>> >> I'm still a bit confused about what is better with this flow if you are
>> >> still calling hmm_range_fault(). How is it better than just calling
>> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>> >
>> > The code in rev2 calls migrate_vma_setup but the requires a struct
>> > vm_area_struct argument whereas hmm_range_fault does not.
>> 
>> I'm not sure that's a good enough justfication because the problem isn't
>> whether you're looking up vma's in driver code or mm code. The problem
>> is you are looking up vma's at all and all that goes with that (mainly
>> taking mmap lock, etc.)
>> 
>> And for eviction hmm_range_fault() won't even find all the pages because
>> their virtual address may have changed - consider what happens in cases
>> of mremap(), fork(), etc. So eviction really needs physical pages
>> (pfn's), not virtual addresses.
>>
>
> See above, #1 yes we use a physical pages. For #2 it is about making the
> state consistent within a virtual address range.

Yep, makes sense now. For migration of physical pages you want
migrate_device_*, virtual address ranges want migrate_vma_*

 - Alistair

> Matt
>  
>> >> we're talking about eviction here so I don't understand why that would
>> >> be relevant. And hmm_range_fault() still requires the VMA, although I
>> >> need to look at the patches more closely, probably CPUVMA is a DRM
>> >
>> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
>> > as argument. This is about avoiding a driver side lookup of the VMA.
>> >
>> > CPUVMA == struct vm_area_struct in this email.
>> 
>> Thanks for the clarification.
>> 
>>  - Alistair
>> 
>> > Matt
>> >
>> >> specific concept?
>> >> 
>> >> Thanks.
>> >> 
>> >>  - Alistair
>> >> 
>> >> > Matt
>> >> >  
>> >> >>  - Alistair
>> >> >> 
>> >> >> > Matt
>> >> >> >
>> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>> >> >> >
>> >> >> >> Matt
>> >> >> >> 
>> >> >> >> > > +	}
>> >> >> >> > > +
>> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> >> >> >> > > +
>> >> >> >> > > +	return 0;
>> >> >> >> > > +}
>> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> >> >> >> > > +
>> >> >> >> > >  /*
>> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
>> >> >> >> > 
>> >> >> 
>> >> 
>>
Matthew Brost Oct. 18, 2024, 12:54 a.m. UTC | #11
On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> 
> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> >> >> >> 
> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> 
> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> >> >> >> 
> >> >> >> [...]
> >> >> >> 
> >> >> >> >> > > +{
> >> >> >> >> > > +	unsigned long i;
> >> >> >> >> > > +
> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> >> >> >> > > +
> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> > > +			continue;
> >> >> >> >> > > +		}
> >> >> >> >> > > +
> >> >> >> >> > > +		if (!trylock_page(page)) {
> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> > > +			put_page(page);
> >> >> >> >> > > +			continue;
> >> >> >> >> > > +		}
> >> >> >> >> > > +
> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> >> >> >> > 
> >> >> >> >> > This needs to be converted to use a folio like
> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
> >> >> >> >> > a function that both migrate_device_range() and this function can call
> >> >> >> >> > given this bit is identical.
> >> >> >> >> > 
> >> >> >> >> 
> >> >> >> >> Missed the folio conversion and agree a helper shared between this
> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
> >> >> >> >> 
> >> >> >> >
> >> >> >> > Alistair,
> >> >> >> >
> >> >> >> > Ok, I think now I want to go slightly different direction here to give
> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
> >> >> >> >
> >> >> >> > What if I exported the helper discussed above, e.g.,
> >> >> >> >
> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >> >> >> >  906 {
> >> >> >> >  907         struct folio *folio;
> >> >> >> >  908
> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >> >> >> >  910         if (!folio)
> >> >> >> >  911                 return 0;
> >> >> >> >  912
> >> >> >> >  913         if (!folio_trylock(folio)) {
> >> >> >> >  914                 folio_put(folio);
> >> >> >> >  915                 return 0;
> >> >> >> >  916         }
> >> >> >> >  917
> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >> >> >> >  919 }
> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >> >> >> >
> >> >> >> > And then also export migrate_device_unmap.
> >> >> >> >
> >> >> >> > The usage here would be let a driver collect the device pages in virtual
> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
> >> >> >> > migrate_device_unmap.
> >> >> >> 
> >> >> >> I'm still working through this series but that seems a bit dubious, the
> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
> >> >> >> would help me a lot in understanding what you're suggesting.
> >> >> >>
> >> >> >
> >> >> > For sure locking in tricky, my mistake on not working through this
> >> >> > before sending out the next rev but it came to mind after sending +
> >> >> > regarding some late feedback from Thomas about using hmm for eviction
> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
> >> >> > doesn't work for coherent pages, while something like below does.
> >> >> >
> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> >> >> >
> >> >> > Here is a snippet I have locally which seems to work.
> >> >> >
> >> >> > 2024 retry:
> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> > 2026         hmm_range.hmm_pfns = src;
> >> >> > 2027
> >> >> > 2028         while (true) {
> >> >> > 2029                 mmap_read_lock(mm);
> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
> >> >> > 2031                 mmap_read_unlock(mm);
> >> >> > 2032                 if (err == -EBUSY) {
> >> >> > 2033                         if (time_after(jiffies, timeout))
> >> >> > 2034                                 break;
> >> >> > 2035
> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> > 2037                         continue;
> >> >> > 2038                 }
> >> >> > 2039                 break;
> >> >> > 2040         }
> >> >> > 2041         if (err)
> >> >> > 2042                 goto err_put;
> >> >> > 2043
> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
> >> >> > 2048                 goto retry;
> >> >> > 2049         }
> >> >> > 2050         for (i = 0; i < npages; ++i) {
> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> >> >> > 2052
> >> >> > 2053                 if (page && (is_device_private_page(page) ||
> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> >> >> > 2056                 else
> >> >> > 2057                         src[i] = 0;
> >> >> > 2058                 if (src[i])
> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> >> >> > 2060         }
> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> >> >> 
> >> >> Practically for eviction isn't this much the same as calling
> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
> >> >> probably shouldn't be looking at mm/vma structs.
> >> >> 
> >> >
> >> > hmm_range_fault is just collecting the pages, internally I suppose it
> >> > does look at a VMA (struct vm_area_struct) but I think the point is
> >> > drivers should not be looking at VMA here.
> >> 
> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
> >> so in general I don't see a problem with drivers looking up vma's. The
> >> problem arises specifically for eviction and whether or not that happens
> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
> >> issues there (see below).
> >> 
> >
> > Ok, if you think it ok for drivers to lookup the VMA then purposed
> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
> > needed, rather just the original function exported in the this patch.
> >
> > More below too.
> >
> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
> >> >> > ...
> >> >> > 2101         migrate_device_pages(src, dst, npages);
> >> >> > 2102         migrate_device_finalize(src, dst, npages);
> >> >> >
> >> >> >
> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
> >> >> >> > lookup during eviction cases and this would let me fixup
> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
> >> >> >> 
> >> >> >> That sounds reasonable but for context do you have a link to the
> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
> >> >> >> just missed it.
> >> >> >> 
> >> >> >
> >> >> > See in [4], search for '2. eviction' comment from sima.
> >> >> 
> >> >> Thanks for pointing that out. For reference here's Sima's comment:
> >> >> 
> >> >> > 2. eviction
> >> >> > 
> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
> >> >> > migration gurantee:
> >> >> > 
> >> >> > - Only looking at physical memory datastructures and locks, no looking at
> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> >> >>
> >> >> I also very much agree with this. That's basically why I added
> >> >> migrate_device_range(), so that we can forcibly evict pages when the
> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
> >> >> general it is impossible to guarantee eviction og all pages using just
> >> >> hmm_range_fault().
> >> >> 
> >> >
> >> > In this code path we don't have device pages available, hence the
> >> > purposed collection via hmm_range_fault.
> >> 
> >> Why don't you have the pfns requiring eviction available? I need to read
> >> this series in more depth, but generally hmm_range_fault() can't
> >> gurantee you will find every device page.
> >> 
> >
> > There are two cases for eviction in my series:
> >
> > 1. TTM decides it needs to move memory. This calls
> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
> > here albiet with the new function added in this patch as device pfns may
> > be non-contiguous.
> 
> That makes sense and is generally what I think of when I'm thinking of
> eviction. The new function makes sense too - migrate_device_range() was
> primarily introduced to allow a driver to evict all device-private pages
> from a GPU so didn't consider non-contiguous cases, etc.
> 
> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
> > partial unmap of a range, etc...). Here we want to evict the range ram
> > to make the state consistent. No device pages are available due to an
> > intentional disconnect between a virtual range and physical
> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
> > using hmm_range_fault like I have suggested here.
> 
> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
> fine for this usage and is exactly what you want - it was designed to
> either select all the system memory pages or device-private pages within
> a VA range and migrate them.
> 
> FWIW I have toyed with the idea of a combined
> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
> migrate_vma_*() process but haven't come up with something nice as
> yet. I don't think mixing the two in an open-coded fashion is a good
> idea though, I'd rather we come up with a new API that addresses the
> short-comings of migrate_vma_setup().
> 

I think that would good. Here we actually need to lookup multiple VMAs
and have a sequence of migrate_vma_* calls as it possible for VMAs to
have changed after the driver range was created. It might be nice to
hide the VMA lookup from the drivers with an API saying collect and
migrate all pages of a type in a VA range much like hmm_range_fault. If
the range spans multiple VMAs that would be hidden from the caller.

Matt

> > Note #2 may be removed or unnecessary at some point if we decide to add
> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
> > now though. See 'Ranges with mixed system and device pages' in [5].
> >
> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
> >
> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> >> >> >
> >> >> >> > It would also make the function exported in this patch unnecessary too
> >> >> >> > as non-contiguous pfns can be setup on driver side via
> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> >> >> >> > in [1].
> >> >> >> >
> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
> >> >> >> > migrate_device_unmap?
> >> >> >> 
> >> >> >> If there is a good justification for it I can't see a problem with
> >> >> >> exporting it. That said I don't really understand why you would
> >> >> >> want/need to split those steps up but I'll wait to see the code.
> >> >> >>
> >> >> >
> >> >> > It is so the device pages returned from hmm_range_fault, which are only
> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> >> >> > MMU invalidation which takes the notifier lock thus calling the function
> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> >> >> >
> >> >> > I think this flow makes sense and agree in general this likely better
> >> >> > than looking at a CPUVMA.
> >> >> 
> >> >> I'm still a bit confused about what is better with this flow if you are
> >> >> still calling hmm_range_fault(). How is it better than just calling
> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
> >> >
> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
> >> > vm_area_struct argument whereas hmm_range_fault does not.
> >> 
> >> I'm not sure that's a good enough justfication because the problem isn't
> >> whether you're looking up vma's in driver code or mm code. The problem
> >> is you are looking up vma's at all and all that goes with that (mainly
> >> taking mmap lock, etc.)
> >> 
> >> And for eviction hmm_range_fault() won't even find all the pages because
> >> their virtual address may have changed - consider what happens in cases
> >> of mremap(), fork(), etc. So eviction really needs physical pages
> >> (pfn's), not virtual addresses.
> >>
> >
> > See above, #1 yes we use a physical pages. For #2 it is about making the
> > state consistent within a virtual address range.
> 
> Yep, makes sense now. For migration of physical pages you want
> migrate_device_*, virtual address ranges want migrate_vma_*
> 
>  - Alistair
> 
> > Matt
> >  
> >> >> we're talking about eviction here so I don't understand why that would
> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
> >> >
> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
> >> > as argument. This is about avoiding a driver side lookup of the VMA.
> >> >
> >> > CPUVMA == struct vm_area_struct in this email.
> >> 
> >> Thanks for the clarification.
> >> 
> >>  - Alistair
> >> 
> >> > Matt
> >> >
> >> >> specific concept?
> >> >> 
> >> >> Thanks.
> >> >> 
> >> >>  - Alistair
> >> >> 
> >> >> > Matt
> >> >> >  
> >> >> >>  - Alistair
> >> >> >> 
> >> >> >> > Matt
> >> >> >> >
> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >> >> >> >
> >> >> >> >> Matt
> >> >> >> >> 
> >> >> >> >> > > +	}
> >> >> >> >> > > +
> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> >> >> >> > > +
> >> >> >> >> > > +	return 0;
> >> >> >> >> > > +}
> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> >> >> >> > > +
> >> >> >> >> > >  /*
> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> >> >> >> > 
> >> >> >> 
> >> >> 
> >> 
>
Mika Penttilä Oct. 18, 2024, 4:02 a.m. UTC | #12
Hi,

On 10/18/24 00:58, Alistair Popple wrote:
> Matthew Brost <matthew.brost@intel.com> writes:
>
>> On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>
>>>> On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>
>>>>>> On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>>>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>>>
>>>>>>>> On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>>>>>>>>> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>>>>>>> [...]
>>>>>>>
>>>>>>>>>>> +{
>>>>>>>>>>> +	unsigned long i;
>>>>>>>>>>> +
>>>>>>>>>>> +	for (i = 0; i < npages; i++) {
>>>>>>>>>>> +		struct page *page = pfn_to_page(src_pfns[i]);
>>>>>>>>>>> +
>>>>>>>>>>> +		if (!get_page_unless_zero(page)) {
>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>> +			continue;
>>>>>>>>>>> +		}
>>>>>>>>>>> +
>>>>>>>>>>> +		if (!trylock_page(page)) {
>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>> +			put_page(page);
>>>>>>>>>>> +			continue;
>>>>>>>>>>> +		}
>>>>>>>>>>> +
>>>>>>>>>>> +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>>>>>>>>>> This needs to be converted to use a folio like
>>>>>>>>>> migrate_device_range(). But more importantly this should be split out as
>>>>>>>>>> a function that both migrate_device_range() and this function can call
>>>>>>>>>> given this bit is identical.
>>>>>>>>>>
>>>>>>>>> Missed the folio conversion and agree a helper shared between this
>>>>>>>>> function and migrate_device_range would be a good idea. Let add that.
>>>>>>>>>
>>>>>>>> Alistair,
>>>>>>>>
>>>>>>>> Ok, I think now I want to go slightly different direction here to give
>>>>>>>> GPUSVM a bit more control over several eviction scenarios.
>>>>>>>>
>>>>>>>> What if I exported the helper discussed above, e.g.,
>>>>>>>>
>>>>>>>>  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>>>>>>>>  906 {
>>>>>>>>  907         struct folio *folio;
>>>>>>>>  908
>>>>>>>>  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>>>>>>>>  910         if (!folio)
>>>>>>>>  911                 return 0;
>>>>>>>>  912
>>>>>>>>  913         if (!folio_trylock(folio)) {
>>>>>>>>  914                 folio_put(folio);
>>>>>>>>  915                 return 0;
>>>>>>>>  916         }
>>>>>>>>  917
>>>>>>>>  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>>>>>>>>  919 }
>>>>>>>>  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>>>>>>>>
>>>>>>>> And then also export migrate_device_unmap.
>>>>>>>>
>>>>>>>> The usage here would be let a driver collect the device pages in virtual
>>>>>>>> address range via hmm_range_fault, lock device pages under notifier
>>>>>>>> lock ensuring device pages are valid, drop the notifier lock and call
>>>>>>>> migrate_device_unmap.
>>>>>>> I'm still working through this series but that seems a bit dubious, the
>>>>>>> locking here is pretty subtle and easy to get wrong so seeing some code
>>>>>>> would help me a lot in understanding what you're suggesting.
>>>>>>>
>>>>>> For sure locking in tricky, my mistake on not working through this
>>>>>> before sending out the next rev but it came to mind after sending +
>>>>>> regarding some late feedback from Thomas about using hmm for eviction
>>>>>> [2]. His suggestion of using hmm_range_fault to trigger migration
>>>>>> doesn't work for coherent pages, while something like below does.
>>>>>>
>>>>>> [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>>>>>>
>>>>>> Here is a snippet I have locally which seems to work.
>>>>>>
>>>>>> 2024 retry:
>>>>>> 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>> 2026         hmm_range.hmm_pfns = src;
>>>>>> 2027
>>>>>> 2028         while (true) {
>>>>>> 2029                 mmap_read_lock(mm);
>>>>>> 2030                 err = hmm_range_fault(&hmm_range);
>>>>>> 2031                 mmap_read_unlock(mm);
>>>>>> 2032                 if (err == -EBUSY) {
>>>>>> 2033                         if (time_after(jiffies, timeout))
>>>>>> 2034                                 break;
>>>>>> 2035
>>>>>> 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>> 2037                         continue;
>>>>>> 2038                 }
>>>>>> 2039                 break;
>>>>>> 2040         }
>>>>>> 2041         if (err)
>>>>>> 2042                 goto err_put;
>>>>>> 2043
>>>>>> 2044         drm_gpusvm_notifier_lock(gpusvm);
>>>>>> 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>>>>>> 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>>>>>> 2047                 memset(src, 0, sizeof(*src) * npages);
>>>>>> 2048                 goto retry;
>>>>>> 2049         }
>>>>>> 2050         for (i = 0; i < npages; ++i) {
>>>>>> 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>>>>>> 2052
>>>>>> 2053                 if (page && (is_device_private_page(page) ||
>>>>>> 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>>>>>> 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>>>>>> 2056                 else
>>>>>> 2057                         src[i] = 0;
>>>>>> 2058                 if (src[i])
>>>>>> 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>>>>>> 2060         }
>>>>>> 2061         drm_gpusvm_notifier_unlock(gpusvm);
>>>>> Practically for eviction isn't this much the same as calling
>>>>> migrate_vma_setup()? And also for eviction as Sima mentioned you
>>>>> probably shouldn't be looking at mm/vma structs.
>>>>>
>>>> hmm_range_fault is just collecting the pages, internally I suppose it
>>>> does look at a VMA (struct vm_area_struct) but I think the point is
>>>> drivers should not be looking at VMA here.
>>> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>>> so in general I don't see a problem with drivers looking up vma's. The
>>> problem arises specifically for eviction and whether or not that happens
>>> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>>> issues there (see below).
>>>
>> Ok, if you think it ok for drivers to lookup the VMA then purposed
>> exporting of migrate_device_pfn_lock & migrate_device_unmap is not
>> needed, rather just the original function exported in the this patch.
>>
>> More below too.
>>
>>>>>> 2063         migrate_device_unmap(src, npages, NULL);
>>>>>> ...
>>>>>> 2101         migrate_device_pages(src, dst, npages);
>>>>>> 2102         migrate_device_finalize(src, dst, npages);
>>>>>>
>>>>>>
>>>>>>>> Sima has strongly suggested avoiding a CPUVMA
>>>>>>>> lookup during eviction cases and this would let me fixup
>>>>>>>> drm_gpusvm_range_evict in [1] to avoid this.
>>>>>>> That sounds reasonable but for context do you have a link to the
>>>>>>> comments/discussion on this? I couldn't readily find it, but I may have
>>>>>>> just missed it.
>>>>>>>
>>>>>> See in [4], search for '2. eviction' comment from sima.
>>>>> Thanks for pointing that out. For reference here's Sima's comment:
>>>>>
>>>>>> 2. eviction
>>>>>>
>>>>>> Requirements much like migrate_to_ram, because otherwise we break the
>>>>>> migration gurantee:
>>>>>>
>>>>>> - Only looking at physical memory datastructures and locks, no looking at
>>>>>>   mm/vma structs or relying on those being locked. We rely entirely on
>>>>>>   reverse maps from try_to_migrate to find all the mappings on both cpu
>>>>>>   and gpu side (cpu only zone device swap or migration pte entries ofc).
>>>>> I also very much agree with this. That's basically why I added
>>>>> migrate_device_range(), so that we can forcibly evict pages when the
>>>>> driver needs them freed (eg. driver unload, low memory, etc.). In
>>>>> general it is impossible to guarantee eviction og all pages using just
>>>>> hmm_range_fault().
>>>>>
>>>> In this code path we don't have device pages available, hence the
>>>> purposed collection via hmm_range_fault.
>>> Why don't you have the pfns requiring eviction available? I need to read
>>> this series in more depth, but generally hmm_range_fault() can't
>>> gurantee you will find every device page.
>>>
>> There are two cases for eviction in my series:
>>
>> 1. TTM decides it needs to move memory. This calls
>> drm_gpusvm_evict_to_ram. In this case the device pfns are available
>> directly from drm_gpusvm_devmem so the migrate_device_* calls be used
>> here albiet with the new function added in this patch as device pfns may
>> be non-contiguous.
> That makes sense and is generally what I think of when I'm thinking of
> eviction. The new function makes sense too - migrate_device_range() was
> primarily introduced to allow a driver to evict all device-private pages
> from a GPU so didn't consider non-contiguous cases, etc.
>
>> 2. An inconsistent state for VA range occurs (mixed system and device pages,
>> partial unmap of a range, etc...). Here we want to evict the range ram
>> to make the state consistent. No device pages are available due to an
>> intentional disconnect between a virtual range and physical
>> drm_gpusvm_devmem, thus the device pages have to be looked up. This the
>> function drm_gpusvm_range_evict. Based on what you tell me, it likely is
>> fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
>> using hmm_range_fault like I have suggested here.
> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
> fine for this usage and is exactly what you want - it was designed to
> either select all the system memory pages or device-private pages within
> a VA range and migrate them.
>
> FWIW I have toyed with the idea of a combined
> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
> migrate_vma_*() process but haven't come up with something nice as
> yet. I don't think mixing the two in an open-coded fashion is a good
> idea though, I'd rather we come up with a new API that addresses the
> short-comings of migrate_vma_setup().

This is what I have been implementing and have a WIP version now, will
cleanup, test and send soon.

It does the migration entry installing while faulting pages, and you
continue migrate with normal migrate_vma_() flow.


>> Note #2 may be removed or unnecessary at some point if we decide to add
>> support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
>> now though. See 'Ranges with mixed system and device pages' in [5].
>>
>> [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>>
>>>>>> [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>>>>>> [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>>>>>>
>>>>>>>> It would also make the function exported in this patch unnecessary too
>>>>>>>> as non-contiguous pfns can be setup on driver side via
>>>>>>>> migrate_device_pfn_lock and then migrate_device_unmap can be called.
>>>>>>>> This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>>>>>>>> in [1].
>>>>>>>>
>>>>>>>> Do you see an issue exporting migrate_device_pfn_lock,
>>>>>>>> migrate_device_unmap?
>>>>>>> If there is a good justification for it I can't see a problem with
>>>>>>> exporting it. That said I don't really understand why you would
>>>>>>> want/need to split those steps up but I'll wait to see the code.
>>>>>>>
>>>>>> It is so the device pages returned from hmm_range_fault, which are only
>>>>>> guaranteed to be valid under the notifier lock + a seqno check, to be
>>>>>> locked and ref taken for migration. migrate_device_unmap() can trigger a
>>>>>> MMU invalidation which takes the notifier lock thus calling the function
>>>>>> which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>>>>>>
>>>>>> I think this flow makes sense and agree in general this likely better
>>>>>> than looking at a CPUVMA.
>>>>> I'm still a bit confused about what is better with this flow if you are
>>>>> still calling hmm_range_fault(). How is it better than just calling
>>>>> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>>>> The code in rev2 calls migrate_vma_setup but the requires a struct
>>>> vm_area_struct argument whereas hmm_range_fault does not.
>>> I'm not sure that's a good enough justfication because the problem isn't
>>> whether you're looking up vma's in driver code or mm code. The problem
>>> is you are looking up vma's at all and all that goes with that (mainly
>>> taking mmap lock, etc.)
>>>
>>> And for eviction hmm_range_fault() won't even find all the pages because
>>> their virtual address may have changed - consider what happens in cases
>>> of mremap(), fork(), etc. So eviction really needs physical pages
>>> (pfn's), not virtual addresses.
>>>
>> See above, #1 yes we use a physical pages. For #2 it is about making the
>> state consistent within a virtual address range.
> Yep, makes sense now. For migration of physical pages you want
> migrate_device_*, virtual address ranges want migrate_vma_*
>
>  - Alistair
>
>> Matt
>>  
>>>>> we're talking about eviction here so I don't understand why that would
>>>>> be relevant. And hmm_range_fault() still requires the VMA, although I
>>>>> need to look at the patches more closely, probably CPUVMA is a DRM
>>>> 'hmm_range_fault() still requires the VMA' internal yes, but again not
>>>> as argument. This is about avoiding a driver side lookup of the VMA.
>>>>
>>>> CPUVMA == struct vm_area_struct in this email.
>>> Thanks for the clarification.
>>>
>>>  - Alistair
>>>
>>>> Matt
>>>>
>>>>> specific concept?
>>>>>
>>>>> Thanks.
>>>>>
>>>>>  - Alistair
>>>>>
>>>>>> Matt
>>>>>>  
>>>>>>>  - Alistair
>>>>>>>
>>>>>>>> Matt
>>>>>>>>
>>>>>>>> [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>>>>>>>>
>>>>>>>>> Matt
>>>>>>>>>
>>>>>>>>>>> +	}
>>>>>>>>>>> +
>>>>>>>>>>> +	migrate_device_unmap(src_pfns, npages, NULL);
>>>>>>>>>>> +
>>>>>>>>>>> +	return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>>>>>>>>>>> +
>>>>>>>>>>>  /*
>>>>>>>>>>>   * Migrate a device coherent folio back to normal memory. The caller should have
>>>>>>>>>>>   * a reference on folio which will be copied to the new folio if migration is

--Mika
Alistair Popple Oct. 18, 2024, 5:55 a.m. UTC | #13
Mika Penttilä <mpenttil@redhat.com> writes:

> Hi,
>
> On 10/18/24 00:58, Alistair Popple wrote:
>> Matthew Brost <matthew.brost@intel.com> writes:
>>
>>> On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>
>>>>> On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>>
>>>>>>> On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>>>>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>>>>
>>>>>>>>> On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>>>>>>>>>> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>>>>>>>> [...]
>>>>>>>>
>>>>>>>>>>>> +{
>>>>>>>>>>>> +	unsigned long i;
>>>>>>>>>>>> +
>>>>>>>>>>>> +	for (i = 0; i < npages; i++) {
>>>>>>>>>>>> +		struct page *page = pfn_to_page(src_pfns[i]);
>>>>>>>>>>>> +
>>>>>>>>>>>> +		if (!get_page_unless_zero(page)) {
>>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>>> +			continue;
>>>>>>>>>>>> +		}
>>>>>>>>>>>> +
>>>>>>>>>>>> +		if (!trylock_page(page)) {
>>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>>> +			put_page(page);
>>>>>>>>>>>> +			continue;
>>>>>>>>>>>> +		}
>>>>>>>>>>>> +
>>>>>>>>>>>> +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>>>>>>>>>>> This needs to be converted to use a folio like
>>>>>>>>>>> migrate_device_range(). But more importantly this should be split out as
>>>>>>>>>>> a function that both migrate_device_range() and this function can call
>>>>>>>>>>> given this bit is identical.
>>>>>>>>>>>
>>>>>>>>>> Missed the folio conversion and agree a helper shared between this
>>>>>>>>>> function and migrate_device_range would be a good idea. Let add that.
>>>>>>>>>>
>>>>>>>>> Alistair,
>>>>>>>>>
>>>>>>>>> Ok, I think now I want to go slightly different direction here to give
>>>>>>>>> GPUSVM a bit more control over several eviction scenarios.
>>>>>>>>>
>>>>>>>>> What if I exported the helper discussed above, e.g.,
>>>>>>>>>
>>>>>>>>>  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>>>>>>>>>  906 {
>>>>>>>>>  907         struct folio *folio;
>>>>>>>>>  908
>>>>>>>>>  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>>>>>>>>>  910         if (!folio)
>>>>>>>>>  911                 return 0;
>>>>>>>>>  912
>>>>>>>>>  913         if (!folio_trylock(folio)) {
>>>>>>>>>  914                 folio_put(folio);
>>>>>>>>>  915                 return 0;
>>>>>>>>>  916         }
>>>>>>>>>  917
>>>>>>>>>  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>>>>>>>>>  919 }
>>>>>>>>>  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>>>>>>>>>
>>>>>>>>> And then also export migrate_device_unmap.
>>>>>>>>>
>>>>>>>>> The usage here would be let a driver collect the device pages in virtual
>>>>>>>>> address range via hmm_range_fault, lock device pages under notifier
>>>>>>>>> lock ensuring device pages are valid, drop the notifier lock and call
>>>>>>>>> migrate_device_unmap.
>>>>>>>> I'm still working through this series but that seems a bit dubious, the
>>>>>>>> locking here is pretty subtle and easy to get wrong so seeing some code
>>>>>>>> would help me a lot in understanding what you're suggesting.
>>>>>>>>
>>>>>>> For sure locking in tricky, my mistake on not working through this
>>>>>>> before sending out the next rev but it came to mind after sending +
>>>>>>> regarding some late feedback from Thomas about using hmm for eviction
>>>>>>> [2]. His suggestion of using hmm_range_fault to trigger migration
>>>>>>> doesn't work for coherent pages, while something like below does.
>>>>>>>
>>>>>>> [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>>>>>>>
>>>>>>> Here is a snippet I have locally which seems to work.
>>>>>>>
>>>>>>> 2024 retry:
>>>>>>> 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>>> 2026         hmm_range.hmm_pfns = src;
>>>>>>> 2027
>>>>>>> 2028         while (true) {
>>>>>>> 2029                 mmap_read_lock(mm);
>>>>>>> 2030                 err = hmm_range_fault(&hmm_range);
>>>>>>> 2031                 mmap_read_unlock(mm);
>>>>>>> 2032                 if (err == -EBUSY) {
>>>>>>> 2033                         if (time_after(jiffies, timeout))
>>>>>>> 2034                                 break;
>>>>>>> 2035
>>>>>>> 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>>> 2037                         continue;
>>>>>>> 2038                 }
>>>>>>> 2039                 break;
>>>>>>> 2040         }
>>>>>>> 2041         if (err)
>>>>>>> 2042                 goto err_put;
>>>>>>> 2043
>>>>>>> 2044         drm_gpusvm_notifier_lock(gpusvm);
>>>>>>> 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>>>>>>> 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>>>>>>> 2047                 memset(src, 0, sizeof(*src) * npages);
>>>>>>> 2048                 goto retry;
>>>>>>> 2049         }
>>>>>>> 2050         for (i = 0; i < npages; ++i) {
>>>>>>> 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>>>>>>> 2052
>>>>>>> 2053                 if (page && (is_device_private_page(page) ||
>>>>>>> 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>>>>>>> 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>>>>>>> 2056                 else
>>>>>>> 2057                         src[i] = 0;
>>>>>>> 2058                 if (src[i])
>>>>>>> 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>>>>>>> 2060         }
>>>>>>> 2061         drm_gpusvm_notifier_unlock(gpusvm);
>>>>>> Practically for eviction isn't this much the same as calling
>>>>>> migrate_vma_setup()? And also for eviction as Sima mentioned you
>>>>>> probably shouldn't be looking at mm/vma structs.
>>>>>>
>>>>> hmm_range_fault is just collecting the pages, internally I suppose it
>>>>> does look at a VMA (struct vm_area_struct) but I think the point is
>>>>> drivers should not be looking at VMA here.
>>>> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>>>> so in general I don't see a problem with drivers looking up vma's. The
>>>> problem arises specifically for eviction and whether or not that happens
>>>> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>>>> issues there (see below).
>>>>
>>> Ok, if you think it ok for drivers to lookup the VMA then purposed
>>> exporting of migrate_device_pfn_lock & migrate_device_unmap is not
>>> needed, rather just the original function exported in the this patch.
>>>
>>> More below too.
>>>
>>>>>>> 2063         migrate_device_unmap(src, npages, NULL);
>>>>>>> ...
>>>>>>> 2101         migrate_device_pages(src, dst, npages);
>>>>>>> 2102         migrate_device_finalize(src, dst, npages);
>>>>>>>
>>>>>>>
>>>>>>>>> Sima has strongly suggested avoiding a CPUVMA
>>>>>>>>> lookup during eviction cases and this would let me fixup
>>>>>>>>> drm_gpusvm_range_evict in [1] to avoid this.
>>>>>>>> That sounds reasonable but for context do you have a link to the
>>>>>>>> comments/discussion on this? I couldn't readily find it, but I may have
>>>>>>>> just missed it.
>>>>>>>>
>>>>>>> See in [4], search for '2. eviction' comment from sima.
>>>>>> Thanks for pointing that out. For reference here's Sima's comment:
>>>>>>
>>>>>>> 2. eviction
>>>>>>>
>>>>>>> Requirements much like migrate_to_ram, because otherwise we break the
>>>>>>> migration gurantee:
>>>>>>>
>>>>>>> - Only looking at physical memory datastructures and locks, no looking at
>>>>>>>   mm/vma structs or relying on those being locked. We rely entirely on
>>>>>>>   reverse maps from try_to_migrate to find all the mappings on both cpu
>>>>>>>   and gpu side (cpu only zone device swap or migration pte entries ofc).
>>>>>> I also very much agree with this. That's basically why I added
>>>>>> migrate_device_range(), so that we can forcibly evict pages when the
>>>>>> driver needs them freed (eg. driver unload, low memory, etc.). In
>>>>>> general it is impossible to guarantee eviction og all pages using just
>>>>>> hmm_range_fault().
>>>>>>
>>>>> In this code path we don't have device pages available, hence the
>>>>> purposed collection via hmm_range_fault.
>>>> Why don't you have the pfns requiring eviction available? I need to read
>>>> this series in more depth, but generally hmm_range_fault() can't
>>>> gurantee you will find every device page.
>>>>
>>> There are two cases for eviction in my series:
>>>
>>> 1. TTM decides it needs to move memory. This calls
>>> drm_gpusvm_evict_to_ram. In this case the device pfns are available
>>> directly from drm_gpusvm_devmem so the migrate_device_* calls be used
>>> here albiet with the new function added in this patch as device pfns may
>>> be non-contiguous.
>> That makes sense and is generally what I think of when I'm thinking of
>> eviction. The new function makes sense too - migrate_device_range() was
>> primarily introduced to allow a driver to evict all device-private pages
>> from a GPU so didn't consider non-contiguous cases, etc.
>>
>>> 2. An inconsistent state for VA range occurs (mixed system and device pages,
>>> partial unmap of a range, etc...). Here we want to evict the range ram
>>> to make the state consistent. No device pages are available due to an
>>> intentional disconnect between a virtual range and physical
>>> drm_gpusvm_devmem, thus the device pages have to be looked up. This the
>>> function drm_gpusvm_range_evict. Based on what you tell me, it likely is
>>> fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
>>> using hmm_range_fault like I have suggested here.
>> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
>> fine for this usage and is exactly what you want - it was designed to
>> either select all the system memory pages or device-private pages within
>> a VA range and migrate them.
>>
>> FWIW I have toyed with the idea of a combined
>> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
>> migrate_vma_*() process but haven't come up with something nice as
>> yet. I don't think mixing the two in an open-coded fashion is a good
>> idea though, I'd rather we come up with a new API that addresses the
>> short-comings of migrate_vma_setup().
>
> This is what I have been implementing and have a WIP version now, will
> cleanup, test and send soon.
>
> It does the migration entry installing while faulting pages, and you
> continue migrate with normal migrate_vma_() flow.

Oh nice! Thanks for looking further into that idea, I'm looking forward
to seeing the results. For background Mika and I had an offline
discussion about this a little while back but I wasn't sure if it had
gone anywhere.

>>> Note #2 may be removed or unnecessary at some point if we decide to add
>>> support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
>>> now though. See 'Ranges with mixed system and device pages' in [5].
>>>
>>> [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>>>
>>>>>>> [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>>>>>>> [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>>>>>>>
>>>>>>>>> It would also make the function exported in this patch unnecessary too
>>>>>>>>> as non-contiguous pfns can be setup on driver side via
>>>>>>>>> migrate_device_pfn_lock and then migrate_device_unmap can be called.
>>>>>>>>> This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>>>>>>>>> in [1].
>>>>>>>>>
>>>>>>>>> Do you see an issue exporting migrate_device_pfn_lock,
>>>>>>>>> migrate_device_unmap?
>>>>>>>> If there is a good justification for it I can't see a problem with
>>>>>>>> exporting it. That said I don't really understand why you would
>>>>>>>> want/need to split those steps up but I'll wait to see the code.
>>>>>>>>
>>>>>>> It is so the device pages returned from hmm_range_fault, which are only
>>>>>>> guaranteed to be valid under the notifier lock + a seqno check, to be
>>>>>>> locked and ref taken for migration. migrate_device_unmap() can trigger a
>>>>>>> MMU invalidation which takes the notifier lock thus calling the function
>>>>>>> which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>>>>>>>
>>>>>>> I think this flow makes sense and agree in general this likely better
>>>>>>> than looking at a CPUVMA.
>>>>>> I'm still a bit confused about what is better with this flow if you are
>>>>>> still calling hmm_range_fault(). How is it better than just calling
>>>>>> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>>>>> The code in rev2 calls migrate_vma_setup but the requires a struct
>>>>> vm_area_struct argument whereas hmm_range_fault does not.
>>>> I'm not sure that's a good enough justfication because the problem isn't
>>>> whether you're looking up vma's in driver code or mm code. The problem
>>>> is you are looking up vma's at all and all that goes with that (mainly
>>>> taking mmap lock, etc.)
>>>>
>>>> And for eviction hmm_range_fault() won't even find all the pages because
>>>> their virtual address may have changed - consider what happens in cases
>>>> of mremap(), fork(), etc. So eviction really needs physical pages
>>>> (pfn's), not virtual addresses.
>>>>
>>> See above, #1 yes we use a physical pages. For #2 it is about making the
>>> state consistent within a virtual address range.
>> Yep, makes sense now. For migration of physical pages you want
>> migrate_device_*, virtual address ranges want migrate_vma_*
>>
>>  - Alistair
>>
>>> Matt
>>>  
>>>>>> we're talking about eviction here so I don't understand why that would
>>>>>> be relevant. And hmm_range_fault() still requires the VMA, although I
>>>>>> need to look at the patches more closely, probably CPUVMA is a DRM
>>>>> 'hmm_range_fault() still requires the VMA' internal yes, but again not
>>>>> as argument. This is about avoiding a driver side lookup of the VMA.
>>>>>
>>>>> CPUVMA == struct vm_area_struct in this email.
>>>> Thanks for the clarification.
>>>>
>>>>  - Alistair
>>>>
>>>>> Matt
>>>>>
>>>>>> specific concept?
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>>  - Alistair
>>>>>>
>>>>>>> Matt
>>>>>>>  
>>>>>>>>  - Alistair
>>>>>>>>
>>>>>>>>> Matt
>>>>>>>>>
>>>>>>>>> [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>>>>>>>>>
>>>>>>>>>> Matt
>>>>>>>>>>
>>>>>>>>>>>> +	}
>>>>>>>>>>>> +
>>>>>>>>>>>> +	migrate_device_unmap(src_pfns, npages, NULL);
>>>>>>>>>>>> +
>>>>>>>>>>>> +	return 0;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>>>>>>>>>>>> +
>>>>>>>>>>>>  /*
>>>>>>>>>>>>   * Migrate a device coherent folio back to normal memory. The caller should have
>>>>>>>>>>>>   * a reference on folio which will be copied to the new folio if migration is
>
> --Mika
Alistair Popple Oct. 18, 2024, 5:59 a.m. UTC | #14
Matthew Brost <matthew.brost@intel.com> writes:

> On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> 
>> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>> >> >> 
>> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> 
>> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>> >> >> >> 
>> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> >> 
>> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>> >> >> >> 
>> >> >> >> [...]
>> >> >> >> 
>> >> >> >> >> > > +{
>> >> >> >> >> > > +	unsigned long i;
>> >> >> >> >> > > +
>> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
>> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> >> >> >> >> > > +
>> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
>> >> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> >> > > +			continue;
>> >> >> >> >> > > +		}
>> >> >> >> >> > > +
>> >> >> >> >> > > +		if (!trylock_page(page)) {
>> >> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> >> > > +			put_page(page);
>> >> >> >> >> > > +			continue;
>> >> >> >> >> > > +		}
>> >> >> >> >> > > +
>> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> >> >> >> >> > 
>> >> >> >> >> > This needs to be converted to use a folio like
>> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
>> >> >> >> >> > a function that both migrate_device_range() and this function can call
>> >> >> >> >> > given this bit is identical.
>> >> >> >> >> > 
>> >> >> >> >> 
>> >> >> >> >> Missed the folio conversion and agree a helper shared between this
>> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
>> >> >> >> >> 
>> >> >> >> >
>> >> >> >> > Alistair,
>> >> >> >> >
>> >> >> >> > Ok, I think now I want to go slightly different direction here to give
>> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
>> >> >> >> >
>> >> >> >> > What if I exported the helper discussed above, e.g.,
>> >> >> >> >
>> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>> >> >> >> >  906 {
>> >> >> >> >  907         struct folio *folio;
>> >> >> >> >  908
>> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>> >> >> >> >  910         if (!folio)
>> >> >> >> >  911                 return 0;
>> >> >> >> >  912
>> >> >> >> >  913         if (!folio_trylock(folio)) {
>> >> >> >> >  914                 folio_put(folio);
>> >> >> >> >  915                 return 0;
>> >> >> >> >  916         }
>> >> >> >> >  917
>> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>> >> >> >> >  919 }
>> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>> >> >> >> >
>> >> >> >> > And then also export migrate_device_unmap.
>> >> >> >> >
>> >> >> >> > The usage here would be let a driver collect the device pages in virtual
>> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
>> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
>> >> >> >> > migrate_device_unmap.
>> >> >> >> 
>> >> >> >> I'm still working through this series but that seems a bit dubious, the
>> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
>> >> >> >> would help me a lot in understanding what you're suggesting.
>> >> >> >>
>> >> >> >
>> >> >> > For sure locking in tricky, my mistake on not working through this
>> >> >> > before sending out the next rev but it came to mind after sending +
>> >> >> > regarding some late feedback from Thomas about using hmm for eviction
>> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
>> >> >> > doesn't work for coherent pages, while something like below does.
>> >> >> >
>> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>> >> >> >
>> >> >> > Here is a snippet I have locally which seems to work.
>> >> >> >
>> >> >> > 2024 retry:
>> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> >> > 2026         hmm_range.hmm_pfns = src;
>> >> >> > 2027
>> >> >> > 2028         while (true) {
>> >> >> > 2029                 mmap_read_lock(mm);
>> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
>> >> >> > 2031                 mmap_read_unlock(mm);
>> >> >> > 2032                 if (err == -EBUSY) {
>> >> >> > 2033                         if (time_after(jiffies, timeout))
>> >> >> > 2034                                 break;
>> >> >> > 2035
>> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> >> > 2037                         continue;
>> >> >> > 2038                 }
>> >> >> > 2039                 break;
>> >> >> > 2040         }
>> >> >> > 2041         if (err)
>> >> >> > 2042                 goto err_put;
>> >> >> > 2043
>> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
>> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
>> >> >> > 2048                 goto retry;
>> >> >> > 2049         }
>> >> >> > 2050         for (i = 0; i < npages; ++i) {
>> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>> >> >> > 2052
>> >> >> > 2053                 if (page && (is_device_private_page(page) ||
>> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>> >> >> > 2056                 else
>> >> >> > 2057                         src[i] = 0;
>> >> >> > 2058                 if (src[i])
>> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>> >> >> > 2060         }
>> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
>> >> >> 
>> >> >> Practically for eviction isn't this much the same as calling
>> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
>> >> >> probably shouldn't be looking at mm/vma structs.
>> >> >> 
>> >> >
>> >> > hmm_range_fault is just collecting the pages, internally I suppose it
>> >> > does look at a VMA (struct vm_area_struct) but I think the point is
>> >> > drivers should not be looking at VMA here.
>> >> 
>> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>> >> so in general I don't see a problem with drivers looking up vma's. The
>> >> problem arises specifically for eviction and whether or not that happens
>> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>> >> issues there (see below).
>> >> 
>> >
>> > Ok, if you think it ok for drivers to lookup the VMA then purposed
>> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
>> > needed, rather just the original function exported in the this patch.
>> >
>> > More below too.
>> >
>> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
>> >> >> > ...
>> >> >> > 2101         migrate_device_pages(src, dst, npages);
>> >> >> > 2102         migrate_device_finalize(src, dst, npages);
>> >> >> >
>> >> >> >
>> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
>> >> >> >> > lookup during eviction cases and this would let me fixup
>> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
>> >> >> >> 
>> >> >> >> That sounds reasonable but for context do you have a link to the
>> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
>> >> >> >> just missed it.
>> >> >> >> 
>> >> >> >
>> >> >> > See in [4], search for '2. eviction' comment from sima.
>> >> >> 
>> >> >> Thanks for pointing that out. For reference here's Sima's comment:
>> >> >> 
>> >> >> > 2. eviction
>> >> >> > 
>> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
>> >> >> > migration gurantee:
>> >> >> > 
>> >> >> > - Only looking at physical memory datastructures and locks, no looking at
>> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
>> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
>> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
>> >> >>
>> >> >> I also very much agree with this. That's basically why I added
>> >> >> migrate_device_range(), so that we can forcibly evict pages when the
>> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
>> >> >> general it is impossible to guarantee eviction og all pages using just
>> >> >> hmm_range_fault().
>> >> >> 
>> >> >
>> >> > In this code path we don't have device pages available, hence the
>> >> > purposed collection via hmm_range_fault.
>> >> 
>> >> Why don't you have the pfns requiring eviction available? I need to read
>> >> this series in more depth, but generally hmm_range_fault() can't
>> >> gurantee you will find every device page.
>> >> 
>> >
>> > There are two cases for eviction in my series:
>> >
>> > 1. TTM decides it needs to move memory. This calls
>> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
>> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
>> > here albiet with the new function added in this patch as device pfns may
>> > be non-contiguous.
>> 
>> That makes sense and is generally what I think of when I'm thinking of
>> eviction. The new function makes sense too - migrate_device_range() was
>> primarily introduced to allow a driver to evict all device-private pages
>> from a GPU so didn't consider non-contiguous cases, etc.
>> 
>> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
>> > partial unmap of a range, etc...). Here we want to evict the range ram
>> > to make the state consistent. No device pages are available due to an
>> > intentional disconnect between a virtual range and physical
>> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
>> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
>> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
>> > using hmm_range_fault like I have suggested here.
>> 
>> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
>> fine for this usage and is exactly what you want - it was designed to
>> either select all the system memory pages or device-private pages within
>> a VA range and migrate them.
>> 
>> FWIW I have toyed with the idea of a combined
>> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
>> migrate_vma_*() process but haven't come up with something nice as
>> yet. I don't think mixing the two in an open-coded fashion is a good
>> idea though, I'd rather we come up with a new API that addresses the
>> short-comings of migrate_vma_setup().
>> 
>
> I think that would good. Here we actually need to lookup multiple VMAs
> and have a sequence of migrate_vma_* calls as it possible for VMAs to
> have changed after the driver range was created. It might be nice to
> hide the VMA lookup from the drivers with an API saying collect and
> migrate all pages of a type in a VA range much like hmm_range_fault. If
> the range spans multiple VMAs that would be hidden from the caller.

Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
really have a requirement to migrate across multiple VMAs but if that's
neccessary I think an API that hides that specifically for working with
migrate_vma_*() might make sense.

> Matt
>
>> > Note #2 may be removed or unnecessary at some point if we decide to add
>> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
>> > now though. See 'Ranges with mixed system and device pages' in [5].

As someone not very familiar with some of the DRM layers can I ask why
having virtual address ranges with a mix of system and device pages is
hard to support? It seems to me that in practice it might be quite
difficult to keep a VMA range as exclusively all in system memory or all
in device memory.

>> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>> >
>> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>> >> >> >
>> >> >> >> > It would also make the function exported in this patch unnecessary too
>> >> >> >> > as non-contiguous pfns can be setup on driver side via
>> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
>> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>> >> >> >> > in [1].
>> >> >> >> >
>> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
>> >> >> >> > migrate_device_unmap?
>> >> >> >> 
>> >> >> >> If there is a good justification for it I can't see a problem with
>> >> >> >> exporting it. That said I don't really understand why you would
>> >> >> >> want/need to split those steps up but I'll wait to see the code.
>> >> >> >>
>> >> >> >
>> >> >> > It is so the device pages returned from hmm_range_fault, which are only
>> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
>> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
>> >> >> > MMU invalidation which takes the notifier lock thus calling the function
>> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>> >> >> >
>> >> >> > I think this flow makes sense and agree in general this likely better
>> >> >> > than looking at a CPUVMA.
>> >> >> 
>> >> >> I'm still a bit confused about what is better with this flow if you are
>> >> >> still calling hmm_range_fault(). How is it better than just calling
>> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>> >> >
>> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
>> >> > vm_area_struct argument whereas hmm_range_fault does not.
>> >> 
>> >> I'm not sure that's a good enough justfication because the problem isn't
>> >> whether you're looking up vma's in driver code or mm code. The problem
>> >> is you are looking up vma's at all and all that goes with that (mainly
>> >> taking mmap lock, etc.)
>> >> 
>> >> And for eviction hmm_range_fault() won't even find all the pages because
>> >> their virtual address may have changed - consider what happens in cases
>> >> of mremap(), fork(), etc. So eviction really needs physical pages
>> >> (pfn's), not virtual addresses.
>> >>
>> >
>> > See above, #1 yes we use a physical pages. For #2 it is about making the
>> > state consistent within a virtual address range.
>> 
>> Yep, makes sense now. For migration of physical pages you want
>> migrate_device_*, virtual address ranges want migrate_vma_*
>> 
>>  - Alistair
>> 
>> > Matt
>> >  
>> >> >> we're talking about eviction here so I don't understand why that would
>> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
>> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
>> >> >
>> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
>> >> > as argument. This is about avoiding a driver side lookup of the VMA.
>> >> >
>> >> > CPUVMA == struct vm_area_struct in this email.
>> >> 
>> >> Thanks for the clarification.
>> >> 
>> >>  - Alistair
>> >> 
>> >> > Matt
>> >> >
>> >> >> specific concept?
>> >> >> 
>> >> >> Thanks.
>> >> >> 
>> >> >>  - Alistair
>> >> >> 
>> >> >> > Matt
>> >> >> >  
>> >> >> >>  - Alistair
>> >> >> >> 
>> >> >> >> > Matt
>> >> >> >> >
>> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>> >> >> >> >
>> >> >> >> >> Matt
>> >> >> >> >> 
>> >> >> >> >> > > +	}
>> >> >> >> >> > > +
>> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> >> >> >> >> > > +
>> >> >> >> >> > > +	return 0;
>> >> >> >> >> > > +}
>> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> >> >> >> >> > > +
>> >> >> >> >> > >  /*
>> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
>> >> >> >> >> > 
>> >> >> >> 
>> >> >> 
>> >> 
>>
Mika Penttilä Oct. 18, 2024, 6:39 a.m. UTC | #15
On 10/18/24 08:59, Alistair Popple wrote:
> Matthew Brost <matthew.brost@intel.com> writes:
>
>> On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>
>>>> On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>
>>>>>> On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>>>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>>>
>>>>>>>> On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>>>>>>>>> Matthew Brost <matthew.brost@intel.com> writes:
>>>>>>>>>
>>>>>>>>>> On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>>>>>>>>>>> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>>>>>>>>> [...]
>>>>>>>>>
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +	unsigned long i;
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +	for (i = 0; i < npages; i++) {
>>>>>>>>>>>>> +		struct page *page = pfn_to_page(src_pfns[i]);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +		if (!get_page_unless_zero(page)) {
>>>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>>>> +			continue;
>>>>>>>>>>>>> +		}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +		if (!trylock_page(page)) {
>>>>>>>>>>>>> +			src_pfns[i] = 0;
>>>>>>>>>>>>> +			put_page(page);
>>>>>>>>>>>>> +			continue;
>>>>>>>>>>>>> +		}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>>>>>>>>>>>> This needs to be converted to use a folio like
>>>>>>>>>>>> migrate_device_range(). But more importantly this should be split out as
>>>>>>>>>>>> a function that both migrate_device_range() and this function can call
>>>>>>>>>>>> given this bit is identical.
>>>>>>>>>>>>
>>>>>>>>>>> Missed the folio conversion and agree a helper shared between this
>>>>>>>>>>> function and migrate_device_range would be a good idea. Let add that.
>>>>>>>>>>>
>>>>>>>>>> Alistair,
>>>>>>>>>>
>>>>>>>>>> Ok, I think now I want to go slightly different direction here to give
>>>>>>>>>> GPUSVM a bit more control over several eviction scenarios.
>>>>>>>>>>
>>>>>>>>>> What if I exported the helper discussed above, e.g.,
>>>>>>>>>>
>>>>>>>>>>  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>>>>>>>>>>  906 {
>>>>>>>>>>  907         struct folio *folio;
>>>>>>>>>>  908
>>>>>>>>>>  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>>>>>>>>>>  910         if (!folio)
>>>>>>>>>>  911                 return 0;
>>>>>>>>>>  912
>>>>>>>>>>  913         if (!folio_trylock(folio)) {
>>>>>>>>>>  914                 folio_put(folio);
>>>>>>>>>>  915                 return 0;
>>>>>>>>>>  916         }
>>>>>>>>>>  917
>>>>>>>>>>  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>>>>>>>>>>  919 }
>>>>>>>>>>  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>>>>>>>>>>
>>>>>>>>>> And then also export migrate_device_unmap.
>>>>>>>>>>
>>>>>>>>>> The usage here would be let a driver collect the device pages in virtual
>>>>>>>>>> address range via hmm_range_fault, lock device pages under notifier
>>>>>>>>>> lock ensuring device pages are valid, drop the notifier lock and call
>>>>>>>>>> migrate_device_unmap.
>>>>>>>>> I'm still working through this series but that seems a bit dubious, the
>>>>>>>>> locking here is pretty subtle and easy to get wrong so seeing some code
>>>>>>>>> would help me a lot in understanding what you're suggesting.
>>>>>>>>>
>>>>>>>> For sure locking in tricky, my mistake on not working through this
>>>>>>>> before sending out the next rev but it came to mind after sending +
>>>>>>>> regarding some late feedback from Thomas about using hmm for eviction
>>>>>>>> [2]. His suggestion of using hmm_range_fault to trigger migration
>>>>>>>> doesn't work for coherent pages, while something like below does.
>>>>>>>>
>>>>>>>> [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>>>>>>>>
>>>>>>>> Here is a snippet I have locally which seems to work.
>>>>>>>>
>>>>>>>> 2024 retry:
>>>>>>>> 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>>>> 2026         hmm_range.hmm_pfns = src;
>>>>>>>> 2027
>>>>>>>> 2028         while (true) {
>>>>>>>> 2029                 mmap_read_lock(mm);
>>>>>>>> 2030                 err = hmm_range_fault(&hmm_range);
>>>>>>>> 2031                 mmap_read_unlock(mm);
>>>>>>>> 2032                 if (err == -EBUSY) {
>>>>>>>> 2033                         if (time_after(jiffies, timeout))
>>>>>>>> 2034                                 break;
>>>>>>>> 2035
>>>>>>>> 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>>>>>>>> 2037                         continue;
>>>>>>>> 2038                 }
>>>>>>>> 2039                 break;
>>>>>>>> 2040         }
>>>>>>>> 2041         if (err)
>>>>>>>> 2042                 goto err_put;
>>>>>>>> 2043
>>>>>>>> 2044         drm_gpusvm_notifier_lock(gpusvm);
>>>>>>>> 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>>>>>>>> 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>>>>>>>> 2047                 memset(src, 0, sizeof(*src) * npages);
>>>>>>>> 2048                 goto retry;
>>>>>>>> 2049         }
>>>>>>>> 2050         for (i = 0; i < npages; ++i) {
>>>>>>>> 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>>>>>>>> 2052
>>>>>>>> 2053                 if (page && (is_device_private_page(page) ||
>>>>>>>> 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>>>>>>>> 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>>>>>>>> 2056                 else
>>>>>>>> 2057                         src[i] = 0;
>>>>>>>> 2058                 if (src[i])
>>>>>>>> 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>>>>>>>> 2060         }
>>>>>>>> 2061         drm_gpusvm_notifier_unlock(gpusvm);
>>>>>>> Practically for eviction isn't this much the same as calling
>>>>>>> migrate_vma_setup()? And also for eviction as Sima mentioned you
>>>>>>> probably shouldn't be looking at mm/vma structs.
>>>>>>>
>>>>>> hmm_range_fault is just collecting the pages, internally I suppose it
>>>>>> does look at a VMA (struct vm_area_struct) but I think the point is
>>>>>> drivers should not be looking at VMA here.
>>>>> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>>>>> so in general I don't see a problem with drivers looking up vma's. The
>>>>> problem arises specifically for eviction and whether or not that happens
>>>>> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>>>>> issues there (see below).
>>>>>
>>>> Ok, if you think it ok for drivers to lookup the VMA then purposed
>>>> exporting of migrate_device_pfn_lock & migrate_device_unmap is not
>>>> needed, rather just the original function exported in the this patch.
>>>>
>>>> More below too.
>>>>
>>>>>>>> 2063         migrate_device_unmap(src, npages, NULL);
>>>>>>>> ...
>>>>>>>> 2101         migrate_device_pages(src, dst, npages);
>>>>>>>> 2102         migrate_device_finalize(src, dst, npages);
>>>>>>>>
>>>>>>>>
>>>>>>>>>> Sima has strongly suggested avoiding a CPUVMA
>>>>>>>>>> lookup during eviction cases and this would let me fixup
>>>>>>>>>> drm_gpusvm_range_evict in [1] to avoid this.
>>>>>>>>> That sounds reasonable but for context do you have a link to the
>>>>>>>>> comments/discussion on this? I couldn't readily find it, but I may have
>>>>>>>>> just missed it.
>>>>>>>>>
>>>>>>>> See in [4], search for '2. eviction' comment from sima.
>>>>>>> Thanks for pointing that out. For reference here's Sima's comment:
>>>>>>>
>>>>>>>> 2. eviction
>>>>>>>>
>>>>>>>> Requirements much like migrate_to_ram, because otherwise we break the
>>>>>>>> migration gurantee:
>>>>>>>>
>>>>>>>> - Only looking at physical memory datastructures and locks, no looking at
>>>>>>>>   mm/vma structs or relying on those being locked. We rely entirely on
>>>>>>>>   reverse maps from try_to_migrate to find all the mappings on both cpu
>>>>>>>>   and gpu side (cpu only zone device swap or migration pte entries ofc).
>>>>>>> I also very much agree with this. That's basically why I added
>>>>>>> migrate_device_range(), so that we can forcibly evict pages when the
>>>>>>> driver needs them freed (eg. driver unload, low memory, etc.). In
>>>>>>> general it is impossible to guarantee eviction og all pages using just
>>>>>>> hmm_range_fault().
>>>>>>>
>>>>>> In this code path we don't have device pages available, hence the
>>>>>> purposed collection via hmm_range_fault.
>>>>> Why don't you have the pfns requiring eviction available? I need to read
>>>>> this series in more depth, but generally hmm_range_fault() can't
>>>>> gurantee you will find every device page.
>>>>>
>>>> There are two cases for eviction in my series:
>>>>
>>>> 1. TTM decides it needs to move memory. This calls
>>>> drm_gpusvm_evict_to_ram. In this case the device pfns are available
>>>> directly from drm_gpusvm_devmem so the migrate_device_* calls be used
>>>> here albiet with the new function added in this patch as device pfns may
>>>> be non-contiguous.
>>> That makes sense and is generally what I think of when I'm thinking of
>>> eviction. The new function makes sense too - migrate_device_range() was
>>> primarily introduced to allow a driver to evict all device-private pages
>>> from a GPU so didn't consider non-contiguous cases, etc.
>>>
>>>> 2. An inconsistent state for VA range occurs (mixed system and device pages,
>>>> partial unmap of a range, etc...). Here we want to evict the range ram
>>>> to make the state consistent. No device pages are available due to an
>>>> intentional disconnect between a virtual range and physical
>>>> drm_gpusvm_devmem, thus the device pages have to be looked up. This the
>>>> function drm_gpusvm_range_evict. Based on what you tell me, it likely is
>>>> fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
>>>> using hmm_range_fault like I have suggested here.
>>> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
>>> fine for this usage and is exactly what you want - it was designed to
>>> either select all the system memory pages or device-private pages within
>>> a VA range and migrate them.
>>>
>>> FWIW I have toyed with the idea of a combined
>>> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
>>> migrate_vma_*() process but haven't come up with something nice as
>>> yet. I don't think mixing the two in an open-coded fashion is a good
>>> idea though, I'd rather we come up with a new API that addresses the
>>> short-comings of migrate_vma_setup().
>>>
>> I think that would good. Here we actually need to lookup multiple VMAs
>> and have a sequence of migrate_vma_* calls as it possible for VMAs to
>> have changed after the driver range was created. It might be nice to
>> hide the VMA lookup from the drivers with an API saying collect and
>> migrate all pages of a type in a VA range much like hmm_range_fault. If
>> the range spans multiple VMAs that would be hidden from the caller.
> Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
> really have a requirement to migrate across multiple VMAs but if that's
> neccessary I think an API that hides that specifically for working with
> migrate_vma_*() might make sense.

Yes that's what I'm currently doing. You call it in a loop, the
fault+migrate prepare part chunks the calls to vma boundaries and you do
the migrations for each vma and loop until the whole range done.


>
>> Matt
>>
>>>> Note #2 may be removed or unnecessary at some point if we decide to add
>>>> support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
>>>> now though. See 'Ranges with mixed system and device pages' in [5].
> As someone not very familiar with some of the DRM layers can I ask why
> having virtual address ranges with a mix of system and device pages is
> hard to support? It seems to me that in practice it might be quite
> difficult to keep a VMA range as exclusively all in system memory or all
> in device memory.
>
>>>> [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>>>>
>>>>>>>> [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>>>>>>>> [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>>>>>>>>
>>>>>>>>>> It would also make the function exported in this patch unnecessary too
>>>>>>>>>> as non-contiguous pfns can be setup on driver side via
>>>>>>>>>> migrate_device_pfn_lock and then migrate_device_unmap can be called.
>>>>>>>>>> This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>>>>>>>>>> in [1].
>>>>>>>>>>
>>>>>>>>>> Do you see an issue exporting migrate_device_pfn_lock,
>>>>>>>>>> migrate_device_unmap?
>>>>>>>>> If there is a good justification for it I can't see a problem with
>>>>>>>>> exporting it. That said I don't really understand why you would
>>>>>>>>> want/need to split those steps up but I'll wait to see the code.
>>>>>>>>>
>>>>>>>> It is so the device pages returned from hmm_range_fault, which are only
>>>>>>>> guaranteed to be valid under the notifier lock + a seqno check, to be
>>>>>>>> locked and ref taken for migration. migrate_device_unmap() can trigger a
>>>>>>>> MMU invalidation which takes the notifier lock thus calling the function
>>>>>>>> which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>>>>>>>>
>>>>>>>> I think this flow makes sense and agree in general this likely better
>>>>>>>> than looking at a CPUVMA.
>>>>>>> I'm still a bit confused about what is better with this flow if you are
>>>>>>> still calling hmm_range_fault(). How is it better than just calling
>>>>>>> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>>>>>> The code in rev2 calls migrate_vma_setup but the requires a struct
>>>>>> vm_area_struct argument whereas hmm_range_fault does not.
>>>>> I'm not sure that's a good enough justfication because the problem isn't
>>>>> whether you're looking up vma's in driver code or mm code. The problem
>>>>> is you are looking up vma's at all and all that goes with that (mainly
>>>>> taking mmap lock, etc.)
>>>>>
>>>>> And for eviction hmm_range_fault() won't even find all the pages because
>>>>> their virtual address may have changed - consider what happens in cases
>>>>> of mremap(), fork(), etc. So eviction really needs physical pages
>>>>> (pfn's), not virtual addresses.
>>>>>
>>>> See above, #1 yes we use a physical pages. For #2 it is about making the
>>>> state consistent within a virtual address range.
>>> Yep, makes sense now. For migration of physical pages you want
>>> migrate_device_*, virtual address ranges want migrate_vma_*
>>>
>>>  - Alistair
>>>
>>>> Matt
>>>>  
>>>>>>> we're talking about eviction here so I don't understand why that would
>>>>>>> be relevant. And hmm_range_fault() still requires the VMA, although I
>>>>>>> need to look at the patches more closely, probably CPUVMA is a DRM
>>>>>> 'hmm_range_fault() still requires the VMA' internal yes, but again not
>>>>>> as argument. This is about avoiding a driver side lookup of the VMA.
>>>>>>
>>>>>> CPUVMA == struct vm_area_struct in this email.
>>>>> Thanks for the clarification.
>>>>>
>>>>>  - Alistair
>>>>>
>>>>>> Matt
>>>>>>
>>>>>>> specific concept?
>>>>>>>
>>>>>>> Thanks.
>>>>>>>
>>>>>>>  - Alistair
>>>>>>>
>>>>>>>> Matt
>>>>>>>>  
>>>>>>>>>  - Alistair
>>>>>>>>>
>>>>>>>>>> Matt
>>>>>>>>>>
>>>>>>>>>> [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>>>>>>>>>>
>>>>>>>>>>> Matt
>>>>>>>>>>>
>>>>>>>>>>>>> +	}
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +	migrate_device_unmap(src_pfns, npages, NULL);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +	return 0;
>>>>>>>>>>>>> +}
>>>>>>>>>>>>> +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>>>>>>>>>>>>> +
>>>>>>>>>>>>>  /*
>>>>>>>>>>>>>   * Migrate a device coherent folio back to normal memory. The caller should have
>>>>>>>>>>>>>   * a reference on folio which will be copied to the new folio if migration is
Matthew Brost Oct. 18, 2024, 7:16 a.m. UTC | #16
On Fri, Oct 18, 2024 at 04:59:05PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> 
> >> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> >> >> >> 
> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> 
> >> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> >> >> >> >> 
> >> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> >> 
> >> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> >> >> >> >> 
> >> >> >> >> [...]
> >> >> >> >> 
> >> >> >> >> >> > > +{
> >> >> >> >> >> > > +	unsigned long i;
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
> >> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
> >> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> >> > > +			continue;
> >> >> >> >> >> > > +		}
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +		if (!trylock_page(page)) {
> >> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> >> > > +			put_page(page);
> >> >> >> >> >> > > +			continue;
> >> >> >> >> >> > > +		}
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> >> >> >> >> > 
> >> >> >> >> >> > This needs to be converted to use a folio like
> >> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
> >> >> >> >> >> > a function that both migrate_device_range() and this function can call
> >> >> >> >> >> > given this bit is identical.
> >> >> >> >> >> > 
> >> >> >> >> >> 
> >> >> >> >> >> Missed the folio conversion and agree a helper shared between this
> >> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
> >> >> >> >> >> 
> >> >> >> >> >
> >> >> >> >> > Alistair,
> >> >> >> >> >
> >> >> >> >> > Ok, I think now I want to go slightly different direction here to give
> >> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
> >> >> >> >> >
> >> >> >> >> > What if I exported the helper discussed above, e.g.,
> >> >> >> >> >
> >> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >> >> >> >> >  906 {
> >> >> >> >> >  907         struct folio *folio;
> >> >> >> >> >  908
> >> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >> >> >> >> >  910         if (!folio)
> >> >> >> >> >  911                 return 0;
> >> >> >> >> >  912
> >> >> >> >> >  913         if (!folio_trylock(folio)) {
> >> >> >> >> >  914                 folio_put(folio);
> >> >> >> >> >  915                 return 0;
> >> >> >> >> >  916         }
> >> >> >> >> >  917
> >> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >> >> >> >> >  919 }
> >> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >> >> >> >> >
> >> >> >> >> > And then also export migrate_device_unmap.
> >> >> >> >> >
> >> >> >> >> > The usage here would be let a driver collect the device pages in virtual
> >> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
> >> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
> >> >> >> >> > migrate_device_unmap.
> >> >> >> >> 
> >> >> >> >> I'm still working through this series but that seems a bit dubious, the
> >> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
> >> >> >> >> would help me a lot in understanding what you're suggesting.
> >> >> >> >>
> >> >> >> >
> >> >> >> > For sure locking in tricky, my mistake on not working through this
> >> >> >> > before sending out the next rev but it came to mind after sending +
> >> >> >> > regarding some late feedback from Thomas about using hmm for eviction
> >> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
> >> >> >> > doesn't work for coherent pages, while something like below does.
> >> >> >> >
> >> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> >> >> >> >
> >> >> >> > Here is a snippet I have locally which seems to work.
> >> >> >> >
> >> >> >> > 2024 retry:
> >> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> >> > 2026         hmm_range.hmm_pfns = src;
> >> >> >> > 2027
> >> >> >> > 2028         while (true) {
> >> >> >> > 2029                 mmap_read_lock(mm);
> >> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
> >> >> >> > 2031                 mmap_read_unlock(mm);
> >> >> >> > 2032                 if (err == -EBUSY) {
> >> >> >> > 2033                         if (time_after(jiffies, timeout))
> >> >> >> > 2034                                 break;
> >> >> >> > 2035
> >> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> >> > 2037                         continue;
> >> >> >> > 2038                 }
> >> >> >> > 2039                 break;
> >> >> >> > 2040         }
> >> >> >> > 2041         if (err)
> >> >> >> > 2042                 goto err_put;
> >> >> >> > 2043
> >> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> >> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> >> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> >> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
> >> >> >> > 2048                 goto retry;
> >> >> >> > 2049         }
> >> >> >> > 2050         for (i = 0; i < npages; ++i) {
> >> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> >> >> >> > 2052
> >> >> >> > 2053                 if (page && (is_device_private_page(page) ||
> >> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> >> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> >> >> >> > 2056                 else
> >> >> >> > 2057                         src[i] = 0;
> >> >> >> > 2058                 if (src[i])
> >> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> >> >> >> > 2060         }
> >> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> >> >> >> 
> >> >> >> Practically for eviction isn't this much the same as calling
> >> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
> >> >> >> probably shouldn't be looking at mm/vma structs.
> >> >> >> 
> >> >> >
> >> >> > hmm_range_fault is just collecting the pages, internally I suppose it
> >> >> > does look at a VMA (struct vm_area_struct) but I think the point is
> >> >> > drivers should not be looking at VMA here.
> >> >> 
> >> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
> >> >> so in general I don't see a problem with drivers looking up vma's. The
> >> >> problem arises specifically for eviction and whether or not that happens
> >> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
> >> >> issues there (see below).
> >> >> 
> >> >
> >> > Ok, if you think it ok for drivers to lookup the VMA then purposed
> >> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
> >> > needed, rather just the original function exported in the this patch.
> >> >
> >> > More below too.
> >> >
> >> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
> >> >> >> > ...
> >> >> >> > 2101         migrate_device_pages(src, dst, npages);
> >> >> >> > 2102         migrate_device_finalize(src, dst, npages);
> >> >> >> >
> >> >> >> >
> >> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
> >> >> >> >> > lookup during eviction cases and this would let me fixup
> >> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
> >> >> >> >> 
> >> >> >> >> That sounds reasonable but for context do you have a link to the
> >> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
> >> >> >> >> just missed it.
> >> >> >> >> 
> >> >> >> >
> >> >> >> > See in [4], search for '2. eviction' comment from sima.
> >> >> >> 
> >> >> >> Thanks for pointing that out. For reference here's Sima's comment:
> >> >> >> 
> >> >> >> > 2. eviction
> >> >> >> > 
> >> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
> >> >> >> > migration gurantee:
> >> >> >> > 
> >> >> >> > - Only looking at physical memory datastructures and locks, no looking at
> >> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
> >> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> >> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> >> >> >>
> >> >> >> I also very much agree with this. That's basically why I added
> >> >> >> migrate_device_range(), so that we can forcibly evict pages when the
> >> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
> >> >> >> general it is impossible to guarantee eviction og all pages using just
> >> >> >> hmm_range_fault().
> >> >> >> 
> >> >> >
> >> >> > In this code path we don't have device pages available, hence the
> >> >> > purposed collection via hmm_range_fault.
> >> >> 
> >> >> Why don't you have the pfns requiring eviction available? I need to read
> >> >> this series in more depth, but generally hmm_range_fault() can't
> >> >> gurantee you will find every device page.
> >> >> 
> >> >
> >> > There are two cases for eviction in my series:
> >> >
> >> > 1. TTM decides it needs to move memory. This calls
> >> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
> >> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
> >> > here albiet with the new function added in this patch as device pfns may
> >> > be non-contiguous.
> >> 
> >> That makes sense and is generally what I think of when I'm thinking of
> >> eviction. The new function makes sense too - migrate_device_range() was
> >> primarily introduced to allow a driver to evict all device-private pages
> >> from a GPU so didn't consider non-contiguous cases, etc.
> >> 
> >> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
> >> > partial unmap of a range, etc...). Here we want to evict the range ram
> >> > to make the state consistent. No device pages are available due to an
> >> > intentional disconnect between a virtual range and physical
> >> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
> >> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
> >> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
> >> > using hmm_range_fault like I have suggested here.
> >> 
> >> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
> >> fine for this usage and is exactly what you want - it was designed to
> >> either select all the system memory pages or device-private pages within
> >> a VA range and migrate them.
> >> 
> >> FWIW I have toyed with the idea of a combined
> >> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
> >> migrate_vma_*() process but haven't come up with something nice as
> >> yet. I don't think mixing the two in an open-coded fashion is a good
> >> idea though, I'd rather we come up with a new API that addresses the
> >> short-comings of migrate_vma_setup().
> >> 
> >
> > I think that would good. Here we actually need to lookup multiple VMAs
> > and have a sequence of migrate_vma_* calls as it possible for VMAs to
> > have changed after the driver range was created. It might be nice to
> > hide the VMA lookup from the drivers with an API saying collect and
> > migrate all pages of a type in a VA range much like hmm_range_fault. If
> > the range spans multiple VMAs that would be hidden from the caller.
> 
> Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
> really have a requirement to migrate across multiple VMAs but if that's
> neccessary I think an API that hides that specifically for working with
> migrate_vma_*() might make sense.
> 

We can run into multiple VMA scenarios if a user does something rude
like this:

mmap	0x000000...0x1fffff -> fault migrates 2M to VRAM and creates an internal range to track
munmap	0x080000...0x17ffff -> now we have two VMAs instead of one and the range has a hole in it

In this scenario, which we believe to rare / unsual, we just evict
remaining VRAM pages to SRAM, destroy range, and fixup on next GPU
fault.

> > Matt
> >
> >> > Note #2 may be removed or unnecessary at some point if we decide to add
> >> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
> >> > now though. See 'Ranges with mixed system and device pages' in [5].
> 
> As someone not very familiar with some of the DRM layers can I ask why
> having virtual address ranges with a mix of system and device pages is
> hard to support? It seems to me that in practice it might be quite
> difficult to keep a VMA range as exclusively all in system memory or all
> in device memory.
>

A few things that make this difficult are:

- Our (Xe) bind code would need to be updated to support this
- TTM / DRM buddy allocator doesn't support freeing / reallocation of
  individual pages rather aligned chunks of initial allocation size
  (e.g., 2M would be common allocation size).
- Spliting ranges would add complications

All workable problems but since we are writing a new common
implementation trying to keep it as simple as possible for initial merge
of the design. Almost certainly at some point we will add support for
mixed ranges to the common GPU SVM layer with a driver choosing if it
wants mixed or non-mixed ranges via a flag to function calls.

wrt to being difficult keeping exclusively in system or vram, in
addition to the above case the only other case I have found in which
this occurs is CPU and GPU faults to same address range racing. This can
cause hmm_range_fault to grab a set mixed pages. In this case again we
do an eviction remaining page and restart the GPU fault.

I don't have real workloads yet but I do have a very aggressive test
case that intentionally does things which could break the design in a
highly parallel manner and the design as work. Is it ideal? Maybe not.
But getting in a simple design which we can build upon is the current
goal.

Matt

> >> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
> >> >
> >> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> >> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> >> >> >> >
> >> >> >> >> > It would also make the function exported in this patch unnecessary too
> >> >> >> >> > as non-contiguous pfns can be setup on driver side via
> >> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> >> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> >> >> >> >> > in [1].
> >> >> >> >> >
> >> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
> >> >> >> >> > migrate_device_unmap?
> >> >> >> >> 
> >> >> >> >> If there is a good justification for it I can't see a problem with
> >> >> >> >> exporting it. That said I don't really understand why you would
> >> >> >> >> want/need to split those steps up but I'll wait to see the code.
> >> >> >> >>
> >> >> >> >
> >> >> >> > It is so the device pages returned from hmm_range_fault, which are only
> >> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
> >> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> >> >> >> > MMU invalidation which takes the notifier lock thus calling the function
> >> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> >> >> >> >
> >> >> >> > I think this flow makes sense and agree in general this likely better
> >> >> >> > than looking at a CPUVMA.
> >> >> >> 
> >> >> >> I'm still a bit confused about what is better with this flow if you are
> >> >> >> still calling hmm_range_fault(). How is it better than just calling
> >> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
> >> >> >
> >> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
> >> >> > vm_area_struct argument whereas hmm_range_fault does not.
> >> >> 
> >> >> I'm not sure that's a good enough justfication because the problem isn't
> >> >> whether you're looking up vma's in driver code or mm code. The problem
> >> >> is you are looking up vma's at all and all that goes with that (mainly
> >> >> taking mmap lock, etc.)
> >> >> 
> >> >> And for eviction hmm_range_fault() won't even find all the pages because
> >> >> their virtual address may have changed - consider what happens in cases
> >> >> of mremap(), fork(), etc. So eviction really needs physical pages
> >> >> (pfn's), not virtual addresses.
> >> >>
> >> >
> >> > See above, #1 yes we use a physical pages. For #2 it is about making the
> >> > state consistent within a virtual address range.
> >> 
> >> Yep, makes sense now. For migration of physical pages you want
> >> migrate_device_*, virtual address ranges want migrate_vma_*
> >> 
> >>  - Alistair
> >> 
> >> > Matt
> >> >  
> >> >> >> we're talking about eviction here so I don't understand why that would
> >> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
> >> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
> >> >> >
> >> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
> >> >> > as argument. This is about avoiding a driver side lookup of the VMA.
> >> >> >
> >> >> > CPUVMA == struct vm_area_struct in this email.
> >> >> 
> >> >> Thanks for the clarification.
> >> >> 
> >> >>  - Alistair
> >> >> 
> >> >> > Matt
> >> >> >
> >> >> >> specific concept?
> >> >> >> 
> >> >> >> Thanks.
> >> >> >> 
> >> >> >>  - Alistair
> >> >> >> 
> >> >> >> > Matt
> >> >> >> >  
> >> >> >> >>  - Alistair
> >> >> >> >> 
> >> >> >> >> > Matt
> >> >> >> >> >
> >> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >> >> >> >> >
> >> >> >> >> >> Matt
> >> >> >> >> >> 
> >> >> >> >> >> > > +	}
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> >> >> >> >> > > +
> >> >> >> >> >> > > +	return 0;
> >> >> >> >> >> > > +}
> >> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> >> >> >> >> > > +
> >> >> >> >> >> > >  /*
> >> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> >> >> >> >> > 
> >> >> >> >> 
> >> >> >> 
> >> >> 
> >> 
>
Matthew Brost Oct. 18, 2024, 7:33 a.m. UTC | #17
On Fri, Oct 18, 2024 at 07:16:15AM +0000, Matthew Brost wrote:
> On Fri, Oct 18, 2024 at 04:59:05PM +1100, Alistair Popple wrote:
> > 
> > Matthew Brost <matthew.brost@intel.com> writes:
> > 
> > > On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
> > >> 
> > >> Matthew Brost <matthew.brost@intel.com> writes:
> > >> 
> > >> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
> > >> >> 
> > >> >> Matthew Brost <matthew.brost@intel.com> writes:
> > >> >> 
> > >> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> > >> >> >> 
> > >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> > >> >> >> 
> > >> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> > >> >> >> >> 
> > >> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> > >> >> >> >> 
> > >> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> > >> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> > >> >> >> >> 
> > >> >> >> >> [...]
> > >> >> >> >> 
> > >> >> >> >> >> > > +{
> > >> >> >> >> >> > > +	unsigned long i;
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
> > >> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
> > >> >> >> >> >> > > +			src_pfns[i] = 0;
> > >> >> >> >> >> > > +			continue;
> > >> >> >> >> >> > > +		}
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +		if (!trylock_page(page)) {
> > >> >> >> >> >> > > +			src_pfns[i] = 0;
> > >> >> >> >> >> > > +			put_page(page);
> > >> >> >> >> >> > > +			continue;
> > >> >> >> >> >> > > +		}
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> > >> >> >> >> >> > 
> > >> >> >> >> >> > This needs to be converted to use a folio like
> > >> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
> > >> >> >> >> >> > a function that both migrate_device_range() and this function can call
> > >> >> >> >> >> > given this bit is identical.
> > >> >> >> >> >> > 
> > >> >> >> >> >> 
> > >> >> >> >> >> Missed the folio conversion and agree a helper shared between this
> > >> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
> > >> >> >> >> >> 
> > >> >> >> >> >
> > >> >> >> >> > Alistair,
> > >> >> >> >> >
> > >> >> >> >> > Ok, I think now I want to go slightly different direction here to give
> > >> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
> > >> >> >> >> >
> > >> >> >> >> > What if I exported the helper discussed above, e.g.,
> > >> >> >> >> >
> > >> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> > >> >> >> >> >  906 {
> > >> >> >> >> >  907         struct folio *folio;
> > >> >> >> >> >  908
> > >> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> > >> >> >> >> >  910         if (!folio)
> > >> >> >> >> >  911                 return 0;
> > >> >> >> >> >  912
> > >> >> >> >> >  913         if (!folio_trylock(folio)) {
> > >> >> >> >> >  914                 folio_put(folio);
> > >> >> >> >> >  915                 return 0;
> > >> >> >> >> >  916         }
> > >> >> >> >> >  917
> > >> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> > >> >> >> >> >  919 }
> > >> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> > >> >> >> >> >
> > >> >> >> >> > And then also export migrate_device_unmap.
> > >> >> >> >> >
> > >> >> >> >> > The usage here would be let a driver collect the device pages in virtual
> > >> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
> > >> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
> > >> >> >> >> > migrate_device_unmap.
> > >> >> >> >> 
> > >> >> >> >> I'm still working through this series but that seems a bit dubious, the
> > >> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
> > >> >> >> >> would help me a lot in understanding what you're suggesting.
> > >> >> >> >>
> > >> >> >> >
> > >> >> >> > For sure locking in tricky, my mistake on not working through this
> > >> >> >> > before sending out the next rev but it came to mind after sending +
> > >> >> >> > regarding some late feedback from Thomas about using hmm for eviction
> > >> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
> > >> >> >> > doesn't work for coherent pages, while something like below does.
> > >> >> >> >
> > >> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> > >> >> >> >
> > >> >> >> > Here is a snippet I have locally which seems to work.
> > >> >> >> >
> > >> >> >> > 2024 retry:
> > >> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > >> >> >> > 2026         hmm_range.hmm_pfns = src;
> > >> >> >> > 2027
> > >> >> >> > 2028         while (true) {
> > >> >> >> > 2029                 mmap_read_lock(mm);
> > >> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
> > >> >> >> > 2031                 mmap_read_unlock(mm);
> > >> >> >> > 2032                 if (err == -EBUSY) {
> > >> >> >> > 2033                         if (time_after(jiffies, timeout))
> > >> >> >> > 2034                                 break;
> > >> >> >> > 2035
> > >> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> > >> >> >> > 2037                         continue;
> > >> >> >> > 2038                 }
> > >> >> >> > 2039                 break;
> > >> >> >> > 2040         }
> > >> >> >> > 2041         if (err)
> > >> >> >> > 2042                 goto err_put;
> > >> >> >> > 2043
> > >> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> > >> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> > >> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> > >> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
> > >> >> >> > 2048                 goto retry;
> > >> >> >> > 2049         }
> > >> >> >> > 2050         for (i = 0; i < npages; ++i) {
> > >> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> > >> >> >> > 2052
> > >> >> >> > 2053                 if (page && (is_device_private_page(page) ||
> > >> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> > >> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> > >> >> >> > 2056                 else
> > >> >> >> > 2057                         src[i] = 0;
> > >> >> >> > 2058                 if (src[i])
> > >> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> > >> >> >> > 2060         }
> > >> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> > >> >> >> 
> > >> >> >> Practically for eviction isn't this much the same as calling
> > >> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
> > >> >> >> probably shouldn't be looking at mm/vma structs.
> > >> >> >> 
> > >> >> >
> > >> >> > hmm_range_fault is just collecting the pages, internally I suppose it
> > >> >> > does look at a VMA (struct vm_area_struct) but I think the point is
> > >> >> > drivers should not be looking at VMA here.
> > >> >> 
> > >> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
> > >> >> so in general I don't see a problem with drivers looking up vma's. The
> > >> >> problem arises specifically for eviction and whether or not that happens
> > >> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
> > >> >> issues there (see below).
> > >> >> 
> > >> >
> > >> > Ok, if you think it ok for drivers to lookup the VMA then purposed
> > >> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
> > >> > needed, rather just the original function exported in the this patch.
> > >> >
> > >> > More below too.
> > >> >
> > >> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
> > >> >> >> > ...
> > >> >> >> > 2101         migrate_device_pages(src, dst, npages);
> > >> >> >> > 2102         migrate_device_finalize(src, dst, npages);
> > >> >> >> >
> > >> >> >> >
> > >> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
> > >> >> >> >> > lookup during eviction cases and this would let me fixup
> > >> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
> > >> >> >> >> 
> > >> >> >> >> That sounds reasonable but for context do you have a link to the
> > >> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
> > >> >> >> >> just missed it.
> > >> >> >> >> 
> > >> >> >> >
> > >> >> >> > See in [4], search for '2. eviction' comment from sima.
> > >> >> >> 
> > >> >> >> Thanks for pointing that out. For reference here's Sima's comment:
> > >> >> >> 
> > >> >> >> > 2. eviction
> > >> >> >> > 
> > >> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
> > >> >> >> > migration gurantee:
> > >> >> >> > 
> > >> >> >> > - Only looking at physical memory datastructures and locks, no looking at
> > >> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
> > >> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> > >> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> > >> >> >>
> > >> >> >> I also very much agree with this. That's basically why I added
> > >> >> >> migrate_device_range(), so that we can forcibly evict pages when the
> > >> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
> > >> >> >> general it is impossible to guarantee eviction og all pages using just
> > >> >> >> hmm_range_fault().
> > >> >> >> 
> > >> >> >
> > >> >> > In this code path we don't have device pages available, hence the
> > >> >> > purposed collection via hmm_range_fault.
> > >> >> 
> > >> >> Why don't you have the pfns requiring eviction available? I need to read
> > >> >> this series in more depth, but generally hmm_range_fault() can't
> > >> >> gurantee you will find every device page.
> > >> >> 
> > >> >
> > >> > There are two cases for eviction in my series:
> > >> >
> > >> > 1. TTM decides it needs to move memory. This calls
> > >> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
> > >> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
> > >> > here albiet with the new function added in this patch as device pfns may
> > >> > be non-contiguous.
> > >> 
> > >> That makes sense and is generally what I think of when I'm thinking of
> > >> eviction. The new function makes sense too - migrate_device_range() was
> > >> primarily introduced to allow a driver to evict all device-private pages
> > >> from a GPU so didn't consider non-contiguous cases, etc.
> > >> 
> > >> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
> > >> > partial unmap of a range, etc...). Here we want to evict the range ram
> > >> > to make the state consistent. No device pages are available due to an
> > >> > intentional disconnect between a virtual range and physical
> > >> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
> > >> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
> > >> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
> > >> > using hmm_range_fault like I have suggested here.
> > >> 
> > >> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
> > >> fine for this usage and is exactly what you want - it was designed to
> > >> either select all the system memory pages or device-private pages within
> > >> a VA range and migrate them.
> > >> 
> > >> FWIW I have toyed with the idea of a combined
> > >> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
> > >> migrate_vma_*() process but haven't come up with something nice as
> > >> yet. I don't think mixing the two in an open-coded fashion is a good
> > >> idea though, I'd rather we come up with a new API that addresses the
> > >> short-comings of migrate_vma_setup().
> > >> 
> > >
> > > I think that would good. Here we actually need to lookup multiple VMAs
> > > and have a sequence of migrate_vma_* calls as it possible for VMAs to
> > > have changed after the driver range was created. It might be nice to
> > > hide the VMA lookup from the drivers with an API saying collect and
> > > migrate all pages of a type in a VA range much like hmm_range_fault. If
> > > the range spans multiple VMAs that would be hidden from the caller.
> > 
> > Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
> > really have a requirement to migrate across multiple VMAs but if that's
> > neccessary I think an API that hides that specifically for working with
> > migrate_vma_*() might make sense.
> > 
> 
> We can run into multiple VMA scenarios if a user does something rude
> like this:
> 
> mmap	0x000000...0x1fffff -> fault migrates 2M to VRAM and creates an internal range to track
> munmap	0x080000...0x17ffff -> now we have two VMAs instead of one and the range has a hole in it
> 
> In this scenario, which we believe to rare / unsual, we just evict
> remaining VRAM pages to SRAM, destroy range, and fixup on next GPU
> fault.
> 
> > > Matt
> > >
> > >> > Note #2 may be removed or unnecessary at some point if we decide to add
> > >> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
> > >> > now though. See 'Ranges with mixed system and device pages' in [5].
> > 
> > As someone not very familiar with some of the DRM layers can I ask why
> > having virtual address ranges with a mix of system and device pages is
> > hard to support? It seems to me that in practice it might be quite
> > difficult to keep a VMA range as exclusively all in system memory or all
> > in device memory.
> >

Sorry double reply - missed 'VMA range as exclusively' comment.

I'm refering to DRM GPU SVM range being exclusive in system or device
memory not a VMA. A DRM GPU SVM range is relatively small and controled
by the driver, via a table, and VMA boundaries. In Xe, we current
support 4k, 64k, or 2M DRM GPU SVM ranges. So say if a VMA is 1 GB, we
can have individual 2M ranges in either system or device memory based on
access patterns. What range allocation size, that is migration
grainularity which all pages in the range being exclusively in system or
device memory.

Matt

> 
> A few things that make this difficult are:
> 
> - Our (Xe) bind code would need to be updated to support this
> - TTM / DRM buddy allocator doesn't support freeing / reallocation of
>   individual pages rather aligned chunks of initial allocation size
>   (e.g., 2M would be common allocation size).
> - Spliting ranges would add complications
> 
> All workable problems but since we are writing a new common
> implementation trying to keep it as simple as possible for initial merge
> of the design. Almost certainly at some point we will add support for
> mixed ranges to the common GPU SVM layer with a driver choosing if it
> wants mixed or non-mixed ranges via a flag to function calls.
> 
> wrt to being difficult keeping exclusively in system or vram, in
> addition to the above case the only other case I have found in which
> this occurs is CPU and GPU faults to same address range racing. This can
> cause hmm_range_fault to grab a set mixed pages. In this case again we
> do an eviction remaining page and restart the GPU fault.
> 
> I don't have real workloads yet but I do have a very aggressive test
> case that intentionally does things which could break the design in a
> highly parallel manner and the design as work. Is it ideal? Maybe not.
> But getting in a simple design which we can build upon is the current
> goal.
> 
> Matt
> 
> > >> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
> > >> >
> > >> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> > >> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> > >> >> >> >
> > >> >> >> >> > It would also make the function exported in this patch unnecessary too
> > >> >> >> >> > as non-contiguous pfns can be setup on driver side via
> > >> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> > >> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> > >> >> >> >> > in [1].
> > >> >> >> >> >
> > >> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
> > >> >> >> >> > migrate_device_unmap?
> > >> >> >> >> 
> > >> >> >> >> If there is a good justification for it I can't see a problem with
> > >> >> >> >> exporting it. That said I don't really understand why you would
> > >> >> >> >> want/need to split those steps up but I'll wait to see the code.
> > >> >> >> >>
> > >> >> >> >
> > >> >> >> > It is so the device pages returned from hmm_range_fault, which are only
> > >> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
> > >> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> > >> >> >> > MMU invalidation which takes the notifier lock thus calling the function
> > >> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> > >> >> >> >
> > >> >> >> > I think this flow makes sense and agree in general this likely better
> > >> >> >> > than looking at a CPUVMA.
> > >> >> >> 
> > >> >> >> I'm still a bit confused about what is better with this flow if you are
> > >> >> >> still calling hmm_range_fault(). How is it better than just calling
> > >> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
> > >> >> >
> > >> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
> > >> >> > vm_area_struct argument whereas hmm_range_fault does not.
> > >> >> 
> > >> >> I'm not sure that's a good enough justfication because the problem isn't
> > >> >> whether you're looking up vma's in driver code or mm code. The problem
> > >> >> is you are looking up vma's at all and all that goes with that (mainly
> > >> >> taking mmap lock, etc.)
> > >> >> 
> > >> >> And for eviction hmm_range_fault() won't even find all the pages because
> > >> >> their virtual address may have changed - consider what happens in cases
> > >> >> of mremap(), fork(), etc. So eviction really needs physical pages
> > >> >> (pfn's), not virtual addresses.
> > >> >>
> > >> >
> > >> > See above, #1 yes we use a physical pages. For #2 it is about making the
> > >> > state consistent within a virtual address range.
> > >> 
> > >> Yep, makes sense now. For migration of physical pages you want
> > >> migrate_device_*, virtual address ranges want migrate_vma_*
> > >> 
> > >>  - Alistair
> > >> 
> > >> > Matt
> > >> >  
> > >> >> >> we're talking about eviction here so I don't understand why that would
> > >> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
> > >> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
> > >> >> >
> > >> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
> > >> >> > as argument. This is about avoiding a driver side lookup of the VMA.
> > >> >> >
> > >> >> > CPUVMA == struct vm_area_struct in this email.
> > >> >> 
> > >> >> Thanks for the clarification.
> > >> >> 
> > >> >>  - Alistair
> > >> >> 
> > >> >> > Matt
> > >> >> >
> > >> >> >> specific concept?
> > >> >> >> 
> > >> >> >> Thanks.
> > >> >> >> 
> > >> >> >>  - Alistair
> > >> >> >> 
> > >> >> >> > Matt
> > >> >> >> >  
> > >> >> >> >>  - Alistair
> > >> >> >> >> 
> > >> >> >> >> > Matt
> > >> >> >> >> >
> > >> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> > >> >> >> >> >
> > >> >> >> >> >> Matt
> > >> >> >> >> >> 
> > >> >> >> >> >> > > +	}
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > > +	return 0;
> > >> >> >> >> >> > > +}
> > >> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> > >> >> >> >> >> > > +
> > >> >> >> >> >> > >  /*
> > >> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> > >> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> > >> >> >> >> >> > 
> > >> >> >> >> 
> > >> >> >> 
> > >> >> 
> > >> 
> >
Alistair Popple Oct. 18, 2024, 7:34 a.m. UTC | #18
Matthew Brost <matthew.brost@intel.com> writes:

> On Fri, Oct 18, 2024 at 04:59:05PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> 
>> >> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
>> >> >> 
>> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> 
>> >> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
>> >> >> >> 
>> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> >> 
>> >> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
>> >> >> >> >> 
>> >> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> >> >> >> 
>> >> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
>> >> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
>> >> >> >> >> 
>> >> >> >> >> [...]
>> >> >> >> >> 
>> >> >> >> >> >> > > +{
>> >> >> >> >> >> > > +	unsigned long i;
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
>> >> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
>> >> >> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> >> >> > > +			continue;
>> >> >> >> >> >> > > +		}
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +		if (!trylock_page(page)) {
>> >> >> >> >> >> > > +			src_pfns[i] = 0;
>> >> >> >> >> >> > > +			put_page(page);
>> >> >> >> >> >> > > +			continue;
>> >> >> >> >> >> > > +		}
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
>> >> >> >> >> >> > 
>> >> >> >> >> >> > This needs to be converted to use a folio like
>> >> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
>> >> >> >> >> >> > a function that both migrate_device_range() and this function can call
>> >> >> >> >> >> > given this bit is identical.
>> >> >> >> >> >> > 
>> >> >> >> >> >> 
>> >> >> >> >> >> Missed the folio conversion and agree a helper shared between this
>> >> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
>> >> >> >> >> >> 
>> >> >> >> >> >
>> >> >> >> >> > Alistair,
>> >> >> >> >> >
>> >> >> >> >> > Ok, I think now I want to go slightly different direction here to give
>> >> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
>> >> >> >> >> >
>> >> >> >> >> > What if I exported the helper discussed above, e.g.,
>> >> >> >> >> >
>> >> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
>> >> >> >> >> >  906 {
>> >> >> >> >> >  907         struct folio *folio;
>> >> >> >> >> >  908
>> >> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
>> >> >> >> >> >  910         if (!folio)
>> >> >> >> >> >  911                 return 0;
>> >> >> >> >> >  912
>> >> >> >> >> >  913         if (!folio_trylock(folio)) {
>> >> >> >> >> >  914                 folio_put(folio);
>> >> >> >> >> >  915                 return 0;
>> >> >> >> >> >  916         }
>> >> >> >> >> >  917
>> >> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
>> >> >> >> >> >  919 }
>> >> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
>> >> >> >> >> >
>> >> >> >> >> > And then also export migrate_device_unmap.
>> >> >> >> >> >
>> >> >> >> >> > The usage here would be let a driver collect the device pages in virtual
>> >> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
>> >> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
>> >> >> >> >> > migrate_device_unmap.
>> >> >> >> >> 
>> >> >> >> >> I'm still working through this series but that seems a bit dubious, the
>> >> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
>> >> >> >> >> would help me a lot in understanding what you're suggesting.
>> >> >> >> >>
>> >> >> >> >
>> >> >> >> > For sure locking in tricky, my mistake on not working through this
>> >> >> >> > before sending out the next rev but it came to mind after sending +
>> >> >> >> > regarding some late feedback from Thomas about using hmm for eviction
>> >> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
>> >> >> >> > doesn't work for coherent pages, while something like below does.
>> >> >> >> >
>> >> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
>> >> >> >> >
>> >> >> >> > Here is a snippet I have locally which seems to work.
>> >> >> >> >
>> >> >> >> > 2024 retry:
>> >> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> >> >> > 2026         hmm_range.hmm_pfns = src;
>> >> >> >> > 2027
>> >> >> >> > 2028         while (true) {
>> >> >> >> > 2029                 mmap_read_lock(mm);
>> >> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
>> >> >> >> > 2031                 mmap_read_unlock(mm);
>> >> >> >> > 2032                 if (err == -EBUSY) {
>> >> >> >> > 2033                         if (time_after(jiffies, timeout))
>> >> >> >> > 2034                                 break;
>> >> >> >> > 2035
>> >> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
>> >> >> >> > 2037                         continue;
>> >> >> >> > 2038                 }
>> >> >> >> > 2039                 break;
>> >> >> >> > 2040         }
>> >> >> >> > 2041         if (err)
>> >> >> >> > 2042                 goto err_put;
>> >> >> >> > 2043
>> >> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
>> >> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
>> >> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
>> >> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
>> >> >> >> > 2048                 goto retry;
>> >> >> >> > 2049         }
>> >> >> >> > 2050         for (i = 0; i < npages; ++i) {
>> >> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
>> >> >> >> > 2052
>> >> >> >> > 2053                 if (page && (is_device_private_page(page) ||
>> >> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
>> >> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
>> >> >> >> > 2056                 else
>> >> >> >> > 2057                         src[i] = 0;
>> >> >> >> > 2058                 if (src[i])
>> >> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
>> >> >> >> > 2060         }
>> >> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
>> >> >> >> 
>> >> >> >> Practically for eviction isn't this much the same as calling
>> >> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
>> >> >> >> probably shouldn't be looking at mm/vma structs.
>> >> >> >> 
>> >> >> >
>> >> >> > hmm_range_fault is just collecting the pages, internally I suppose it
>> >> >> > does look at a VMA (struct vm_area_struct) but I think the point is
>> >> >> > drivers should not be looking at VMA here.
>> >> >> 
>> >> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
>> >> >> so in general I don't see a problem with drivers looking up vma's. The
>> >> >> problem arises specifically for eviction and whether or not that happens
>> >> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
>> >> >> issues there (see below).
>> >> >> 
>> >> >
>> >> > Ok, if you think it ok for drivers to lookup the VMA then purposed
>> >> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
>> >> > needed, rather just the original function exported in the this patch.
>> >> >
>> >> > More below too.
>> >> >
>> >> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
>> >> >> >> > ...
>> >> >> >> > 2101         migrate_device_pages(src, dst, npages);
>> >> >> >> > 2102         migrate_device_finalize(src, dst, npages);
>> >> >> >> >
>> >> >> >> >
>> >> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
>> >> >> >> >> > lookup during eviction cases and this would let me fixup
>> >> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
>> >> >> >> >> 
>> >> >> >> >> That sounds reasonable but for context do you have a link to the
>> >> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
>> >> >> >> >> just missed it.
>> >> >> >> >> 
>> >> >> >> >
>> >> >> >> > See in [4], search for '2. eviction' comment from sima.
>> >> >> >> 
>> >> >> >> Thanks for pointing that out. For reference here's Sima's comment:
>> >> >> >> 
>> >> >> >> > 2. eviction
>> >> >> >> > 
>> >> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
>> >> >> >> > migration gurantee:
>> >> >> >> > 
>> >> >> >> > - Only looking at physical memory datastructures and locks, no looking at
>> >> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
>> >> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
>> >> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
>> >> >> >>
>> >> >> >> I also very much agree with this. That's basically why I added
>> >> >> >> migrate_device_range(), so that we can forcibly evict pages when the
>> >> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
>> >> >> >> general it is impossible to guarantee eviction og all pages using just
>> >> >> >> hmm_range_fault().
>> >> >> >> 
>> >> >> >
>> >> >> > In this code path we don't have device pages available, hence the
>> >> >> > purposed collection via hmm_range_fault.
>> >> >> 
>> >> >> Why don't you have the pfns requiring eviction available? I need to read
>> >> >> this series in more depth, but generally hmm_range_fault() can't
>> >> >> gurantee you will find every device page.
>> >> >> 
>> >> >
>> >> > There are two cases for eviction in my series:
>> >> >
>> >> > 1. TTM decides it needs to move memory. This calls
>> >> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
>> >> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
>> >> > here albiet with the new function added in this patch as device pfns may
>> >> > be non-contiguous.
>> >> 
>> >> That makes sense and is generally what I think of when I'm thinking of
>> >> eviction. The new function makes sense too - migrate_device_range() was
>> >> primarily introduced to allow a driver to evict all device-private pages
>> >> from a GPU so didn't consider non-contiguous cases, etc.
>> >> 
>> >> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
>> >> > partial unmap of a range, etc...). Here we want to evict the range ram
>> >> > to make the state consistent. No device pages are available due to an
>> >> > intentional disconnect between a virtual range and physical
>> >> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
>> >> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
>> >> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
>> >> > using hmm_range_fault like I have suggested here.
>> >> 
>> >> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
>> >> fine for this usage and is exactly what you want - it was designed to
>> >> either select all the system memory pages or device-private pages within
>> >> a VA range and migrate them.
>> >> 
>> >> FWIW I have toyed with the idea of a combined
>> >> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
>> >> migrate_vma_*() process but haven't come up with something nice as
>> >> yet. I don't think mixing the two in an open-coded fashion is a good
>> >> idea though, I'd rather we come up with a new API that addresses the
>> >> short-comings of migrate_vma_setup().
>> >> 
>> >
>> > I think that would good. Here we actually need to lookup multiple VMAs
>> > and have a sequence of migrate_vma_* calls as it possible for VMAs to
>> > have changed after the driver range was created. It might be nice to
>> > hide the VMA lookup from the drivers with an API saying collect and
>> > migrate all pages of a type in a VA range much like hmm_range_fault. If
>> > the range spans multiple VMAs that would be hidden from the caller.
>> 
>> Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
>> really have a requirement to migrate across multiple VMAs but if that's
>> neccessary I think an API that hides that specifically for working with
>> migrate_vma_*() might make sense.
>> 
>
> We can run into multiple VMA scenarios if a user does something rude
> like this:

fork and mremap were the other "rude" things we've had fun with. They
basically mean you can get references to device pages which a driver
can't track with virtual addresses.

> mmap	0x000000...0x1fffff -> fault migrates 2M to VRAM and creates an internal range to track
> munmap	0x080000...0x17ffff -> now we have two VMAs instead of one and the range has a hole in it
>
> In this scenario, which we believe to rare / unsual, we just evict
> remaining VRAM pages to SRAM, destroy range, and fixup on next GPU
> fault.
>
>> > Matt
>> >
>> >> > Note #2 may be removed or unnecessary at some point if we decide to add
>> >> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
>> >> > now though. See 'Ranges with mixed system and device pages' in [5].
>> 
>> As someone not very familiar with some of the DRM layers can I ask why
>> having virtual address ranges with a mix of system and device pages is
>> hard to support? It seems to me that in practice it might be quite
>> difficult to keep a VMA range as exclusively all in system memory or all
>> in device memory.
>>
>
> A few things that make this difficult are:
>
> - Our (Xe) bind code would need to be updated to support this
> - TTM / DRM buddy allocator doesn't support freeing / reallocation of
>   individual pages rather aligned chunks of initial allocation size
>   (e.g., 2M would be common allocation size).
> - Spliting ranges would add complications
>
> All workable problems but since we are writing a new common
> implementation trying to keep it as simple as possible for initial merge
> of the design. Almost certainly at some point we will add support for
> mixed ranges to the common GPU SVM layer with a driver choosing if it
> wants mixed or non-mixed ranges via a flag to function calls.
>
> wrt to being difficult keeping exclusively in system or vram, in
> addition to the above case the only other case I have found in which
> this occurs is CPU and GPU faults to same address range racing. This can
> cause hmm_range_fault to grab a set mixed pages. In this case again we
> do an eviction remaining page and restart the GPU fault.
>
> I don't have real workloads yet but I do have a very aggressive test
> case that intentionally does things which could break the design in a
> highly parallel manner and the design as work. Is it ideal? Maybe not.
> But getting in a simple design which we can build upon is the current
> goal.

Taking a simple approach first definitely sounds lie the right approach
to me. I was just interested in the background because it wasn't
something I'd run into (though we built on top of something quite
different to the DRM layer). But I have often thought that the
interfaces we have between core mm and GPU drivers is still a bit too
low level at the moment and is calling out for a slightly higher level
common implementation in the middle so am very interested to see where
this all goes. Thanks.

 - Alistair

> Matt
>
>> >> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
>> >> >
>> >> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
>> >> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
>> >> >> >> >
>> >> >> >> >> > It would also make the function exported in this patch unnecessary too
>> >> >> >> >> > as non-contiguous pfns can be setup on driver side via
>> >> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
>> >> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
>> >> >> >> >> > in [1].
>> >> >> >> >> >
>> >> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
>> >> >> >> >> > migrate_device_unmap?
>> >> >> >> >> 
>> >> >> >> >> If there is a good justification for it I can't see a problem with
>> >> >> >> >> exporting it. That said I don't really understand why you would
>> >> >> >> >> want/need to split those steps up but I'll wait to see the code.
>> >> >> >> >>
>> >> >> >> >
>> >> >> >> > It is so the device pages returned from hmm_range_fault, which are only
>> >> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
>> >> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
>> >> >> >> > MMU invalidation which takes the notifier lock thus calling the function
>> >> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
>> >> >> >> >
>> >> >> >> > I think this flow makes sense and agree in general this likely better
>> >> >> >> > than looking at a CPUVMA.
>> >> >> >> 
>> >> >> >> I'm still a bit confused about what is better with this flow if you are
>> >> >> >> still calling hmm_range_fault(). How is it better than just calling
>> >> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
>> >> >> >
>> >> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
>> >> >> > vm_area_struct argument whereas hmm_range_fault does not.
>> >> >> 
>> >> >> I'm not sure that's a good enough justfication because the problem isn't
>> >> >> whether you're looking up vma's in driver code or mm code. The problem
>> >> >> is you are looking up vma's at all and all that goes with that (mainly
>> >> >> taking mmap lock, etc.)
>> >> >> 
>> >> >> And for eviction hmm_range_fault() won't even find all the pages because
>> >> >> their virtual address may have changed - consider what happens in cases
>> >> >> of mremap(), fork(), etc. So eviction really needs physical pages
>> >> >> (pfn's), not virtual addresses.
>> >> >>
>> >> >
>> >> > See above, #1 yes we use a physical pages. For #2 it is about making the
>> >> > state consistent within a virtual address range.
>> >> 
>> >> Yep, makes sense now. For migration of physical pages you want
>> >> migrate_device_*, virtual address ranges want migrate_vma_*
>> >> 
>> >>  - Alistair
>> >> 
>> >> > Matt
>> >> >  
>> >> >> >> we're talking about eviction here so I don't understand why that would
>> >> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
>> >> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
>> >> >> >
>> >> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
>> >> >> > as argument. This is about avoiding a driver side lookup of the VMA.
>> >> >> >
>> >> >> > CPUVMA == struct vm_area_struct in this email.
>> >> >> 
>> >> >> Thanks for the clarification.
>> >> >> 
>> >> >>  - Alistair
>> >> >> 
>> >> >> > Matt
>> >> >> >
>> >> >> >> specific concept?
>> >> >> >> 
>> >> >> >> Thanks.
>> >> >> >> 
>> >> >> >>  - Alistair
>> >> >> >> 
>> >> >> >> > Matt
>> >> >> >> >  
>> >> >> >> >>  - Alistair
>> >> >> >> >> 
>> >> >> >> >> > Matt
>> >> >> >> >> >
>> >> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
>> >> >> >> >> >
>> >> >> >> >> >> Matt
>> >> >> >> >> >> 
>> >> >> >> >> >> > > +	}
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > > +	return 0;
>> >> >> >> >> >> > > +}
>> >> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
>> >> >> >> >> >> > > +
>> >> >> >> >> >> > >  /*
>> >> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
>> >> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
>> >> >> >> >> >> > 
>> >> >> >> >> 
>> >> >> >> 
>> >> >> 
>> >> 
>>
Matthew Brost Oct. 18, 2024, 7:57 a.m. UTC | #19
On Fri, Oct 18, 2024 at 06:34:13PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Fri, Oct 18, 2024 at 04:59:05PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Fri, Oct 18, 2024 at 08:58:02AM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> 
> >> >> > On Thu, Oct 17, 2024 at 04:49:11PM +1100, Alistair Popple wrote:
> >> >> >> 
> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> 
> >> >> >> > On Thu, Oct 17, 2024 at 02:21:13PM +1100, Alistair Popple wrote:
> >> >> >> >> 
> >> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> >> 
> >> >> >> >> > On Thu, Oct 17, 2024 at 12:49:55PM +1100, Alistair Popple wrote:
> >> >> >> >> >> 
> >> >> >> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> >> >> >> 
> >> >> >> >> >> > On Wed, Oct 16, 2024 at 04:46:52AM +0000, Matthew Brost wrote:
> >> >> >> >> >> >> On Wed, Oct 16, 2024 at 03:04:06PM +1100, Alistair Popple wrote:
> >> >> >> >> >> 
> >> >> >> >> >> [...]
> >> >> >> >> >> 
> >> >> >> >> >> >> > > +{
> >> >> >> >> >> >> > > +	unsigned long i;
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +	for (i = 0; i < npages; i++) {
> >> >> >> >> >> >> > > +		struct page *page = pfn_to_page(src_pfns[i]);
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +		if (!get_page_unless_zero(page)) {
> >> >> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> >> >> > > +			continue;
> >> >> >> >> >> >> > > +		}
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +		if (!trylock_page(page)) {
> >> >> >> >> >> >> > > +			src_pfns[i] = 0;
> >> >> >> >> >> >> > > +			put_page(page);
> >> >> >> >> >> >> > > +			continue;
> >> >> >> >> >> >> > > +		}
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
> >> >> >> >> >> >> > 
> >> >> >> >> >> >> > This needs to be converted to use a folio like
> >> >> >> >> >> >> > migrate_device_range(). But more importantly this should be split out as
> >> >> >> >> >> >> > a function that both migrate_device_range() and this function can call
> >> >> >> >> >> >> > given this bit is identical.
> >> >> >> >> >> >> > 
> >> >> >> >> >> >> 
> >> >> >> >> >> >> Missed the folio conversion and agree a helper shared between this
> >> >> >> >> >> >> function and migrate_device_range would be a good idea. Let add that.
> >> >> >> >> >> >> 
> >> >> >> >> >> >
> >> >> >> >> >> > Alistair,
> >> >> >> >> >> >
> >> >> >> >> >> > Ok, I think now I want to go slightly different direction here to give
> >> >> >> >> >> > GPUSVM a bit more control over several eviction scenarios.
> >> >> >> >> >> >
> >> >> >> >> >> > What if I exported the helper discussed above, e.g.,
> >> >> >> >> >> >
> >> >> >> >> >> >  905 unsigned long migrate_device_pfn_lock(unsigned long pfn)
> >> >> >> >> >> >  906 {
> >> >> >> >> >> >  907         struct folio *folio;
> >> >> >> >> >> >  908
> >> >> >> >> >> >  909         folio = folio_get_nontail_page(pfn_to_page(pfn));
> >> >> >> >> >> >  910         if (!folio)
> >> >> >> >> >> >  911                 return 0;
> >> >> >> >> >> >  912
> >> >> >> >> >> >  913         if (!folio_trylock(folio)) {
> >> >> >> >> >> >  914                 folio_put(folio);
> >> >> >> >> >> >  915                 return 0;
> >> >> >> >> >> >  916         }
> >> >> >> >> >> >  917
> >> >> >> >> >> >  918         return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
> >> >> >> >> >> >  919 }
> >> >> >> >> >> >  920 EXPORT_SYMBOL(migrate_device_pfn_lock);
> >> >> >> >> >> >
> >> >> >> >> >> > And then also export migrate_device_unmap.
> >> >> >> >> >> >
> >> >> >> >> >> > The usage here would be let a driver collect the device pages in virtual
> >> >> >> >> >> > address range via hmm_range_fault, lock device pages under notifier
> >> >> >> >> >> > lock ensuring device pages are valid, drop the notifier lock and call
> >> >> >> >> >> > migrate_device_unmap.
> >> >> >> >> >> 
> >> >> >> >> >> I'm still working through this series but that seems a bit dubious, the
> >> >> >> >> >> locking here is pretty subtle and easy to get wrong so seeing some code
> >> >> >> >> >> would help me a lot in understanding what you're suggesting.
> >> >> >> >> >>
> >> >> >> >> >
> >> >> >> >> > For sure locking in tricky, my mistake on not working through this
> >> >> >> >> > before sending out the next rev but it came to mind after sending +
> >> >> >> >> > regarding some late feedback from Thomas about using hmm for eviction
> >> >> >> >> > [2]. His suggestion of using hmm_range_fault to trigger migration
> >> >> >> >> > doesn't work for coherent pages, while something like below does.
> >> >> >> >> >
> >> >> >> >> > [2] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1125461
> >> >> >> >> >
> >> >> >> >> > Here is a snippet I have locally which seems to work.
> >> >> >> >> >
> >> >> >> >> > 2024 retry:
> >> >> >> >> > 2025         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> >> >> > 2026         hmm_range.hmm_pfns = src;
> >> >> >> >> > 2027
> >> >> >> >> > 2028         while (true) {
> >> >> >> >> > 2029                 mmap_read_lock(mm);
> >> >> >> >> > 2030                 err = hmm_range_fault(&hmm_range);
> >> >> >> >> > 2031                 mmap_read_unlock(mm);
> >> >> >> >> > 2032                 if (err == -EBUSY) {
> >> >> >> >> > 2033                         if (time_after(jiffies, timeout))
> >> >> >> >> > 2034                                 break;
> >> >> >> >> > 2035
> >> >> >> >> > 2036                         hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
> >> >> >> >> > 2037                         continue;
> >> >> >> >> > 2038                 }
> >> >> >> >> > 2039                 break;
> >> >> >> >> > 2040         }
> >> >> >> >> > 2041         if (err)
> >> >> >> >> > 2042                 goto err_put;
> >> >> >> >> > 2043
> >> >> >> >> > 2044         drm_gpusvm_notifier_lock(gpusvm);
> >> >> >> >> > 2045         if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
> >> >> >> >> > 2046                 drm_gpusvm_notifier_unlock(gpusvm);
> >> >> >> >> > 2047                 memset(src, 0, sizeof(*src) * npages);
> >> >> >> >> > 2048                 goto retry;
> >> >> >> >> > 2049         }
> >> >> >> >> > 2050         for (i = 0; i < npages; ++i) {
> >> >> >> >> > 2051                 struct page *page = hmm_pfn_to_page(src[i]);
> >> >> >> >> > 2052
> >> >> >> >> > 2053                 if (page && (is_device_private_page(page) ||
> >> >> >> >> > 2054                     is_device_coherent_page(page)) && page->zone_device_data)
> >> >> >> >> > 2055                         src[i] = src[i] & ~HMM_PFN_FLAGS;
> >> >> >> >> > 2056                 else
> >> >> >> >> > 2057                         src[i] = 0;
> >> >> >> >> > 2058                 if (src[i])
> >> >> >> >> > 2059                         src[i] = migrate_device_pfn_lock(src[i]);
> >> >> >> >> > 2060         }
> >> >> >> >> > 2061         drm_gpusvm_notifier_unlock(gpusvm);
> >> >> >> >> 
> >> >> >> >> Practically for eviction isn't this much the same as calling
> >> >> >> >> migrate_vma_setup()? And also for eviction as Sima mentioned you
> >> >> >> >> probably shouldn't be looking at mm/vma structs.
> >> >> >> >> 
> >> >> >> >
> >> >> >> > hmm_range_fault is just collecting the pages, internally I suppose it
> >> >> >> > does look at a VMA (struct vm_area_struct) but I think the point is
> >> >> >> > drivers should not be looking at VMA here.
> >> >> >> 
> >> >> >> migrate_vma_setup() is designed to be called by drivers and needs a vma,
> >> >> >> so in general I don't see a problem with drivers looking up vma's. The
> >> >> >> problem arises specifically for eviction and whether or not that happens
> >> >> >> in the driver or hmm_range_fault() is pretty irrelevant IMHO for the
> >> >> >> issues there (see below).
> >> >> >> 
> >> >> >
> >> >> > Ok, if you think it ok for drivers to lookup the VMA then purposed
> >> >> > exporting of migrate_device_pfn_lock & migrate_device_unmap is not
> >> >> > needed, rather just the original function exported in the this patch.
> >> >> >
> >> >> > More below too.
> >> >> >
> >> >> >> >> > 2063         migrate_device_unmap(src, npages, NULL);
> >> >> >> >> > ...
> >> >> >> >> > 2101         migrate_device_pages(src, dst, npages);
> >> >> >> >> > 2102         migrate_device_finalize(src, dst, npages);
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> >> > Sima has strongly suggested avoiding a CPUVMA
> >> >> >> >> >> > lookup during eviction cases and this would let me fixup
> >> >> >> >> >> > drm_gpusvm_range_evict in [1] to avoid this.
> >> >> >> >> >> 
> >> >> >> >> >> That sounds reasonable but for context do you have a link to the
> >> >> >> >> >> comments/discussion on this? I couldn't readily find it, but I may have
> >> >> >> >> >> just missed it.
> >> >> >> >> >> 
> >> >> >> >> >
> >> >> >> >> > See in [4], search for '2. eviction' comment from sima.
> >> >> >> >> 
> >> >> >> >> Thanks for pointing that out. For reference here's Sima's comment:
> >> >> >> >> 
> >> >> >> >> > 2. eviction
> >> >> >> >> > 
> >> >> >> >> > Requirements much like migrate_to_ram, because otherwise we break the
> >> >> >> >> > migration gurantee:
> >> >> >> >> > 
> >> >> >> >> > - Only looking at physical memory datastructures and locks, no looking at
> >> >> >> >> >   mm/vma structs or relying on those being locked. We rely entirely on
> >> >> >> >> >   reverse maps from try_to_migrate to find all the mappings on both cpu
> >> >> >> >> >   and gpu side (cpu only zone device swap or migration pte entries ofc).
> >> >> >> >>
> >> >> >> >> I also very much agree with this. That's basically why I added
> >> >> >> >> migrate_device_range(), so that we can forcibly evict pages when the
> >> >> >> >> driver needs them freed (eg. driver unload, low memory, etc.). In
> >> >> >> >> general it is impossible to guarantee eviction og all pages using just
> >> >> >> >> hmm_range_fault().
> >> >> >> >> 
> >> >> >> >
> >> >> >> > In this code path we don't have device pages available, hence the
> >> >> >> > purposed collection via hmm_range_fault.
> >> >> >> 
> >> >> >> Why don't you have the pfns requiring eviction available? I need to read
> >> >> >> this series in more depth, but generally hmm_range_fault() can't
> >> >> >> gurantee you will find every device page.
> >> >> >> 
> >> >> >
> >> >> > There are two cases for eviction in my series:
> >> >> >
> >> >> > 1. TTM decides it needs to move memory. This calls
> >> >> > drm_gpusvm_evict_to_ram. In this case the device pfns are available
> >> >> > directly from drm_gpusvm_devmem so the migrate_device_* calls be used
> >> >> > here albiet with the new function added in this patch as device pfns may
> >> >> > be non-contiguous.
> >> >> 
> >> >> That makes sense and is generally what I think of when I'm thinking of
> >> >> eviction. The new function makes sense too - migrate_device_range() was
> >> >> primarily introduced to allow a driver to evict all device-private pages
> >> >> from a GPU so didn't consider non-contiguous cases, etc.
> >> >> 
> >> >> > 2. An inconsistent state for VA range occurs (mixed system and device pages,
> >> >> > partial unmap of a range, etc...). Here we want to evict the range ram
> >> >> > to make the state consistent. No device pages are available due to an
> >> >> > intentional disconnect between a virtual range and physical
> >> >> > drm_gpusvm_devmem, thus the device pages have to be looked up. This the
> >> >> > function drm_gpusvm_range_evict. Based on what you tell me, it likely is
> >> >> > fine the way originally coded in v2 (vma lookup + migrate_vma_*) vs
> >> >> > using hmm_range_fault like I have suggested here.
> >> >> 
> >> >> Thanks for the explanation. I think vma lookup + migrate_vma_setup() is
> >> >> fine for this usage and is exactly what you want - it was designed to
> >> >> either select all the system memory pages or device-private pages within
> >> >> a VA range and migrate them.
> >> >> 
> >> >> FWIW I have toyed with the idea of a combined
> >> >> hmm_range_fault()/migrate_vma_setup() front-end to the rest of the
> >> >> migrate_vma_*() process but haven't come up with something nice as
> >> >> yet. I don't think mixing the two in an open-coded fashion is a good
> >> >> idea though, I'd rather we come up with a new API that addresses the
> >> >> short-comings of migrate_vma_setup().
> >> >> 
> >> >
> >> > I think that would good. Here we actually need to lookup multiple VMAs
> >> > and have a sequence of migrate_vma_* calls as it possible for VMAs to
> >> > have changed after the driver range was created. It might be nice to
> >> > hide the VMA lookup from the drivers with an API saying collect and
> >> > migrate all pages of a type in a VA range much like hmm_range_fault. If
> >> > the range spans multiple VMAs that would be hidden from the caller.
> >> 
> >> Ok. I wasn't really considering multiple VMAs. UVM and Nouveau don't
> >> really have a requirement to migrate across multiple VMAs but if that's
> >> neccessary I think an API that hides that specifically for working with
> >> migrate_vma_*() might make sense.
> >> 
> >
> > We can run into multiple VMA scenarios if a user does something rude
> > like this:
> 
> fork and mremap were the other "rude" things we've had fun with. They
> basically mean you can get references to device pages which a driver
> can't track with virtual addresses.
> 

Yes, I've tested those two and are fun as well. But both are COW which
hasn't turned out to be too difficult.

> > mmap	0x000000...0x1fffff -> fault migrates 2M to VRAM and creates an internal range to track
> > munmap	0x080000...0x17ffff -> now we have two VMAs instead of one and the range has a hole in it
> >
> > In this scenario, which we believe to rare / unsual, we just evict
> > remaining VRAM pages to SRAM, destroy range, and fixup on next GPU
> > fault.
> >
> >> > Matt
> >> >
> >> >> > Note #2 may be removed or unnecessary at some point if we decide to add
> >> >> > support for ininconsistent state in GPU SVM and Xe. Keeping it simple for
> >> >> > now though. See 'Ranges with mixed system and device pages' in [5].
> >> 
> >> As someone not very familiar with some of the DRM layers can I ask why
> >> having virtual address ranges with a mix of system and device pages is
> >> hard to support? It seems to me that in practice it might be quite
> >> difficult to keep a VMA range as exclusively all in system memory or all
> >> in device memory.
> >>
> >
> > A few things that make this difficult are:
> >
> > - Our (Xe) bind code would need to be updated to support this
> > - TTM / DRM buddy allocator doesn't support freeing / reallocation of
> >   individual pages rather aligned chunks of initial allocation size
> >   (e.g., 2M would be common allocation size).
> > - Spliting ranges would add complications
> >
> > All workable problems but since we are writing a new common
> > implementation trying to keep it as simple as possible for initial merge
> > of the design. Almost certainly at some point we will add support for
> > mixed ranges to the common GPU SVM layer with a driver choosing if it
> > wants mixed or non-mixed ranges via a flag to function calls.
> >
> > wrt to being difficult keeping exclusively in system or vram, in
> > addition to the above case the only other case I have found in which
> > this occurs is CPU and GPU faults to same address range racing. This can
> > cause hmm_range_fault to grab a set mixed pages. In this case again we
> > do an eviction remaining page and restart the GPU fault.
> >
> > I don't have real workloads yet but I do have a very aggressive test
> > case that intentionally does things which could break the design in a
> > highly parallel manner and the design as work. Is it ideal? Maybe not.
> > But getting in a simple design which we can build upon is the current
> > goal.
> 
> Taking a simple approach first definitely sounds lie the right approach
> to me. I was just interested in the background because it wasn't
> something I'd run into (though we built on top of something quite
> different to the DRM layer). But I have often thought that the
> interfaces we have between core mm and GPU drivers is still a bit too
> low level at the moment and is calling out for a slightly higher level
> common implementation in the middle so am very interested to see where
> this all goes. Thanks.

The idea is build something common which other DRM drivers can use or
perhaps even pull out of the DRM layer eventually into a common device
layer for SVM.

Matt

> 
>  - Alistair
> 
> > Matt
> >
> >> >> > [5] https://patchwork.freedesktop.org/patch/619819/?series=137870&rev=2
> >> >> >
> >> >> >> >> > [3] https://patchwork.freedesktop.org/patch/610957/?series=137870&rev=1#comment_1110726
> >> >> >> >> > [4] https://lore.kernel.org/all/BYAPR11MB3159A304925168D8B6B4671292692@BYAPR11MB3159.namprd11.prod.outlook.com/T/#m89cd6a37778ba5271d5381ebeb03e1f963856a78
> >> >> >> >> >
> >> >> >> >> >> > It would also make the function exported in this patch unnecessary too
> >> >> >> >> >> > as non-contiguous pfns can be setup on driver side via
> >> >> >> >> >> > migrate_device_pfn_lock and then migrate_device_unmap can be called.
> >> >> >> >> >> > This also another eviction usage in GPUSVM, see drm_gpusvm_evict_to_ram
> >> >> >> >> >> > in [1].
> >> >> >> >> >> >
> >> >> >> >> >> > Do you see an issue exporting migrate_device_pfn_lock,
> >> >> >> >> >> > migrate_device_unmap?
> >> >> >> >> >> 
> >> >> >> >> >> If there is a good justification for it I can't see a problem with
> >> >> >> >> >> exporting it. That said I don't really understand why you would
> >> >> >> >> >> want/need to split those steps up but I'll wait to see the code.
> >> >> >> >> >>
> >> >> >> >> >
> >> >> >> >> > It is so the device pages returned from hmm_range_fault, which are only
> >> >> >> >> > guaranteed to be valid under the notifier lock + a seqno check, to be
> >> >> >> >> > locked and ref taken for migration. migrate_device_unmap() can trigger a
> >> >> >> >> > MMU invalidation which takes the notifier lock thus calling the function
> >> >> >> >> > which combines migrate_device_pfn_lock + migrate_device_unmap deadlocks.
> >> >> >> >> >
> >> >> >> >> > I think this flow makes sense and agree in general this likely better
> >> >> >> >> > than looking at a CPUVMA.
> >> >> >> >> 
> >> >> >> >> I'm still a bit confused about what is better with this flow if you are
> >> >> >> >> still calling hmm_range_fault(). How is it better than just calling
> >> >> >> >> migrate_vma_setup()? Obviously it will fault the pages in, but it seems
> >> >> >> >
> >> >> >> > The code in rev2 calls migrate_vma_setup but the requires a struct
> >> >> >> > vm_area_struct argument whereas hmm_range_fault does not.
> >> >> >> 
> >> >> >> I'm not sure that's a good enough justfication because the problem isn't
> >> >> >> whether you're looking up vma's in driver code or mm code. The problem
> >> >> >> is you are looking up vma's at all and all that goes with that (mainly
> >> >> >> taking mmap lock, etc.)
> >> >> >> 
> >> >> >> And for eviction hmm_range_fault() won't even find all the pages because
> >> >> >> their virtual address may have changed - consider what happens in cases
> >> >> >> of mremap(), fork(), etc. So eviction really needs physical pages
> >> >> >> (pfn's), not virtual addresses.
> >> >> >>
> >> >> >
> >> >> > See above, #1 yes we use a physical pages. For #2 it is about making the
> >> >> > state consistent within a virtual address range.
> >> >> 
> >> >> Yep, makes sense now. For migration of physical pages you want
> >> >> migrate_device_*, virtual address ranges want migrate_vma_*
> >> >> 
> >> >>  - Alistair
> >> >> 
> >> >> > Matt
> >> >> >  
> >> >> >> >> we're talking about eviction here so I don't understand why that would
> >> >> >> >> be relevant. And hmm_range_fault() still requires the VMA, although I
> >> >> >> >> need to look at the patches more closely, probably CPUVMA is a DRM
> >> >> >> >
> >> >> >> > 'hmm_range_fault() still requires the VMA' internal yes, but again not
> >> >> >> > as argument. This is about avoiding a driver side lookup of the VMA.
> >> >> >> >
> >> >> >> > CPUVMA == struct vm_area_struct in this email.
> >> >> >> 
> >> >> >> Thanks for the clarification.
> >> >> >> 
> >> >> >>  - Alistair
> >> >> >> 
> >> >> >> > Matt
> >> >> >> >
> >> >> >> >> specific concept?
> >> >> >> >> 
> >> >> >> >> Thanks.
> >> >> >> >> 
> >> >> >> >>  - Alistair
> >> >> >> >> 
> >> >> >> >> > Matt
> >> >> >> >> >  
> >> >> >> >> >>  - Alistair
> >> >> >> >> >> 
> >> >> >> >> >> > Matt
> >> >> >> >> >> >
> >> >> >> >> >> > [1] https://patchwork.freedesktop.org/patch/619809/?series=137870&rev=2
> >> >> >> >> >> >
> >> >> >> >> >> >> Matt
> >> >> >> >> >> >> 
> >> >> >> >> >> >> > > +	}
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +	migrate_device_unmap(src_pfns, npages, NULL);
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > > +	return 0;
> >> >> >> >> >> >> > > +}
> >> >> >> >> >> >> > > +EXPORT_SYMBOL(migrate_device_prepopulated_range);
> >> >> >> >> >> >> > > +
> >> >> >> >> >> >> > >  /*
> >> >> >> >> >> >> > >   * Migrate a device coherent folio back to normal memory. The caller should have
> >> >> >> >> >> >> > >   * a reference on folio which will be copied to the new folio if migration is
> >> >> >> >> >> >> > 
> >> >> >> >> >> 
> >> >> >> >> 
> >> >> >> 
> >> >> 
> >> 
>
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 002e49b2ebd9..9146ed39a2a3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -229,6 +229,7 @@  void migrate_vma_pages(struct migrate_vma *migrate);
 void migrate_vma_finalize(struct migrate_vma *migrate);
 int migrate_device_range(unsigned long *src_pfns, unsigned long start,
 			unsigned long npages);
+int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages);
 void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
 			unsigned long npages);
 void migrate_device_finalize(unsigned long *src_pfns,
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..f163c2131022 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -924,6 +924,41 @@  int migrate_device_range(unsigned long *src_pfns, unsigned long start,
 }
 EXPORT_SYMBOL(migrate_device_range);
 
+/**
+ * migrate_device_prepopulated_range() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_prepopulated_range(unsigned long *src_pfns, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = pfn_to_page(src_pfns[i]);
+
+		if (!get_page_unless_zero(page)) {
+			src_pfns[i] = 0;
+			continue;
+		}
+
+		if (!trylock_page(page)) {
+			src_pfns[i] = 0;
+			put_page(page);
+			continue;
+		}
+
+		src_pfns[i] = migrate_pfn(src_pfns[i]) | MIGRATE_PFN_MIGRATE;
+	}
+
+	migrate_device_unmap(src_pfns, npages, NULL);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_device_prepopulated_range);
+
 /*
  * Migrate a device coherent folio back to normal memory. The caller should have
  * a reference on folio which will be copied to the new folio if migration is