diff mbox series

[v2,03/29] mm/migrate: Trylock device page in do_swap_page

Message ID 20241016032518.539495-4-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce GPU SVM and Xe SVM implementation | expand

Commit Message

Matthew Brost Oct. 16, 2024, 3:24 a.m. UTC
Avoid multiple CPU page faults to the same device page racing by trying
to lock the page in do_swap_page before taking an extra reference to the
page. This prevents scenarios where multiple CPU page faults each take
an extra reference to a device page, which could abort migration in
folio_migrate_mapping. With the device page being locked in
do_swap_page, the migrate_vma_* functions need to be updated to avoid
locking the fault_page argument.

Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
DRM driver) SVM implementation if enough threads faulted the same device
page.

Cc: Philip Yang <Philip.Yang@amd.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 mm/memory.c         | 13 ++++++---
 mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 26 deletions(-)

Comments

Alistair Popple Oct. 16, 2024, 4 a.m. UTC | #1
Matthew Brost <matthew.brost@intel.com> writes:

> Avoid multiple CPU page faults to the same device page racing by trying
> to lock the page in do_swap_page before taking an extra reference to the
> page. This prevents scenarios where multiple CPU page faults each take
> an extra reference to a device page, which could abort migration in
> folio_migrate_mapping. With the device page being locked in
> do_swap_page, the migrate_vma_* functions need to be updated to avoid
> locking the fault_page argument.
>
> Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> DRM driver) SVM implementation if enough threads faulted the same device
> page.
>
> Cc: Philip Yang <Philip.Yang@amd.com>
> Cc: Felix Kuehling <felix.kuehling@amd.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  mm/memory.c         | 13 ++++++---
>  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
>  2 files changed, 56 insertions(+), 26 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 2366578015ad..b72bde782611 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			 * Get a page reference while we know the page can't be
>  			 * freed.
>  			 */
> -			get_page(vmf->page);
> -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> -			put_page(vmf->page);
> +			if (trylock_page(vmf->page)) {
> +				get_page(vmf->page);
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> +				put_page(vmf->page);
> +				unlock_page(vmf->page);

I don't think my previous review of this change has really been
addressed. Why don't we just install the migration entry here? Seems
like it would be a much simpler way of solving this.

> +			} else {
> +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> +			}
>  		} else if (is_hwpoison_entry(entry)) {
>  			ret = VM_FAULT_HWPOISON;
>  		} else if (is_pte_marker_entry(entry)) {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index f163c2131022..2477d39f57be 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  				   struct mm_walk *walk)
>  {
>  	struct migrate_vma *migrate = walk->private;
> +	struct folio *fault_folio = migrate->fault_page ?
> +		page_folio(migrate->fault_page) : NULL;
>  	struct vm_area_struct *vma = walk->vma;
>  	struct mm_struct *mm = vma->vm_mm;
>  	unsigned long addr = start, unmapped = 0;
> @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  			folio_get(folio);
>  			spin_unlock(ptl);
> -			if (unlikely(!folio_trylock(folio)))
> +			if (unlikely(fault_folio != folio &&
> +				     !folio_trylock(folio)))
>  				return migrate_vma_collect_skip(start, end,
>  								walk);
>  			ret = split_folio(folio);
> -			folio_unlock(folio);
> +			if (fault_folio != folio)
> +				folio_unlock(folio);
>  			folio_put(folio);
>  			if (ret)
>  				return migrate_vma_collect_skip(start, end,
> @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  		 * optimisation to avoid walking the rmap later with
>  		 * try_to_migrate().
>  		 */
> -		if (folio_trylock(folio)) {
> +		if (fault_folio == folio || folio_trylock(folio)) {
>  			bool anon_exclusive;
>  			pte_t swp_pte;
>  
> @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>  				if (folio_try_share_anon_rmap_pte(folio, page)) {
>  					set_pte_at(mm, addr, ptep, pte);
> -					folio_unlock(folio);
> +					if (fault_folio != folio)
> +						folio_unlock(folio);
>  					folio_put(folio);
>  					mpfn = 0;
>  					goto next;
> @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  					  unsigned long npages,
>  					  struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i, restore = 0;
>  	bool allow_drain = true;
>  	unsigned long unmapped = 0;
> @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  		remove_migration_ptes(folio, folio, 0);
>  
>  		src_pfns[i] = 0;
> -		folio_unlock(folio);
> +		if (fault_folio != folio)
> +			folio_unlock(folio);
>  		folio_put(folio);
>  		restore--;
>  	}
> @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>  		return -EINVAL;
>  	if (args->fault_page && !is_device_private_page(args->fault_page))
>  		return -EINVAL;
> +	if (args->fault_page && !PageLocked(args->fault_page))
> +		return -EINVAL;
>  
>  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>  	args->cpages = 0;
> @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>  }
>  EXPORT_SYMBOL(migrate_vma_pages);
>  
> -/*
> - * migrate_device_finalize() - complete page migration
> - * @src_pfns: src_pfns returned from migrate_device_range()
> - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> - * @npages: number of pages in the range
> - *
> - * Completes migration of the page by removing special migration entries.
> - * Drivers must ensure copying of page data is complete and visible to the CPU
> - * before calling this.
> - */
> -void migrate_device_finalize(unsigned long *src_pfns,
> -			unsigned long *dst_pfns, unsigned long npages)
> +static void __migrate_device_finalize(unsigned long *src_pfns,
> +				      unsigned long *dst_pfns,
> +				      unsigned long npages,
> +				      struct page *fault_page)
>  {
> +	struct folio *fault_folio = fault_page ?
> +		page_folio(fault_page) : NULL;
>  	unsigned long i;
>  
>  	for (i = 0; i < npages; i++) {
> @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  
>  		if (!page) {
>  			if (dst) {
> -				folio_unlock(dst);
> +				if (fault_folio != dst)
> +					folio_unlock(dst);
>  				folio_put(dst);
>  			}
>  			continue;
> @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  
>  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
>  			if (dst) {
> -				folio_unlock(dst);
> +				if (fault_folio != dst)
> +					folio_unlock(dst);
>  				folio_put(dst);
>  			}
>  			dst = src;
>  		}
>  
>  		remove_migration_ptes(src, dst, 0);
> -		folio_unlock(src);
> +		if (fault_folio != src)
> +			folio_unlock(src);
>  
>  		if (folio_is_zone_device(src))
>  			folio_put(src);
> @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  			folio_putback_lru(src);
>  
>  		if (dst != src) {
> -			folio_unlock(dst);
> +			if (fault_folio != dst)
> +				folio_unlock(dst);
>  			if (folio_is_zone_device(dst))
>  				folio_put(dst);
>  			else
> @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  		}
>  	}
>  }
> +
> +/*
> + * migrate_device_finalize() - complete page migration
> + * @src_pfns: src_pfns returned from migrate_device_range()
> + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> + * @npages: number of pages in the range
> + *
> + * Completes migration of the page by removing special migration entries.
> + * Drivers must ensure copying of page data is complete and visible to the CPU
> + * before calling this.
> + */
> +void migrate_device_finalize(unsigned long *src_pfns,
> +			unsigned long *dst_pfns, unsigned long npages)
> +{
> +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> +}
>  EXPORT_SYMBOL(migrate_device_finalize);
>  
>  /**
> @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
>   */
>  void migrate_vma_finalize(struct migrate_vma *migrate)
>  {
> -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> +				  migrate->fault_page);
>  }
>  EXPORT_SYMBOL(migrate_vma_finalize);
Matthew Brost Oct. 16, 2024, 4:41 a.m. UTC | #2
On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > Avoid multiple CPU page faults to the same device page racing by trying
> > to lock the page in do_swap_page before taking an extra reference to the
> > page. This prevents scenarios where multiple CPU page faults each take
> > an extra reference to a device page, which could abort migration in
> > folio_migrate_mapping. With the device page being locked in
> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > locking the fault_page argument.
> >
> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > DRM driver) SVM implementation if enough threads faulted the same device
> > page.
> >
> > Cc: Philip Yang <Philip.Yang@amd.com>
> > Cc: Felix Kuehling <felix.kuehling@amd.com>
> > Cc: Christian König <christian.koenig@amd.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  mm/memory.c         | 13 ++++++---
> >  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 2366578015ad..b72bde782611 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >  			 * Get a page reference while we know the page can't be
> >  			 * freed.
> >  			 */
> > -			get_page(vmf->page);
> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > -			put_page(vmf->page);
> > +			if (trylock_page(vmf->page)) {
> > +				get_page(vmf->page);
> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > +				put_page(vmf->page);
> > +				unlock_page(vmf->page);
> 
> I don't think my previous review of this change has really been
> addressed. Why don't we just install the migration entry here? Seems
> like it would be a much simpler way of solving this.
> 

I should have mentioned this in the cover-letter, I haven't got around
to trying that out yet. Included this existing version for correctness
but I also think this is not strickly required to merge this series as
our locking in migrate_to_ram only relies on the core MM locks so
some thread would eventually win the race and make forward progress.

So I guess just ignore this patch and will send an updated version
individually with installing a migration entry in do_swap_page. If for
some reason that doesn't work, I'll respond here explaining why.

Matt

> > +			} else {
> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +			}
> >  		} else if (is_hwpoison_entry(entry)) {
> >  			ret = VM_FAULT_HWPOISON;
> >  		} else if (is_pte_marker_entry(entry)) {
> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > index f163c2131022..2477d39f57be 100644
> > --- a/mm/migrate_device.c
> > +++ b/mm/migrate_device.c
> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  				   struct mm_walk *walk)
> >  {
> >  	struct migrate_vma *migrate = walk->private;
> > +	struct folio *fault_folio = migrate->fault_page ?
> > +		page_folio(migrate->fault_page) : NULL;
> >  	struct vm_area_struct *vma = walk->vma;
> >  	struct mm_struct *mm = vma->vm_mm;
> >  	unsigned long addr = start, unmapped = 0;
> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> >  			folio_get(folio);
> >  			spin_unlock(ptl);
> > -			if (unlikely(!folio_trylock(folio)))
> > +			if (unlikely(fault_folio != folio &&
> > +				     !folio_trylock(folio)))
> >  				return migrate_vma_collect_skip(start, end,
> >  								walk);
> >  			ret = split_folio(folio);
> > -			folio_unlock(folio);
> > +			if (fault_folio != folio)
> > +				folio_unlock(folio);
> >  			folio_put(folio);
> >  			if (ret)
> >  				return migrate_vma_collect_skip(start, end,
> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  		 * optimisation to avoid walking the rmap later with
> >  		 * try_to_migrate().
> >  		 */
> > -		if (folio_trylock(folio)) {
> > +		if (fault_folio == folio || folio_trylock(folio)) {
> >  			bool anon_exclusive;
> >  			pte_t swp_pte;
> >  
> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
> >  					set_pte_at(mm, addr, ptep, pte);
> > -					folio_unlock(folio);
> > +					if (fault_folio != folio)
> > +						folio_unlock(folio);
> >  					folio_put(folio);
> >  					mpfn = 0;
> >  					goto next;
> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >  					  unsigned long npages,
> >  					  struct page *fault_page)
> >  {
> > +	struct folio *fault_folio = fault_page ?
> > +		page_folio(fault_page) : NULL;
> >  	unsigned long i, restore = 0;
> >  	bool allow_drain = true;
> >  	unsigned long unmapped = 0;
> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >  		remove_migration_ptes(folio, folio, 0);
> >  
> >  		src_pfns[i] = 0;
> > -		folio_unlock(folio);
> > +		if (fault_folio != folio)
> > +			folio_unlock(folio);
> >  		folio_put(folio);
> >  		restore--;
> >  	}
> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
> >  		return -EINVAL;
> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
> >  		return -EINVAL;
> > +	if (args->fault_page && !PageLocked(args->fault_page))
> > +		return -EINVAL;
> >  
> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
> >  	args->cpages = 0;
> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >  }
> >  EXPORT_SYMBOL(migrate_vma_pages);
> >  
> > -/*
> > - * migrate_device_finalize() - complete page migration
> > - * @src_pfns: src_pfns returned from migrate_device_range()
> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > - * @npages: number of pages in the range
> > - *
> > - * Completes migration of the page by removing special migration entries.
> > - * Drivers must ensure copying of page data is complete and visible to the CPU
> > - * before calling this.
> > - */
> > -void migrate_device_finalize(unsigned long *src_pfns,
> > -			unsigned long *dst_pfns, unsigned long npages)
> > +static void __migrate_device_finalize(unsigned long *src_pfns,
> > +				      unsigned long *dst_pfns,
> > +				      unsigned long npages,
> > +				      struct page *fault_page)
> >  {
> > +	struct folio *fault_folio = fault_page ?
> > +		page_folio(fault_page) : NULL;
> >  	unsigned long i;
> >  
> >  	for (i = 0; i < npages; i++) {
> > @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  
> >  		if (!page) {
> >  			if (dst) {
> > -				folio_unlock(dst);
> > +				if (fault_folio != dst)
> > +					folio_unlock(dst);
> >  				folio_put(dst);
> >  			}
> >  			continue;
> > @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  
> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
> >  			if (dst) {
> > -				folio_unlock(dst);
> > +				if (fault_folio != dst)
> > +					folio_unlock(dst);
> >  				folio_put(dst);
> >  			}
> >  			dst = src;
> >  		}
> >  
> >  		remove_migration_ptes(src, dst, 0);
> > -		folio_unlock(src);
> > +		if (fault_folio != src)
> > +			folio_unlock(src);
> >  
> >  		if (folio_is_zone_device(src))
> >  			folio_put(src);
> > @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  			folio_putback_lru(src);
> >  
> >  		if (dst != src) {
> > -			folio_unlock(dst);
> > +			if (fault_folio != dst)
> > +				folio_unlock(dst);
> >  			if (folio_is_zone_device(dst))
> >  				folio_put(dst);
> >  			else
> > @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >  		}
> >  	}
> >  }
> > +
> > +/*
> > + * migrate_device_finalize() - complete page migration
> > + * @src_pfns: src_pfns returned from migrate_device_range()
> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> > + * @npages: number of pages in the range
> > + *
> > + * Completes migration of the page by removing special migration entries.
> > + * Drivers must ensure copying of page data is complete and visible to the CPU
> > + * before calling this.
> > + */
> > +void migrate_device_finalize(unsigned long *src_pfns,
> > +			unsigned long *dst_pfns, unsigned long npages)
> > +{
> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> > +}
> >  EXPORT_SYMBOL(migrate_device_finalize);
> >  
> >  /**
> > @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
> >   */
> >  void migrate_vma_finalize(struct migrate_vma *migrate)
> >  {
> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> > +				  migrate->fault_page);
> >  }
> >  EXPORT_SYMBOL(migrate_vma_finalize);
>
Alistair Popple Oct. 17, 2024, 1:51 a.m. UTC | #3
Matthew Brost <matthew.brost@intel.com> writes:

> On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > Avoid multiple CPU page faults to the same device page racing by trying
>> > to lock the page in do_swap_page before taking an extra reference to the
>> > page. This prevents scenarios where multiple CPU page faults each take
>> > an extra reference to a device page, which could abort migration in
>> > folio_migrate_mapping. With the device page being locked in
>> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
>> > locking the fault_page argument.
>> >
>> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
>> > DRM driver) SVM implementation if enough threads faulted the same device
>> > page.
>> >
>> > Cc: Philip Yang <Philip.Yang@amd.com>
>> > Cc: Felix Kuehling <felix.kuehling@amd.com>
>> > Cc: Christian König <christian.koenig@amd.com>
>> > Cc: Andrew Morton <akpm@linux-foundation.org>
>> > Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
>> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>> > ---
>> >  mm/memory.c         | 13 ++++++---
>> >  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
>> >  2 files changed, 56 insertions(+), 26 deletions(-)
>> >
>> > diff --git a/mm/memory.c b/mm/memory.c
>> > index 2366578015ad..b72bde782611 100644
>> > --- a/mm/memory.c
>> > +++ b/mm/memory.c
>> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >  			 * Get a page reference while we know the page can't be
>> >  			 * freed.
>> >  			 */
>> > -			get_page(vmf->page);
>> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> > -			put_page(vmf->page);
>> > +			if (trylock_page(vmf->page)) {
>> > +				get_page(vmf->page);
>> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> > +				put_page(vmf->page);
>> > +				unlock_page(vmf->page);
>> 
>> I don't think my previous review of this change has really been
>> addressed. Why don't we just install the migration entry here? Seems
>> like it would be a much simpler way of solving this.
>> 
>
> I should have mentioned this in the cover-letter, I haven't got around
> to trying that out yet. Included this existing version for correctness
> but I also think this is not strickly required to merge this series as
> our locking in migrate_to_ram only relies on the core MM locks so
> some thread would eventually win the race and make forward progress.
>
> So I guess just ignore this patch and will send an updated version
> individually with installing a migration entry in do_swap_page. If for
> some reason that doesn't work, I'll respond here explaining why.

That would be great. I have a fairly strong preference for doing that
instead of adding more special cases for the fault page in the migration
code. And if we can't do that it would be good to understand
why. Thanks.

 - Alistair

> Matt
>
>> > +			} else {
>> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > +			}
>> >  		} else if (is_hwpoison_entry(entry)) {
>> >  			ret = VM_FAULT_HWPOISON;
>> >  		} else if (is_pte_marker_entry(entry)) {
>> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> > index f163c2131022..2477d39f57be 100644
>> > --- a/mm/migrate_device.c
>> > +++ b/mm/migrate_device.c
>> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >  				   struct mm_walk *walk)
>> >  {
>> >  	struct migrate_vma *migrate = walk->private;
>> > +	struct folio *fault_folio = migrate->fault_page ?
>> > +		page_folio(migrate->fault_page) : NULL;
>> >  	struct vm_area_struct *vma = walk->vma;
>> >  	struct mm_struct *mm = vma->vm_mm;
>> >  	unsigned long addr = start, unmapped = 0;
>> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >  
>> >  			folio_get(folio);
>> >  			spin_unlock(ptl);
>> > -			if (unlikely(!folio_trylock(folio)))
>> > +			if (unlikely(fault_folio != folio &&
>> > +				     !folio_trylock(folio)))
>> >  				return migrate_vma_collect_skip(start, end,
>> >  								walk);
>> >  			ret = split_folio(folio);
>> > -			folio_unlock(folio);
>> > +			if (fault_folio != folio)
>> > +				folio_unlock(folio);
>> >  			folio_put(folio);
>> >  			if (ret)
>> >  				return migrate_vma_collect_skip(start, end,
>> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >  		 * optimisation to avoid walking the rmap later with
>> >  		 * try_to_migrate().
>> >  		 */
>> > -		if (folio_trylock(folio)) {
>> > +		if (fault_folio == folio || folio_trylock(folio)) {
>> >  			bool anon_exclusive;
>> >  			pte_t swp_pte;
>> >  
>> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >  
>> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
>> >  					set_pte_at(mm, addr, ptep, pte);
>> > -					folio_unlock(folio);
>> > +					if (fault_folio != folio)
>> > +						folio_unlock(folio);
>> >  					folio_put(folio);
>> >  					mpfn = 0;
>> >  					goto next;
>> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>> >  					  unsigned long npages,
>> >  					  struct page *fault_page)
>> >  {
>> > +	struct folio *fault_folio = fault_page ?
>> > +		page_folio(fault_page) : NULL;
>> >  	unsigned long i, restore = 0;
>> >  	bool allow_drain = true;
>> >  	unsigned long unmapped = 0;
>> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>> >  		remove_migration_ptes(folio, folio, 0);
>> >  
>> >  		src_pfns[i] = 0;
>> > -		folio_unlock(folio);
>> > +		if (fault_folio != folio)
>> > +			folio_unlock(folio);
>> >  		folio_put(folio);
>> >  		restore--;
>> >  	}
>> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>> >  		return -EINVAL;
>> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
>> >  		return -EINVAL;
>> > +	if (args->fault_page && !PageLocked(args->fault_page))
>> > +		return -EINVAL;
>> >  
>> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>> >  	args->cpages = 0;
>> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>> >  }
>> >  EXPORT_SYMBOL(migrate_vma_pages);
>> >  
>> > -/*
>> > - * migrate_device_finalize() - complete page migration
>> > - * @src_pfns: src_pfns returned from migrate_device_range()
>> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
>> > - * @npages: number of pages in the range
>> > - *
>> > - * Completes migration of the page by removing special migration entries.
>> > - * Drivers must ensure copying of page data is complete and visible to the CPU
>> > - * before calling this.
>> > - */
>> > -void migrate_device_finalize(unsigned long *src_pfns,
>> > -			unsigned long *dst_pfns, unsigned long npages)
>> > +static void __migrate_device_finalize(unsigned long *src_pfns,
>> > +				      unsigned long *dst_pfns,
>> > +				      unsigned long npages,
>> > +				      struct page *fault_page)
>> >  {
>> > +	struct folio *fault_folio = fault_page ?
>> > +		page_folio(fault_page) : NULL;
>> >  	unsigned long i;
>> >  
>> >  	for (i = 0; i < npages; i++) {
>> > @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >  
>> >  		if (!page) {
>> >  			if (dst) {
>> > -				folio_unlock(dst);
>> > +				if (fault_folio != dst)
>> > +					folio_unlock(dst);
>> >  				folio_put(dst);
>> >  			}
>> >  			continue;
>> > @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >  
>> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
>> >  			if (dst) {
>> > -				folio_unlock(dst);
>> > +				if (fault_folio != dst)
>> > +					folio_unlock(dst);
>> >  				folio_put(dst);
>> >  			}
>> >  			dst = src;
>> >  		}
>> >  
>> >  		remove_migration_ptes(src, dst, 0);
>> > -		folio_unlock(src);
>> > +		if (fault_folio != src)
>> > +			folio_unlock(src);
>> >  
>> >  		if (folio_is_zone_device(src))
>> >  			folio_put(src);
>> > @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >  			folio_putback_lru(src);
>> >  
>> >  		if (dst != src) {
>> > -			folio_unlock(dst);
>> > +			if (fault_folio != dst)
>> > +				folio_unlock(dst);
>> >  			if (folio_is_zone_device(dst))
>> >  				folio_put(dst);
>> >  			else
>> > @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >  		}
>> >  	}
>> >  }
>> > +
>> > +/*
>> > + * migrate_device_finalize() - complete page migration
>> > + * @src_pfns: src_pfns returned from migrate_device_range()
>> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
>> > + * @npages: number of pages in the range
>> > + *
>> > + * Completes migration of the page by removing special migration entries.
>> > + * Drivers must ensure copying of page data is complete and visible to the CPU
>> > + * before calling this.
>> > + */
>> > +void migrate_device_finalize(unsigned long *src_pfns,
>> > +			unsigned long *dst_pfns, unsigned long npages)
>> > +{
>> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
>> > +}
>> >  EXPORT_SYMBOL(migrate_device_finalize);
>> >  
>> >  /**
>> > @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
>> >   */
>> >  void migrate_vma_finalize(struct migrate_vma *migrate)
>> >  {
>> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
>> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
>> > +				  migrate->fault_page);
>> >  }
>> >  EXPORT_SYMBOL(migrate_vma_finalize);
>>
Matthew Brost Oct. 25, 2024, 12:31 a.m. UTC | #4
On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > Avoid multiple CPU page faults to the same device page racing by trying
> >> > to lock the page in do_swap_page before taking an extra reference to the
> >> > page. This prevents scenarios where multiple CPU page faults each take
> >> > an extra reference to a device page, which could abort migration in
> >> > folio_migrate_mapping. With the device page being locked in
> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> >> > locking the fault_page argument.
> >> >
> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> >> > DRM driver) SVM implementation if enough threads faulted the same device
> >> > page.
> >> >
> >> > Cc: Philip Yang <Philip.Yang@amd.com>
> >> > Cc: Felix Kuehling <felix.kuehling@amd.com>
> >> > Cc: Christian König <christian.koenig@amd.com>
> >> > Cc: Andrew Morton <akpm@linux-foundation.org>
> >> > Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
> >> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >> > ---
> >> >  mm/memory.c         | 13 ++++++---
> >> >  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >> >
> >> > diff --git a/mm/memory.c b/mm/memory.c
> >> > index 2366578015ad..b72bde782611 100644
> >> > --- a/mm/memory.c
> >> > +++ b/mm/memory.c
> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >  			 * Get a page reference while we know the page can't be
> >> >  			 * freed.
> >> >  			 */
> >> > -			get_page(vmf->page);
> >> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> > -			put_page(vmf->page);
> >> > +			if (trylock_page(vmf->page)) {
> >> > +				get_page(vmf->page);
> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> > +				put_page(vmf->page);
> >> > +				unlock_page(vmf->page);
> >> 
> >> I don't think my previous review of this change has really been
> >> addressed. Why don't we just install the migration entry here? Seems
> >> like it would be a much simpler way of solving this.
> >> 
> >
> > I should have mentioned this in the cover-letter, I haven't got around
> > to trying that out yet. Included this existing version for correctness
> > but I also think this is not strickly required to merge this series as
> > our locking in migrate_to_ram only relies on the core MM locks so
> > some thread would eventually win the race and make forward progress.
> >
> > So I guess just ignore this patch and will send an updated version
> > individually with installing a migration entry in do_swap_page. If for
> > some reason that doesn't work, I'll respond here explaining why.
> 
> That would be great. I have a fairly strong preference for doing that
> instead of adding more special cases for the fault page in the migration
> code. And if we can't do that it would be good to understand
> why. Thanks.
> 

I've looked into this and actually prefer the approach in this patch.

Consider the scenario where we install a migration entry, but
migrate_to_ram fails. How do we handle this?

We don't know where migrate_to_ram failed. Was migrate_device_finalize
called, removing the migration PTE? Do we need to special-case failures
in migrate_to_ram to prevent migrate_device_finalize from removing the
faulting page's migration entry? Should we check for a migration entry
after migrate_to_ram and remove it if it exists?

Now, if migrate_to_ram succeeds, it seems the migration entry should be
removed in migrate_device_finalize since the new page is only available
there. We could return the new page in migrate_to_ram, but that feels
messy.

Additionally, the page lock needs to be held across migrate_to_ram, as
this patch does, so we'll require some special handling in
migrate_device_finalize to avoid unlocking the faulting page.

Finally, installing a migration entry is non-trivial, while taking a
page reference under a lock is straightforward.

Given all this, I prefer to keep this patch as it is.

Matt

>  - Alistair
> 
> > Matt
> >
> >> > +			} else {
> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > +			}
> >> >  		} else if (is_hwpoison_entry(entry)) {
> >> >  			ret = VM_FAULT_HWPOISON;
> >> >  		} else if (is_pte_marker_entry(entry)) {
> >> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> > index f163c2131022..2477d39f57be 100644
> >> > --- a/mm/migrate_device.c
> >> > +++ b/mm/migrate_device.c
> >> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >  				   struct mm_walk *walk)
> >> >  {
> >> >  	struct migrate_vma *migrate = walk->private;
> >> > +	struct folio *fault_folio = migrate->fault_page ?
> >> > +		page_folio(migrate->fault_page) : NULL;
> >> >  	struct vm_area_struct *vma = walk->vma;
> >> >  	struct mm_struct *mm = vma->vm_mm;
> >> >  	unsigned long addr = start, unmapped = 0;
> >> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >  
> >> >  			folio_get(folio);
> >> >  			spin_unlock(ptl);
> >> > -			if (unlikely(!folio_trylock(folio)))
> >> > +			if (unlikely(fault_folio != folio &&
> >> > +				     !folio_trylock(folio)))
> >> >  				return migrate_vma_collect_skip(start, end,
> >> >  								walk);
> >> >  			ret = split_folio(folio);
> >> > -			folio_unlock(folio);
> >> > +			if (fault_folio != folio)
> >> > +				folio_unlock(folio);
> >> >  			folio_put(folio);
> >> >  			if (ret)
> >> >  				return migrate_vma_collect_skip(start, end,
> >> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >  		 * optimisation to avoid walking the rmap later with
> >> >  		 * try_to_migrate().
> >> >  		 */
> >> > -		if (folio_trylock(folio)) {
> >> > +		if (fault_folio == folio || folio_trylock(folio)) {
> >> >  			bool anon_exclusive;
> >> >  			pte_t swp_pte;
> >> >  
> >> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >  
> >> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
> >> >  					set_pte_at(mm, addr, ptep, pte);
> >> > -					folio_unlock(folio);
> >> > +					if (fault_folio != folio)
> >> > +						folio_unlock(folio);
> >> >  					folio_put(folio);
> >> >  					mpfn = 0;
> >> >  					goto next;
> >> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >> >  					  unsigned long npages,
> >> >  					  struct page *fault_page)
> >> >  {
> >> > +	struct folio *fault_folio = fault_page ?
> >> > +		page_folio(fault_page) : NULL;
> >> >  	unsigned long i, restore = 0;
> >> >  	bool allow_drain = true;
> >> >  	unsigned long unmapped = 0;
> >> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >> >  		remove_migration_ptes(folio, folio, 0);
> >> >  
> >> >  		src_pfns[i] = 0;
> >> > -		folio_unlock(folio);
> >> > +		if (fault_folio != folio)
> >> > +			folio_unlock(folio);
> >> >  		folio_put(folio);
> >> >  		restore--;
> >> >  	}
> >> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
> >> >  		return -EINVAL;
> >> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
> >> >  		return -EINVAL;
> >> > +	if (args->fault_page && !PageLocked(args->fault_page))
> >> > +		return -EINVAL;
> >> >  
> >> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
> >> >  	args->cpages = 0;
> >> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >> >  }
> >> >  EXPORT_SYMBOL(migrate_vma_pages);
> >> >  
> >> > -/*
> >> > - * migrate_device_finalize() - complete page migration
> >> > - * @src_pfns: src_pfns returned from migrate_device_range()
> >> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> >> > - * @npages: number of pages in the range
> >> > - *
> >> > - * Completes migration of the page by removing special migration entries.
> >> > - * Drivers must ensure copying of page data is complete and visible to the CPU
> >> > - * before calling this.
> >> > - */
> >> > -void migrate_device_finalize(unsigned long *src_pfns,
> >> > -			unsigned long *dst_pfns, unsigned long npages)
> >> > +static void __migrate_device_finalize(unsigned long *src_pfns,
> >> > +				      unsigned long *dst_pfns,
> >> > +				      unsigned long npages,
> >> > +				      struct page *fault_page)
> >> >  {
> >> > +	struct folio *fault_folio = fault_page ?
> >> > +		page_folio(fault_page) : NULL;
> >> >  	unsigned long i;
> >> >  
> >> >  	for (i = 0; i < npages; i++) {
> >> > @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >  
> >> >  		if (!page) {
> >> >  			if (dst) {
> >> > -				folio_unlock(dst);
> >> > +				if (fault_folio != dst)
> >> > +					folio_unlock(dst);
> >> >  				folio_put(dst);
> >> >  			}
> >> >  			continue;
> >> > @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >  
> >> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
> >> >  			if (dst) {
> >> > -				folio_unlock(dst);
> >> > +				if (fault_folio != dst)
> >> > +					folio_unlock(dst);
> >> >  				folio_put(dst);
> >> >  			}
> >> >  			dst = src;
> >> >  		}
> >> >  
> >> >  		remove_migration_ptes(src, dst, 0);
> >> > -		folio_unlock(src);
> >> > +		if (fault_folio != src)
> >> > +			folio_unlock(src);
> >> >  
> >> >  		if (folio_is_zone_device(src))
> >> >  			folio_put(src);
> >> > @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >  			folio_putback_lru(src);
> >> >  
> >> >  		if (dst != src) {
> >> > -			folio_unlock(dst);
> >> > +			if (fault_folio != dst)
> >> > +				folio_unlock(dst);
> >> >  			if (folio_is_zone_device(dst))
> >> >  				folio_put(dst);
> >> >  			else
> >> > @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >  		}
> >> >  	}
> >> >  }
> >> > +
> >> > +/*
> >> > + * migrate_device_finalize() - complete page migration
> >> > + * @src_pfns: src_pfns returned from migrate_device_range()
> >> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> >> > + * @npages: number of pages in the range
> >> > + *
> >> > + * Completes migration of the page by removing special migration entries.
> >> > + * Drivers must ensure copying of page data is complete and visible to the CPU
> >> > + * before calling this.
> >> > + */
> >> > +void migrate_device_finalize(unsigned long *src_pfns,
> >> > +			unsigned long *dst_pfns, unsigned long npages)
> >> > +{
> >> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> >> > +}
> >> >  EXPORT_SYMBOL(migrate_device_finalize);
> >> >  
> >> >  /**
> >> > @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
> >> >   */
> >> >  void migrate_vma_finalize(struct migrate_vma *migrate)
> >> >  {
> >> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> >> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> >> > +				  migrate->fault_page);
> >> >  }
> >> >  EXPORT_SYMBOL(migrate_vma_finalize);
> >> 
>
Alistair Popple Oct. 29, 2024, 6:37 a.m. UTC | #5
Matthew Brost <matthew.brost@intel.com> writes:

> On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost <matthew.brost@intel.com> writes:
>> 
>> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost <matthew.brost@intel.com> writes:
>> >> 
>> >> > Avoid multiple CPU page faults to the same device page racing by trying
>> >> > to lock the page in do_swap_page before taking an extra reference to the
>> >> > page. This prevents scenarios where multiple CPU page faults each take
>> >> > an extra reference to a device page, which could abort migration in
>> >> > folio_migrate_mapping. With the device page being locked in
>> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
>> >> > locking the fault_page argument.
>> >> >
>> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
>> >> > DRM driver) SVM implementation if enough threads faulted the same device
>> >> > page.
>> >> >
>> >> > Cc: Philip Yang <Philip.Yang@amd.com>
>> >> > Cc: Felix Kuehling <felix.kuehling@amd.com>
>> >> > Cc: Christian König <christian.koenig@amd.com>
>> >> > Cc: Andrew Morton <akpm@linux-foundation.org>
>> >> > Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
>> >> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>> >> > ---
>> >> >  mm/memory.c         | 13 ++++++---
>> >> >  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
>> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
>> >> >
>> >> > diff --git a/mm/memory.c b/mm/memory.c
>> >> > index 2366578015ad..b72bde782611 100644
>> >> > --- a/mm/memory.c
>> >> > +++ b/mm/memory.c
>> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >  			 * Get a page reference while we know the page can't be
>> >> >  			 * freed.
>> >> >  			 */
>> >> > -			get_page(vmf->page);
>> >> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
>> >> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> >> > -			put_page(vmf->page);
>> >> > +			if (trylock_page(vmf->page)) {
>> >> > +				get_page(vmf->page);
>> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
>> >> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> >> > +				put_page(vmf->page);
>> >> > +				unlock_page(vmf->page);
>> >> 
>> >> I don't think my previous review of this change has really been
>> >> addressed. Why don't we just install the migration entry here? Seems
>> >> like it would be a much simpler way of solving this.
>> >> 
>> >
>> > I should have mentioned this in the cover-letter, I haven't got around
>> > to trying that out yet. Included this existing version for correctness
>> > but I also think this is not strickly required to merge this series as
>> > our locking in migrate_to_ram only relies on the core MM locks so
>> > some thread would eventually win the race and make forward progress.
>> >
>> > So I guess just ignore this patch and will send an updated version
>> > individually with installing a migration entry in do_swap_page. If for
>> > some reason that doesn't work, I'll respond here explaining why.
>> 
>> That would be great. I have a fairly strong preference for doing that
>> instead of adding more special cases for the fault page in the migration
>> code. And if we can't do that it would be good to understand
>> why. Thanks.
>> 
>
> I've looked into this and actually prefer the approach in this patch.

Thanks for looking into this.

> Consider the scenario where we install a migration entry, but
> migrate_to_ram fails. How do we handle this?
>
> We don't know where migrate_to_ram failed. Was migrate_device_finalize
> called, removing the migration PTE? Do we need to special-case failures
> in migrate_to_ram to prevent migrate_device_finalize from removing the
> faulting page's migration entry? Should we check for a migration entry
> after migrate_to_ram and remove it if it exists?

The driver should always call migrate_device_finalize(). On failure it
will remove the migration entry and remap the original device private
page. That obviously doesn't handle the fault but the process is about
to die anyway with a SIGBUS because migrate_to_ram() can't fail.

> Now, if migrate_to_ram succeeds, it seems the migration entry should be
> removed in migrate_device_finalize since the new page is only available
> there. We could return the new page in migrate_to_ram, but that feels
> messy.

Agreed - I would expect migrate_device_finalize() to always be called
and remove the migration entry.

> Additionally, the page lock needs to be held across migrate_to_ram, as
> this patch does, so we'll require some special handling in
> migrate_device_finalize to avoid unlocking the faulting page.

Or just unlock it in migrate_device_finalize(). I agree locking it one
place and unlocking it in another is a bit ugly though.

> Finally, installing a migration entry is non-trivial, while taking a
> page reference under a lock is straightforward.

I didn't think it was that hard once you have the PTL - although there
is a bit of account keeping the same as migrate_vma_collect_pmd() but
that could be abstracted into a common function. The advantage is it
saves a page table walk, but I suppose that isn't that relevant if
you're migrating a group of pages.

> Given all this, I prefer to keep this patch as it is.

Ok, I will take a closer look at it. Thanks.

 - Alistair

> Matt
>
>>  - Alistair
>> 
>> > Matt
>> >
>> >> > +			} else {
>> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
>> >> > +			}
>> >> >  		} else if (is_hwpoison_entry(entry)) {
>> >> >  			ret = VM_FAULT_HWPOISON;
>> >> >  		} else if (is_pte_marker_entry(entry)) {
>> >> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> >> > index f163c2131022..2477d39f57be 100644
>> >> > --- a/mm/migrate_device.c
>> >> > +++ b/mm/migrate_device.c
>> >> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >> >  				   struct mm_walk *walk)
>> >> >  {
>> >> >  	struct migrate_vma *migrate = walk->private;
>> >> > +	struct folio *fault_folio = migrate->fault_page ?
>> >> > +		page_folio(migrate->fault_page) : NULL;
>> >> >  	struct vm_area_struct *vma = walk->vma;
>> >> >  	struct mm_struct *mm = vma->vm_mm;
>> >> >  	unsigned long addr = start, unmapped = 0;
>> >> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >> >  
>> >> >  			folio_get(folio);
>> >> >  			spin_unlock(ptl);
>> >> > -			if (unlikely(!folio_trylock(folio)))
>> >> > +			if (unlikely(fault_folio != folio &&
>> >> > +				     !folio_trylock(folio)))
>> >> >  				return migrate_vma_collect_skip(start, end,
>> >> >  								walk);
>> >> >  			ret = split_folio(folio);
>> >> > -			folio_unlock(folio);
>> >> > +			if (fault_folio != folio)
>> >> > +				folio_unlock(folio);
>> >> >  			folio_put(folio);
>> >> >  			if (ret)
>> >> >  				return migrate_vma_collect_skip(start, end,
>> >> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >> >  		 * optimisation to avoid walking the rmap later with
>> >> >  		 * try_to_migrate().
>> >> >  		 */
>> >> > -		if (folio_trylock(folio)) {
>> >> > +		if (fault_folio == folio || folio_trylock(folio)) {
>> >> >  			bool anon_exclusive;
>> >> >  			pte_t swp_pte;
>> >> >  
>> >> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >> >  
>> >> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
>> >> >  					set_pte_at(mm, addr, ptep, pte);
>> >> > -					folio_unlock(folio);
>> >> > +					if (fault_folio != folio)
>> >> > +						folio_unlock(folio);
>> >> >  					folio_put(folio);
>> >> >  					mpfn = 0;
>> >> >  					goto next;
>> >> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>> >> >  					  unsigned long npages,
>> >> >  					  struct page *fault_page)
>> >> >  {
>> >> > +	struct folio *fault_folio = fault_page ?
>> >> > +		page_folio(fault_page) : NULL;
>> >> >  	unsigned long i, restore = 0;
>> >> >  	bool allow_drain = true;
>> >> >  	unsigned long unmapped = 0;
>> >> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>> >> >  		remove_migration_ptes(folio, folio, 0);
>> >> >  
>> >> >  		src_pfns[i] = 0;
>> >> > -		folio_unlock(folio);
>> >> > +		if (fault_folio != folio)
>> >> > +			folio_unlock(folio);
>> >> >  		folio_put(folio);
>> >> >  		restore--;
>> >> >  	}
>> >> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>> >> >  		return -EINVAL;
>> >> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
>> >> >  		return -EINVAL;
>> >> > +	if (args->fault_page && !PageLocked(args->fault_page))
>> >> > +		return -EINVAL;
>> >> >  
>> >> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>> >> >  	args->cpages = 0;
>> >> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>> >> >  }
>> >> >  EXPORT_SYMBOL(migrate_vma_pages);
>> >> >  
>> >> > -/*
>> >> > - * migrate_device_finalize() - complete page migration
>> >> > - * @src_pfns: src_pfns returned from migrate_device_range()
>> >> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
>> >> > - * @npages: number of pages in the range
>> >> > - *
>> >> > - * Completes migration of the page by removing special migration entries.
>> >> > - * Drivers must ensure copying of page data is complete and visible to the CPU
>> >> > - * before calling this.
>> >> > - */
>> >> > -void migrate_device_finalize(unsigned long *src_pfns,
>> >> > -			unsigned long *dst_pfns, unsigned long npages)
>> >> > +static void __migrate_device_finalize(unsigned long *src_pfns,
>> >> > +				      unsigned long *dst_pfns,
>> >> > +				      unsigned long npages,
>> >> > +				      struct page *fault_page)
>> >> >  {
>> >> > +	struct folio *fault_folio = fault_page ?
>> >> > +		page_folio(fault_page) : NULL;
>> >> >  	unsigned long i;
>> >> >  
>> >> >  	for (i = 0; i < npages; i++) {
>> >> > @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >> >  
>> >> >  		if (!page) {
>> >> >  			if (dst) {
>> >> > -				folio_unlock(dst);
>> >> > +				if (fault_folio != dst)
>> >> > +					folio_unlock(dst);
>> >> >  				folio_put(dst);
>> >> >  			}
>> >> >  			continue;
>> >> > @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >> >  
>> >> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
>> >> >  			if (dst) {
>> >> > -				folio_unlock(dst);
>> >> > +				if (fault_folio != dst)
>> >> > +					folio_unlock(dst);
>> >> >  				folio_put(dst);
>> >> >  			}
>> >> >  			dst = src;
>> >> >  		}
>> >> >  
>> >> >  		remove_migration_ptes(src, dst, 0);
>> >> > -		folio_unlock(src);
>> >> > +		if (fault_folio != src)
>> >> > +			folio_unlock(src);
>> >> >  
>> >> >  		if (folio_is_zone_device(src))
>> >> >  			folio_put(src);
>> >> > @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >> >  			folio_putback_lru(src);
>> >> >  
>> >> >  		if (dst != src) {
>> >> > -			folio_unlock(dst);
>> >> > +			if (fault_folio != dst)
>> >> > +				folio_unlock(dst);
>> >> >  			if (folio_is_zone_device(dst))
>> >> >  				folio_put(dst);
>> >> >  			else
>> >> > @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> >> >  		}
>> >> >  	}
>> >> >  }
>> >> > +
>> >> > +/*
>> >> > + * migrate_device_finalize() - complete page migration
>> >> > + * @src_pfns: src_pfns returned from migrate_device_range()
>> >> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
>> >> > + * @npages: number of pages in the range
>> >> > + *
>> >> > + * Completes migration of the page by removing special migration entries.
>> >> > + * Drivers must ensure copying of page data is complete and visible to the CPU
>> >> > + * before calling this.
>> >> > + */
>> >> > +void migrate_device_finalize(unsigned long *src_pfns,
>> >> > +			unsigned long *dst_pfns, unsigned long npages)
>> >> > +{
>> >> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
>> >> > +}
>> >> >  EXPORT_SYMBOL(migrate_device_finalize);
>> >> >  
>> >> >  /**
>> >> > @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
>> >> >   */
>> >> >  void migrate_vma_finalize(struct migrate_vma *migrate)
>> >> >  {
>> >> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
>> >> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
>> >> > +				  migrate->fault_page);
>> >> >  }
>> >> >  EXPORT_SYMBOL(migrate_vma_finalize);
>> >> 
>>
Matthew Brost Nov. 1, 2024, 5:19 p.m. UTC | #6
On Tue, Oct 29, 2024 at 05:37:45PM +1100, Alistair Popple wrote:
> 
> Matthew Brost <matthew.brost@intel.com> writes:
> 
> > On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> 
> >> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost <matthew.brost@intel.com> writes:
> >> >> 
> >> >> > Avoid multiple CPU page faults to the same device page racing by trying
> >> >> > to lock the page in do_swap_page before taking an extra reference to the
> >> >> > page. This prevents scenarios where multiple CPU page faults each take
> >> >> > an extra reference to a device page, which could abort migration in
> >> >> > folio_migrate_mapping. With the device page being locked in
> >> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> >> >> > locking the fault_page argument.
> >> >> >
> >> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> >> >> > DRM driver) SVM implementation if enough threads faulted the same device
> >> >> > page.
> >> >> >
> >> >> > Cc: Philip Yang <Philip.Yang@amd.com>
> >> >> > Cc: Felix Kuehling <felix.kuehling@amd.com>
> >> >> > Cc: Christian König <christian.koenig@amd.com>
> >> >> > Cc: Andrew Morton <akpm@linux-foundation.org>
> >> >> > Suggessted-by: Simona Vetter <simona.vetter@ffwll.ch>
> >> >> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >> >> > ---
> >> >> >  mm/memory.c         | 13 ++++++---
> >> >> >  mm/migrate_device.c | 69 ++++++++++++++++++++++++++++++---------------
> >> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >> >> >
> >> >> > diff --git a/mm/memory.c b/mm/memory.c
> >> >> > index 2366578015ad..b72bde782611 100644
> >> >> > --- a/mm/memory.c
> >> >> > +++ b/mm/memory.c
> >> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >  			 * Get a page reference while we know the page can't be
> >> >> >  			 * freed.
> >> >> >  			 */
> >> >> > -			get_page(vmf->page);
> >> >> > -			pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> >> > -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> >> > -			put_page(vmf->page);
> >> >> > +			if (trylock_page(vmf->page)) {
> >> >> > +				get_page(vmf->page);
> >> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> >> > +				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> >> > +				put_page(vmf->page);
> >> >> > +				unlock_page(vmf->page);
> >> >> 
> >> >> I don't think my previous review of this change has really been
> >> >> addressed. Why don't we just install the migration entry here? Seems
> >> >> like it would be a much simpler way of solving this.
> >> >> 
> >> >
> >> > I should have mentioned this in the cover-letter, I haven't got around
> >> > to trying that out yet. Included this existing version for correctness
> >> > but I also think this is not strickly required to merge this series as
> >> > our locking in migrate_to_ram only relies on the core MM locks so
> >> > some thread would eventually win the race and make forward progress.
> >> >
> >> > So I guess just ignore this patch and will send an updated version
> >> > individually with installing a migration entry in do_swap_page. If for
> >> > some reason that doesn't work, I'll respond here explaining why.
> >> 
> >> That would be great. I have a fairly strong preference for doing that
> >> instead of adding more special cases for the fault page in the migration
> >> code. And if we can't do that it would be good to understand
> >> why. Thanks.
> >> 
> >
> > I've looked into this and actually prefer the approach in this patch.
> 
> Thanks for looking into this.
> 
> > Consider the scenario where we install a migration entry, but
> > migrate_to_ram fails. How do we handle this?
> >
> > We don't know where migrate_to_ram failed. Was migrate_device_finalize
> > called, removing the migration PTE? Do we need to special-case failures
> > in migrate_to_ram to prevent migrate_device_finalize from removing the
> > faulting page's migration entry? Should we check for a migration entry
> > after migrate_to_ram and remove it if it exists?
> 
> The driver should always call migrate_device_finalize(). On failure it
> will remove the migration entry and remap the original device private
> page. That obviously doesn't handle the fault but the process is about
> to die anyway with a SIGBUS because migrate_to_ram() can't fail.
> 

What if migrate_to_ram fails before calling migrate_vma_setup - e.g. a
kmalloc of the arguments fails? Very ackward situation.

> > Now, if migrate_to_ram succeeds, it seems the migration entry should be
> > removed in migrate_device_finalize since the new page is only available
> > there. We could return the new page in migrate_to_ram, but that feels
> > messy.
> 
> Agreed - I would expect migrate_device_finalize() to always be called
> and remove the migration entry.
> 
> > Additionally, the page lock needs to be held across migrate_to_ram, as
> > this patch does, so we'll require some special handling in
> > migrate_device_finalize to avoid unlocking the faulting page.
> 
> Or just unlock it in migrate_device_finalize(). I agree locking it one
> place and unlocking it in another is a bit ugly though.

Agreed.

> 
> > Finally, installing a migration entry is non-trivial, while taking a
> > page reference under a lock is straightforward.
> 
> I didn't think it was that hard once you have the PTL - although there
> is a bit of account keeping the same as migrate_vma_collect_pmd() but
> that could be abstracted into a common function. The advantage is it
> saves a page table walk, but I suppose that isn't that relevant if
> you're migrating a group of pages.
>

A helper would definitely be required if we do this.
 
> > Given all this, I prefer to keep this patch as it is.
> 
> Ok, I will take a closer look at it. Thanks.
> 

+1. Let me know what you come up with. This patch doesn't strickly block
my work but it would good to have something in to fix this problem soon.

Matt

>  - Alistair
> 
> > Matt
> >
> >>  - Alistair
> >> 
> >> > Matt
> >> >
> >> >> > +			} else {
> >> >> > +				pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> >> > +			}
> >> >> >  		} else if (is_hwpoison_entry(entry)) {
> >> >> >  			ret = VM_FAULT_HWPOISON;
> >> >> >  		} else if (is_pte_marker_entry(entry)) {
> >> >> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> >> > index f163c2131022..2477d39f57be 100644
> >> >> > --- a/mm/migrate_device.c
> >> >> > +++ b/mm/migrate_device.c
> >> >> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >> >  				   struct mm_walk *walk)
> >> >> >  {
> >> >> >  	struct migrate_vma *migrate = walk->private;
> >> >> > +	struct folio *fault_folio = migrate->fault_page ?
> >> >> > +		page_folio(migrate->fault_page) : NULL;
> >> >> >  	struct vm_area_struct *vma = walk->vma;
> >> >> >  	struct mm_struct *mm = vma->vm_mm;
> >> >> >  	unsigned long addr = start, unmapped = 0;
> >> >> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >> >  
> >> >> >  			folio_get(folio);
> >> >> >  			spin_unlock(ptl);
> >> >> > -			if (unlikely(!folio_trylock(folio)))
> >> >> > +			if (unlikely(fault_folio != folio &&
> >> >> > +				     !folio_trylock(folio)))
> >> >> >  				return migrate_vma_collect_skip(start, end,
> >> >> >  								walk);
> >> >> >  			ret = split_folio(folio);
> >> >> > -			folio_unlock(folio);
> >> >> > +			if (fault_folio != folio)
> >> >> > +				folio_unlock(folio);
> >> >> >  			folio_put(folio);
> >> >> >  			if (ret)
> >> >> >  				return migrate_vma_collect_skip(start, end,
> >> >> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >> >  		 * optimisation to avoid walking the rmap later with
> >> >> >  		 * try_to_migrate().
> >> >> >  		 */
> >> >> > -		if (folio_trylock(folio)) {
> >> >> > +		if (fault_folio == folio || folio_trylock(folio)) {
> >> >> >  			bool anon_exclusive;
> >> >> >  			pte_t swp_pte;
> >> >> >  
> >> >> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >> >  
> >> >> >  				if (folio_try_share_anon_rmap_pte(folio, page)) {
> >> >> >  					set_pte_at(mm, addr, ptep, pte);
> >> >> > -					folio_unlock(folio);
> >> >> > +					if (fault_folio != folio)
> >> >> > +						folio_unlock(folio);
> >> >> >  					folio_put(folio);
> >> >> >  					mpfn = 0;
> >> >> >  					goto next;
> >> >> > @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >> >> >  					  unsigned long npages,
> >> >> >  					  struct page *fault_page)
> >> >> >  {
> >> >> > +	struct folio *fault_folio = fault_page ?
> >> >> > +		page_folio(fault_page) : NULL;
> >> >> >  	unsigned long i, restore = 0;
> >> >> >  	bool allow_drain = true;
> >> >> >  	unsigned long unmapped = 0;
> >> >> > @@ -427,7 +434,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> >> >> >  		remove_migration_ptes(folio, folio, 0);
> >> >> >  
> >> >> >  		src_pfns[i] = 0;
> >> >> > -		folio_unlock(folio);
> >> >> > +		if (fault_folio != folio)
> >> >> > +			folio_unlock(folio);
> >> >> >  		folio_put(folio);
> >> >> >  		restore--;
> >> >> >  	}
> >> >> > @@ -536,6 +544,8 @@ int migrate_vma_setup(struct migrate_vma *args)
> >> >> >  		return -EINVAL;
> >> >> >  	if (args->fault_page && !is_device_private_page(args->fault_page))
> >> >> >  		return -EINVAL;
> >> >> > +	if (args->fault_page && !PageLocked(args->fault_page))
> >> >> > +		return -EINVAL;
> >> >> >  
> >> >> >  	memset(args->src, 0, sizeof(*args->src) * nr_pages);
> >> >> >  	args->cpages = 0;
> >> >> > @@ -799,19 +809,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >> >> >  }
> >> >> >  EXPORT_SYMBOL(migrate_vma_pages);
> >> >> >  
> >> >> > -/*
> >> >> > - * migrate_device_finalize() - complete page migration
> >> >> > - * @src_pfns: src_pfns returned from migrate_device_range()
> >> >> > - * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> >> >> > - * @npages: number of pages in the range
> >> >> > - *
> >> >> > - * Completes migration of the page by removing special migration entries.
> >> >> > - * Drivers must ensure copying of page data is complete and visible to the CPU
> >> >> > - * before calling this.
> >> >> > - */
> >> >> > -void migrate_device_finalize(unsigned long *src_pfns,
> >> >> > -			unsigned long *dst_pfns, unsigned long npages)
> >> >> > +static void __migrate_device_finalize(unsigned long *src_pfns,
> >> >> > +				      unsigned long *dst_pfns,
> >> >> > +				      unsigned long npages,
> >> >> > +				      struct page *fault_page)
> >> >> >  {
> >> >> > +	struct folio *fault_folio = fault_page ?
> >> >> > +		page_folio(fault_page) : NULL;
> >> >> >  	unsigned long i;
> >> >> >  
> >> >> >  	for (i = 0; i < npages; i++) {
> >> >> > @@ -824,7 +828,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >> >  
> >> >> >  		if (!page) {
> >> >> >  			if (dst) {
> >> >> > -				folio_unlock(dst);
> >> >> > +				if (fault_folio != dst)
> >> >> > +					folio_unlock(dst);
> >> >> >  				folio_put(dst);
> >> >> >  			}
> >> >> >  			continue;
> >> >> > @@ -834,14 +839,16 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >> >  
> >> >> >  		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
> >> >> >  			if (dst) {
> >> >> > -				folio_unlock(dst);
> >> >> > +				if (fault_folio != dst)
> >> >> > +					folio_unlock(dst);
> >> >> >  				folio_put(dst);
> >> >> >  			}
> >> >> >  			dst = src;
> >> >> >  		}
> >> >> >  
> >> >> >  		remove_migration_ptes(src, dst, 0);
> >> >> > -		folio_unlock(src);
> >> >> > +		if (fault_folio != src)
> >> >> > +			folio_unlock(src);
> >> >> >  
> >> >> >  		if (folio_is_zone_device(src))
> >> >> >  			folio_put(src);
> >> >> > @@ -849,7 +856,8 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >> >  			folio_putback_lru(src);
> >> >> >  
> >> >> >  		if (dst != src) {
> >> >> > -			folio_unlock(dst);
> >> >> > +			if (fault_folio != dst)
> >> >> > +				folio_unlock(dst);
> >> >> >  			if (folio_is_zone_device(dst))
> >> >> >  				folio_put(dst);
> >> >> >  			else
> >> >> > @@ -857,6 +865,22 @@ void migrate_device_finalize(unsigned long *src_pfns,
> >> >> >  		}
> >> >> >  	}
> >> >> >  }
> >> >> > +
> >> >> > +/*
> >> >> > + * migrate_device_finalize() - complete page migration
> >> >> > + * @src_pfns: src_pfns returned from migrate_device_range()
> >> >> > + * @dst_pfns: array of pfns allocated by the driver to migrate memory to
> >> >> > + * @npages: number of pages in the range
> >> >> > + *
> >> >> > + * Completes migration of the page by removing special migration entries.
> >> >> > + * Drivers must ensure copying of page data is complete and visible to the CPU
> >> >> > + * before calling this.
> >> >> > + */
> >> >> > +void migrate_device_finalize(unsigned long *src_pfns,
> >> >> > +			unsigned long *dst_pfns, unsigned long npages)
> >> >> > +{
> >> >> > +	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
> >> >> > +}
> >> >> >  EXPORT_SYMBOL(migrate_device_finalize);
> >> >> >  
> >> >> >  /**
> >> >> > @@ -872,7 +896,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
> >> >> >   */
> >> >> >  void migrate_vma_finalize(struct migrate_vma *migrate)
> >> >> >  {
> >> >> > -	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
> >> >> > +	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
> >> >> > +				  migrate->fault_page);
> >> >> >  }
> >> >> >  EXPORT_SYMBOL(migrate_vma_finalize);
> >> >> 
> >> 
>
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 2366578015ad..b72bde782611 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4252,10 +4252,15 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			 * Get a page reference while we know the page can't be
 			 * freed.
 			 */
-			get_page(vmf->page);
-			pte_unmap_unlock(vmf->pte, vmf->ptl);
-			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
-			put_page(vmf->page);
+			if (trylock_page(vmf->page)) {
+				get_page(vmf->page);
+				pte_unmap_unlock(vmf->pte, vmf->ptl);
+				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+				put_page(vmf->page);
+				unlock_page(vmf->page);
+			} else {
+				pte_unmap_unlock(vmf->pte, vmf->ptl);
+			}
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else if (is_pte_marker_entry(entry)) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index f163c2131022..2477d39f57be 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 				   struct mm_walk *walk)
 {
 	struct migrate_vma *migrate = walk->private;
+	struct folio *fault_folio = migrate->fault_page ?
+		page_folio(migrate->fault_page) : NULL;
 	struct vm_area_struct *vma = walk->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,13 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 
 			folio_get(folio);
 			spin_unlock(ptl);
-			if (unlikely(!folio_trylock(folio)))
+			if (unlikely(fault_folio != folio &&
+				     !folio_trylock(folio)))
 				return migrate_vma_collect_skip(start, end,
 								walk);
 			ret = split_folio(folio);
-			folio_unlock(folio);
+			if (fault_folio != folio)
+				folio_unlock(folio);
 			folio_put(folio);
 			if (ret)
 				return migrate_vma_collect_skip(start, end,
@@ -192,7 +196,7 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		 * optimisation to avoid walking the rmap later with
 		 * try_to_migrate().
 		 */
-		if (folio_trylock(folio)) {
+		if (fault_folio == folio || folio_trylock(folio)) {
 			bool anon_exclusive;
 			pte_t swp_pte;
 
@@ -204,7 +208,8 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 
 				if (folio_try_share_anon_rmap_pte(folio, page)) {
 					set_pte_at(mm, addr, ptep, pte);
-					folio_unlock(folio);
+					if (fault_folio != folio)
+						folio_unlock(folio);
 					folio_put(folio);
 					mpfn = 0;
 					goto next;
@@ -363,6 +368,8 @@  static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 					  unsigned long npages,
 					  struct page *fault_page)
 {
+	struct folio *fault_folio = fault_page ?
+		page_folio(fault_page) : NULL;
 	unsigned long i, restore = 0;
 	bool allow_drain = true;
 	unsigned long unmapped = 0;
@@ -427,7 +434,8 @@  static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 		remove_migration_ptes(folio, folio, 0);
 
 		src_pfns[i] = 0;
-		folio_unlock(folio);
+		if (fault_folio != folio)
+			folio_unlock(folio);
 		folio_put(folio);
 		restore--;
 	}
@@ -536,6 +544,8 @@  int migrate_vma_setup(struct migrate_vma *args)
 		return -EINVAL;
 	if (args->fault_page && !is_device_private_page(args->fault_page))
 		return -EINVAL;
+	if (args->fault_page && !PageLocked(args->fault_page))
+		return -EINVAL;
 
 	memset(args->src, 0, sizeof(*args->src) * nr_pages);
 	args->cpages = 0;
@@ -799,19 +809,13 @@  void migrate_vma_pages(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_pages);
 
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
-			unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+				      unsigned long *dst_pfns,
+				      unsigned long npages,
+				      struct page *fault_page)
 {
+	struct folio *fault_folio = fault_page ?
+		page_folio(fault_page) : NULL;
 	unsigned long i;
 
 	for (i = 0; i < npages; i++) {
@@ -824,7 +828,8 @@  void migrate_device_finalize(unsigned long *src_pfns,
 
 		if (!page) {
 			if (dst) {
-				folio_unlock(dst);
+				if (fault_folio != dst)
+					folio_unlock(dst);
 				folio_put(dst);
 			}
 			continue;
@@ -834,14 +839,16 @@  void migrate_device_finalize(unsigned long *src_pfns,
 
 		if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
 			if (dst) {
-				folio_unlock(dst);
+				if (fault_folio != dst)
+					folio_unlock(dst);
 				folio_put(dst);
 			}
 			dst = src;
 		}
 
 		remove_migration_ptes(src, dst, 0);
-		folio_unlock(src);
+		if (fault_folio != src)
+			folio_unlock(src);
 
 		if (folio_is_zone_device(src))
 			folio_put(src);
@@ -849,7 +856,8 @@  void migrate_device_finalize(unsigned long *src_pfns,
 			folio_putback_lru(src);
 
 		if (dst != src) {
-			folio_unlock(dst);
+			if (fault_folio != dst)
+				folio_unlock(dst);
 			if (folio_is_zone_device(dst))
 				folio_put(dst);
 			else
@@ -857,6 +865,22 @@  void migrate_device_finalize(unsigned long *src_pfns,
 		}
 	}
 }
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+			unsigned long *dst_pfns, unsigned long npages)
+{
+	return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
 EXPORT_SYMBOL(migrate_device_finalize);
 
 /**
@@ -872,7 +896,8 @@  EXPORT_SYMBOL(migrate_device_finalize);
  */
 void migrate_vma_finalize(struct migrate_vma *migrate)
 {
-	migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+	__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+				  migrate->fault_page);
 }
 EXPORT_SYMBOL(migrate_vma_finalize);