Message ID | 20200713172149.2310-3-rcampbell@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm/migrate: avoid device private invalidations | expand |
On Mon, Jul 13, 2020 at 10:21:46AM -0700, Ralph Campbell wrote: > The src_owner field in struct migrate_vma is being used for two purposes, > it implies the direction of the migration and it identifies device private > pages owned by the caller. Split this into separate parameters so the > src_owner field can be used just to identify device private pages owned > by the caller of migrate_vma_setup(). > > Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> > Reviewed-by: Bharata B Rao <bharata@linux.ibm.com> > arch/powerpc/kvm/book3s_hv_uvmem.c | 2 ++ > drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 ++ > include/linux/migrate.h | 12 +++++++++--- > lib/test_hmm.c | 2 ++ > mm/migrate.c | 5 +++-- > 5 files changed, 18 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c > index 09d8119024db..acbf14cd2d72 100644 > +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c > @@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, > mig.end = end; > mig.src = &src_pfn; > mig.dst = &dst_pfn; > + mig.dir = MIGRATE_VMA_FROM_SYSTEM; > > /* > * We come here with mmap_lock write lock held just for > @@ -578,6 +579,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, > mig.src = &src_pfn; > mig.dst = &dst_pfn; > mig.src_owner = &kvmppc_uvmem_pgmap; > + mig.dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE; > > mutex_lock(&kvm->arch.uvmem_lock); > /* The requested page is already paged-out, nothing to do */ > diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c > index e5c230d9ae24..e5c83b8ee82e 100644 > +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c > @@ -183,6 +183,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) > .src = &src, > .dst = &dst, > .src_owner = drm->dev, > + .dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE, > }; > > /* > @@ -615,6 +616,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, > struct migrate_vma args = { > .vma = vma, > .start = start, > + .dir = MIGRATE_VMA_FROM_SYSTEM, > }; > unsigned long i; > u64 *pfns; > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index 3e546cbf03dd..620f2235d7d4 100644 > +++ b/include/linux/migrate.h > @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) > return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; > } > > +enum migrate_vma_direction { > + MIGRATE_VMA_FROM_SYSTEM, > + MIGRATE_VMA_FROM_DEVICE_PRIVATE, > +}; I would have guessed this is more natural as _FROM_DEVICE_ and TO_DEVICE_ ? All the callers of this API are device drivers managing their DEVICE_PRIVATE, right? Jason
On 7/20/20 11:36 AM, Jason Gunthorpe wrote: > On Mon, Jul 13, 2020 at 10:21:46AM -0700, Ralph Campbell wrote: >> The src_owner field in struct migrate_vma is being used for two purposes, >> it implies the direction of the migration and it identifies device private >> pages owned by the caller. Split this into separate parameters so the >> src_owner field can be used just to identify device private pages owned >> by the caller of migrate_vma_setup(). >> >> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> >> Reviewed-by: Bharata B Rao <bharata@linux.ibm.com> >> arch/powerpc/kvm/book3s_hv_uvmem.c | 2 ++ >> drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 ++ >> include/linux/migrate.h | 12 +++++++++--- >> lib/test_hmm.c | 2 ++ >> mm/migrate.c | 5 +++-- >> 5 files changed, 18 insertions(+), 5 deletions(-) >> >> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c >> index 09d8119024db..acbf14cd2d72 100644 >> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c >> @@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, >> mig.end = end; >> mig.src = &src_pfn; >> mig.dst = &dst_pfn; >> + mig.dir = MIGRATE_VMA_FROM_SYSTEM; >> >> /* >> * We come here with mmap_lock write lock held just for >> @@ -578,6 +579,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, >> mig.src = &src_pfn; >> mig.dst = &dst_pfn; >> mig.src_owner = &kvmppc_uvmem_pgmap; >> + mig.dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE; >> >> mutex_lock(&kvm->arch.uvmem_lock); >> /* The requested page is already paged-out, nothing to do */ >> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c >> index e5c230d9ae24..e5c83b8ee82e 100644 >> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c >> @@ -183,6 +183,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) >> .src = &src, >> .dst = &dst, >> .src_owner = drm->dev, >> + .dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE, >> }; >> >> /* >> @@ -615,6 +616,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, >> struct migrate_vma args = { >> .vma = vma, >> .start = start, >> + .dir = MIGRATE_VMA_FROM_SYSTEM, >> }; >> unsigned long i; >> u64 *pfns; >> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >> index 3e546cbf03dd..620f2235d7d4 100644 >> +++ b/include/linux/migrate.h >> @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) >> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; >> } >> >> +enum migrate_vma_direction { >> + MIGRATE_VMA_FROM_SYSTEM, >> + MIGRATE_VMA_FROM_DEVICE_PRIVATE, >> +}; > > I would have guessed this is more natural as _FROM_DEVICE_ and > TO_DEVICE_ ? The caller controls where the destination memory is allocated so it isn't necessarily device private memory, it could be from system to system. The use case for system to system memory migration is for hardware like ARM SMMU or PCIe ATS where a single set of page tables is shared by the device and a CPU process over a coherent system memory bus. Also many integrated GPUs in SOCs fall into this category too. So to me, it makes more sense to specify the direction based on the source location. > All the callers of this API are device drivers managing their > DEVICE_PRIVATE, right? True for now, yes. > Jason >
On Mon, Jul 20, 2020 at 12:54:53PM -0700, Ralph Campbell wrote: > > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > > > index 3e546cbf03dd..620f2235d7d4 100644 > > > +++ b/include/linux/migrate.h > > > @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) > > > return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; > > > } > > > +enum migrate_vma_direction { > > > + MIGRATE_VMA_FROM_SYSTEM, > > > + MIGRATE_VMA_FROM_DEVICE_PRIVATE, > > > +}; > > > > I would have guessed this is more natural as _FROM_DEVICE_ and > > TO_DEVICE_ ? > > The caller controls where the destination memory is allocated so it isn't > necessarily device private memory, it could be from system to system. > The use case for system to system memory migration is for hardware > like ARM SMMU or PCIe ATS where a single set of page tables is shared by > the device and a CPU process over a coherent system memory bus. > Also many integrated GPUs in SOCs fall into this category too. Maybe just TO/FROM_DEIVCE then? Even though the memory is not DEVICE_PRIVATE it is still device owned pages right? > So to me, it makes more sense to specify the direction based on the > source location. It feels strange because the driver doesn't always know or control the source? Jason
On 7/20/20 12:59 PM, Jason Gunthorpe wrote: > On Mon, Jul 20, 2020 at 12:54:53PM -0700, Ralph Campbell wrote: >>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >>>> index 3e546cbf03dd..620f2235d7d4 100644 >>>> +++ b/include/linux/migrate.h >>>> @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) >>>> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; >>>> } >>>> +enum migrate_vma_direction { >>>> + MIGRATE_VMA_FROM_SYSTEM, >>>> + MIGRATE_VMA_FROM_DEVICE_PRIVATE, >>>> +}; >>> >>> I would have guessed this is more natural as _FROM_DEVICE_ and >>> TO_DEVICE_ ? >> >> The caller controls where the destination memory is allocated so it isn't >> necessarily device private memory, it could be from system to system. >> The use case for system to system memory migration is for hardware >> like ARM SMMU or PCIe ATS where a single set of page tables is shared by >> the device and a CPU process over a coherent system memory bus. >> Also many integrated GPUs in SOCs fall into this category too. > > Maybe just TO/FROM_DEIVCE then? Even though the memory is not > DEVICE_PRIVATE it is still device owned pages right? > >> So to me, it makes more sense to specify the direction based on the >> source location. > > It feels strange because the driver doesn't always know or control the > source? > > Jason > The driver can't really know where the source is currently located because the API is designed to not initially hold the page locks, migrate_vma_setup() only knows the source once it holds the page table locks and isolates/locks the pages being migrated. The direction and pgmap_owner are supposed to filter which pages the caller is interested in migrating. Perhaps the direction should instead be a flags field with separate bits for system memory and device private memory selecting source candidates for migration. I can imagine use cases for all 4 combinations of d->d, d->s, s->d, and s->s being valid. I didn't really think a direction was needed, this was something that Christoph Hellwig seemed to think made the API safer.
On Mon, Jul 20, 2020 at 01:49:09PM -0700, Ralph Campbell wrote: > > On 7/20/20 12:59 PM, Jason Gunthorpe wrote: > > On Mon, Jul 20, 2020 at 12:54:53PM -0700, Ralph Campbell wrote: > > > > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > > > > > index 3e546cbf03dd..620f2235d7d4 100644 > > > > > +++ b/include/linux/migrate.h > > > > > @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) > > > > > return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; > > > > > } > > > > > +enum migrate_vma_direction { > > > > > + MIGRATE_VMA_FROM_SYSTEM, > > > > > + MIGRATE_VMA_FROM_DEVICE_PRIVATE, > > > > > +}; > > > > > > > > I would have guessed this is more natural as _FROM_DEVICE_ and > > > > TO_DEVICE_ ? > > > > > > The caller controls where the destination memory is allocated so it isn't > > > necessarily device private memory, it could be from system to system. > > > The use case for system to system memory migration is for hardware > > > like ARM SMMU or PCIe ATS where a single set of page tables is shared by > > > the device and a CPU process over a coherent system memory bus. > > > Also many integrated GPUs in SOCs fall into this category too. > > > > Maybe just TO/FROM_DEIVCE then? Even though the memory is not > > DEVICE_PRIVATE it is still device owned pages right? > > > > > So to me, it makes more sense to specify the direction based on the > > > source location. > > > > It feels strange because the driver doesn't always know or control the > > source? > > The driver can't really know where the source is currently located because the > API is designed to not initially hold the page locks, migrate_vma_setup() only knows > the source once it holds the page table locks and isolates/locks the pages being > migrated. The direction and pgmap_owner are supposed to filter which pages > the caller is interested in migrating. > Perhaps the direction should instead be a flags field with separate bits for > system memory and device private memory selecting source candidates for > migration. I can imagine use cases for all 4 combinations of > d->d, d->s, s->d, and s->s being valid. > > I didn't really think a direction was needed, this was something that > Christoph Hellwig seemed to think made the API safer. If it is a filter then just using those names would make sense MIGRATE_VMA_SELECT_SYSTEM MIGRATE_VMA_SELECT_DEVICE_PRIVATE SYSTEM feels like the wrong name too, doesn't linux have a formal name for RAM struct pages? In your future coherent design how would the migrate select 'device' pages that are fully coherent? Are they still zone something pages that are OK for CPU usage? Jason
On 7/20/20 4:16 PM, Jason Gunthorpe wrote: > On Mon, Jul 20, 2020 at 01:49:09PM -0700, Ralph Campbell wrote: >> >> On 7/20/20 12:59 PM, Jason Gunthorpe wrote: >>> On Mon, Jul 20, 2020 at 12:54:53PM -0700, Ralph Campbell wrote: >>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >>>>>> index 3e546cbf03dd..620f2235d7d4 100644 >>>>>> +++ b/include/linux/migrate.h >>>>>> @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) >>>>>> return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; >>>>>> } >>>>>> +enum migrate_vma_direction { >>>>>> + MIGRATE_VMA_FROM_SYSTEM, >>>>>> + MIGRATE_VMA_FROM_DEVICE_PRIVATE, >>>>>> +}; >>>>> >>>>> I would have guessed this is more natural as _FROM_DEVICE_ and >>>>> TO_DEVICE_ ? >>>> >>>> The caller controls where the destination memory is allocated so it isn't >>>> necessarily device private memory, it could be from system to system. >>>> The use case for system to system memory migration is for hardware >>>> like ARM SMMU or PCIe ATS where a single set of page tables is shared by >>>> the device and a CPU process over a coherent system memory bus. >>>> Also many integrated GPUs in SOCs fall into this category too. >>> >>> Maybe just TO/FROM_DEIVCE then? Even though the memory is not >>> DEVICE_PRIVATE it is still device owned pages right? >>> >>>> So to me, it makes more sense to specify the direction based on the >>>> source location. >>> >>> It feels strange because the driver doesn't always know or control the >>> source? >> >> The driver can't really know where the source is currently located because the >> API is designed to not initially hold the page locks, migrate_vma_setup() only knows >> the source once it holds the page table locks and isolates/locks the pages being >> migrated. The direction and pgmap_owner are supposed to filter which pages >> the caller is interested in migrating. >> Perhaps the direction should instead be a flags field with separate bits for >> system memory and device private memory selecting source candidates for >> migration. I can imagine use cases for all 4 combinations of >> d->d, d->s, s->d, and s->s being valid. >> >> I didn't really think a direction was needed, this was something that >> Christoph Hellwig seemed to think made the API safer. > > If it is a filter then just using those names would make sense > > MIGRATE_VMA_SELECT_SYSTEM > MIGRATE_VMA_SELECT_DEVICE_PRIVATE > > SYSTEM feels like the wrong name too, doesn't linux have a formal name > for RAM struct pages? Highmem? Movable? Zone normal? There are quite a few :-) At the moment, only anonymous pages are being migrated but I expect file backed pages to be supported at some point (but not DAX). VM_PFNMAP and VM_MIXEDMAP might make sense some day with peer-to-peer copies. So MIGRATE_VMA_SELECT_SYSTEM seems OK to me. > In your future coherent design how would the migrate select 'device' > pages that are fully coherent? Are they still zone something pages > that are OK for CPU usage? > > Jason > For pages that are device private, the pgmap_owner selects them (plus the MIGRATE_VMA_SELECT_DEVICE_PRIVATE flag). For pages that are migrating from system memory to system memory, I expect the pages to be in different NUMA zones. Otherwise, there wouldn't be much point in migrating them. And yes, the CPU can access them. It might be useful to have a filter saying "migrate system memory not already in NUMA zone X" if the MIGRATE_VMA_SELECT_SYSTEM flag is set. Also, in support of the flags field, I'm looking at THP migration and I can picture defining some request flags like hmm_range_fault() to say "migrate THPs if they exist, otherwise split THPs". A default_flags MIGRATE_PFN_REQ_FAULT would be useful if the source page is swapped out. Currently, migrate_vma_setup() just skips these pages without any indication to the caller why the page isn't being migrated or if retrying is worth attempting.
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 09d8119024db..acbf14cd2d72 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, mig.end = end; mig.src = &src_pfn; mig.dst = &dst_pfn; + mig.dir = MIGRATE_VMA_FROM_SYSTEM; /* * We come here with mmap_lock write lock held just for @@ -578,6 +579,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, mig.src = &src_pfn; mig.dst = &dst_pfn; mig.src_owner = &kvmppc_uvmem_pgmap; + mig.dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE; mutex_lock(&kvm->arch.uvmem_lock); /* The requested page is already paged-out, nothing to do */ diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index e5c230d9ae24..e5c83b8ee82e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -183,6 +183,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) .src = &src, .dst = &dst, .src_owner = drm->dev, + .dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE, }; /* @@ -615,6 +616,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, struct migrate_vma args = { .vma = vma, .start = start, + .dir = MIGRATE_VMA_FROM_SYSTEM, }; unsigned long i; u64 *pfns; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3e546cbf03dd..620f2235d7d4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; } +enum migrate_vma_direction { + MIGRATE_VMA_FROM_SYSTEM, + MIGRATE_VMA_FROM_DEVICE_PRIVATE, +}; + struct migrate_vma { struct vm_area_struct *vma; /* @@ -199,11 +204,12 @@ struct migrate_vma { /* * Set to the owner value also stored in page->pgmap->owner for - * migrating out of device private memory. If set only device - * private pages with this owner are migrated. If not set - * device private pages are not migrated at all. + * migrating device private memory. The direction also needs to + * be set to MIGRATE_VMA_FROM_DEVICE_PRIVATE. */ void *src_owner; + + enum migrate_vma_direction dir; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 9aa577afc269..1bd60cfb5a25 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -703,6 +703,7 @@ static int dmirror_migrate(struct dmirror *dmirror, args.start = addr; args.end = next; args.src_owner = NULL; + args.dir = MIGRATE_VMA_FROM_SYSTEM; ret = migrate_vma_setup(&args); if (ret) goto out; @@ -1054,6 +1055,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.src = &src_pfns; args.dst = &dst_pfns; args.src_owner = dmirror->mdevice; + args.dir = MIGRATE_VMA_FROM_DEVICE_PRIVATE; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..2bbc5c4c672e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2287,7 +2287,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, goto next; page = device_private_entry_to_page(entry); - if (page->pgmap->owner != migrate->src_owner) + if (migrate->dir != MIGRATE_VMA_FROM_DEVICE_PRIVATE || + page->pgmap->owner != migrate->src_owner) goto next; mpfn = migrate_pfn(page_to_pfn(page)) | @@ -2295,7 +2296,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (migrate->src_owner) + if (migrate->dir != MIGRATE_VMA_FROM_SYSTEM) goto next; pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) {