@@ -119,6 +119,7 @@ struct dmirror_device {
unsigned long calloc;
unsigned long cfree;
struct page *free_pages;
+ struct folio *free_folios;
spinlock_t lock; /* protects the above */
};
@@ -492,7 +493,7 @@ static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
}
static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
- struct page **ppage)
+ struct page **ppage, bool is_large)
{
struct dmirror_chunk *devmem;
struct resource *res = NULL;
@@ -572,20 +573,45 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
pfn_first, pfn_last);
spin_lock(&mdevice->lock);
- for (pfn = pfn_first; pfn < pfn_last; pfn++) {
+ for (pfn = pfn_first; pfn < pfn_last; ) {
struct page *page = pfn_to_page(pfn);
+ if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR)
+ && (pfn + HPAGE_PMD_NR <= pfn_last)) {
+ page->zone_device_data = mdevice->free_folios;
+ mdevice->free_folios = page_folio(page);
+ pfn += HPAGE_PMD_NR;
+ continue;
+ }
+
page->zone_device_data = mdevice->free_pages;
mdevice->free_pages = page;
+ pfn++;
}
+
+ ret = 0;
if (ppage) {
- *ppage = mdevice->free_pages;
- mdevice->free_pages = (*ppage)->zone_device_data;
- mdevice->calloc++;
+ if (is_large) {
+ if (!mdevice->free_folios) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+ *ppage = folio_page(mdevice->free_folios, 0);
+ mdevice->free_folios = (*ppage)->zone_device_data;
+ mdevice->calloc += HPAGE_PMD_NR;
+ } else if (mdevice->free_pages) {
+ *ppage = mdevice->free_pages;
+ mdevice->free_pages = (*ppage)->zone_device_data;
+ mdevice->calloc++;
+ } else {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
}
+err_unlock:
spin_unlock(&mdevice->lock);
- return 0;
+ return ret;
err_release:
mutex_unlock(&mdevice->devmem_lock);
@@ -598,10 +624,13 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
return ret;
}
-static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
+static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror,
+ bool is_large)
{
struct page *dpage = NULL;
struct page *rpage = NULL;
+ unsigned int order = is_large ? HPAGE_PMD_ORDER : 0;
+ struct dmirror_device *mdevice = dmirror->mdevice;
/*
* For ZONE_DEVICE private type, this is a fake device so we allocate
@@ -610,49 +639,55 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
* data and ignore rpage.
*/
if (dmirror_is_private_zone(mdevice)) {
- rpage = alloc_page(GFP_HIGHUSER);
+ rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0);
if (!rpage)
return NULL;
}
spin_lock(&mdevice->lock);
- if (mdevice->free_pages) {
+ if (is_large && mdevice->free_folios) {
+ dpage = folio_page(mdevice->free_folios, 0);
+ mdevice->free_folios = dpage->zone_device_data;
+ mdevice->calloc += 1 << order;
+ spin_unlock(&mdevice->lock);
+ } else if (!is_large && mdevice->free_pages) {
dpage = mdevice->free_pages;
mdevice->free_pages = dpage->zone_device_data;
mdevice->calloc++;
spin_unlock(&mdevice->lock);
} else {
spin_unlock(&mdevice->lock);
- if (dmirror_allocate_chunk(mdevice, &dpage))
+ if (dmirror_allocate_chunk(mdevice, &dpage, is_large))
goto error;
}
- zone_device_page_init(dpage);
+ init_zone_device_folio(page_folio(dpage), order);
dpage->zone_device_data = rpage;
return dpage;
error:
if (rpage)
- __free_page(rpage);
+ __free_pages(rpage, order);
return NULL;
}
static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
struct dmirror *dmirror)
{
- struct dmirror_device *mdevice = dmirror->mdevice;
const unsigned long *src = args->src;
unsigned long *dst = args->dst;
unsigned long addr;
- for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
- src++, dst++) {
+ for (addr = args->start; addr < args->end; ) {
struct page *spage;
struct page *dpage;
struct page *rpage;
+ bool is_large = *src & MIGRATE_PFN_COMPOUND;
+ int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0;
+ unsigned long nr = 1;
if (!(*src & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* Note that spage might be NULL which is OK since it is an
@@ -662,17 +697,45 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
if (WARN(spage && is_zone_device_page(spage),
"page already in device spage pfn: 0x%lx\n",
page_to_pfn(spage)))
+ goto next;
+
+ dpage = dmirror_devmem_alloc_page(dmirror, is_large);
+ if (!dpage) {
+ struct folio *folio;
+ unsigned long i;
+ unsigned long spfn = *src >> MIGRATE_PFN_SHIFT;
+ struct page *src_page;
+
+ if (!is_large)
+ goto next;
+
+ if (!spage && is_large) {
+ nr = HPAGE_PMD_NR;
+ } else {
+ folio = page_folio(spage);
+ nr = folio_nr_pages(folio);
+ }
+
+ for (i = 0; i < nr && addr < args->end; i++) {
+ dpage = dmirror_devmem_alloc_page(dmirror, false);
+ rpage = BACKING_PAGE(dpage);
+ rpage->zone_device_data = dmirror;
+
+ *dst = migrate_pfn(page_to_pfn(dpage)) | write;
+ src_page = pfn_to_page(spfn + i);
+
+ if (spage)
+ copy_highpage(rpage, src_page);
+ else
+ clear_highpage(rpage);
+ src++;
+ dst++;
+ addr += PAGE_SIZE;
+ }
continue;
-
- dpage = dmirror_devmem_alloc_page(mdevice);
- if (!dpage)
- continue;
+ }
rpage = BACKING_PAGE(dpage);
- if (spage)
- copy_highpage(rpage, spage);
- else
- clear_highpage(rpage);
/*
* Normally, a device would use the page->zone_device_data to
@@ -684,10 +747,42 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
page_to_pfn(spage), page_to_pfn(dpage));
- *dst = migrate_pfn(page_to_pfn(dpage));
- if ((*src & MIGRATE_PFN_WRITE) ||
- (!spage && args->vma->vm_flags & VM_WRITE))
- *dst |= MIGRATE_PFN_WRITE;
+
+ *dst = migrate_pfn(page_to_pfn(dpage)) | write;
+
+ if (is_large) {
+ int i;
+ struct folio *folio = page_folio(dpage);
+ *dst |= MIGRATE_PFN_COMPOUND;
+
+ if (folio_test_large(folio)) {
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ struct page *dst_page =
+ pfn_to_page(page_to_pfn(rpage) + i);
+ struct page *src_page =
+ pfn_to_page(page_to_pfn(spage) + i);
+
+ if (spage)
+ copy_highpage(dst_page, src_page);
+ else
+ clear_highpage(dst_page);
+ src++;
+ dst++;
+ addr += PAGE_SIZE;
+ }
+ continue;
+ }
+ }
+
+ if (spage)
+ copy_highpage(rpage, spage);
+ else
+ clear_highpage(rpage);
+
+next:
+ src++;
+ dst++;
+ addr += PAGE_SIZE;
}
}
@@ -734,14 +829,17 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
const unsigned long *src = args->src;
const unsigned long *dst = args->dst;
unsigned long pfn;
+ const unsigned long start_pfn = start >> PAGE_SHIFT;
+ const unsigned long end_pfn = end >> PAGE_SHIFT;
/* Map the migrated pages into the device's page tables. */
mutex_lock(&dmirror->mutex);
- for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
- src++, dst++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) {
struct page *dpage;
void *entry;
+ int nr, i;
+ struct page *rpage;
if (!(*src & MIGRATE_PFN_MIGRATE))
continue;
@@ -750,13 +848,25 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
if (!dpage)
continue;
- entry = BACKING_PAGE(dpage);
- if (*dst & MIGRATE_PFN_WRITE)
- entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
- entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
- if (xa_is_err(entry)) {
- mutex_unlock(&dmirror->mutex);
- return xa_err(entry);
+ if (*dst & MIGRATE_PFN_COMPOUND)
+ nr = folio_nr_pages(page_folio(dpage));
+ else
+ nr = 1;
+
+ WARN_ON_ONCE(end_pfn < start_pfn + nr);
+
+ rpage = BACKING_PAGE(dpage);
+ VM_BUG_ON(folio_nr_pages(page_folio(rpage)) != nr);
+
+ for (i = 0; i < nr; i++) {
+ entry = folio_page(page_folio(rpage), i);
+ if (*dst & MIGRATE_PFN_WRITE)
+ entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
+ entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC);
+ if (xa_is_err(entry)) {
+ mutex_unlock(&dmirror->mutex);
+ return xa_err(entry);
+ }
}
}
@@ -829,31 +939,61 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
unsigned long start = args->start;
unsigned long end = args->end;
unsigned long addr;
+ unsigned int order = 0;
+ int i;
- for (addr = start; addr < end; addr += PAGE_SIZE,
- src++, dst++) {
+ for (addr = start; addr < end; ) {
struct page *dpage, *spage;
spage = migrate_pfn_to_page(*src);
if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
if (WARN_ON(!is_device_private_page(spage) &&
!is_device_coherent_page(spage)))
- continue;
+ goto next;
spage = BACKING_PAGE(spage);
- dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
- if (!dpage)
- continue;
- pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
- page_to_pfn(spage), page_to_pfn(dpage));
+ order = folio_order(page_folio(spage));
+ if (order)
+ dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE,
+ order, args->vma, addr), 0);
+ else
+ dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+
+ /* Try with smaller pages if large allocation fails */
+ if (!dpage && order) {
+ dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+ if (!dpage)
+ return VM_FAULT_OOM;
+ order = 0;
+ }
+
+ pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
+ page_to_pfn(spage), page_to_pfn(dpage));
lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage));
if (*src & MIGRATE_PFN_WRITE)
*dst |= MIGRATE_PFN_WRITE;
+ if (order)
+ *dst |= MIGRATE_PFN_COMPOUND;
+
+ for (i = 0; i < (1 << order); i++) {
+ struct page *src_page;
+ struct page *dst_page;
+
+ src_page = pfn_to_page(page_to_pfn(spage) + i);
+ dst_page = pfn_to_page(page_to_pfn(dpage) + i);
+
+ xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
+ copy_highpage(dst_page, src_page);
+ }
+next:
+ addr += PAGE_SIZE << order;
+ src += 1 << order;
+ dst += 1 << order;
}
return 0;
}
@@ -939,8 +1079,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
- unsigned long src_pfns[64] = { 0 };
- unsigned long dst_pfns[64] = { 0 };
+ unsigned long *src_pfns;
+ unsigned long *dst_pfns;
struct dmirror_bounce bounce;
struct migrate_vma args = { 0 };
unsigned long next;
@@ -955,6 +1095,18 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
if (!mmget_not_zero(mm))
return -EINVAL;
+ ret = -ENOMEM;
+ src_pfns = kmalloc_array(PTRS_PER_PTE, sizeof(*src_pfns),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!src_pfns)
+ goto free_mem;
+
+ dst_pfns = kmalloc_array(PTRS_PER_PTE, sizeof(*dst_pfns),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!dst_pfns)
+ goto free_mem;
+
+ ret = 0;
mmap_read_lock(mm);
for (addr = start; addr < end; addr = next) {
vma = vma_lookup(mm, addr);
@@ -962,7 +1114,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
ret = -EINVAL;
goto out;
}
- next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+ next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT));
if (next > vma->vm_end)
next = vma->vm_end;
@@ -972,7 +1124,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
args.start = addr;
args.end = next;
args.pgmap_owner = dmirror->mdevice;
- args.flags = MIGRATE_VMA_SELECT_SYSTEM;
+ args.flags = MIGRATE_VMA_SELECT_SYSTEM |
+ MIGRATE_VMA_SELECT_COMPOUND;
ret = migrate_vma_setup(&args);
if (ret)
goto out;
@@ -992,7 +1145,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
*/
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
- return ret;
+ goto free_mem;
mutex_lock(&dmirror->mutex);
ret = dmirror_do_read(dmirror, start, end, &bounce);
mutex_unlock(&dmirror->mutex);
@@ -1003,11 +1156,14 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
}
cmd->cpages = bounce.cpages;
dmirror_bounce_fini(&bounce);
- return ret;
+ goto free_mem;
out:
mmap_read_unlock(mm);
mmput(mm);
+free_mem:
+ kfree(src_pfns);
+ kfree(dst_pfns);
return ret;
}
@@ -1200,6 +1356,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
unsigned long i;
unsigned long *src_pfns;
unsigned long *dst_pfns;
+ unsigned int order = 0;
src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
@@ -1215,13 +1372,25 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
if (WARN_ON(!is_device_private_page(spage) &&
!is_device_coherent_page(spage)))
continue;
+
+ order = folio_order(page_folio(spage));
spage = BACKING_PAGE(spage);
- dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+ if (src_pfns[i] & MIGRATE_PFN_COMPOUND) {
+ dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE,
+ order), 0);
+ } else {
+ dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+ order = 0;
+ }
+
+ /* TODO Support splitting here */
lock_page(dpage);
- copy_highpage(dpage, spage);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
if (src_pfns[i] & MIGRATE_PFN_WRITE)
dst_pfns[i] |= MIGRATE_PFN_WRITE;
+ if (order)
+ dst_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ folio_copy(page_folio(dpage), page_folio(spage));
}
migrate_device_pages(src_pfns, dst_pfns, npages);
migrate_device_finalize(src_pfns, dst_pfns, npages);
@@ -1234,7 +1403,12 @@ static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
{
struct dmirror_device *mdevice = devmem->mdevice;
struct page *page;
+ struct folio *folio;
+
+ for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio))
+ if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem)
+ mdevice->free_folios = folio_zone_device_data(folio);
for (page = mdevice->free_pages; page; page = page->zone_device_data)
if (dmirror_page_to_chunk(page) == devmem)
mdevice->free_pages = page->zone_device_data;
@@ -1265,6 +1439,7 @@ static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
mdevice->devmem_count = 0;
mdevice->devmem_capacity = 0;
mdevice->free_pages = NULL;
+ mdevice->free_folios = NULL;
kfree(mdevice->devmem_chunks);
mdevice->devmem_chunks = NULL;
}
@@ -1378,18 +1553,29 @@ static void dmirror_devmem_free(struct page *page)
{
struct page *rpage = BACKING_PAGE(page);
struct dmirror_device *mdevice;
+ struct folio *folio = page_folio(page);
+ unsigned int order = folio_order(folio);
- if (rpage != page)
- __free_page(rpage);
+ if (rpage != page) {
+ if (order)
+ __free_pages(rpage, order);
+ else
+ __free_page(rpage);
+ }
mdevice = dmirror_page_to_device(page);
spin_lock(&mdevice->lock);
/* Return page to our allocator if not freeing the chunk */
if (!dmirror_page_to_chunk(page)->remove) {
- mdevice->cfree++;
- page->zone_device_data = mdevice->free_pages;
- mdevice->free_pages = page;
+ mdevice->cfree += 1 << order;
+ if (order) {
+ page->zone_device_data = mdevice->free_folios;
+ mdevice->free_folios = folio;
+ } else {
+ page->zone_device_data = mdevice->free_pages;
+ mdevice->free_pages = page;
+ }
}
spin_unlock(&mdevice->lock);
}
@@ -1397,11 +1583,10 @@ static void dmirror_devmem_free(struct page *page)
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args = { 0 };
- unsigned long src_pfns = 0;
- unsigned long dst_pfns = 0;
struct page *rpage;
struct dmirror *dmirror;
- vm_fault_t ret;
+ vm_fault_t ret = 0;
+ unsigned int order, nr;
/*
* Normally, a device would use the page->zone_device_data to point to
@@ -1412,21 +1597,36 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
dmirror = rpage->zone_device_data;
/* FIXME demonstrate how we can adjust migrate range */
+ order = folio_order(page_folio(vmf->page));
+ nr = 1 << order;
+
+ /*
+ * Consider a per-cpu cache of src and dst pfns, but with
+ * large number of cpus that might not scale well.
+ */
+ args.start = ALIGN_DOWN(vmf->address, (1 << (PAGE_SHIFT + order)));
args.vma = vmf->vma;
- args.start = vmf->address;
- args.end = args.start + PAGE_SIZE;
- args.src = &src_pfns;
- args.dst = &dst_pfns;
+ args.end = args.start + (PAGE_SIZE << order);
+ args.src = kcalloc(nr, sizeof(*args.src), GFP_KERNEL);
+ args.dst = kcalloc(nr, sizeof(*args.dst), GFP_KERNEL);
args.pgmap_owner = dmirror->mdevice;
args.flags = dmirror_select_device(dmirror);
args.fault_page = vmf->page;
+ if (!args.src || !args.dst) {
+ ret = VM_FAULT_OOM;
+ goto err;
+ }
+
+ if (order)
+ args.flags |= MIGRATE_VMA_SELECT_COMPOUND;
+
if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
if (ret)
- return ret;
+ goto err;
migrate_vma_pages(&args);
/*
* No device finalize step is needed since
@@ -1434,12 +1634,16 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
* invalidated the device page table.
*/
migrate_vma_finalize(&args);
- return 0;
+err:
+ kfree(args.src);
+ kfree(args.dst);
+ return ret;
}
static const struct dev_pagemap_ops dmirror_devmem_ops = {
.page_free = dmirror_devmem_free,
.migrate_to_ram = dmirror_devmem_fault,
+ .page_free = dmirror_devmem_free,
};
static int dmirror_device_init(struct dmirror_device *mdevice, int id)
@@ -1465,7 +1669,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
return ret;
/* Build a list of free ZONE_DEVICE struct pages */
- return dmirror_allocate_chunk(mdevice, NULL);
+ return dmirror_allocate_chunk(mdevice, NULL, false);
}
static void dmirror_device_remove(struct dmirror_device *mdevice)
Enhance the hmm test driver (lib/test_hmm) with support for THP pages. A new pool of free_folios() has now been added to the dmirror device, which can be allocated when a request for a THP zone device private page is made. Add compound page awareness to the allocation function during normal migration and fault based migration. These routines also copy folio_nr_pages() when moving data between system memory and device memory. args.src and args.dst used to hold migration entries are now dynamically allocated (as they need to hold HPAGE_PMD_NR entries or more). Split and migrate support will be added in future patches in this series. Signed-off-by: Balbir Singh <balbirs@nvidia.com> --- lib/test_hmm.c | 342 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 273 insertions(+), 69 deletions(-)