Message ID | 20211112150824.11028-9-joao.m.martins@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm, dax: Introduce compound pages in devmap | expand |
On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index a65c67ab5ee0..0c2ac97d397d 100644 > +++ b/drivers/dax/device.c > @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, > } > #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ > > +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, > + unsigned long fault_size, > + struct address_space *f_mapping) > +{ > + unsigned long i; > + pgoff_t pgoff; > + > + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); > + > + for (i = 0; i < fault_size / PAGE_SIZE; i++) { > + struct page *page; > + > + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); > + if (page->mapping) > + continue; > + page->mapping = f_mapping; > + page->index = pgoff + i; > + } > +} > + > +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, > + unsigned long fault_size, > + struct address_space *f_mapping) > +{ > + struct page *head; > + > + head = pfn_to_page(pfn_t_to_pfn(pfn)); > + head = compound_head(head); > + if (head->mapping) > + return; > + > + head->mapping = f_mapping; > + head->index = linear_page_index(vmf->vma, > + ALIGN(vmf->address, fault_size)); > +} Should this stuff be setup before doing vmf_insert_pfn_XX? In normal cases the page should be returned in the vmf and populated to the page tables by the core code after all this is done. dax can't do that because of the refcount mess, but I would think the basic ordering should be the same. Jason
On 11/12/21 16:34, Jason Gunthorpe wrote: > On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: > >> diff --git a/drivers/dax/device.c b/drivers/dax/device.c >> index a65c67ab5ee0..0c2ac97d397d 100644 >> +++ b/drivers/dax/device.c >> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, >> } >> #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ >> >> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, >> + unsigned long fault_size, >> + struct address_space *f_mapping) >> +{ >> + unsigned long i; >> + pgoff_t pgoff; >> + >> + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); >> + >> + for (i = 0; i < fault_size / PAGE_SIZE; i++) { >> + struct page *page; >> + >> + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); >> + if (page->mapping) >> + continue; >> + page->mapping = f_mapping; >> + page->index = pgoff + i; >> + } >> +} >> + >> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, >> + unsigned long fault_size, >> + struct address_space *f_mapping) >> +{ >> + struct page *head; >> + >> + head = pfn_to_page(pfn_t_to_pfn(pfn)); >> + head = compound_head(head); >> + if (head->mapping) >> + return; >> + >> + head->mapping = f_mapping; >> + head->index = linear_page_index(vmf->vma, >> + ALIGN(vmf->address, fault_size)); >> +} > > Should this stuff be setup before doing vmf_insert_pfn_XX? > Interestingly filesystem-dax does this, but not device-dax. set_page_mapping/set_compound_mapping() could be moved to before and then torn down on @rc != VM_FAULT_NOPAGE (failure). I am not sure what's the benefit in this series.. besides the ordering (that you hinted below) ? > In normal cases the page should be returned in the vmf and populated > to the page tables by the core code after all this is done. > So I suppose by call sites examples as 'core code' is either hugetlbfs call to __filemap_add_folio() (on hugetlbfs fault handler), shmem_add_to_page_cache() or anon-equivalent.
On Mon, Nov 15, 2021 at 01:11:32PM +0100, Joao Martins wrote: > On 11/12/21 16:34, Jason Gunthorpe wrote: > > On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: > > > >> diff --git a/drivers/dax/device.c b/drivers/dax/device.c > >> index a65c67ab5ee0..0c2ac97d397d 100644 > >> +++ b/drivers/dax/device.c > >> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, > >> } > >> #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ > >> > >> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, > >> + unsigned long fault_size, > >> + struct address_space *f_mapping) > >> +{ > >> + unsigned long i; > >> + pgoff_t pgoff; > >> + > >> + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); > >> + > >> + for (i = 0; i < fault_size / PAGE_SIZE; i++) { > >> + struct page *page; > >> + > >> + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); > >> + if (page->mapping) > >> + continue; > >> + page->mapping = f_mapping; > >> + page->index = pgoff + i; > >> + } > >> +} > >> + > >> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, > >> + unsigned long fault_size, > >> + struct address_space *f_mapping) > >> +{ > >> + struct page *head; > >> + > >> + head = pfn_to_page(pfn_t_to_pfn(pfn)); > >> + head = compound_head(head); > >> + if (head->mapping) > >> + return; > >> + > >> + head->mapping = f_mapping; > >> + head->index = linear_page_index(vmf->vma, > >> + ALIGN(vmf->address, fault_size)); > >> +} > > > > Should this stuff be setup before doing vmf_insert_pfn_XX? > > > > Interestingly filesystem-dax does this, but not device-dax. I think it may be a bug ? > set_page_mapping/set_compound_mapping() could be moved to before and > then torn down on @rc != VM_FAULT_NOPAGE (failure). I am not sure > what's the benefit in this series.. besides the ordering (that you > hinted below) ? Well, it should probably be fixed in a precursor patch. I think the general idea is that page->mapping/index are stable once the page is published in a PTE? > > In normal cases the page should be returned in the vmf and populated > > to the page tables by the core code after all this is done. > > So I suppose by call sites examples as 'core code' is either hugetlbfs call to > __filemap_add_folio() (on hugetlbfs fault handler), shmem_add_to_page_cache() or > anon-equivalent. I was talking more about the normal page insertion flow which is done by setting vmf->page and then returning. finish_fault() will install the PTE If this is the best way then I would expect some future where maybe there is a vmf->folio and finish_fault() will install a PUD/PMD/PTE and we don't call vmf_insert_pfnxx in DAX. Jason
On 11/15/21 17:49, Jason Gunthorpe wrote: > On Mon, Nov 15, 2021 at 01:11:32PM +0100, Joao Martins wrote: >> On 11/12/21 16:34, Jason Gunthorpe wrote: >>> On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: >>>> diff --git a/drivers/dax/device.c b/drivers/dax/device.c >>>> index a65c67ab5ee0..0c2ac97d397d 100644 >>>> +++ b/drivers/dax/device.c >>>> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, >>>> } >>>> #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ >>>> >>>> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, >>>> + unsigned long fault_size, >>>> + struct address_space *f_mapping) >>>> +{ >>>> + unsigned long i; >>>> + pgoff_t pgoff; >>>> + >>>> + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); >>>> + >>>> + for (i = 0; i < fault_size / PAGE_SIZE; i++) { >>>> + struct page *page; >>>> + >>>> + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); >>>> + if (page->mapping) >>>> + continue; >>>> + page->mapping = f_mapping; >>>> + page->index = pgoff + i; >>>> + } >>>> +} >>>> + >>>> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, >>>> + unsigned long fault_size, >>>> + struct address_space *f_mapping) >>>> +{ >>>> + struct page *head; >>>> + >>>> + head = pfn_to_page(pfn_t_to_pfn(pfn)); >>>> + head = compound_head(head); >>>> + if (head->mapping) >>>> + return; >>>> + >>>> + head->mapping = f_mapping; >>>> + head->index = linear_page_index(vmf->vma, >>>> + ALIGN(vmf->address, fault_size)); >>>> +} >>> >>> Should this stuff be setup before doing vmf_insert_pfn_XX? >>> >> >> Interestingly filesystem-dax does this, but not device-dax. > > I think it may be a bug ? > Possibly. Dan, any thoughts (see also below) ? You probably hold all that history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") and commit 35de299547d1 ("device-dax: Set page->index"). >> set_page_mapping/set_compound_mapping() could be moved to before and >> then torn down on @rc != VM_FAULT_NOPAGE (failure). I am not sure >> what's the benefit in this series.. besides the ordering (that you >> hinted below) ? > > Well, it should probably be fixed in a precursor patch. > Yeap -- I would move page_mapping prior to introduce set_compound_mapping(). Now I am thinking again and with that logic it makes more sense to ammend inside set_page_mapping() -- should have less nested around in the fault handler. > I think the general idea is that page->mapping/index are stable once > the page is published in a PTE? > /me nods >>> In normal cases the page should be returned in the vmf and populated >>> to the page tables by the core code after all this is done. >> >> So I suppose by call sites examples as 'core code' is either hugetlbfs call to >> __filemap_add_folio() (on hugetlbfs fault handler), shmem_add_to_page_cache() or >> anon-equivalent. > > I was talking more about the normal page insertion flow which is done > by setting vmf->page and then returning. finish_fault() will install > the PTE > I misunderstood you earlier -- I thought you were suggesting me to look at how mapping/index is set (in the context of the flow you just described) Joao
On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: > Use the newly added compound devmap facility which maps the assigned dax > ranges as compound pages at a page size of @align. > > dax devices are created with a fixed @align (huge page size) which is > enforced through as well at mmap() of the device. Faults, consequently > happen too at the specified @align specified at the creation, and those > don't change throughout dax device lifetime. MCEs unmap a whole dax > huge page, as well as splits occurring at the configured page size. > > Performance measured by gup_test improves considerably for > unpin_user_pages() and altmap with NVDIMMs: > > $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w > (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms > [altmap] > (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms > > $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w > (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms > [altmap with -m 127004] > (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms > > .. as well as unpin_user_page_range_dirty_lock() being just as effective > as THP/hugetlb[0] pages. > > [0] https://lore.kernel.org/linux-mm/20210212130843.13865-5-joao.m.martins@oracle.com/ > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > Reviewed-by: Dan Williams <dan.j.williams@intel.com> > --- > drivers/dax/device.c | 57 ++++++++++++++++++++++++++++++++++---------- > 1 file changed, 44 insertions(+), 13 deletions(-) > > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index a65c67ab5ee0..0c2ac97d397d 100644 > --- a/drivers/dax/device.c > +++ b/drivers/dax/device.c > @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, > } > #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ > > +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, > + unsigned long fault_size, > + struct address_space *f_mapping) > +{ > + unsigned long i; > + pgoff_t pgoff; > + > + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); > + > + for (i = 0; i < fault_size / PAGE_SIZE; i++) { > + struct page *page; > + > + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); > + if (page->mapping) > + continue; > + page->mapping = f_mapping; > + page->index = pgoff + i; > + } > +} No need to pass f_mapping here, it must be vmf->vma->vm_file->f_mapping. > +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, > + unsigned long fault_size, > + struct address_space *f_mapping) > +{ > + struct page *head; > + > + head = pfn_to_page(pfn_t_to_pfn(pfn)); > + head = compound_head(head); > + if (head->mapping) > + return; > + > + head->mapping = f_mapping; > + head->index = linear_page_index(vmf->vma, > + ALIGN(vmf->address, fault_size)); > +} Same here. > if (rc == VM_FAULT_NOPAGE) { > - unsigned long i; > - pgoff_t pgoff; > + struct dev_pagemap *pgmap = dev_dax->pgmap; If you're touching this anyway: why not do an early return here for the error case to simplify the flow?
On 11/17/21 10:43, Christoph Hellwig wrote: > On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: >> Use the newly added compound devmap facility which maps the assigned dax >> ranges as compound pages at a page size of @align. >> >> dax devices are created with a fixed @align (huge page size) which is >> enforced through as well at mmap() of the device. Faults, consequently >> happen too at the specified @align specified at the creation, and those >> don't change throughout dax device lifetime. MCEs unmap a whole dax >> huge page, as well as splits occurring at the configured page size. >> >> Performance measured by gup_test improves considerably for >> unpin_user_pages() and altmap with NVDIMMs: >> >> $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w >> (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms >> [altmap] >> (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms >> >> $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w >> (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms >> [altmap with -m 127004] >> (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms >> >> .. as well as unpin_user_page_range_dirty_lock() being just as effective >> as THP/hugetlb[0] pages. >> >> [0] https://lore.kernel.org/linux-mm/20210212130843.13865-5-joao.m.martins@oracle.com/ >> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> Reviewed-by: Dan Williams <dan.j.williams@intel.com> >> --- >> drivers/dax/device.c | 57 ++++++++++++++++++++++++++++++++++---------- >> 1 file changed, 44 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/dax/device.c b/drivers/dax/device.c >> index a65c67ab5ee0..0c2ac97d397d 100644 >> --- a/drivers/dax/device.c >> +++ b/drivers/dax/device.c >> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, >> } >> #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ >> >> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, >> + unsigned long fault_size, >> + struct address_space *f_mapping) >> +{ >> + unsigned long i; >> + pgoff_t pgoff; >> + >> + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); >> + >> + for (i = 0; i < fault_size / PAGE_SIZE; i++) { >> + struct page *page; >> + >> + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); >> + if (page->mapping) >> + continue; >> + page->mapping = f_mapping; >> + page->index = pgoff + i; >> + } >> +} > > No need to pass f_mapping here, it must be vmf->vma->vm_file->f_mapping. > Hmmm good point -- If I keep this structure yeah I will nuke @f_mapping. Should I move the @mapping setting to before vmf_insert_pfn*() (as Jason suggests) then the @f_mapping argument might be useful to clear it on @rc != VM_FAULT_NOPAGE. >> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, >> + unsigned long fault_size, >> + struct address_space *f_mapping) >> +{ >> + struct page *head; >> + >> + head = pfn_to_page(pfn_t_to_pfn(pfn)); >> + head = compound_head(head); >> + if (head->mapping) >> + return; >> + >> + head->mapping = f_mapping; >> + head->index = linear_page_index(vmf->vma, >> + ALIGN(vmf->address, fault_size)); >> +} > > Same here. > /me nods >> if (rc == VM_FAULT_NOPAGE) { >> - unsigned long i; >> - pgoff_t pgoff; >> + struct dev_pagemap *pgmap = dev_dax->pgmap; > > If you're touching this anyway: why not do an early return here for > the error case to simplify the flow? > Yeah. I was thinking in doing that i.e. calling set_compound_mapping() earlier or even inside set_page_mapping(). Albeit this would need a new argument (the dev_dax).
On 11/16/21 16:38, Joao Martins wrote: > On 11/15/21 17:49, Jason Gunthorpe wrote: >> On Mon, Nov 15, 2021 at 01:11:32PM +0100, Joao Martins wrote: >>> On 11/12/21 16:34, Jason Gunthorpe wrote: >>>> On Fri, Nov 12, 2021 at 04:08:24PM +0100, Joao Martins wrote: >>>>> diff --git a/drivers/dax/device.c b/drivers/dax/device.c >>>>> index a65c67ab5ee0..0c2ac97d397d 100644 >>>>> +++ b/drivers/dax/device.c >>>>> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, >>>>> } >>>>> #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ >>>>> >>>>> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, >>>>> + unsigned long fault_size, >>>>> + struct address_space *f_mapping) >>>>> +{ >>>>> + unsigned long i; >>>>> + pgoff_t pgoff; >>>>> + >>>>> + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); >>>>> + >>>>> + for (i = 0; i < fault_size / PAGE_SIZE; i++) { >>>>> + struct page *page; >>>>> + >>>>> + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); >>>>> + if (page->mapping) >>>>> + continue; >>>>> + page->mapping = f_mapping; >>>>> + page->index = pgoff + i; >>>>> + } >>>>> +} >>>>> + >>>>> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, >>>>> + unsigned long fault_size, >>>>> + struct address_space *f_mapping) >>>>> +{ >>>>> + struct page *head; >>>>> + >>>>> + head = pfn_to_page(pfn_t_to_pfn(pfn)); >>>>> + head = compound_head(head); >>>>> + if (head->mapping) >>>>> + return; >>>>> + >>>>> + head->mapping = f_mapping; >>>>> + head->index = linear_page_index(vmf->vma, >>>>> + ALIGN(vmf->address, fault_size)); >>>>> +} >>>> >>>> Should this stuff be setup before doing vmf_insert_pfn_XX? >>>> >>> >>> Interestingly filesystem-dax does this, but not device-dax. >> >> I think it may be a bug ? >> > Possibly. > > Dan, any thoughts (see also below) ? You probably hold all that > history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") > and commit 35de299547d1 ("device-dax: Set page->index"). > Below is what I have staged so far as a percursor patch (see below scissors mark). It also lets me simplify compound page case for __dax_set_mapping() in this patch, like below diff. But I still wonder whether this ordering adjustment of @mapping setting is best placed as a percursor patch whenever pgmap/page refcount changes happen. Anyways it's just a thought. diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 80824e460fbf..35706214778e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -78,15 +78,21 @@ static void __dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, struct address_space *f_mapping) { struct address_space *c_mapping = vmf->vma->vm_file->f_mapping; + struct dev_dax *dev_dax = vmf->vma->vm_file->private_data; unsigned long i, nr_pages = fault_size / PAGE_SIZE; pgoff_t pgoff; + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + page = compound_head(page); if (page->mapping && (!f_mapping && page->mapping != c_mapping)) continue; @@ -473,6 +479,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) } pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); ( ----------------------------------->8------------------------------------- From: Joao Martins <joao.m.martins@oracle.com> Subject: [PATCH] device-dax: set mapping prior to vmf_insert_pfn{,_pmd,pud}() Normally, the @page mapping is set prior to inserting the page into a page table entry. Make device-dax adhere to the same ordering, rather than setting mapping after the PTE is inserted. Care is taken to clear the mapping on a vmf_insert_pfn* failure (rc != VM_FAULT_NOPAGE). Thus mapping is cleared when we have a valid @pfn which is right before we call vmf_insert_pfn*() and it is only cleared if the one set on the page is the mapping recorded in the fault handler data (@vmf). Suggested-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> --- drivers/dax/device.c | 79 +++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 630de5a795b0..80824e460fbf 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -73,6 +73,43 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, return -1; } +static void __dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size, + struct address_space *f_mapping) +{ + struct address_space *c_mapping = vmf->vma->vm_file->f_mapping; + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + pgoff_t pgoff; + + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + + if (page->mapping && + (!f_mapping && page->mapping != c_mapping)) + continue; + + page->mapping = f_mapping; + page->index = pgoff + i; + } +} + +static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) +{ + struct address_space *c_mapping = vmf->vma->vm_file->f_mapping; + + __dax_set_mapping(vmf, pfn, fault_size, c_mapping); +} + +static void dax_clear_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) +{ + __dax_set_mapping(vmf, pfn, fault_size, NULL); +} + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf, pfn_t *pfn) { @@ -100,6 +137,8 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); } @@ -140,6 +179,8 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -182,6 +223,8 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } #else @@ -199,7 +242,7 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; + pfn_t pfn = { .val = 0 }; struct dev_dax *dev_dax = filp->private_data; dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, @@ -224,28 +267,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, - ALIGN(vmf->address, fault_size)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } + /* + * In the device-dax case the only possibility for a + * VM_FAULT_NOPAGE result is when device-dax capacity is + * mapped. No need to consider the zero page, or racing + * conflicting mappings. + * We could get VM_FAULT_FALLBACK without even attempting + * to insert the page table entry. So make sure we test + * for the error code with a devmap @pfn value which is + * set right before vmf_insert_pfn*(). + */ + if (rc != VM_FAULT_NOPAGE && pfn_t_devmap(pfn)) + dax_clear_mapping(vmf, pfn, fault_size); dax_read_unlock(id); return rc;
On Fri, Nov 19, 2021 at 04:12:18PM +0000, Joao Martins wrote: > > Dan, any thoughts (see also below) ? You probably hold all that > > history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") > > and commit 35de299547d1 ("device-dax: Set page->index"). > > > Below is what I have staged so far as a percursor patch (see below scissors mark). > > It also lets me simplify compound page case for __dax_set_mapping() in this patch, > like below diff. > > But I still wonder whether this ordering adjustment of @mapping setting is best placed > as a percursor patch whenever pgmap/page refcount changes happen. Anyways it's just a > thought. naively I would have thought you'd set the mapping on all pages when you create the address_space and allocate pages into it. AFAIK devmap assigns all pages to a single address_space, so shouldn't the mapping just be done once? Jason
On 11/19/21 16:55, Jason Gunthorpe wrote: > On Fri, Nov 19, 2021 at 04:12:18PM +0000, Joao Martins wrote: > >>> Dan, any thoughts (see also below) ? You probably hold all that >>> history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") >>> and commit 35de299547d1 ("device-dax: Set page->index"). >>> >> Below is what I have staged so far as a percursor patch (see below scissors mark). >> >> It also lets me simplify compound page case for __dax_set_mapping() in this patch, >> like below diff. >> >> But I still wonder whether this ordering adjustment of @mapping setting is best placed >> as a percursor patch whenever pgmap/page refcount changes happen. Anyways it's just a >> thought. > > naively I would have thought you'd set the mapping on all pages when > you create the address_space and allocate pages into it. Today in fsdax/device-dax (hugetlb too) this is set on fault and set once only (as you say) on the mapped pages. fsdax WARN_ON() you when you clearing a page mapping that was not set to the expected address_space (similar to what I did here) Similar to what I do in the previous snippet I pasted. Except that maybe I should just clear the mapping rather than clearing if f_mapping is the expected one. > AFAIK devmap > assigns all pages to a single address_space, so shouldn't the mapping > just be done once? Isn't it a bit more efficient that you set only when you try to map a page? I take it that's why it is being done this way. Joao
On Fri, Nov 19, 2021 at 07:26:44PM +0000, Joao Martins wrote: > On 11/19/21 16:55, Jason Gunthorpe wrote: > > On Fri, Nov 19, 2021 at 04:12:18PM +0000, Joao Martins wrote: > > > >>> Dan, any thoughts (see also below) ? You probably hold all that > >>> history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") > >>> and commit 35de299547d1 ("device-dax: Set page->index"). > >>> > >> Below is what I have staged so far as a percursor patch (see below scissors mark). > >> > >> It also lets me simplify compound page case for __dax_set_mapping() in this patch, > >> like below diff. > >> > >> But I still wonder whether this ordering adjustment of @mapping setting is best placed > >> as a percursor patch whenever pgmap/page refcount changes happen. Anyways it's just a > >> thought. > > > > naively I would have thought you'd set the mapping on all pages when > > you create the address_space and allocate pages into it. > > Today in fsdax/device-dax (hugetlb too) this is set on fault and set once > only (as you say) on the mapped pages. fsdax WARN_ON() you when you clearing > a page mapping that was not set to the expected address_space (similar to > what I did here) I would imagine that a normal FS case is to allocate some new memory and then join it to the address_space and set mapping, so that makes sense. For fsdax, logically the DAX pages on the medium with struct pages could be in the address_space as soon as the inode is created. That would improve fault performance at the cost of making address_space creation a lot slower, so I can see why not to do that. > > AFAIK devmap > > assigns all pages to a single address_space, so shouldn't the mapping > > just be done once? > Isn't it a bit more efficient that you set only when you try to map a page? For devdax if you can set the address space as part of initializing each struct page and setting the compounds it would probably be a net win? Anyhow, I think what you did here is OK? Maybe I don't understand the question 'whenever pgmap/page refcount changes happen' Jason
On 11/19/21 19:53, Jason Gunthorpe wrote: > On Fri, Nov 19, 2021 at 07:26:44PM +0000, Joao Martins wrote: >> On 11/19/21 16:55, Jason Gunthorpe wrote: >>> On Fri, Nov 19, 2021 at 04:12:18PM +0000, Joao Martins wrote: >>> >>>>> Dan, any thoughts (see also below) ? You probably hold all that >>>>> history since its inception on commit 2232c6382a4 ("device-dax: Enable page_mapping()") >>>>> and commit 35de299547d1 ("device-dax: Set page->index"). >>>>> >>>> Below is what I have staged so far as a percursor patch (see below scissors mark). >>>> >>>> It also lets me simplify compound page case for __dax_set_mapping() in this patch, >>>> like below diff. >>>> >>>> But I still wonder whether this ordering adjustment of @mapping setting is best placed >>>> as a percursor patch whenever pgmap/page refcount changes happen. Anyways it's just a >>>> thought. >>> >>> naively I would have thought you'd set the mapping on all pages when >>> you create the address_space and allocate pages into it. >> >> Today in fsdax/device-dax (hugetlb too) this is set on fault and set once >> only (as you say) on the mapped pages. fsdax WARN_ON() you when you clearing >> a page mapping that was not set to the expected address_space (similar to >> what I did here) > > I would imagine that a normal FS case is to allocate some new memory > and then join it to the address_space and set mapping, so that makes > sense. > > For fsdax, logically the DAX pages on the medium with struct pages > could be in the address_space as soon as the inode is created. That > would improve fault performance at the cost of making address_space > creation a lot slower, so I can see why not to do that. > >>> AFAIK devmap >>> assigns all pages to a single address_space, so shouldn't the mapping >>> just be done once? >> Isn't it a bit more efficient that you set only when you try to map a page? > > For devdax if you can set the address space as part of initializing > each struct page and setting the compounds it would probably be a net > win? > Provided that we only set in the head yes, it would have a neligible cost over region bringup as it only touches the head mapping. Now with the base pages on device-dax the zone init would probably jump considerably. > Anyhow, I think what you did here is OK? Yeah, I wanted to hear Dan thoughts over -- or maybe I should just respin the series with the added cleanup. Joao
diff --git a/drivers/dax/device.c b/drivers/dax/device.c index a65c67ab5ee0..0c2ac97d397d 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size, + struct address_space *f_mapping) +{ + unsigned long i; + pgoff_t pgoff; + + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); + + for (i = 0; i < fault_size / PAGE_SIZE; i++) { + struct page *page; + + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + if (page->mapping) + continue; + page->mapping = f_mapping; + page->index = pgoff + i; + } +} + +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size, + struct address_space *f_mapping) +{ + struct page *head; + + head = pfn_to_page(pfn_t_to_pfn(pfn)); + head = compound_head(head); + if (head->mapping) + return; + + head->mapping = f_mapping; + head->index = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); +} + static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { @@ -225,8 +261,7 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, } if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; + struct dev_pagemap *pgmap = dev_dax->pgmap; /* * In the device-dax case the only possibility for a @@ -234,17 +269,10 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, * mapped. No need to consider the zero page, or racing * conflicting mappings. */ - pgoff = linear_page_index(vmf->vma, - ALIGN(vmf->address, fault_size)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } + if (pgmap->vmemmap_shift) + set_compound_mapping(vmf, pfn, fault_size, filp->f_mapping); + else + set_page_mapping(vmf, pfn, fault_size, filp->f_mapping); } dax_read_unlock(id); @@ -439,6 +467,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) } pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr);