Message ID | 20210827145819.16471-9-joao.m.martins@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm, sparse-vmemmap: Introduce compound devmaps for device-dax | expand |
On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: > #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) > static int __gup_device_huge(unsigned long pfn, unsigned long addr, > unsigned long end, unsigned int flags, > struct page **pages, int *nr) > { > - int nr_start = *nr; > + int refs, nr_start = *nr; > struct dev_pagemap *pgmap = NULL; > int ret = 1; > > do { > - struct page *page = pfn_to_page(pfn); > + struct page *head, *page = pfn_to_page(pfn); > + unsigned long next = addr + PAGE_SIZE; > > pgmap = get_dev_pagemap(pfn, pgmap); > if (unlikely(!pgmap)) { > @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, > ret = 0; > break; > } > - SetPageReferenced(page); > - pages[*nr] = page; > - if (unlikely(!try_grab_page(page, flags))) { > - undo_dev_pagemap(nr, nr_start, flags, pages); > + > + head = compound_head(page); > + /* @end is assumed to be limited at most one compound page */ > + if (PageHead(head)) > + next = end; > + refs = record_subpages(page, addr, next, pages + *nr); > + > + SetPageReferenced(head); > + if (unlikely(!try_grab_compound_head(head, refs, flags))) { > + if (PageHead(head)) > + ClearPageReferenced(head); > + else > + undo_dev_pagemap(nr, nr_start, flags, pages); > ret = 0; > break; Why is this special cased for devmap? Shouldn't everything processing pud/pmds/etc use the same basic loop that is similar in idea to the 'for_each_compound_head' scheme in unpin_user_pages_dirty_lock()? Doesn't that work for all the special page type cases here? Jason
On 8/27/21 5:25 PM, Jason Gunthorpe wrote: > On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: > >> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) >> static int __gup_device_huge(unsigned long pfn, unsigned long addr, >> unsigned long end, unsigned int flags, >> struct page **pages, int *nr) >> { >> - int nr_start = *nr; >> + int refs, nr_start = *nr; >> struct dev_pagemap *pgmap = NULL; >> int ret = 1; >> >> do { >> - struct page *page = pfn_to_page(pfn); >> + struct page *head, *page = pfn_to_page(pfn); >> + unsigned long next = addr + PAGE_SIZE; >> >> pgmap = get_dev_pagemap(pfn, pgmap); >> if (unlikely(!pgmap)) { >> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, >> ret = 0; >> break; >> } >> - SetPageReferenced(page); >> - pages[*nr] = page; >> - if (unlikely(!try_grab_page(page, flags))) { >> - undo_dev_pagemap(nr, nr_start, flags, pages); >> + >> + head = compound_head(page); >> + /* @end is assumed to be limited at most one compound page */ >> + if (PageHead(head)) >> + next = end; >> + refs = record_subpages(page, addr, next, pages + *nr); >> + >> + SetPageReferenced(head); >> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { >> + if (PageHead(head)) >> + ClearPageReferenced(head); >> + else >> + undo_dev_pagemap(nr, nr_start, flags, pages); >> ret = 0; >> break; > > Why is this special cased for devmap? > > Shouldn't everything processing pud/pmds/etc use the same basic loop > that is similar in idea to the 'for_each_compound_head' scheme in > unpin_user_pages_dirty_lock()? > > Doesn't that work for all the special page type cases here? We are iterating over PFNs to create an array of base pages (regardless of page table type), rather than iterating over an array of pages to work on. Given that all these gup functions already give you the boundary (end of pmd or end of pud, etc) then we just need to grab the ref to pgmap and head page and save the tails. But sadly we need to handle the base page case which is why there's this outer loop exists sadly. If it was just head pages we wouldn't need the outer loop (and hence no for_each_compound_head, like the hugetlb variant). But maybe I am being dense and you just mean to replace the outer loop with for_each_compound_range(). I am a little stuck on the part that I anyways need to record back the tail pages when iterating over the (single) head page. So I feel that it wouldn't bring that much improvement, unless I missed your point. Joao
On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote: > On 8/27/21 5:25 PM, Jason Gunthorpe wrote: > > On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: > > > >> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) > >> static int __gup_device_huge(unsigned long pfn, unsigned long addr, > >> unsigned long end, unsigned int flags, > >> struct page **pages, int *nr) > >> { > >> - int nr_start = *nr; > >> + int refs, nr_start = *nr; > >> struct dev_pagemap *pgmap = NULL; > >> int ret = 1; > >> > >> do { > >> - struct page *page = pfn_to_page(pfn); > >> + struct page *head, *page = pfn_to_page(pfn); > >> + unsigned long next = addr + PAGE_SIZE; > >> > >> pgmap = get_dev_pagemap(pfn, pgmap); > >> if (unlikely(!pgmap)) { > >> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, > >> ret = 0; > >> break; > >> } > >> - SetPageReferenced(page); > >> - pages[*nr] = page; > >> - if (unlikely(!try_grab_page(page, flags))) { > >> - undo_dev_pagemap(nr, nr_start, flags, pages); > >> + > >> + head = compound_head(page); > >> + /* @end is assumed to be limited at most one compound page */ > >> + if (PageHead(head)) > >> + next = end; > >> + refs = record_subpages(page, addr, next, pages + *nr); > >> + > >> + SetPageReferenced(head); > >> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { > >> + if (PageHead(head)) > >> + ClearPageReferenced(head); > >> + else > >> + undo_dev_pagemap(nr, nr_start, flags, pages); > >> ret = 0; > >> break; > > > > Why is this special cased for devmap? > > > > Shouldn't everything processing pud/pmds/etc use the same basic loop > > that is similar in idea to the 'for_each_compound_head' scheme in > > unpin_user_pages_dirty_lock()? > > > > Doesn't that work for all the special page type cases here? > > We are iterating over PFNs to create an array of base pages (regardless of page table > type), rather than iterating over an array of pages to work on. That is part of it, yes, but the slow bit here is to minimally find the head pages and do the atomics on them, much like the unpin_user_pages_dirty_lock() I would think this should be designed similar to how things work on the unpin side. Sweep the page tables to find a proper start/end - eg even if a compound is spread across multiple pte/pmd/pud/etc we should find a linear range of starting PFN (ie starting page*) and npages across as much of the page tables as we can manage. This is the same as where things end up in the unpin case where all the contiguous PFNs are grouped togeher into a range. Then 'assign' that range to the output array which requires walking over each compount_head in the range and pinning it, then writing out the tail pages to the output struct page array. And this approach should apply universally no matter what is under the pte's - ie huge pages, THPs and devmaps should all be treated the same way. Currently each case is different, like above which is unique to device_huge. The more we can process groups of pages in bulks the faster the whole thing will be. Jason Given that all these gup > functions already give you the boundary (end of pmd or end of pud, etc) then we just need > to grab the ref to pgmap and head page and save the tails. But sadly we need to handle the > base page case which is why there's this outer loop exists sadly. If it was just head > pages we wouldn't need the outer loop (and hence no for_each_compound_head, like the > hugetlb variant). > > But maybe I am being dense and you just mean to replace the outer loop with > for_each_compound_range(). I am a little stuck on the part that I anyways need to record > back the tail pages when iterating over the (single) head page. So I feel that it wouldn't > bring that much improvement, unless I missed your point. > > Joao >
On 8/30/21 2:07 PM, Jason Gunthorpe wrote: > On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote: >> On 8/27/21 5:25 PM, Jason Gunthorpe wrote: >>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: >>> >>>> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) >>>> static int __gup_device_huge(unsigned long pfn, unsigned long addr, >>>> unsigned long end, unsigned int flags, >>>> struct page **pages, int *nr) >>>> { >>>> - int nr_start = *nr; >>>> + int refs, nr_start = *nr; >>>> struct dev_pagemap *pgmap = NULL; >>>> int ret = 1; >>>> >>>> do { >>>> - struct page *page = pfn_to_page(pfn); >>>> + struct page *head, *page = pfn_to_page(pfn); >>>> + unsigned long next = addr + PAGE_SIZE; >>>> >>>> pgmap = get_dev_pagemap(pfn, pgmap); >>>> if (unlikely(!pgmap)) { >>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, >>>> ret = 0; >>>> break; >>>> } >>>> - SetPageReferenced(page); >>>> - pages[*nr] = page; >>>> - if (unlikely(!try_grab_page(page, flags))) { >>>> - undo_dev_pagemap(nr, nr_start, flags, pages); >>>> + >>>> + head = compound_head(page); >>>> + /* @end is assumed to be limited at most one compound page */ >>>> + if (PageHead(head)) >>>> + next = end; >>>> + refs = record_subpages(page, addr, next, pages + *nr); >>>> + >>>> + SetPageReferenced(head); >>>> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { >>>> + if (PageHead(head)) >>>> + ClearPageReferenced(head); >>>> + else >>>> + undo_dev_pagemap(nr, nr_start, flags, pages); >>>> ret = 0; >>>> break; >>> >>> Why is this special cased for devmap? >>> >>> Shouldn't everything processing pud/pmds/etc use the same basic loop >>> that is similar in idea to the 'for_each_compound_head' scheme in >>> unpin_user_pages_dirty_lock()? >>> >>> Doesn't that work for all the special page type cases here? >> >> We are iterating over PFNs to create an array of base pages (regardless of page table >> type), rather than iterating over an array of pages to work on. > > That is part of it, yes, but the slow bit here is to minimally find > the head pages and do the atomics on them, much like the > unpin_user_pages_dirty_lock() > > I would think this should be designed similar to how things work on > the unpin side. > I don't think it's the same thing. The bit you say 'minimally find the head pages' carries a visible overhead in unpin_user_pages() as we are checking each of the pages belongs to the same head page -- because you can pass an arbritary set of pages. This does have a cost which is not in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you already know that you are processing a contiguous chunk of pages. If anything, we are closer to unpin_user_page_range*() than unpin_user_pages(). > Sweep the page tables to find a proper start/end - eg even if a > compound is spread across multiple pte/pmd/pud/etc we should find a > linear range of starting PFN (ie starting page*) and npages across as > much of the page tables as we can manage. This is the same as where > things end up in the unpin case where all the contiguous PFNs are > grouped togeher into a range. > > Then 'assign' that range to the output array which requires walking > over each compount_head in the range and pinning it, then writing out > the tail pages to the output struct page array. > > And this approach should apply universally no matter what is under the > pte's - ie huge pages, THPs and devmaps should all be treated the same > way. Currently each case is different, like above which is unique to > device_huge. > Only devmap gup-fast is different IIUC. Switching to similar iteration logic to unpin would look something like this (still untested): for_each_compound_range(index, &page, npages, head, refs) { pgmap = get_dev_pagemap(pfn + *nr, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); ret = 0; break; } SetPageReferenced(head); if (unlikely(!try_grab_compound_head(head, refs, flags))) { if (PageHead(head)) ClearPageReferenced(head); else undo_dev_pagemap(nr, nr_start, flags, pages); ret = 0; break; } record_subpages(page + *nr, addr, addr + (refs << PAGE_SHIFT), pages + *nr); *(nr) += refs; addr += (refs << PAGE_SHIFT); } But it looks to be a tidbit more complex and not really aligning with the rest of gup-fast. All in all, I am dealing with the fact that 1) devmap pmds/puds may not be represented with compound pages and 2) we temporarily grab dev_pagemap reference prior to pinning the page. Those two items is what makes this different than THPs/HugeTLB (which do have the same logic). And thus it's what lead me to *slightly* improve gup_device_huge().
On Tue, Aug 31, 2021 at 01:34:04PM +0100, Joao Martins wrote: > On 8/30/21 2:07 PM, Jason Gunthorpe wrote: > > On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote: > >> On 8/27/21 5:25 PM, Jason Gunthorpe wrote: > >>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: > >>> > >>>> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) > >>>> static int __gup_device_huge(unsigned long pfn, unsigned long addr, > >>>> unsigned long end, unsigned int flags, > >>>> struct page **pages, int *nr) > >>>> { > >>>> - int nr_start = *nr; > >>>> + int refs, nr_start = *nr; > >>>> struct dev_pagemap *pgmap = NULL; > >>>> int ret = 1; > >>>> > >>>> do { > >>>> - struct page *page = pfn_to_page(pfn); > >>>> + struct page *head, *page = pfn_to_page(pfn); > >>>> + unsigned long next = addr + PAGE_SIZE; > >>>> > >>>> pgmap = get_dev_pagemap(pfn, pgmap); > >>>> if (unlikely(!pgmap)) { > >>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, > >>>> ret = 0; > >>>> break; > >>>> } > >>>> - SetPageReferenced(page); > >>>> - pages[*nr] = page; > >>>> - if (unlikely(!try_grab_page(page, flags))) { > >>>> - undo_dev_pagemap(nr, nr_start, flags, pages); > >>>> + > >>>> + head = compound_head(page); > >>>> + /* @end is assumed to be limited at most one compound page */ > >>>> + if (PageHead(head)) > >>>> + next = end; > >>>> + refs = record_subpages(page, addr, next, pages + *nr); > >>>> + > >>>> + SetPageReferenced(head); > >>>> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { > >>>> + if (PageHead(head)) > >>>> + ClearPageReferenced(head); > >>>> + else > >>>> + undo_dev_pagemap(nr, nr_start, flags, pages); > >>>> ret = 0; > >>>> break; > >>> > >>> Why is this special cased for devmap? > >>> > >>> Shouldn't everything processing pud/pmds/etc use the same basic loop > >>> that is similar in idea to the 'for_each_compound_head' scheme in > >>> unpin_user_pages_dirty_lock()? > >>> > >>> Doesn't that work for all the special page type cases here? > >> > >> We are iterating over PFNs to create an array of base pages (regardless of page table > >> type), rather than iterating over an array of pages to work on. > > > > That is part of it, yes, but the slow bit here is to minimally find > > the head pages and do the atomics on them, much like the > > unpin_user_pages_dirty_lock() > > > > I would think this should be designed similar to how things work on > > the unpin side. > > > I don't think it's the same thing. The bit you say 'minimally find the > head pages' carries a visible overhead in unpin_user_pages() as we are > checking each of the pages belongs to the same head page -- because you > can pass an arbritary set of pages. This does have a cost which is not > in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you > already know that you are processing a contiguous chunk of pages. > If anything, we are closer to unpin_user_page_range*() > than unpin_user_pages(). Yes, that is what I mean, it is very similar to the range case as we don't even know that a single compound spans a pud/pmd. So you end up doing the same loop to find the compound boundaries. Under GUP slow we can also aggregate multiple page table entires, eg a split huge page could be procesed as a single 2M range operation even if it is broken to 4K PTEs. > > Switching to similar iteration logic to unpin would look something like > this (still untested): > > for_each_compound_range(index, &page, npages, head, refs) { > pgmap = get_dev_pagemap(pfn + *nr, pgmap); I recall talking to DanW about this and we agreed it was unnecessary here to hold the pgmap and should be deleted. Jason
On 8/31/21 6:05 PM, Jason Gunthorpe wrote: > On Tue, Aug 31, 2021 at 01:34:04PM +0100, Joao Martins wrote: >> On 8/30/21 2:07 PM, Jason Gunthorpe wrote: >>> On Fri, Aug 27, 2021 at 07:34:54PM +0100, Joao Martins wrote: >>>> On 8/27/21 5:25 PM, Jason Gunthorpe wrote: >>>>> On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: >>>>> >>>>>> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) >>>>>> static int __gup_device_huge(unsigned long pfn, unsigned long addr, >>>>>> unsigned long end, unsigned int flags, >>>>>> struct page **pages, int *nr) >>>>>> { >>>>>> - int nr_start = *nr; >>>>>> + int refs, nr_start = *nr; >>>>>> struct dev_pagemap *pgmap = NULL; >>>>>> int ret = 1; >>>>>> >>>>>> do { >>>>>> - struct page *page = pfn_to_page(pfn); >>>>>> + struct page *head, *page = pfn_to_page(pfn); >>>>>> + unsigned long next = addr + PAGE_SIZE; >>>>>> >>>>>> pgmap = get_dev_pagemap(pfn, pgmap); >>>>>> if (unlikely(!pgmap)) { >>>>>> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, >>>>>> ret = 0; >>>>>> break; >>>>>> } >>>>>> - SetPageReferenced(page); >>>>>> - pages[*nr] = page; >>>>>> - if (unlikely(!try_grab_page(page, flags))) { >>>>>> - undo_dev_pagemap(nr, nr_start, flags, pages); >>>>>> + >>>>>> + head = compound_head(page); >>>>>> + /* @end is assumed to be limited at most one compound page */ >>>>>> + if (PageHead(head)) >>>>>> + next = end; >>>>>> + refs = record_subpages(page, addr, next, pages + *nr); >>>>>> + >>>>>> + SetPageReferenced(head); >>>>>> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { >>>>>> + if (PageHead(head)) >>>>>> + ClearPageReferenced(head); >>>>>> + else >>>>>> + undo_dev_pagemap(nr, nr_start, flags, pages); >>>>>> ret = 0; >>>>>> break; >>>>> >>>>> Why is this special cased for devmap? >>>>> >>>>> Shouldn't everything processing pud/pmds/etc use the same basic loop >>>>> that is similar in idea to the 'for_each_compound_head' scheme in >>>>> unpin_user_pages_dirty_lock()? >>>>> >>>>> Doesn't that work for all the special page type cases here? >>>> >>>> We are iterating over PFNs to create an array of base pages (regardless of page table >>>> type), rather than iterating over an array of pages to work on. >>> >>> That is part of it, yes, but the slow bit here is to minimally find >>> the head pages and do the atomics on them, much like the >>> unpin_user_pages_dirty_lock() >>> >>> I would think this should be designed similar to how things work on >>> the unpin side. >>> >> I don't think it's the same thing. The bit you say 'minimally find the >> head pages' carries a visible overhead in unpin_user_pages() as we are >> checking each of the pages belongs to the same head page -- because you >> can pass an arbritary set of pages. This does have a cost which is not >> in gup-fast right now AIUI. Whereas in our gup-fast 'handler' you >> already know that you are processing a contiguous chunk of pages. >> If anything, we are closer to unpin_user_page_range*() >> than unpin_user_pages(). > > Yes, that is what I mean, it is very similar to the range case as we > don't even know that a single compound spans a pud/pmd. So you end up > doing the same loop to find the compound boundaries. > > Under GUP slow we can also aggregate multiple page table entires, eg a > split huge page could be procesed as a single 2M range operation even > if it is broken to 4K PTEs. /me nods FWIW, I have a follow-up patch pursuing similar optimization (to fix gup-slow case) that I need to put in better shape -- I probably won't wait until this series is done contrary to what the cover letter says. >> Switching to similar iteration logic to unpin would look something like >> this (still untested): >> >> for_each_compound_range(index, &page, npages, head, refs) { >> pgmap = get_dev_pagemap(pfn + *nr, pgmap); > > I recall talking to DanW about this and we agreed it was unnecessary > here to hold the pgmap and should be deleted. Yeap, I remember that conversation[0]. It was a long time ago, and I am not sure what progress was made there since the last posting? Dan, any thoughts there? [0] https://lore.kernel.org/linux-mm/161604050866.1463742.7759521510383551055.stgit@dwillia2-desk3.amr.corp.intel.com/ So ... if pgmap accounting was removed from gup-fast then this patch would be a lot simpler and we could perhaps just fallback to the regular hugepage case (THP, HugeTLB) like your suggestion at the top. See at the end below scissors mark as the ballpark of changes. So far my options seem to be: 1) this patch which leverages the existing iteration logic or 2) switching to for_each_compound_range() -- see my previous reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use something similar to below scissors mark. What do you think would be the best course of action? --->8--- ++static int __gup_device_compound(unsigned long addr, unsigned long pfn, ++ unsigned long mask) ++{ ++ pfn += ((addr & ~mask) >> PAGE_SHIFT); ++ ++ return PageCompound(pfn_to_page(pfn)); ++} ++ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@@ -2428,8 -2428,8 +2433,10 @@@ static int gup_huge_pmd(pmd_t orig, pmd if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; -- return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, -- pages, nr); ++ ++ if (!__gup_device_compound(addr, pmd_pfn(orig), PMD_MASK)) ++ return __gup_device_huge_pmd(orig, pmdp, addr, end, ++ flags, pages, nr); } page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@@ -2462,8 -2462,8 +2469,10 @@@ static int gup_huge_pud(pud_t orig, pud if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; -- return __gup_device_huge_pud(orig, pudp, addr, end, flags, -- pages, nr); ++ ++ if (!__gup_device_compound(addr, pud_pfn(orig), PUD_MASK)) ++ return __gup_device_huge_pud(orig, pudp, addr, end, ++ flags, pages, nr); } page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote: > On 8/31/21 6:05 PM, Jason Gunthorpe wrote: > >> Switching to similar iteration logic to unpin would look something like > >> this (still untested): > >> > >> for_each_compound_range(index, &page, npages, head, refs) { > >> pgmap = get_dev_pagemap(pfn + *nr, pgmap); > > > > I recall talking to DanW about this and we agreed it was unnecessary > > here to hold the pgmap and should be deleted. > > Yeap, I remember that conversation[0]. It was a long time ago, and I am > not sure what progress was made there since the last posting? Dan, any > thoughts there? > > [0] > https://lore.kernel.org/linux-mm/161604050866.1463742.7759521510383551055.stgit@dwillia2-desk3.amr.corp.intel.com/ I would really like to see that finished :\ > So ... if pgmap accounting was removed from gup-fast then this patch > would be a lot simpler and we could perhaps just fallback to the regular > hugepage case (THP, HugeTLB) like your suggestion at the top. See at the > end below scissors mark as the ballpark of changes. > > So far my options seem to be: 1) this patch which leverages the existing > iteration logic or 2) switching to for_each_compound_range() -- see my previous > reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use > something similar to below scissors mark. > > What do you think would be the best course of action? I still think the basic algorithm should be to accumulate physicaly contiguous addresses when walking the page table and then flush them back to struct pages once we can't accumulate any more. That works for both the walkers and all the page types? If the get_dev_pagemap has to remain then it just means we have to flush before changing pagemap pointers Jason
On 9/28/21 19:01, Jason Gunthorpe wrote: > On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote: >> So ... if pgmap accounting was removed from gup-fast then this patch >> would be a lot simpler and we could perhaps just fallback to the regular >> hugepage case (THP, HugeTLB) like your suggestion at the top. See at the >> end below scissors mark as the ballpark of changes. >> >> So far my options seem to be: 1) this patch which leverages the existing >> iteration logic or 2) switching to for_each_compound_range() -- see my previous >> reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use >> something similar to below scissors mark. >> >> What do you think would be the best course of action? > > I still think the basic algorithm should be to accumulate physicaly > contiguous addresses when walking the page table and then flush them > back to struct pages once we can't accumulate any more. > > That works for both the walkers and all the page types? > The logic already handles all page types -- I was trying to avoid the extra complexity in regular hugetlb/THP path by not merging the handling of the oddball case that is devmap (or fundamentally devmap non-compound case in the future). In the context of this patch I am think your suggestion that you wrote above to ... instead of changing __gup_device_huge() we uplevel/merge it all in gup_huge_{pud,pmd}() to cover the devmap? static int __gup_huge_range(orig_head, ...) { ... page = orig_head + ((addr & ~mask) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(orig_head, refs, flags); if (!head) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { put_compound_head(head, refs, flags); return 0; } SetPageReferenced(head); return 1; } static int gup_huge_pmd(...) { ... for_each_compound_range(index, page, npages, head, refs) { if (pud_devmap(orig)) pgmap = get_dev_pagemap(pmd_pfn(orig), pgmap); if (!__gup_huge_page_range(pmd_page(orig), refs)) { undo_dev_pagemap(...); return 0; } } put_dev_pagemap(pgmap); } But all this gup_huge_{pmd,pud}() rework is all just for the trouble of trying to merge the basepage-on-PMD/PUD case of devmap. It feels more complex (and affecting other page types) compared to leave the devmap odity siloed like option 1. If the pgmap refcount wasn't there and there was no users of basepages-on-PMD/PUD but just compound pages on PMDs/PUDs ... then we would be talking about code removal rather than added complexity. But I don't know how realistic that is for other devmap users (beside device-dax). > If the get_dev_pagemap has to remain then it just means we have to > flush before changing pagemap pointers Right -- I don't think we should need it as that discussion on the other thread goes. OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] can support it but not MEMORY_DEVICE_FSDAX [fsdax]). [0] https://lore.kernel.org/linux-mm/6a18179e-65f7-367d-89a9-d5162f10fef0@oracle.com/
On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: > > If the get_dev_pagemap has to remain then it just means we have to > > flush before changing pagemap pointers > Right -- I don't think we should need it as that discussion on the other > thread goes. > > OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM > for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] > can support it but not MEMORY_DEVICE_FSDAX [fsdax]). When looking at Logan's patches I think it is pretty clear to me that page->pgmap must never be a dangling pointer if the caller has a legitimate refcount on the page. For instance the migrate and stuff all blindly calls is_device_private_page() on the struct page expecting a valid page->pgmap. This also looks like it is happening, ie void __put_page(struct page *page) { if (is_zone_device_page(page)) { put_dev_pagemap(page->pgmap); Is indeed putting the pgmap ref back when the page becomes ungettable. This properly happens when the page refcount goes to zero and so it should fully interlock with __page_cache_add_speculative(): if (unlikely(!page_ref_add_unless(page, count, 0))) { Thus, in gup.c, if we succeed at try_grab_compound_head() then page->pgmap is a stable pointer with a valid refcount. So, all the external pgmap stuff in gup.c is completely pointless. try_grab_compound_head() provides us with an equivalent protection at lower cost. Remember gup.c doesn't deref the pgmap at all. Dan/Alistair/Felix do you see any hole in that argument?? So lets just delete it! Jason
On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote: > On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: > > > > If the get_dev_pagemap has to remain then it just means we have to > > > flush before changing pagemap pointers > > Right -- I don't think we should need it as that discussion on the other > > thread goes. > > > > OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM > > for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] > > can support it but not MEMORY_DEVICE_FSDAX [fsdax]). > > When looking at Logan's patches I think it is pretty clear to me that > page->pgmap must never be a dangling pointer if the caller has a > legitimate refcount on the page. > > For instance the migrate and stuff all blindly calls > is_device_private_page() on the struct page expecting a valid > page->pgmap. > > This also looks like it is happening, ie > > void __put_page(struct page *page) > { > if (is_zone_device_page(page)) { > put_dev_pagemap(page->pgmap); > > Is indeed putting the pgmap ref back when the page becomes ungettable. > > This properly happens when the page refcount goes to zero and so it > should fully interlock with __page_cache_add_speculative(): > > if (unlikely(!page_ref_add_unless(page, count, 0))) { > > Thus, in gup.c, if we succeed at try_grab_compound_head() then > page->pgmap is a stable pointer with a valid refcount. > > So, all the external pgmap stuff in gup.c is completely pointless. > try_grab_compound_head() provides us with an equivalent protection at > lower cost. Remember gup.c doesn't deref the pgmap at all. > > Dan/Alistair/Felix do you see any hole in that argument?? As background note that device pages are currently considered free when refcount == 1 but the pgmap reference is dropped when the refcount transitions 1->0. The final pgmap reference is typically dropped when a driver calls memunmap_pages() and put_page() drops the last page reference: void memunmap_pages(struct dev_pagemap *pgmap) { unsigned long pfn; int i; dev_pagemap_kill(pgmap); for (i = 0; i < pgmap->nr_range; i++) for_each_device_pfn(pfn, pgmap, i) put_page(pfn_to_page(pfn)); dev_pagemap_cleanup(pgmap); If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until the final reference is dropped. So I think your argument holds at least for DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap cleanup but I can't see why the same argument wouldn't hold there - if a page has a valid refcount it must have a reference on the pagemap too. - Alistair > So lets just delete it! > > Jason >
On 9/30/21 04:01, Alistair Popple wrote: > On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote: >> On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: >> >>>> If the get_dev_pagemap has to remain then it just means we have to >>>> flush before changing pagemap pointers >>> Right -- I don't think we should need it as that discussion on the other >>> thread goes. >>> >>> OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM >>> for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] >>> can support it but not MEMORY_DEVICE_FSDAX [fsdax]). >> >> When looking at Logan's patches I think it is pretty clear to me that >> page->pgmap must never be a dangling pointer if the caller has a >> legitimate refcount on the page. >> >> For instance the migrate and stuff all blindly calls >> is_device_private_page() on the struct page expecting a valid >> page->pgmap. >> >> This also looks like it is happening, ie >> >> void __put_page(struct page *page) >> { >> if (is_zone_device_page(page)) { >> put_dev_pagemap(page->pgmap); >> >> Is indeed putting the pgmap ref back when the page becomes ungettable. >> >> This properly happens when the page refcount goes to zero and so it >> should fully interlock with __page_cache_add_speculative(): >> >> if (unlikely(!page_ref_add_unless(page, count, 0))) { >> >> Thus, in gup.c, if we succeed at try_grab_compound_head() then >> page->pgmap is a stable pointer with a valid refcount. >> >> So, all the external pgmap stuff in gup.c is completely pointless. >> try_grab_compound_head() provides us with an equivalent protection at >> lower cost. Remember gup.c doesn't deref the pgmap at all. >> >> Dan/Alistair/Felix do you see any hole in that argument?? > > As background note that device pages are currently considered free when > refcount == 1 but the pgmap reference is dropped when the refcount transitions > 1->0. The final pgmap reference is typically dropped when a driver calls > memunmap_pages() and put_page() drops the last page reference: > > void memunmap_pages(struct dev_pagemap *pgmap) > { > unsigned long pfn; > int i; > > dev_pagemap_kill(pgmap); > for (i = 0; i < pgmap->nr_range; i++) > for_each_device_pfn(pfn, pgmap, i) > put_page(pfn_to_page(pfn)); > dev_pagemap_cleanup(pgmap); > > If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until > the final reference is dropped. So I think your argument holds at least for > DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap > cleanup but I can't see why the same argument wouldn't hold there - if a page > has a valid refcount it must have a reference on the pagemap too. IIUC Dan's reasoning was that fsdax wasn't able to deal with surprise removal [1] so his patches were to ensure fsdax (or the pmem block device) poisons/kills the pages as a way to notify filesystem/dm that the page was to be kept unmapped: [1] https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/ But if fsdax doesn't wait for all the pgmap references[*] on its pagemap cleanup callback then what's the pgmap ref in __gup_device_huge() pairs/protects us up against that is specific to fsdax? I am not sure I follow how both the fsdax specific issue ties in with this pgmap ref being there above. Joao [*] or at least fsdax_pagemap_ops doesn't suggest the contrary ... compared to dev_pagemap_{kill,cleanup}
On Thu, Sep 30, 2021 at 06:54:05PM +0100, Joao Martins wrote: > On 9/30/21 04:01, Alistair Popple wrote: > > On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote: > >> On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: > >> > >>>> If the get_dev_pagemap has to remain then it just means we have to > >>>> flush before changing pagemap pointers > >>> Right -- I don't think we should need it as that discussion on the other > >>> thread goes. > >>> > >>> OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM > >>> for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] > >>> can support it but not MEMORY_DEVICE_FSDAX [fsdax]). > >> > >> When looking at Logan's patches I think it is pretty clear to me that > >> page->pgmap must never be a dangling pointer if the caller has a > >> legitimate refcount on the page. > >> > >> For instance the migrate and stuff all blindly calls > >> is_device_private_page() on the struct page expecting a valid > >> page->pgmap. > >> > >> This also looks like it is happening, ie > >> > >> void __put_page(struct page *page) > >> { > >> if (is_zone_device_page(page)) { > >> put_dev_pagemap(page->pgmap); > >> > >> Is indeed putting the pgmap ref back when the page becomes ungettable. > >> > >> This properly happens when the page refcount goes to zero and so it > >> should fully interlock with __page_cache_add_speculative(): > >> > >> if (unlikely(!page_ref_add_unless(page, count, 0))) { > >> > >> Thus, in gup.c, if we succeed at try_grab_compound_head() then > >> page->pgmap is a stable pointer with a valid refcount. > >> > >> So, all the external pgmap stuff in gup.c is completely pointless. > >> try_grab_compound_head() provides us with an equivalent protection at > >> lower cost. Remember gup.c doesn't deref the pgmap at all. > >> > >> Dan/Alistair/Felix do you see any hole in that argument?? > > > > As background note that device pages are currently considered free when > > refcount == 1 but the pgmap reference is dropped when the refcount transitions > > 1->0. The final pgmap reference is typically dropped when a driver calls > > memunmap_pages() and put_page() drops the last page reference: > > > > void memunmap_pages(struct dev_pagemap *pgmap) > > { > > unsigned long pfn; > > int i; > > > > dev_pagemap_kill(pgmap); > > for (i = 0; i < pgmap->nr_range; i++) > > for_each_device_pfn(pfn, pgmap, i) > > put_page(pfn_to_page(pfn)); > > dev_pagemap_cleanup(pgmap); > > > > If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until > > the final reference is dropped. So I think your argument holds at least for > > DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap > > cleanup but I can't see why the same argument wouldn't hold there - if a page > > has a valid refcount it must have a reference on the pagemap too. > > IIUC Dan's reasoning was that fsdax wasn't able to deal with > surprise removal [1] so his patches were to ensure fsdax (or the > pmem block device) poisons/kills the pages as a way to notify > filesystem/dm that the page was to be kept unmapped: Sure, but that has nothing to do with GUP, that is between the filesytem and fsdax > But if fsdax doesn't wait for all the pgmap references[*] on its > pagemap cleanup callback then what's the pgmap ref in > __gup_device_huge() pairs/protects us up against that is specific to > fsdax? It does wait for refs It sets the pgmap.ref to: pmem->pgmap.ref = &q->q_usage_counter; And that ref is incr'd by the struct page lifetime - the unincr is in __put_page() above fsdax_pagemap_ops does pmem_pagemap_kill() which calls blk_freeze_queue_start() which does percpu_ref_kill(). Then the pmem_pagemap_cleanup() eventually does blk_mq_freeze_queue_wait() which will sleep until the prefcpu ref reaches zero. In other words fsdax cannot pass cleanup while a struct page exists with a non-zero refcount, which answers Alistair's question about how fsdax's cleanup work. Jason
On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: > @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, > ret = 0; > break; > } > - SetPageReferenced(page); > - pages[*nr] = page; > - if (unlikely(!try_grab_page(page, flags))) { > - undo_dev_pagemap(nr, nr_start, flags, pages); > + > + head = compound_head(page); > + /* @end is assumed to be limited at most one compound page */ > + if (PageHead(head)) > + next = end; > + refs = record_subpages(page, addr, next, pages + *nr); > + > + SetPageReferenced(head); > + if (unlikely(!try_grab_compound_head(head, refs, flags))) { I was thinking about this some more, and this ordering doesn't seem like a good idea. We shouldn't be looking at any part of the struct page without holding the refcount, certainly not the compound_head() The only optimization that might work here is to grab the head, then compute the extent of tail pages and amalgamate them. Holding a ref on the head also secures the tails. Which also means most of what I was suggesting isn't going to work anyhow. Jason
On 10/8/21 12:54, Jason Gunthorpe wrote: > On Fri, Aug 27, 2021 at 03:58:13PM +0100, Joao Martins wrote: >> @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, >> ret = 0; >> break; >> } >> - SetPageReferenced(page); >> - pages[*nr] = page; >> - if (unlikely(!try_grab_page(page, flags))) { >> - undo_dev_pagemap(nr, nr_start, flags, pages); >> + >> + head = compound_head(page); >> + /* @end is assumed to be limited at most one compound page */ >> + if (PageHead(head)) >> + next = end; >> + refs = record_subpages(page, addr, next, pages + *nr); >> + >> + SetPageReferenced(head); >> + if (unlikely(!try_grab_compound_head(head, refs, flags))) { > > I was thinking about this some more, and this ordering doesn't seem > like a good idea. We shouldn't be looking at any part of the struct > page without holding the refcount, certainly not the compound_head() > > The only optimization that might work here is to grab the head, then > compute the extent of tail pages and amalgamate them. Holding a ref on > the head also secures the tails. > How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge() as an added @head argument. While keeping the same structure of counting tail pages between @addr .. @end if we have a head page. Albeit this lingers on whether it's OK to call PageHead() .. The PageHead policy is for any page (PF_ANY) so no hidden calls to compound_head() when testing that page flag. but in the end it accesses struct page flags which is well, still struct page data. We could also check pgmap for a non-zero geometry (which tells us that pmd_page(orig) does represent a head page). And that would save us from looking at struct page data today, but it would introduce problems later whenever we remove the pgmap ref grab in gup_device_huge(). So the only viable might be to do the grab, count tails and fixup-ref like you suggest above, and take the perf hit of one extra atomic op :( It's interesting how THP (in gup_huge_pmd()) unilaterally computes tails assuming pmd_page(orig) is the head page.
On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote: > On 10/8/21 12:54, Jason Gunthorpe wrote: > > The only optimization that might work here is to grab the head, then > > compute the extent of tail pages and amalgamate them. Holding a ref on > > the head also secures the tails. > > How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp > checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge() > as an added @head argument. While keeping the same structure of counting tail pages > between @addr .. @end if we have a head page. The right logic is what everything else does: page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pud_page(orig), refs, flags); If you can use this, or not, depends entirely on answering the question of why does __gup_device_huge() exist at all. This I don't fully know: 1) As discussed quite a few times now, the entire get_dev_pagemap stuff looks usless and should just be deleted. If you care about optimizing this I would persue doing that as it will give the biggest single win. 2) It breaks up the PUD/PMD into tail pages and scans them all Why? Can devmap combine multiple compound_head's into the same PTE? Does devmap guarentee that the PUD/PMD points to the head page? (I assume no) But I'm looking at this some more and I see try_get_compound_head() is reading the compound_head with no locking, just READ_ONCE, so that must be OK under GUP. It still seems to me the same generic algorithm should work everywhere, once we get rid of the get_dev_pagemap start_pfn = pud/pmd_pfn() + pud/pmd_page_offset(addr) end_pfn = start_pfn + (end - addr) // fixme if (THP) refs = end_pfn - start_pfn if (devmap) refs = 1 do { page = pfn_to_page(start_pfn) head_page = try_grab_compound_head(page, refs, flags) if (pud/pmd_val() != orig) err npages = 1 << compound_order(head_page) npages = min(npages, end_pfn - start_pfn) for (i = 0, iter = page; i != npages) { *pages++ = iter; mem_map_next(iter, page, i) } if (devmap && npages > 2) grab_compound_head(head_page, npages - 1, flags) start_pfn += npages; } while (start_pfn != end_pfn) Above needs to be cleaned up quite a bit, but is the general idea. There is an further optimization we can put in where we can know that 'page' is still in a currently grab'd compound (eg last_page+1 = page, not past compound_order) and defer the refcount work. > It's interesting how THP (in gup_huge_pmd()) unilaterally computes > tails assuming pmd_page(orig) is the head page. I think this is an integral property of THP, probably not devmap/dax though? Jason
On 10/13/21 18:41, Jason Gunthorpe wrote: > On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote: >> On 10/8/21 12:54, Jason Gunthorpe wrote: > >>> The only optimization that might work here is to grab the head, then >>> compute the extent of tail pages and amalgamate them. Holding a ref on >>> the head also secures the tails. >> >> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp >> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge() >> as an added @head argument. While keeping the same structure of counting tail pages >> between @addr .. @end if we have a head page. > > The right logic is what everything else does: > > page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); > refs = record_subpages(page, addr, end, pages + *nr); > head = try_grab_compound_head(pud_page(orig), refs, flags); > > If you can use this, or not, depends entirely on answering the > question of why does __gup_device_huge() exist at all. > So for device-dax it seems to be an untackled oversight[*], probably inherited from when fsdax devmap was introduced. What I don't know is the other devmap users :( [*] it has all the same properties as hugetlbfs AFAIU (after this series) Certainly if any devmap PMD/PUD was represented in a single compound page like THP/hugetlbfs then this patch would be a matter of removing pgmap ref grab (and nuke the __gup_device_huge function existence as I suggested earlier). > This I don't fully know: > > 1) As discussed quite a few times now, the entire get_dev_pagemap > stuff looks usless and should just be deleted. If you care about > optimizing this I would persue doing that as it will give the > biggest single win. > I am not questioning the well-deserved improvement -- but from a pure optimization perspective the get_dev_pagemap() cost is not visible and quite negligeble. It is done once and only once and subsequent calls to get_dev_pagemap with a non-NULL pgmap don't alter the refcount and just return the pgmap object. And the xarray storing the ranges -> pgmap won't be that big ... perhaps maybe 12 pgmaps on a large >1T pmem system depending on your DIMM size. The refcount update of the individual 4K page is what introduces a seriously prohibite cost: I am seeing 10x the cost with DRAM located struct pages (pmem located struct pages is even more ludicrous). > 2) It breaks up the PUD/PMD into tail pages and scans them all > Why? Can devmap combine multiple compound_head's into the same PTE? I am not aware of any other usage of compound pages for devmap struct pages than this series. At least I haven't seen device-dax or fsdax using this. Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA doesn't seem to be (yet?) caught by any of the GUP path at least before Logan's series lands. Or am I misunderstanding things here? > Does devmap guarentee that the PUD/PMD points to the head page? (I > assume no) > For device-dax yes. > But I'm looking at this some more and I see try_get_compound_head() is > reading the compound_head with no locking, just READ_ONCE, so that > must be OK under GUP. > I suppose one other way to get around the double atomic op would be to fail the try_get_compound_head() by comparing the first tail page compound_head() after grabbing the head with @refs. That is instead of comparing against grabbed head page and the passed page argument. > It still seems to me the same generic algorithm should work > everywhere, once we get rid of the get_dev_pagemap > > start_pfn = pud/pmd_pfn() + pud/pmd_page_offset(addr) > end_pfn = start_pfn + (end - addr) // fixme > if (THP) > refs = end_pfn - start_pfn > if (devmap) > refs = 1 > > do { > page = pfn_to_page(start_pfn) > head_page = try_grab_compound_head(page, refs, flags) > if (pud/pmd_val() != orig) > err > > npages = 1 << compound_order(head_page) > npages = min(npages, end_pfn - start_pfn) > for (i = 0, iter = page; i != npages) { > *pages++ = iter; > mem_map_next(iter, page, i) > } > > if (devmap && npages > 2) > grab_compound_head(head_page, npages - 1, flags) > start_pfn += npages; > } while (start_pfn != end_pfn) > > Above needs to be cleaned up quite a bit, but is the general idea. > > There is an further optimization we can put in where we can know that > 'page' is still in a currently grab'd compound (eg last_page+1 = page, > not past compound_order) and defer the refcount work. > I was changing __gup_device_huge() with similar to the above, but yeah it follows that algorithm as inside your do { } while() (thanks!). I can turn __gup_device_huge() into another (renamed to like try_grab_pages()) helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs equivalent handling. >> It's interesting how THP (in gup_huge_pmd()) unilaterally computes >> tails assuming pmd_page(orig) is the head page. > > I think this is an integral property of THP, probably not devmap/dax > though? I think the right answer is "depends on the devmap" type. device-dax with PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same rules as hugetlbfs. fsdax not so much (as you say above) but it would follow up changes to perhaps adhere to similar scheme (not exactly sure how do deal with holes). HMM I am not sure what the rules are there. P2PDMA seems not applicable?
On Wed, Oct 13, 2021 at 08:18:08PM +0100, Joao Martins wrote: > On 10/13/21 18:41, Jason Gunthorpe wrote: > > On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote: > >> On 10/8/21 12:54, Jason Gunthorpe wrote: > > > >>> The only optimization that might work here is to grab the head, then > >>> compute the extent of tail pages and amalgamate them. Holding a ref on > >>> the head also secures the tails. > >> > >> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp > >> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge() > >> as an added @head argument. While keeping the same structure of counting tail pages > >> between @addr .. @end if we have a head page. > > > > The right logic is what everything else does: > > > > page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); > > refs = record_subpages(page, addr, end, pages + *nr); > > head = try_grab_compound_head(pud_page(orig), refs, flags); > > > > If you can use this, or not, depends entirely on answering the > > question of why does __gup_device_huge() exist at all. > > > So for device-dax it seems to be an untackled oversight[*], probably > inherited from when fsdax devmap was introduced. What I don't know > is the other devmap users :( devmap generic infrastructure waits until all page refcounts go to zero, and it should wait until any fast GUP is serialized as part of the TLB shootdown - otherwise it is leaking access to the memory it controls beyond it's shutdown So, I don't think the different devmap users can break this? > > This I don't fully know: > > > > 1) As discussed quite a few times now, the entire get_dev_pagemap > > stuff looks usless and should just be deleted. If you care about > > optimizing this I would persue doing that as it will give the > > biggest single win. > > I am not questioning the well-deserved improvement -- but from a pure > optimization perspective the get_dev_pagemap() cost is not > visible and quite negligeble. You are doing large enough GUPs then that the expensive xarray seach is small compared to the rest? > > 2) It breaks up the PUD/PMD into tail pages and scans them all > > Why? Can devmap combine multiple compound_head's into the same PTE? > > I am not aware of any other usage of compound pages for devmap struct pages > than this series. At least I haven't seen device-dax or fsdax using > this. Let me ask this question differently, is this assertion OK? --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -808,8 +808,13 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, } entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); - if (pfn_t_devmap(pfn)) + if (pfn_t_devmap(pfn)) { + struct page *pfn_to_page(pfn); + + WARN_ON(compound_head(page) != page); + WARN_ON(compound_order(page) != PMD_SHIFT); entry = pmd_mkdevmap(entry); + } if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); (and same for insert_pfn_pud) You said it is OK for device/dax/device.c? And not for fs/dax.c? > Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA > doesn't seem to be (yet?) caught by any of the GUP path at least before > Logan's series lands. Or am I misunderstanding things here? Of the places that call the insert_pfn_pmd/pud call chains I only see device/dax/device.c and fs/dax.c as being linked to devmap. So other devmap users don't use this stuff. > I was changing __gup_device_huge() with similar to the above, but yeah > it follows that algorithm as inside your do { } while() (thanks!). I can > turn __gup_device_huge() into another (renamed to like try_grab_pages()) > helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs > equivalent handling. I suppose it should be some #define because the (pmd_val != orig) logic is not sharable But, yes, a general call that the walker should make at any level to record a pfn -> npages range efficiently. > I think the right answer is "depends on the devmap" type. device-dax with > PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same > rules as hugetlbfs. fsdax not so much (as you say above) but it would > follow up changes to perhaps adhere to similar scheme (not exactly sure > how do deal with holes). HMM I am not sure what the rules are there. > P2PDMA seems not applicable? I would say, not "depends on the devmap", but what are the rules for calling insert_pfn_pmd in the first place. If users are allowed the create pmds that span many compound_head's then we need logic as I showed. Otherwise we do not. And I would document this relationship in the GUP side "This do/while is required because insert_pfn_pmd/pud() is used with compound pages smaller than the PUD/PMD size" so it isn't so confused with just "devmap" Jason
On 10/13/21 20:43, Jason Gunthorpe wrote: > On Wed, Oct 13, 2021 at 08:18:08PM +0100, Joao Martins wrote: >> On 10/13/21 18:41, Jason Gunthorpe wrote: >>> On Mon, Oct 11, 2021 at 04:53:29PM +0100, Joao Martins wrote: >>>> On 10/8/21 12:54, Jason Gunthorpe wrote: >>> >>>>> The only optimization that might work here is to grab the head, then >>>>> compute the extent of tail pages and amalgamate them. Holding a ref on >>>>> the head also secures the tails. >>>> >>>> How about pmd_page(orig) / pud_page(orig) like what the rest of hugetlb/thp >>>> checks do? i.e. we would pass pmd_page(orig)/pud_page(orig) to __gup_device_huge() >>>> as an added @head argument. While keeping the same structure of counting tail pages >>>> between @addr .. @end if we have a head page. >>> >>> The right logic is what everything else does: >>> >>> page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); >>> refs = record_subpages(page, addr, end, pages + *nr); >>> head = try_grab_compound_head(pud_page(orig), refs, flags); >>> >>> If you can use this, or not, depends entirely on answering the >>> question of why does __gup_device_huge() exist at all. >>> >> So for device-dax it seems to be an untackled oversight[*], probably >> inherited from when fsdax devmap was introduced. What I don't know >> is the other devmap users :( > > devmap generic infrastructure waits until all page refcounts go to > zero, and it should wait until any fast GUP is serialized as part of > the TLB shootdown - otherwise it is leaking access to the memory it > controls beyond it's shutdown > > So, I don't think the different devmap users can break this? > maybe fsdax may not honor that after removing the pgmap ref count as we talked earlier in the thread without the memory failures stuff. But my point earlier on "oversight" was about the fact that we describe PMD/PUDs with base pages. And hence why we can't do this: page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pud_page(orig), refs, flags); ... For devmap. >>> This I don't fully know: >>> >>> 1) As discussed quite a few times now, the entire get_dev_pagemap >>> stuff looks usless and should just be deleted. If you care about >>> optimizing this I would persue doing that as it will give the >>> biggest single win. >> >> I am not questioning the well-deserved improvement -- but from a pure >> optimization perspective the get_dev_pagemap() cost is not >> visible and quite negligeble. > > You are doing large enough GUPs then that the expensive xarray seach > is small compared to the rest? > The tests I showed are on 16G and 128G (I usually test with > 1Tb). But we are comparing 1 pgmap lookup, and 512 refcount atomic updates for a PMD sized pin (2M). It depends on what you think small is. Maybe for a 4K-16K of memory pinned, probably the pgmap lookup is more expensive. A couple months ago I remember measuring xarray lookups and that would be around ~150ns per lookup with few entries excluding the ref update. I can be pedantic and give you a more concrete measurement for get_dev_pagemap() but quite honestly don't think that it will be close with we pin a hugepage. But this is orthogonal to the pgmap ref existence, I was merely commenting on the performance aspect. >>> 2) It breaks up the PUD/PMD into tail pages and scans them all >>> Why? Can devmap combine multiple compound_head's into the same PTE? >> >> I am not aware of any other usage of compound pages for devmap struct pages >> than this series. At least I haven't seen device-dax or fsdax using >> this. > > Let me ask this question differently, is this assertion OK? > I understood the question. I was more trying to learn from you on HMM/P2PDMA use-cases as you sounded like compound pages exist elsewhere in devmap :) > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -808,8 +808,13 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, > } > > entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); > - if (pfn_t_devmap(pfn)) > + if (pfn_t_devmap(pfn)) { > + struct page *pfn_to_page(pfn); > + > + WARN_ON(compound_head(page) != page); > + WARN_ON(compound_order(page) != PMD_SHIFT); > entry = pmd_mkdevmap(entry); > + } > if (write) { > entry = pmd_mkyoung(pmd_mkdirty(entry)); > entry = maybe_pmd_mkwrite(entry, vma); > > (and same for insert_pfn_pud) > > You said it is OK for device/dax/device.c? > Yes, correct. (I also verified with similar snippet above to be extra-pedantic) I would like to emphasize that it is only OK *after this series*. Without this series there is neither compound order or head, nor any compound page whatsoever. > And not for fs/dax.c? > IIUC, Correct. fs/dax code is a little fuzzy on the sector to PFN translation but still: There's no compound pages in fsdax (neither do I add them). So compound_order() doesn't exist (hence order 0) and there's no head which also means that compound_head(@page) returns @page (which is a tad misleading on being a head definition as a PMD subpage would not return @page). > >> Unless HMM does this stuff, or some sort of devmap page migration? P2PDMA >> doesn't seem to be (yet?) caught by any of the GUP path at least before >> Logan's series lands. Or am I misunderstanding things here? > > Of the places that call the insert_pfn_pmd/pud call chains I only see > device/dax/device.c and fs/dax.c as being linked to devmap. So other > devmap users don't use this stuff. > >> I was changing __gup_device_huge() with similar to the above, but yeah >> it follows that algorithm as inside your do { } while() (thanks!). I can >> turn __gup_device_huge() into another (renamed to like try_grab_pages()) >> helper and replace the callsites of gup_huge_{pud,pmd} for the THP/hugetlbfs >> equivalent handling. > > I suppose it should be some #define because the (pmd_val != orig) logic > is not sharable > I wasn't going to have that pmd_val() in there. Just this refcount part: page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); head = try_grab_compound_head(pud_page(orig), refs, flags); and __gup_device_huge() melded in. > But, yes, a general call that the walker should make at any level to > record a pfn -> npages range efficiently. > >> I think the right answer is "depends on the devmap" type. device-dax with >> PMD/PUDs (i.e. 2m pagesize PMEM or 1G pagesize pmem) works with the same >> rules as hugetlbfs. fsdax not so much (as you say above) but it would >> follow up changes to perhaps adhere to similar scheme (not exactly sure >> how do deal with holes). HMM I am not sure what the rules are there. >> P2PDMA seems not applicable? > > I would say, not "depends on the devmap", but what are the rules for > calling insert_pfn_pmd in the first place. > I am gonna translate that into HMM and P2PDMA so far aren't affected as you also said earlier above. Even after Logan's P2PDMA NVME series it appears is always PTEs. So fsdax and device-dax are the only pmd_devmap users we should care, and for pud_devmap() only device-dax. > If users are allowed the create pmds that span many compound_head's > then we need logic as I showed. Otherwise we do not. > /me nods. > And I would document this relationship in the GUP side "This do/while > is required because insert_pfn_pmd/pud() is used with compound pages > smaller than the PUD/PMD size" so it isn't so confused with just > "devmap" Also, it's not that PMDs span compound heads, it's that PMDs/PUDs use just base pages. Compound pages/head in devmap are only introduced by series and for device-dax only.
On Thu, Oct 14, 2021 at 06:56:51PM +0100, Joao Martins wrote: > > And I would document this relationship in the GUP side "This do/while > > is required because insert_pfn_pmd/pud() is used with compound pages > > smaller than the PUD/PMD size" so it isn't so confused with just > > "devmap" > > Also, it's not that PMDs span compound heads, it's that PMDs/PUDs use > just base pages. Compound pages/head in devmap are only introduced by > series and for device-dax only. I think it all makes sense, I just want to clarify what I mean by compound_head: compound_head(base_page) == base_page Ie a PMD consisting only of order 0 pages would have N 'compound_heads' within it even though nothing is a compound page. Which is relavent because the GUP algorithms act on the compound_head Jason
On Thu, Sep 30, 2021 at 01:01:14PM +1000, Alistair Popple wrote: > On Thursday, 30 September 2021 5:34:05 AM AEST Jason Gunthorpe wrote: > > On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: > > > > > > If the get_dev_pagemap has to remain then it just means we have to > > > > flush before changing pagemap pointers > > > Right -- I don't think we should need it as that discussion on the other > > > thread goes. > > > > > > OTOH, using @pgmap might be useful to unblock gup-fast FOLL_LONGTERM > > > for certain devmap types[0] (like MEMORY_DEVICE_GENERIC [device-dax] > > > can support it but not MEMORY_DEVICE_FSDAX [fsdax]). > > > > When looking at Logan's patches I think it is pretty clear to me that > > page->pgmap must never be a dangling pointer if the caller has a > > legitimate refcount on the page. > > > > For instance the migrate and stuff all blindly calls > > is_device_private_page() on the struct page expecting a valid > > page->pgmap. > > > > This also looks like it is happening, ie > > > > void __put_page(struct page *page) > > { > > if (is_zone_device_page(page)) { > > put_dev_pagemap(page->pgmap); > > > > Is indeed putting the pgmap ref back when the page becomes ungettable. > > > > This properly happens when the page refcount goes to zero and so it > > should fully interlock with __page_cache_add_speculative(): > > > > if (unlikely(!page_ref_add_unless(page, count, 0))) { > > > > Thus, in gup.c, if we succeed at try_grab_compound_head() then > > page->pgmap is a stable pointer with a valid refcount. > > > > So, all the external pgmap stuff in gup.c is completely pointless. > > try_grab_compound_head() provides us with an equivalent protection at > > lower cost. Remember gup.c doesn't deref the pgmap at all. > > > > Dan/Alistair/Felix do you see any hole in that argument?? > > As background note that device pages are currently considered free when > refcount == 1 but the pgmap reference is dropped when the refcount transitions > 1->0. The final pgmap reference is typically dropped when a driver calls > memunmap_pages() and put_page() drops the last page reference: > > void memunmap_pages(struct dev_pagemap *pgmap) > { > unsigned long pfn; > int i; > > dev_pagemap_kill(pgmap); > for (i = 0; i < pgmap->nr_range; i++) > for_each_device_pfn(pfn, pgmap, i) > put_page(pfn_to_page(pfn)); > dev_pagemap_cleanup(pgmap); > > If there are still pgmap references dev_pagemap_cleanup(pgmap) will block until > the final reference is dropped. So I think your argument holds at least for > DEVICE_PRIVATE and DEVICE_GENERIC. DEVICE_FS_DAX defines it's own pagemap > cleanup but I can't see why the same argument wouldn't hold there - if a page > has a valid refcount it must have a reference on the pagemap too. To close this circle - the issue is use after free on the struct page * entry while it has a zero ref. memunmap_pages() does wait for the refcount to go to zero, but it then goes on to free the memory under the struct pages. However there are possibly still untracked references to this memory in the page tables. This is the bug Dan has been working on - to shootdown page table mappings before getting to memunmap_pages() Getting the page map ref will make the use-after-free never crash, just be a silent security problem. :( Jason
On Wed, Sep 29, 2021 at 12:50:15PM +0100, Joao Martins wrote: > On 9/28/21 19:01, Jason Gunthorpe wrote: > > On Thu, Sep 23, 2021 at 05:51:04PM +0100, Joao Martins wrote: > >> So ... if pgmap accounting was removed from gup-fast then this patch > >> would be a lot simpler and we could perhaps just fallback to the regular > >> hugepage case (THP, HugeTLB) like your suggestion at the top. See at the > >> end below scissors mark as the ballpark of changes. > >> > >> So far my options seem to be: 1) this patch which leverages the existing > >> iteration logic or 2) switching to for_each_compound_range() -- see my previous > >> reply 3) waiting for Dan to remove @pgmap accounting in gup-fast and use > >> something similar to below scissors mark. > >> > >> What do you think would be the best course of action? > > > > I still think the basic algorithm should be to accumulate physicaly > > contiguous addresses when walking the page table and then flush them > > back to struct pages once we can't accumulate any more. > > > > That works for both the walkers and all the page types? > > > > The logic already handles all page types -- I was trying to avoid the extra > complexity in regular hugetlb/THP path by not merging the handling of the > oddball case that is devmap (or fundamentally devmap > non-compound case in the future). FYI, this untested thing is what I came to when I tried to make something like this: /* * A large page entry such as PUD/PMD can point to a struct page. In cases like * THP this struct page will be a compound page of the same order as the page * table level. However, in cases like DAX or more generally pgmap ZONE_DEVICE, * the PUD/PMD may point at the first pfn in a string of pages. * * This helper iterates over all head pages or all the non-compound base pages. */ static pt_entry_iter_state { struct page *head; unsigned long compound_nr; unsigned long pfn; unsigned long end_pfn; }; static inline struct page *__pt_start_iter(struct iter_state *state, struct page *page, unsigned long pfn, unsigned int entry_size) { state->head = compound_head(page); state->compound_nr = compound_nr(page); state->pfn = pfn & (~(state->compound_nr - 1)); state->end_pfn = pfn + entry_size / PAGE_SIZE; return state->head; } static inline struct page *__pt_next_page(struct iter_state *state) { state->pfn += state->compound_nr; if (state->end_pfn <= state->pfn) return NULL; state->head = pfn_to_page(state->pfn); state->compound_nr = compound_nr(page); return state->head; } #define for_each_page_in_pt_entry(state, page, pfn, entry_size) \ for (page = __pt_start_iter(state, page, pfn, entry_size); page; \ page = __pt_next_page(&state)) static bool remove_pages_from_page_table(struct vm_area_struct *vma, struct page *page, unsigned long pfn, unsigned int entry_size, bool is_dirty, bool is_young) { struct iter_state state; for_each_page_in_pt_entry(&state, page, pfn, entry_size) remove_page_from_page_table(vma, page, is_dirty, is_young); } Jason
diff --git a/mm/gup.c b/mm/gup.c index 7a406d79bd2e..0741d2c0ba5e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2234,17 +2234,30 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ + +static int record_subpages(struct page *page, unsigned long addr, + unsigned long end, struct page **pages) +{ + int nr; + + for (nr = 0; addr != end; addr += PAGE_SIZE) + pages[nr++] = page++; + + return nr; +} + #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - int nr_start = *nr; + int refs, nr_start = *nr; struct dev_pagemap *pgmap = NULL; int ret = 1; do { - struct page *page = pfn_to_page(pfn); + struct page *head, *page = pfn_to_page(pfn); + unsigned long next = addr + PAGE_SIZE; pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { @@ -2252,16 +2265,25 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, ret = 0; break; } - SetPageReferenced(page); - pages[*nr] = page; - if (unlikely(!try_grab_page(page, flags))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + + head = compound_head(page); + /* @end is assumed to be limited at most one compound page */ + if (PageHead(head)) + next = end; + refs = record_subpages(page, addr, next, pages + *nr); + + SetPageReferenced(head); + if (unlikely(!try_grab_compound_head(head, refs, flags))) { + if (PageHead(head)) + ClearPageReferenced(head); + else + undo_dev_pagemap(nr, nr_start, flags, pages); ret = 0; break; } - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); + *nr += refs; + pfn += refs; + } while (addr += (refs << PAGE_SHIFT), addr != end); put_dev_pagemap(pgmap); return ret; @@ -2320,17 +2342,6 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, } #endif -static int record_subpages(struct page *page, unsigned long addr, - unsigned long end, struct page **pages) -{ - int nr; - - for (nr = 0; addr != end; addr += PAGE_SIZE) - pages[nr++] = page++; - - return nr; -} - #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz)