Message ID | 20190612220320.2223898-4-songliubraving@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | THP aware uprobe | expand |
On Wed, Jun 12, 2019 at 03:03:17PM -0700, Song Liu wrote: > This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says > FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge > page stays as-is. > > FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, > but would switch back to huge page and huge pmd on. One of such example > is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. > > Signed-off-by: Song Liu <songliubraving@fb.com> > --- > include/linux/mm.h | 1 + > mm/gup.c | 38 +++++++++++++++++++++++++++++++++++--- > 2 files changed, 36 insertions(+), 3 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0ab8c7d84cd0..e605acc4fc81 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, > #define FOLL_COW 0x4000 /* internal GUP flag */ > #define FOLL_ANON 0x8000 /* don't do file mappings */ > #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ > +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ > > /* > * NOTE on FOLL_LONGTERM: > diff --git a/mm/gup.c b/mm/gup.c > index ddde097cf9e4..3d05bddb56c9 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > spin_unlock(ptl); > return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); > } > - if (flags & FOLL_SPLIT) { > + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { > int ret; > page = pmd_page(*pmd); > if (is_huge_zero_page(page)) { > @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > split_huge_pmd(vma, pmd, address); > if (pmd_trans_unstable(pmd)) > ret = -EBUSY; > - } else { > + } else if (flags & FOLL_SPLIT) { > if (unlikely(!try_get_page(page))) { > spin_unlock(ptl); > return ERR_PTR(-ENOMEM); > @@ -419,8 +419,40 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > put_page(page); > if (pmd_none(*pmd)) > return no_page_table(vma, flags); > - } > + } else { /* flags & FOLL_SPLIT_PMD */ > + unsigned long addr; > + pgprot_t prot; > + pte_t *pte; > + int i; > + > + spin_unlock(ptl); > + split_huge_pmd(vma, pmd, address); All the code below is only relevant for file-backed THP. It will break for anon-THP. And I'm not convinced that it belongs here at all. User requested PMD split and it is done after split_huge_pmd(). The rest can be handled by the caller as needed. > + lock_page(page); > + pte = get_locked_pte(mm, address, &ptl); > + if (!pte) { > + unlock_page(page); > + return no_page_table(vma, flags); Or should it be -ENOMEM? > + } > > + /* get refcount for every small page */ > + page_ref_add(page, HPAGE_PMD_NR); > + > + prot = READ_ONCE(vma->vm_page_prot); > + for (i = 0, addr = address & PMD_MASK; > + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { > + struct page *p = page + i; > + > + pte = pte_offset_map(pmd, addr); > + VM_BUG_ON(!pte_none(*pte)); > + set_pte_at(mm, addr, pte, mk_pte(p, prot)); > + page_add_file_rmap(p, false); > + } > + > + spin_unlock(ptl); > + unlock_page(page); > + add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR); > + ret = 0; > + } > return ret ? ERR_PTR(ret) : > follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); > } > -- > 2.17.1 >
> On Jun 13, 2019, at 5:57 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: > > On Wed, Jun 12, 2019 at 03:03:17PM -0700, Song Liu wrote: >> This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says >> FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge >> page stays as-is. >> >> FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, >> but would switch back to huge page and huge pmd on. One of such example >> is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. >> >> Signed-off-by: Song Liu <songliubraving@fb.com> >> --- >> include/linux/mm.h | 1 + >> mm/gup.c | 38 +++++++++++++++++++++++++++++++++++--- >> 2 files changed, 36 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 0ab8c7d84cd0..e605acc4fc81 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, >> #define FOLL_COW 0x4000 /* internal GUP flag */ >> #define FOLL_ANON 0x8000 /* don't do file mappings */ >> #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ >> +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ >> >> /* >> * NOTE on FOLL_LONGTERM: >> diff --git a/mm/gup.c b/mm/gup.c >> index ddde097cf9e4..3d05bddb56c9 100644 >> --- a/mm/gup.c >> +++ b/mm/gup.c >> @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, >> spin_unlock(ptl); >> return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); >> } >> - if (flags & FOLL_SPLIT) { >> + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { >> int ret; >> page = pmd_page(*pmd); >> if (is_huge_zero_page(page)) { >> @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, >> split_huge_pmd(vma, pmd, address); >> if (pmd_trans_unstable(pmd)) >> ret = -EBUSY; >> - } else { >> + } else if (flags & FOLL_SPLIT) { >> if (unlikely(!try_get_page(page))) { >> spin_unlock(ptl); >> return ERR_PTR(-ENOMEM); >> @@ -419,8 +419,40 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, >> put_page(page); >> if (pmd_none(*pmd)) >> return no_page_table(vma, flags); >> - } >> + } else { /* flags & FOLL_SPLIT_PMD */ >> + unsigned long addr; >> + pgprot_t prot; >> + pte_t *pte; >> + int i; >> + >> + spin_unlock(ptl); >> + split_huge_pmd(vma, pmd, address); > > All the code below is only relevant for file-backed THP. It will break for > anon-THP. Oh, yes, that makes sense. > > And I'm not convinced that it belongs here at all. User requested PMD > split and it is done after split_huge_pmd(). The rest can be handled by > the caller as needed. I put this part here because split_huge_pmd() for file-backed THP is not really done after split_huge_pmd(). And I would like it done before calling follow_page_pte() below. Maybe we can still do them here, just for file-backed THPs? If we would move it, shall we move to callers of follow_page_mask()? In that case, we will probably end up with similar code in two places: __get_user_pages() and follow_page(). Did I get this right? > >> + lock_page(page); >> + pte = get_locked_pte(mm, address, &ptl); >> + if (!pte) { >> + unlock_page(page); >> + return no_page_table(vma, flags); > > Or should it be -ENOMEM? Yeah, ENOMEM is more accurate. Thanks, Song > >> + } >> >> + /* get refcount for every small page */ >> + page_ref_add(page, HPAGE_PMD_NR); >> + >> + prot = READ_ONCE(vma->vm_page_prot); >> + for (i = 0, addr = address & PMD_MASK; >> + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { >> + struct page *p = page + i; >> + >> + pte = pte_offset_map(pmd, addr); >> + VM_BUG_ON(!pte_none(*pte)); >> + set_pte_at(mm, addr, pte, mk_pte(p, prot)); >> + page_add_file_rmap(p, false); >> + } >> + >> + spin_unlock(ptl); >> + unlock_page(page); >> + add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR); >> + ret = 0; >> + } >> return ret ? ERR_PTR(ret) : >> follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); >> } >> -- >> 2.17.1 >> > > -- > Kirill A. Shutemov
On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: > > And I'm not convinced that it belongs here at all. User requested PMD > > split and it is done after split_huge_pmd(). The rest can be handled by > > the caller as needed. > > I put this part here because split_huge_pmd() for file-backed THP is > not really done after split_huge_pmd(). And I would like it done before > calling follow_page_pte() below. Maybe we can still do them here, just > for file-backed THPs? > > If we would move it, shall we move to callers of follow_page_mask()? > In that case, we will probably end up with similar code in two places: > __get_user_pages() and follow_page(). > > Did I get this right? Would it be enough to replace pte_offset_map_lock() in follow_page_pte() with pte_alloc_map_lock()? This will leave bunch not populated PTE entries, but it is fine: they will be populated on the next access to them.
> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: > > On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: >>> And I'm not convinced that it belongs here at all. User requested PMD >>> split and it is done after split_huge_pmd(). The rest can be handled by >>> the caller as needed. >> >> I put this part here because split_huge_pmd() for file-backed THP is >> not really done after split_huge_pmd(). And I would like it done before >> calling follow_page_pte() below. Maybe we can still do them here, just >> for file-backed THPs? >> >> If we would move it, shall we move to callers of follow_page_mask()? >> In that case, we will probably end up with similar code in two places: >> __get_user_pages() and follow_page(). >> >> Did I get this right? > > Would it be enough to replace pte_offset_map_lock() in follow_page_pte() > with pte_alloc_map_lock()? This is similar to my previous version: + } else { /* flags & FOLL_SPLIT_PMD */ + pte_t *pte; + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + pte = get_locked_pte(mm, address, &ptl); + if (!pte) + return no_page_table(vma, flags); + spin_unlock(ptl); + ret = 0; + } I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)? > This will leave bunch not populated PTE entries, but it is fine: they will > be populated on the next access to them. We need to handle page fault during next access, right? Since we already allocated everything, we can just populate the PTE entries and saves a lot of page faults (assuming we will access them later). Thanks, Song > > -- > Kirill A. Shutemov
On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote: > > > > On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: > > > > On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: > >>> And I'm not convinced that it belongs here at all. User requested PMD > >>> split and it is done after split_huge_pmd(). The rest can be handled by > >>> the caller as needed. > >> > >> I put this part here because split_huge_pmd() for file-backed THP is > >> not really done after split_huge_pmd(). And I would like it done before > >> calling follow_page_pte() below. Maybe we can still do them here, just > >> for file-backed THPs? > >> > >> If we would move it, shall we move to callers of follow_page_mask()? > >> In that case, we will probably end up with similar code in two places: > >> __get_user_pages() and follow_page(). > >> > >> Did I get this right? > > > > Would it be enough to replace pte_offset_map_lock() in follow_page_pte() > > with pte_alloc_map_lock()? > > This is similar to my previous version: > > + } else { /* flags & FOLL_SPLIT_PMD */ > + pte_t *pte; > + spin_unlock(ptl); > + split_huge_pmd(vma, pmd, address); > + pte = get_locked_pte(mm, address, &ptl); > + if (!pte) > + return no_page_table(vma, flags); > + spin_unlock(ptl); > + ret = 0; > + } > > I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). > What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)? It's additional lock-unlock cycle and few more lines of code... > > This will leave bunch not populated PTE entries, but it is fine: they will > > be populated on the next access to them. > > We need to handle page fault during next access, right? Since we already > allocated everything, we can just populate the PTE entries and saves a > lot of page faults (assuming we will access them later). Not a lot due to faultaround and they may never happen, but you need to tear down the mapping any way.
> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote: > > On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote: >> >> >>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: >>> >>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: >>>>> And I'm not convinced that it belongs here at all. User requested PMD >>>>> split and it is done after split_huge_pmd(). The rest can be handled by >>>>> the caller as needed. >>>> >>>> I put this part here because split_huge_pmd() for file-backed THP is >>>> not really done after split_huge_pmd(). And I would like it done before >>>> calling follow_page_pte() below. Maybe we can still do them here, just >>>> for file-backed THPs? >>>> >>>> If we would move it, shall we move to callers of follow_page_mask()? >>>> In that case, we will probably end up with similar code in two places: >>>> __get_user_pages() and follow_page(). >>>> >>>> Did I get this right? >>> >>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte() >>> with pte_alloc_map_lock()? >> >> This is similar to my previous version: >> >> + } else { /* flags & FOLL_SPLIT_PMD */ >> + pte_t *pte; >> + spin_unlock(ptl); >> + split_huge_pmd(vma, pmd, address); >> + pte = get_locked_pte(mm, address, &ptl); >> + if (!pte) >> + return no_page_table(vma, flags); >> + spin_unlock(ptl); >> + ret = 0; >> + } >> >> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). >> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)? > > It's additional lock-unlock cycle and few more lines of code... > >>> This will leave bunch not populated PTE entries, but it is fine: they will >>> be populated on the next access to them. >> >> We need to handle page fault during next access, right? Since we already >> allocated everything, we can just populate the PTE entries and saves a >> lot of page faults (assuming we will access them later). > > Not a lot due to faultaround and they may never happen, but you need to > tear down the mapping any way. I see. Let me try this way. Thanks, Song > > -- > Kirill A. Shutemov
Hi Kirill, > On Jun 13, 2019, at 8:24 AM, Song Liu <songliubraving@fb.com> wrote: > > > >> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote: >> >> On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote: >>> >>> >>>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: >>>> >>>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: >>>>>> And I'm not convinced that it belongs here at all. User requested PMD >>>>>> split and it is done after split_huge_pmd(). The rest can be handled by >>>>>> the caller as needed. >>>>> >>>>> I put this part here because split_huge_pmd() for file-backed THP is >>>>> not really done after split_huge_pmd(). And I would like it done before >>>>> calling follow_page_pte() below. Maybe we can still do them here, just >>>>> for file-backed THPs? >>>>> >>>>> If we would move it, shall we move to callers of follow_page_mask()? >>>>> In that case, we will probably end up with similar code in two places: >>>>> __get_user_pages() and follow_page(). >>>>> >>>>> Did I get this right? >>>> >>>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte() >>>> with pte_alloc_map_lock()? >>> >>> This is similar to my previous version: >>> >>> + } else { /* flags & FOLL_SPLIT_PMD */ >>> + pte_t *pte; >>> + spin_unlock(ptl); >>> + split_huge_pmd(vma, pmd, address); >>> + pte = get_locked_pte(mm, address, &ptl); >>> + if (!pte) >>> + return no_page_table(vma, flags); >>> + spin_unlock(ptl); >>> + ret = 0; >>> + } >>> >>> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). >>> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)? >> >> It's additional lock-unlock cycle and few more lines of code... >> >>>> This will leave bunch not populated PTE entries, but it is fine: they will >>>> be populated on the next access to them. >>> >>> We need to handle page fault during next access, right? Since we already >>> allocated everything, we can just populate the PTE entries and saves a >>> lot of page faults (assuming we will access them later). >> >> Not a lot due to faultaround and they may never happen, but you need to >> tear down the mapping any way. > > I see. Let me try this way. > > Thanks, > Song To make sure I understand your suggestions. Here is what I got: diff --git c/mm/gup.c w/mm/gup.c index ddde097cf9e4..85e6f46fd925 100644 --- c/mm/gup.c +++ w/mm/gup.c @@ -197,7 +197,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + ptep = pte_alloc_map_lock(mm, pmd, address, &ptl); + if (!ptep) + return ERR_PTR(-ENOMEM); + pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; @@ -398,7 +401,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -407,7 +410,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -419,6 +422,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); + } else { /* flags & FOLL_SPLIT_PMD */ + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + ret = 0; } return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); This version doesn't work as-is, because it returns at the first check: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); Did I misunderstand anything here? Thanks, Song > >> >> -- >> Kirill A. Shutemov
> On Jun 13, 2019, at 9:47 AM, Song Liu <songliubraving@fb.com> wrote: > > Hi Kirill, > >> On Jun 13, 2019, at 8:24 AM, Song Liu <songliubraving@fb.com> wrote: >> >> >> >>> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote: >>> >>> On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote: >>>> >>>> >>>>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote: >>>>> >>>>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote: >>>>>>> And I'm not convinced that it belongs here at all. User requested PMD >>>>>>> split and it is done after split_huge_pmd(). The rest can be handled by >>>>>>> the caller as needed. >>>>>> >>>>>> I put this part here because split_huge_pmd() for file-backed THP is >>>>>> not really done after split_huge_pmd(). And I would like it done before >>>>>> calling follow_page_pte() below. Maybe we can still do them here, just >>>>>> for file-backed THPs? >>>>>> >>>>>> If we would move it, shall we move to callers of follow_page_mask()? >>>>>> In that case, we will probably end up with similar code in two places: >>>>>> __get_user_pages() and follow_page(). >>>>>> >>>>>> Did I get this right? >>>>> >>>>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte() >>>>> with pte_alloc_map_lock()? >>>> >>>> This is similar to my previous version: >>>> >>>> + } else { /* flags & FOLL_SPLIT_PMD */ >>>> + pte_t *pte; >>>> + spin_unlock(ptl); >>>> + split_huge_pmd(vma, pmd, address); >>>> + pte = get_locked_pte(mm, address, &ptl); >>>> + if (!pte) >>>> + return no_page_table(vma, flags); >>>> + spin_unlock(ptl); >>>> + ret = 0; >>>> + } >>>> >>>> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). >>>> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)? >>> >>> It's additional lock-unlock cycle and few more lines of code... >>> >>>>> This will leave bunch not populated PTE entries, but it is fine: they will >>>>> be populated on the next access to them. >>>> >>>> We need to handle page fault during next access, right? Since we already >>>> allocated everything, we can just populate the PTE entries and saves a >>>> lot of page faults (assuming we will access them later). >>> >>> Not a lot due to faultaround and they may never happen, but you need to >>> tear down the mapping any way. >> >> I see. Let me try this way. >> >> Thanks, >> Song > > To make sure I understand your suggestions. Here is what I got: > > diff --git c/mm/gup.c w/mm/gup.c > index ddde097cf9e4..85e6f46fd925 100644 > --- c/mm/gup.c > +++ w/mm/gup.c > @@ -197,7 +197,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, > if (unlikely(pmd_bad(*pmd))) > return no_page_table(vma, flags); > > - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); > + ptep = pte_alloc_map_lock(mm, pmd, address, &ptl); > + if (!ptep) > + return ERR_PTR(-ENOMEM); > + > pte = *ptep; > if (!pte_present(pte)) { > swp_entry_t entry; > @@ -398,7 +401,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > spin_unlock(ptl); > return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); > } > - if (flags & FOLL_SPLIT) { > + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { > int ret; > page = pmd_page(*pmd); > if (is_huge_zero_page(page)) { > @@ -407,7 +410,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > split_huge_pmd(vma, pmd, address); > if (pmd_trans_unstable(pmd)) > ret = -EBUSY; > - } else { > + } else if (flags & FOLL_SPLIT) { > if (unlikely(!try_get_page(page))) { > spin_unlock(ptl); > return ERR_PTR(-ENOMEM); > @@ -419,6 +422,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > put_page(page); > if (pmd_none(*pmd)) > return no_page_table(vma, flags); > + } else { /* flags & FOLL_SPLIT_PMD */ > + spin_unlock(ptl); > + split_huge_pmd(vma, pmd, address); > + ret = 0; > } > > return ret ? ERR_PTR(ret) : > follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); > > > This version doesn't work as-is, because it returns at the first check: > > if (unlikely(pmd_bad(*pmd))) > return no_page_table(vma, flags); > > Did I misunderstand anything here? > > Thanks, > Song I guess this would be the best. It _is_ a lot simpler. diff --git c/mm/gup.c w/mm/gup.c index ddde097cf9e4..0cd3ce599f41 100644 --- c/mm/gup.c +++ w/mm/gup.c @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -419,6 +419,11 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); + } else { /* flags & FOLL_SPLIT_PMD */ + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + pte_alloc(mm, pmd); } Thanks again for the suggestions. I will send v4 soon. Song > > >> >>> >>> -- >>> Kirill A. Shutemov
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ab8c7d84cd0..e605acc4fc81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ /* * NOTE on FOLL_LONGTERM: diff --git a/mm/gup.c b/mm/gup.c index ddde097cf9e4..3d05bddb56c9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -419,8 +419,40 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); - } + } else { /* flags & FOLL_SPLIT_PMD */ + unsigned long addr; + pgprot_t prot; + pte_t *pte; + int i; + + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + lock_page(page); + pte = get_locked_pte(mm, address, &ptl); + if (!pte) { + unlock_page(page); + return no_page_table(vma, flags); + } + /* get refcount for every small page */ + page_ref_add(page, HPAGE_PMD_NR); + + prot = READ_ONCE(vma->vm_page_prot); + for (i = 0, addr = address & PMD_MASK; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + struct page *p = page + i; + + pte = pte_offset_map(pmd, addr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, mk_pte(p, prot)); + page_add_file_rmap(p, false); + } + + spin_unlock(ptl); + unlock_page(page); + add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR); + ret = 0; + } return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); }
This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge page stays as-is. FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, but would switch back to huge page and huge pmd on. One of such example is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. Signed-off-by: Song Liu <songliubraving@fb.com> --- include/linux/mm.h | 1 + mm/gup.c | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-)