Message ID | 20241216165105.56185-9-dev.jain@arm.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | khugepaged: Asynchronous mTHP collapse | expand |
On 16/12/2024 16:51, Dev Jain wrote: > Abstract away taking the mmap_lock exclusively, copying page contents, and > setting the PMD, into vma_collapse_anon_folio_pmd(). > > Signed-off-by: Dev Jain <dev.jain@arm.com> > --- > mm/khugepaged.c | 119 +++++++++++++++++++++++++++--------------------- > 1 file changed, 66 insertions(+), 53 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 078794aa3335..88beebef773e 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, > return SCAN_SUCCEED; > } > > -static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > - int referenced, int unmapped, int order, > - struct collapse_control *cc) > +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address, > + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd, > + struct folio *folio) > { > + struct mmu_notifier_range range; > + spinlock_t *pmd_ptl, *pte_ptl; > LIST_HEAD(compound_pagelist); > - pmd_t *pmd, _pmd; > - pte_t *pte; > pgtable_t pgtable; > - struct folio *folio; > - spinlock_t *pmd_ptl, *pte_ptl; > - int result = SCAN_FAIL; > - struct vm_area_struct *vma; > - struct mmu_notifier_range range; > - > - VM_BUG_ON(address & ~HPAGE_PMD_MASK); > - > - /* > - * Before allocating the hugepage, release the mmap_lock read lock. > - * The allocation can take potentially a long time if it involves > - * sync compaction, and we do not need to hold the mmap_lock during > - * that. We will recheck the vma after taking it again in write mode. > - */ > - mmap_read_unlock(mm); > - > - result = alloc_charge_folio(&folio, mm, order, cc); > - if (result != SCAN_SUCCEED) > - goto out_nolock; > - > - mmap_read_lock(mm); > - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); > - if (result != SCAN_SUCCEED) { > - mmap_read_unlock(mm); > - goto out_nolock; > - } > - > - result = find_pmd_or_thp_or_none(mm, address, &pmd); > - if (result != SCAN_SUCCEED) { > - mmap_read_unlock(mm); > - goto out_nolock; > - } > - > - if (unmapped) { > - /* > - * __collapse_huge_page_swapin will return with mmap_lock > - * released when it fails. So we jump out_nolock directly in > - * that case. Continuing to collapse causes inconsistency. > - */ > - result = __collapse_huge_page_swapin(mm, vma, address, pmd, > - referenced, order); > - if (result != SCAN_SUCCEED) > - goto out_nolock; > - } > + int result; > + pmd_t _pmd; > + pte_t *pte; > > mmap_read_unlock(mm); > /* > @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * mmap_lock. > */ > mmap_write_lock(mm); > - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); > + > + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc); > if (result != SCAN_SUCCEED) > goto out_up_write; > /* check if the pmd is still valid */ > @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); > if (pte) { > result = __collapse_huge_page_isolate(vma, address, pte, cc, > - &compound_pagelist, order); > + &compound_pagelist, HPAGE_PMD_ORDER); > spin_unlock(pte_ptl); > } else { > result = SCAN_PMD_NULL; > @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > deferred_split_folio(folio, false); > spin_unlock(pmd_ptl); > > - folio = NULL; > - > result = SCAN_SUCCEED; > out_up_write: > mmap_write_unlock(mm); > + return result; > +} > + > +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > + int referenced, int unmapped, int order, > + struct collapse_control *cc) > +{ > + struct vm_area_struct *vma; > + int result = SCAN_FAIL; > + struct folio *folio; > + pmd_t *pmd; > + > + /* > + * Before allocating the hugepage, release the mmap_lock read lock. > + * The allocation can take potentially a long time if it involves > + * sync compaction, and we do not need to hold the mmap_lock during > + * that. We will recheck the vma after taking it again in write mode. > + */ > + mmap_read_unlock(mm); > + > + result = alloc_charge_folio(&folio, mm, order, cc); > + if (result != SCAN_SUCCEED) > + goto out_nolock; > + > + mmap_read_lock(mm); > + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); > + if (result != SCAN_SUCCEED) { > + mmap_read_unlock(mm); > + goto out_nolock; > + } > + > + result = find_pmd_or_thp_or_none(mm, address, &pmd); > + if (result != SCAN_SUCCEED) { > + mmap_read_unlock(mm); > + goto out_nolock; > + } > + > + if (unmapped) { > + /* > + * __collapse_huge_page_swapin will return with mmap_lock > + * released when it fails. So we jump out_nolock directly in > + * that case. Continuing to collapse causes inconsistency. > + */ > + result = __collapse_huge_page_swapin(mm, vma, address, pmd, > + referenced, order); > + if (result != SCAN_SUCCEED) > + goto out_nolock; > + } > + > + if (order == HPAGE_PMD_ORDER) > + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio); I think the locking is broken here? collapse_huge_page() used to enter with the mmamp read lock and exit without the lock held at all. After the change, this is only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap read lock still held. Perhaps: else mmap_read_unlock(mm); > + > + if (result == SCAN_SUCCEED) > + folio = NULL; > + > out_nolock: > if (folio) > folio_put(folio);
On 18/12/24 12:54 am, Ryan Roberts wrote: > On 16/12/2024 16:51, Dev Jain wrote: >> Abstract away taking the mmap_lock exclusively, copying page contents, and >> setting the PMD, into vma_collapse_anon_folio_pmd(). >> >> Signed-off-by: Dev Jain <dev.jain@arm.com> >> --- >> mm/khugepaged.c | 119 +++++++++++++++++++++++++++--------------------- >> 1 file changed, 66 insertions(+), 53 deletions(-) >> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >> index 078794aa3335..88beebef773e 100644 >> --- a/mm/khugepaged.c >> +++ b/mm/khugepaged.c >> @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, >> return SCAN_SUCCEED; >> } >> >> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address, >> - int referenced, int unmapped, int order, >> - struct collapse_control *cc) >> +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address, >> + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd, >> + struct folio *folio) >> { >> + struct mmu_notifier_range range; >> + spinlock_t *pmd_ptl, *pte_ptl; >> LIST_HEAD(compound_pagelist); >> - pmd_t *pmd, _pmd; >> - pte_t *pte; >> pgtable_t pgtable; >> - struct folio *folio; >> - spinlock_t *pmd_ptl, *pte_ptl; >> - int result = SCAN_FAIL; >> - struct vm_area_struct *vma; >> - struct mmu_notifier_range range; >> - >> - VM_BUG_ON(address & ~HPAGE_PMD_MASK); >> - >> - /* >> - * Before allocating the hugepage, release the mmap_lock read lock. >> - * The allocation can take potentially a long time if it involves >> - * sync compaction, and we do not need to hold the mmap_lock during >> - * that. We will recheck the vma after taking it again in write mode. >> - */ >> - mmap_read_unlock(mm); >> - >> - result = alloc_charge_folio(&folio, mm, order, cc); >> - if (result != SCAN_SUCCEED) >> - goto out_nolock; >> - >> - mmap_read_lock(mm); >> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); >> - if (result != SCAN_SUCCEED) { >> - mmap_read_unlock(mm); >> - goto out_nolock; >> - } >> - >> - result = find_pmd_or_thp_or_none(mm, address, &pmd); >> - if (result != SCAN_SUCCEED) { >> - mmap_read_unlock(mm); >> - goto out_nolock; >> - } >> - >> - if (unmapped) { >> - /* >> - * __collapse_huge_page_swapin will return with mmap_lock >> - * released when it fails. So we jump out_nolock directly in >> - * that case. Continuing to collapse causes inconsistency. >> - */ >> - result = __collapse_huge_page_swapin(mm, vma, address, pmd, >> - referenced, order); >> - if (result != SCAN_SUCCEED) >> - goto out_nolock; >> - } >> + int result; >> + pmd_t _pmd; >> + pte_t *pte; >> >> mmap_read_unlock(mm); >> /* >> @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, >> * mmap_lock. >> */ >> mmap_write_lock(mm); >> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); >> + >> + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc); >> if (result != SCAN_SUCCEED) >> goto out_up_write; >> /* check if the pmd is still valid */ >> @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, >> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); >> if (pte) { >> result = __collapse_huge_page_isolate(vma, address, pte, cc, >> - &compound_pagelist, order); >> + &compound_pagelist, HPAGE_PMD_ORDER); >> spin_unlock(pte_ptl); >> } else { >> result = SCAN_PMD_NULL; >> @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, >> deferred_split_folio(folio, false); >> spin_unlock(pmd_ptl); >> >> - folio = NULL; >> - >> result = SCAN_SUCCEED; >> out_up_write: >> mmap_write_unlock(mm); >> + return result; >> +} >> + >> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, >> + int referenced, int unmapped, int order, >> + struct collapse_control *cc) >> +{ >> + struct vm_area_struct *vma; >> + int result = SCAN_FAIL; >> + struct folio *folio; >> + pmd_t *pmd; >> + >> + /* >> + * Before allocating the hugepage, release the mmap_lock read lock. >> + * The allocation can take potentially a long time if it involves >> + * sync compaction, and we do not need to hold the mmap_lock during >> + * that. We will recheck the vma after taking it again in write mode. >> + */ >> + mmap_read_unlock(mm); >> + >> + result = alloc_charge_folio(&folio, mm, order, cc); >> + if (result != SCAN_SUCCEED) >> + goto out_nolock; >> + >> + mmap_read_lock(mm); >> + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); >> + if (result != SCAN_SUCCEED) { >> + mmap_read_unlock(mm); >> + goto out_nolock; >> + } >> + >> + result = find_pmd_or_thp_or_none(mm, address, &pmd); >> + if (result != SCAN_SUCCEED) { >> + mmap_read_unlock(mm); >> + goto out_nolock; >> + } >> + >> + if (unmapped) { >> + /* >> + * __collapse_huge_page_swapin will return with mmap_lock >> + * released when it fails. So we jump out_nolock directly in >> + * that case. Continuing to collapse causes inconsistency. >> + */ >> + result = __collapse_huge_page_swapin(mm, vma, address, pmd, >> + referenced, order); >> + if (result != SCAN_SUCCEED) >> + goto out_nolock; >> + } >> + >> + if (order == HPAGE_PMD_ORDER) >> + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio); > I think the locking is broken here? collapse_huge_page() used to enter with the > mmamp read lock and exit without the lock held at all. After the change, this is > only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap > read lock still held. Perhaps: > > else > mmap_read_unlock(mm); I have completely messed up sequential building of patches :) I will take care next time. > >> + >> + if (result == SCAN_SUCCEED) >> + folio = NULL; >> + >> out_nolock: >> if (folio) >> folio_put(folio);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 078794aa3335..88beebef773e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, return SCAN_SUCCEED; } -static int collapse_huge_page(struct mm_struct *mm, unsigned long address, - int referenced, int unmapped, int order, - struct collapse_control *cc) +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address, + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd, + struct folio *folio) { + struct mmu_notifier_range range; + spinlock_t *pmd_ptl, *pte_ptl; LIST_HEAD(compound_pagelist); - pmd_t *pmd, _pmd; - pte_t *pte; pgtable_t pgtable; - struct folio *folio; - spinlock_t *pmd_ptl, *pte_ptl; - int result = SCAN_FAIL; - struct vm_area_struct *vma; - struct mmu_notifier_range range; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - /* - * Before allocating the hugepage, release the mmap_lock read lock. - * The allocation can take potentially a long time if it involves - * sync compaction, and we do not need to hold the mmap_lock during - * that. We will recheck the vma after taking it again in write mode. - */ - mmap_read_unlock(mm); - - result = alloc_charge_folio(&folio, mm, order, cc); - if (result != SCAN_SUCCEED) - goto out_nolock; - - mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); - if (result != SCAN_SUCCEED) { - mmap_read_unlock(mm); - goto out_nolock; - } - - result = find_pmd_or_thp_or_none(mm, address, &pmd); - if (result != SCAN_SUCCEED) { - mmap_read_unlock(mm); - goto out_nolock; - } - - if (unmapped) { - /* - * __collapse_huge_page_swapin will return with mmap_lock - * released when it fails. So we jump out_nolock directly in - * that case. Continuing to collapse causes inconsistency. - */ - result = __collapse_huge_page_swapin(mm, vma, address, pmd, - referenced, order); - if (result != SCAN_SUCCEED) - goto out_nolock; - } + int result; + pmd_t _pmd; + pte_t *pte; mmap_read_unlock(mm); /* @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * mmap_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); + + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, - &compound_pagelist, order); + &compound_pagelist, HPAGE_PMD_ORDER); spin_unlock(pte_ptl); } else { result = SCAN_PMD_NULL; @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, deferred_split_folio(folio, false); spin_unlock(pmd_ptl); - folio = NULL; - result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); + return result; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, int order, + struct collapse_control *cc) +{ + struct vm_area_struct *vma; + int result = SCAN_FAIL; + struct folio *folio; + pmd_t *pmd; + + /* + * Before allocating the hugepage, release the mmap_lock read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_lock during + * that. We will recheck the vma after taking it again in write mode. + */ + mmap_read_unlock(mm); + + result = alloc_charge_folio(&folio, mm, order, cc); + if (result != SCAN_SUCCEED) + goto out_nolock; + + mmap_read_lock(mm); + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); + goto out_nolock; + } + + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); + goto out_nolock; + } + + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced, order); + if (result != SCAN_SUCCEED) + goto out_nolock; + } + + if (order == HPAGE_PMD_ORDER) + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio); + + if (result == SCAN_SUCCEED) + folio = NULL; + out_nolock: if (folio) folio_put(folio);
Abstract away taking the mmap_lock exclusively, copying page contents, and setting the PMD, into vma_collapse_anon_folio_pmd(). Signed-off-by: Dev Jain <dev.jain@arm.com> --- mm/khugepaged.c | 119 +++++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 53 deletions(-)