Message ID | 20220429133552.33768-14-zhengqi.arch@bytedance.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Try to free user PTE page table pages | expand |
On 2022/4/29 9:35 PM, Qi Zheng wrote: > Normally, the percpu_ref of the user PTE page table page is in > percpu mode. This patch add try_to_free_user_pte() to switch > the percpu_ref to atomic mode and check if it is 0. If the > percpu_ref is 0, which means that no one is using the user PTE > page table page, then we can safely reclaim it. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > include/linux/pte_ref.h | 7 +++ > mm/pte_ref.c | 99 ++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 104 insertions(+), 2 deletions(-) > > diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h > index bfe620038699..379c3b45a6ab 100644 > --- a/include/linux/pte_ref.h > +++ b/include/linux/pte_ref.h > @@ -16,6 +16,8 @@ void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); > bool pte_tryget(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); > void __pte_put(pgtable_t page); > void pte_put(pte_t *ptep); > +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, > + bool switch_back); > > #else /* !CONFIG_FREE_USER_PTE */ > > @@ -47,6 +49,11 @@ static inline void pte_put(pte_t *ptep) > { > } > > +static inline void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, > + unsigned long addr, bool switch_back) > +{ > +} > + > #endif /* CONFIG_FREE_USER_PTE */ > > #endif /* _LINUX_PTE_REF_H */ > diff --git a/mm/pte_ref.c b/mm/pte_ref.c > index 5b382445561e..bf9629272c71 100644 > --- a/mm/pte_ref.c > +++ b/mm/pte_ref.c > @@ -8,6 +8,9 @@ > #include <linux/pte_ref.h> > #include <linux/percpu-refcount.h> > #include <linux/slab.h> > +#include <linux/hugetlb.h> > +#include <asm/tlbflush.h> > +#include <asm/pgalloc.h> > > #ifdef CONFIG_FREE_USER_PTE > > @@ -44,8 +47,6 @@ void pte_ref_free(pgtable_t pte) > kfree(ref); > } > > -void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) {} > - > /* > * pte_tryget - try to get the pte_ref of the user PTE page table page > * @mm: pointer the target address space > @@ -102,4 +103,98 @@ void pte_put(pte_t *ptep) > } > EXPORT_SYMBOL(pte_put); > > +#ifdef CONFIG_DEBUG_VM > +void pte_free_debug(pmd_t pmd) > +{ > + pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd); > + int i = 0; > + > + for (i = 0; i < PTRS_PER_PTE; i++) > + BUG_ON(!pte_none(*ptep++)); > +} > +#else > +static inline void pte_free_debug(pmd_t pmd) > +{ > +} > +#endif > + > +static inline void pte_free_rcu(struct rcu_head *rcu) > +{ > + struct page *page = container_of(rcu, struct page, rcu_head); > + > + pgtable_pte_page_dtor(page); > + __free_page(page); > +} > + > +/* > + * free_user_pte - free the user PTE page table page > + * @mm: pointer the target address space > + * @pmd: pointer to a PMD > + * @addr: start address of the tlb range to be flushed > + * > + * Context: The pmd range has been unmapped and TLB purged. And the user PTE > + * page table page will be freed by rcu handler. > + */ > +void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) > +{ > + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); > + spinlock_t *ptl; > + pmd_t pmdval; > + > + ptl = pmd_lock(mm, pmd); > + pmdval = *pmd; > + if (pmd_none(pmdval) || pmd_leaf(pmdval)) { > + spin_unlock(ptl); > + return; > + } > + pmd_clear(pmd); > + flush_tlb_range(&vma, addr, addr + PMD_SIZE); > + spin_unlock(ptl); > + > + pte_free_debug(pmdval); > + mm_dec_nr_ptes(mm); > + call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu); > +} > + > +/* > + * try_to_free_user_pte - try to free the user PTE page table page > + * @mm: pointer the target address space > + * @pmd: pointer to a PMD > + * @addr: virtual address associated with pmd > + * @switch_back: indicates if switching back to percpu mode is required > + */ > +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, > + bool switch_back) > +{ > + pgtable_t pte; > + > + if (&init_mm == mm) > + return; > + > + if (!pte_tryget(mm, pmd, addr)) > + return; > + pte = pmd_pgtable(*pmd); > + percpu_ref_switch_to_atomic_sync(pte->pte_ref); > + rcu_read_lock(); > + /* > + * Here we can safely put the pte_ref because we already hold the rcu > + * lock, which guarantees that the user PTE page table page will not > + * be released. > + */ > + __pte_put(pte); > + if (percpu_ref_is_zero(pte->pte_ref)) { > + rcu_read_unlock(); > + free_user_pte(mm, pmd, addr & PMD_MASK); > + return; > + } > + rcu_read_unlock(); > + > + if (switch_back) { > + if (pte_tryget(mm, pmd, addr)) { > + percpu_ref_switch_to_percpu(pte->pte_ref); > + __pte_put(pte); > + } > + } We shouldn't switch back to percpu mode here, it will drastically reduce performance. > +} > + > #endif /* CONFIG_FREE_USER_PTE */
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h index bfe620038699..379c3b45a6ab 100644 --- a/include/linux/pte_ref.h +++ b/include/linux/pte_ref.h @@ -16,6 +16,8 @@ void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); bool pte_tryget(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); void __pte_put(pgtable_t page); void pte_put(pte_t *ptep); +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + bool switch_back); #else /* !CONFIG_FREE_USER_PTE */ @@ -47,6 +49,11 @@ static inline void pte_put(pte_t *ptep) { } +static inline void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, bool switch_back) +{ +} + #endif /* CONFIG_FREE_USER_PTE */ #endif /* _LINUX_PTE_REF_H */ diff --git a/mm/pte_ref.c b/mm/pte_ref.c index 5b382445561e..bf9629272c71 100644 --- a/mm/pte_ref.c +++ b/mm/pte_ref.c @@ -8,6 +8,9 @@ #include <linux/pte_ref.h> #include <linux/percpu-refcount.h> #include <linux/slab.h> +#include <linux/hugetlb.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> #ifdef CONFIG_FREE_USER_PTE @@ -44,8 +47,6 @@ void pte_ref_free(pgtable_t pte) kfree(ref); } -void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) {} - /* * pte_tryget - try to get the pte_ref of the user PTE page table page * @mm: pointer the target address space @@ -102,4 +103,98 @@ void pte_put(pte_t *ptep) } EXPORT_SYMBOL(pte_put); +#ifdef CONFIG_DEBUG_VM +void pte_free_debug(pmd_t pmd) +{ + pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd); + int i = 0; + + for (i = 0; i < PTRS_PER_PTE; i++) + BUG_ON(!pte_none(*ptep++)); +} +#else +static inline void pte_free_debug(pmd_t pmd) +{ +} +#endif + +static inline void pte_free_rcu(struct rcu_head *rcu) +{ + struct page *page = container_of(rcu, struct page, rcu_head); + + pgtable_pte_page_dtor(page); + __free_page(page); +} + +/* + * free_user_pte - free the user PTE page table page + * @mm: pointer the target address space + * @pmd: pointer to a PMD + * @addr: start address of the tlb range to be flushed + * + * Context: The pmd range has been unmapped and TLB purged. And the user PTE + * page table page will be freed by rcu handler. + */ +void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + spinlock_t *ptl; + pmd_t pmdval; + + ptl = pmd_lock(mm, pmd); + pmdval = *pmd; + if (pmd_none(pmdval) || pmd_leaf(pmdval)) { + spin_unlock(ptl); + return; + } + pmd_clear(pmd); + flush_tlb_range(&vma, addr, addr + PMD_SIZE); + spin_unlock(ptl); + + pte_free_debug(pmdval); + mm_dec_nr_ptes(mm); + call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu); +} + +/* + * try_to_free_user_pte - try to free the user PTE page table page + * @mm: pointer the target address space + * @pmd: pointer to a PMD + * @addr: virtual address associated with pmd + * @switch_back: indicates if switching back to percpu mode is required + */ +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + bool switch_back) +{ + pgtable_t pte; + + if (&init_mm == mm) + return; + + if (!pte_tryget(mm, pmd, addr)) + return; + pte = pmd_pgtable(*pmd); + percpu_ref_switch_to_atomic_sync(pte->pte_ref); + rcu_read_lock(); + /* + * Here we can safely put the pte_ref because we already hold the rcu + * lock, which guarantees that the user PTE page table page will not + * be released. + */ + __pte_put(pte); + if (percpu_ref_is_zero(pte->pte_ref)) { + rcu_read_unlock(); + free_user_pte(mm, pmd, addr & PMD_MASK); + return; + } + rcu_read_unlock(); + + if (switch_back) { + if (pte_tryget(mm, pmd, addr)) { + percpu_ref_switch_to_percpu(pte->pte_ref); + __pte_put(pte); + } + } +} + #endif /* CONFIG_FREE_USER_PTE */
Normally, the percpu_ref of the user PTE page table page is in percpu mode. This patch add try_to_free_user_pte() to switch the percpu_ref to atomic mode and check if it is 0. If the percpu_ref is 0, which means that no one is using the user PTE page table page, then we can safely reclaim it. Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- include/linux/pte_ref.h | 7 +++ mm/pte_ref.c | 99 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-)