diff mbox series

[RFC,13/18] mm: add try_to_free_user_pte() helper

Message ID 20220429133552.33768-14-zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series Try to free user PTE page table pages | expand

Commit Message

Qi Zheng April 29, 2022, 1:35 p.m. UTC
Normally, the percpu_ref of the user PTE page table page is in
percpu mode. This patch add try_to_free_user_pte() to switch
the percpu_ref to atomic mode and check if it is 0. If the
percpu_ref is 0, which means that no one is using the user PTE
page table page, then we can safely reclaim it.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/pte_ref.h |  7 +++
 mm/pte_ref.c            | 99 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 2 deletions(-)

Comments

Qi Zheng April 30, 2022, 1:35 p.m. UTC | #1
On 2022/4/29 9:35 PM, Qi Zheng wrote:
> Normally, the percpu_ref of the user PTE page table page is in
> percpu mode. This patch add try_to_free_user_pte() to switch
> the percpu_ref to atomic mode and check if it is 0. If the
> percpu_ref is 0, which means that no one is using the user PTE
> page table page, then we can safely reclaim it.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>   include/linux/pte_ref.h |  7 +++
>   mm/pte_ref.c            | 99 ++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 104 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
> index bfe620038699..379c3b45a6ab 100644
> --- a/include/linux/pte_ref.h
> +++ b/include/linux/pte_ref.h
> @@ -16,6 +16,8 @@ void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
>   bool pte_tryget(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
>   void __pte_put(pgtable_t page);
>   void pte_put(pte_t *ptep);
> +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
> +			  bool switch_back);
>   
>   #else /* !CONFIG_FREE_USER_PTE */
>   
> @@ -47,6 +49,11 @@ static inline void pte_put(pte_t *ptep)
>   {
>   }
>   
> +static inline void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd,
> +					unsigned long addr, bool switch_back)
> +{
> +}
> +
>   #endif /* CONFIG_FREE_USER_PTE */
>   
>   #endif /* _LINUX_PTE_REF_H */
> diff --git a/mm/pte_ref.c b/mm/pte_ref.c
> index 5b382445561e..bf9629272c71 100644
> --- a/mm/pte_ref.c
> +++ b/mm/pte_ref.c
> @@ -8,6 +8,9 @@
>   #include <linux/pte_ref.h>
>   #include <linux/percpu-refcount.h>
>   #include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +#include <asm/tlbflush.h>
> +#include <asm/pgalloc.h>
>   
>   #ifdef CONFIG_FREE_USER_PTE
>   
> @@ -44,8 +47,6 @@ void pte_ref_free(pgtable_t pte)
>   	kfree(ref);
>   }
>   
> -void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) {}
> -
>   /*
>    * pte_tryget - try to get the pte_ref of the user PTE page table page
>    * @mm: pointer the target address space
> @@ -102,4 +103,98 @@ void pte_put(pte_t *ptep)
>   }
>   EXPORT_SYMBOL(pte_put);
>   
> +#ifdef CONFIG_DEBUG_VM
> +void pte_free_debug(pmd_t pmd)
> +{
> +	pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
> +	int i = 0;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++)
> +		BUG_ON(!pte_none(*ptep++));
> +}
> +#else
> +static inline void pte_free_debug(pmd_t pmd)
> +{
> +}
> +#endif
> +
> +static inline void pte_free_rcu(struct rcu_head *rcu)
> +{
> +	struct page *page = container_of(rcu, struct page, rcu_head);
> +
> +	pgtable_pte_page_dtor(page);
> +	__free_page(page);
> +}
> +
> +/*
> + * free_user_pte - free the user PTE page table page
> + * @mm: pointer the target address space
> + * @pmd: pointer to a PMD
> + * @addr: start address of the tlb range to be flushed
> + *
> + * Context: The pmd range has been unmapped and TLB purged. And the user PTE
> + *	    page table page will be freed by rcu handler.
> + */
> +void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
> +{
> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +	spinlock_t *ptl;
> +	pmd_t pmdval;
> +
> +	ptl = pmd_lock(mm, pmd);
> +	pmdval = *pmd;
> +	if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
> +		spin_unlock(ptl);
> +		return;
> +	}
> +	pmd_clear(pmd);
> +	flush_tlb_range(&vma, addr, addr + PMD_SIZE);
> +	spin_unlock(ptl);
> +
> +	pte_free_debug(pmdval);
> +	mm_dec_nr_ptes(mm);
> +	call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu);
> +}
> +
> +/*
> + * try_to_free_user_pte - try to free the user PTE page table page
> + * @mm: pointer the target address space
> + * @pmd: pointer to a PMD
> + * @addr: virtual address associated with pmd
> + * @switch_back: indicates if switching back to percpu mode is required
> + */
> +void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
> +			  bool switch_back)
> +{
> +	pgtable_t pte;
> +
> +	if (&init_mm == mm)
> +		return;
> +
> +	if (!pte_tryget(mm, pmd, addr))
> +		return;
> +	pte = pmd_pgtable(*pmd);
> +	percpu_ref_switch_to_atomic_sync(pte->pte_ref);
> +	rcu_read_lock();
> +	/*
> +	 * Here we can safely put the pte_ref because we already hold the rcu
> +	 * lock, which guarantees that the user PTE page table page will not
> +	 * be released.
> +	 */
> +	__pte_put(pte);
> +	if (percpu_ref_is_zero(pte->pte_ref)) {
> +		rcu_read_unlock();
> +		free_user_pte(mm, pmd, addr & PMD_MASK);
> +		return;
> +	}
> +	rcu_read_unlock();
> +
> +	if (switch_back) {
> +		if (pte_tryget(mm, pmd, addr)) {
> +			percpu_ref_switch_to_percpu(pte->pte_ref);
> +			__pte_put(pte);
> +		}
> +	}

We shouldn't switch back to percpu mode here, it will drastically reduce
performance.

> +}
> +
>   #endif /* CONFIG_FREE_USER_PTE */
diff mbox series

Patch

diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index bfe620038699..379c3b45a6ab 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -16,6 +16,8 @@  void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
 bool pte_tryget(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
 void __pte_put(pgtable_t page);
 void pte_put(pte_t *ptep);
+void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+			  bool switch_back);
 
 #else /* !CONFIG_FREE_USER_PTE */
 
@@ -47,6 +49,11 @@  static inline void pte_put(pte_t *ptep)
 {
 }
 
+static inline void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd,
+					unsigned long addr, bool switch_back)
+{
+}
+
 #endif /* CONFIG_FREE_USER_PTE */
 
 #endif /* _LINUX_PTE_REF_H */
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 5b382445561e..bf9629272c71 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -8,6 +8,9 @@ 
 #include <linux/pte_ref.h>
 #include <linux/percpu-refcount.h>
 #include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #ifdef CONFIG_FREE_USER_PTE
 
@@ -44,8 +47,6 @@  void pte_ref_free(pgtable_t pte)
 	kfree(ref);
 }
 
-void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) {}
-
 /*
  * pte_tryget - try to get the pte_ref of the user PTE page table page
  * @mm: pointer the target address space
@@ -102,4 +103,98 @@  void pte_put(pte_t *ptep)
 }
 EXPORT_SYMBOL(pte_put);
 
+#ifdef CONFIG_DEBUG_VM
+void pte_free_debug(pmd_t pmd)
+{
+	pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
+	int i = 0;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		BUG_ON(!pte_none(*ptep++));
+}
+#else
+static inline void pte_free_debug(pmd_t pmd)
+{
+}
+#endif
+
+static inline void pte_free_rcu(struct rcu_head *rcu)
+{
+	struct page *page = container_of(rcu, struct page, rcu_head);
+
+	pgtable_pte_page_dtor(page);
+	__free_page(page);
+}
+
+/*
+ * free_user_pte - free the user PTE page table page
+ * @mm: pointer the target address space
+ * @pmd: pointer to a PMD
+ * @addr: start address of the tlb range to be flushed
+ *
+ * Context: The pmd range has been unmapped and TLB purged. And the user PTE
+ *	    page table page will be freed by rcu handler.
+ */
+void free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	spinlock_t *ptl;
+	pmd_t pmdval;
+
+	ptl = pmd_lock(mm, pmd);
+	pmdval = *pmd;
+	if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+		spin_unlock(ptl);
+		return;
+	}
+	pmd_clear(pmd);
+	flush_tlb_range(&vma, addr, addr + PMD_SIZE);
+	spin_unlock(ptl);
+
+	pte_free_debug(pmdval);
+	mm_dec_nr_ptes(mm);
+	call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu);
+}
+
+/*
+ * try_to_free_user_pte - try to free the user PTE page table page
+ * @mm: pointer the target address space
+ * @pmd: pointer to a PMD
+ * @addr: virtual address associated with pmd
+ * @switch_back: indicates if switching back to percpu mode is required
+ */
+void try_to_free_user_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+			  bool switch_back)
+{
+	pgtable_t pte;
+
+	if (&init_mm == mm)
+		return;
+
+	if (!pte_tryget(mm, pmd, addr))
+		return;
+	pte = pmd_pgtable(*pmd);
+	percpu_ref_switch_to_atomic_sync(pte->pte_ref);
+	rcu_read_lock();
+	/*
+	 * Here we can safely put the pte_ref because we already hold the rcu
+	 * lock, which guarantees that the user PTE page table page will not
+	 * be released.
+	 */
+	__pte_put(pte);
+	if (percpu_ref_is_zero(pte->pte_ref)) {
+		rcu_read_unlock();
+		free_user_pte(mm, pmd, addr & PMD_MASK);
+		return;
+	}
+	rcu_read_unlock();
+
+	if (switch_back) {
+		if (pte_tryget(mm, pmd, addr)) {
+			percpu_ref_switch_to_percpu(pte->pte_ref);
+			__pte_put(pte);
+		}
+	}
+}
+
 #endif /* CONFIG_FREE_USER_PTE */