diff mbox series

[uprobe,thp,3/4] uprobe: support huge page by only splitting the pmd

Message ID 20190529212049.2413886-4-songliubraving@fb.com (mailing list archive)
State New, archived
Headers show
Series THP aware uprobe | expand

Commit Message

Song Liu May 29, 2019, 9:20 p.m. UTC
Instead of splitting the compound page with FOLL_SPLIT, this patch allows
uprobe to only split pmd for huge pages.

A helper function mm_address_trans_huge(mm, address) was introduced to
test whether the address in mm is pointing to THP.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/huge_mm.h |  8 ++++++++
 kernel/events/uprobes.c | 38 ++++++++++++++++++++++++++++++++------
 mm/huge_memory.c        | 24 ++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 6 deletions(-)

Comments

William Kucharski May 30, 2019, 11:08 a.m. UTC | #1
Is there any reason to worry about supporting PUD-sized uprobe pages if
CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD is defined? I would prefer
not to bake in the assumption that "huge" means PMD-sized and more than
it already is.

For example, if CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD is configured,
mm_address_trans_huge() should either make the call to pud_trans_huge()
if appropriate, or a VM_BUG_ON_PAGE should be added in case the routine
is ever called with one.

Otherwise it looks pretty reasonable to me.

    -- Bill
Kirill A. Shutemov May 30, 2019, 12:14 p.m. UTC | #2
On Wed, May 29, 2019 at 02:20:48PM -0700, Song Liu wrote:
> Instead of splitting the compound page with FOLL_SPLIT, this patch allows
> uprobe to only split pmd for huge pages.
> 
> A helper function mm_address_trans_huge(mm, address) was introduced to
> test whether the address in mm is pointing to THP.

Maybe it would be cleaner to have FOLL_SPLIT_PMD which would strip
trans_huge PMD if any and then set pte using get_locked_pte()?

This way you'll not need any changes in split_huge_pmd() path. Clearing
PMD will be fine.
Song Liu May 30, 2019, 5:24 p.m. UTC | #3
> On May 30, 2019, at 4:08 AM, William Kucharski <william.kucharski@oracle.com> wrote:
> 
> 
> Is there any reason to worry about supporting PUD-sized uprobe pages if
> CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD is defined? I would prefer
> not to bake in the assumption that "huge" means PMD-sized and more than
> it already is.
> 
> For example, if CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD is configured,
> mm_address_trans_huge() should either make the call to pud_trans_huge()
> if appropriate, or a VM_BUG_ON_PAGE should be added in case the routine
> is ever called with one.
> 
> Otherwise it looks pretty reasonable to me.
> 
>    -- Bill
> 

I will try that in v2. 

Thanks,
Song
Song Liu May 30, 2019, 5:37 p.m. UTC | #4
> On May 30, 2019, at 5:14 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> On Wed, May 29, 2019 at 02:20:48PM -0700, Song Liu wrote:
>> Instead of splitting the compound page with FOLL_SPLIT, this patch allows
>> uprobe to only split pmd for huge pages.
>> 
>> A helper function mm_address_trans_huge(mm, address) was introduced to
>> test whether the address in mm is pointing to THP.
> 
> Maybe it would be cleaner to have FOLL_SPLIT_PMD which would strip
> trans_huge PMD if any and then set pte using get_locked_pte()?

FOLL_SPLIT_PMD sounds like a great idea! Let me try it. 

Thanks,
Song

> 
> This way you'll not need any changes in split_huge_pmd() path. Clearing
> PMD will be fine.
> 
> -- 
> Kirill A. Shutemov
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2d8a40fd06e4..4832d6580969 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -163,6 +163,8 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 		bool freeze, struct page *page, pgtable_t prealloc_pgtable);
 
+bool mm_address_trans_huge(struct mm_struct *mm, unsigned long address);
+
 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long address);
 
@@ -302,6 +304,12 @@  static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 		unsigned long address, bool freeze, struct page *page,
 		pgtable_t prealloc_pgtable) {}
 
+static inline bool mm_address_trans_huge(struct mm_struct *mm,
+					 unsigned long address)
+{
+	return false;
+}
+
 #define split_huge_pud(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ba49da99d2a2..56eeccc2f7a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@ 
 #include <linux/percpu-rwsem.h>
 #include <linux/task_work.h>
 #include <linux/shmem_fs.h>
+#include <asm/pgalloc.h>
 
 #include <linux/uprobes.h>
 
@@ -153,7 +154,7 @@  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct page_vma_mapped_walk pvmw = {
-		.page = old_page,
+		.page = compound_head(old_page),
 		.vma = vma,
 		.address = addr,
 	};
@@ -165,8 +166,6 @@  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
 				addr + PAGE_SIZE);
 
-	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
 	if (!orig) {
 		err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
 					    &memcg, false);
@@ -188,7 +187,8 @@  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	get_page(new_page);
 	if (orig) {
-		page_add_file_rmap(new_page, false);
+		page_add_file_rmap(compound_head(new_page),
+				   PageTransHuge(compound_head(new_page)));
 		inc_mm_counter(mm, mm_counter_file(new_page));
 		dec_mm_counter(mm, MM_ANONPAGES);
 	} else {
@@ -207,7 +207,8 @@  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	set_pte_at_notify(mm, addr, pvmw.pte,
 			mk_pte(new_page, vma->vm_page_prot));
 
-	page_remove_rmap(old_page, false);
+	page_remove_rmap(compound_head(old_page),
+			 PageTransHuge(compound_head(old_page)));
 	if (!page_mapped(old_page))
 		try_to_free_swap(old_page);
 	page_vma_mapped_walk_done(&pvmw);
@@ -475,17 +476,42 @@  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	struct vm_area_struct *vma;
 	int ret, is_register, ref_ctr_updated = 0;
 	pgoff_t index;
+	pgtable_t prealloc_pgtable = NULL;
+	unsigned long foll_flags = FOLL_FORCE;
 
 	is_register = is_swbp_insn(&opcode);
 	uprobe = container_of(auprobe, struct uprobe, arch);
 
+	/* do not FOLL_SPLIT yet */
+	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+			foll_flags, &old_page, &vma, NULL);
+
+	if (ret <= 0)
+		return ret;
+
+	if (mm_address_trans_huge(mm, vaddr)) {
+		prealloc_pgtable = pte_alloc_one(mm);
+		if (likely(prealloc_pgtable)) {
+			split_huge_pmd_address(vma, vaddr, false, NULL,
+					       prealloc_pgtable);
+			goto verify;
+		} else {
+			/* fallback to FOLL_SPLIT */
+			foll_flags |= FOLL_SPLIT;
+			put_page(old_page);
+		}
+	} else {
+		goto verify;
+	}
+
 retry:
 	/* Read the page with vaddr into memory */
 	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
-			FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+			foll_flags, &old_page, &vma, NULL);
 	if (ret <= 0)
 		return ret;
 
+verify:
 	ret = verify_opcode(old_page, vaddr, &opcode);
 	if (ret <= 0)
 		goto put_old;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dcb0e30213af..4714871353c0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2360,6 +2360,30 @@  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 	____split_huge_pmd(vma, pmd, address, freeze, page, prealloc_pgtable);
 }
 
+bool mm_address_trans_huge(struct mm_struct *mm, unsigned long address)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		return false;
+
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		return false;
+
+	pud = pud_offset(p4d, address);
+	if (!pud_present(*pud))
+		return false;
+
+	pmd = pmd_offset(pud, address);
+
+	return pmd_trans_huge(*pmd);
+}
+
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
 			     unsigned long start,
 			     unsigned long end,