diff mbox series

mm: huge_memory: a new debugfs interface for splitting THP tests.

Message ID 20210308152221.28555-1-zi.yan@sent.com (mailing list archive)
State New, archived
Headers show
Series mm: huge_memory: a new debugfs interface for splitting THP tests. | expand

Commit Message

Zi Yan March 8, 2021, 3:22 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

By writing "<pid>,<vaddr_start>,<vaddr_end>" to
<debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
given pid and virtual address range are split. It is used to test
split_huge_page function. In addition, a selftest program is added to
tools/testing/selftests/vm to utilize the interface by splitting
PMD THPs and PTE-mapped THPs.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/huge_memory.c                              |  98 ++++++
 mm/internal.h                                 |   1 +
 mm/migrate.c                                  |   2 +-
 tools/testing/selftests/vm/.gitignore         |   1 +
 tools/testing/selftests/vm/Makefile           |   1 +
 .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
 6 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c

Comments

David Hildenbrand March 8, 2021, 4:17 p.m. UTC | #1
On 08.03.21 16:22, Zi Yan wrote:
> From: Zi Yan <ziy@nvidia.com>
> 
> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
> given pid and virtual address range are split. It is used to test
> split_huge_page function. In addition, a selftest program is added to
> tools/testing/selftests/vm to utilize the interface by splitting
> PMD THPs and PTE-mapped THPs.

Won't something like

1. MADV_HUGEPAGE

2. Access memory

3. MADV_NOHUGEPAGE

Have a similar effect? What's the benefit of this?

> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>   mm/huge_memory.c                              |  98 ++++++
>   mm/internal.h                                 |   1 +
>   mm/migrate.c                                  |   2 +-
>   tools/testing/selftests/vm/.gitignore         |   1 +
>   tools/testing/selftests/vm/Makefile           |   1 +
>   .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
>   6 files changed, 420 insertions(+), 1 deletion(-)
>   create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 395c75111d33..818172f887bf 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -7,6 +7,7 @@
>   
>   #include <linux/mm.h>
>   #include <linux/sched.h>
> +#include <linux/sched/mm.h>
>   #include <linux/sched/coredump.h>
>   #include <linux/sched/numa_balancing.h>
>   #include <linux/highmem.h>
> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
>   DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>   		"%llu\n");
>   
> +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
> +		const char __user *buf, size_t count, loff_t *ppops)
> +{
> +	static DEFINE_MUTEX(mutex);
> +	ssize_t ret;
> +	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
> +	int pid;
> +	unsigned long vaddr_start, vaddr_end, addr;
> +	nodemask_t task_nodes;
> +	struct mm_struct *mm;
> +	unsigned long total = 0, split = 0;
> +
> +	ret = mutex_lock_interruptible(&mutex);
> +	if (ret)
> +		return ret;
> +
> +	ret = -EFAULT;
> +
> +	memset(input_buf, 0, 80);
> +	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
> +		goto out;
> +
> +	input_buf[79] = '\0';
> +	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
> +	if (ret != 3) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	vaddr_start &= PAGE_MASK;
> +	vaddr_end &= PAGE_MASK;
> +
> +	ret = strlen(input_buf);
> +	pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
> +		 pid, vaddr_start, vaddr_end);
> +
> +	mm = find_mm_struct(pid, &task_nodes);
> +	if (IS_ERR(mm)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	mmap_read_lock(mm);
> +	/*
> +	 * always increase addr by PAGE_SIZE, since we could have a PTE page
> +	 * table filled with PTE-mapped THPs, each of which is distinct.
> +	 */
> +	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
> +		struct vm_area_struct *vma = find_vma(mm, addr);
> +		unsigned int follflags;
> +		struct page *page;
> +
> +		if (!vma || addr < vma->vm_start || !vma_migratable(vma))
> +			break;
> +
> +		/* FOLL_DUMP to ignore special (like zero) pages */
> +		follflags = FOLL_GET | FOLL_DUMP;
> +		page = follow_page(vma, addr, follflags);
> +
> +		if (IS_ERR(page))
> +			break;
> +		if (!page)
> +			break;
> +
> +		if (!is_transparent_hugepage(page))
> +			continue;
> +
> +		total++;
> +		if (!can_split_huge_page(compound_head(page), NULL))
> +			continue;
> +
> +		if (!trylock_page(page))
> +			continue;
> +
> +		if (!split_huge_page(page))
> +			split++;
> +
> +		unlock_page(page);
> +		put_page(page);
> +	}
> +	mmap_read_unlock(mm);
> +	mmput(mm);
> +
> +	pr_debug("%lu of %lu THP split\n", split, total);
> +out:
> +	mutex_unlock(&mutex);
> +	return ret;
> +
> +}
> +
> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
> +	.owner	 = THIS_MODULE,
> +	.write	 = split_huge_pages_in_range_pid_write,
> +	.llseek  = no_llseek,
> +};
> +
>   static int __init split_huge_pages_debugfs(void)
>   {
>   	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
>   			    &split_huge_pages_fops);
> +	debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
> +			    &split_huge_pages_in_range_pid_fops);
>   	return 0;
>   }
>   late_initcall(split_huge_pages_debugfs);
> diff --git a/mm/internal.h b/mm/internal.h
> index 9902648f2206..1659d00100ef 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -623,4 +623,5 @@ struct migration_target_control {
>   	gfp_t gfp_mask;
>   };
>   
> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
>   #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 62b81d5257aa..ce5f213debb2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
>   	return nr_pages ? -EFAULT : 0;
>   }
>   
> -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>   {
>   	struct task_struct *task;
>   	struct mm_struct *mm;
> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
> index 9a35c3f6a557..1f651e85ed60 100644
> --- a/tools/testing/selftests/vm/.gitignore
> +++ b/tools/testing/selftests/vm/.gitignore
> @@ -22,3 +22,4 @@ map_fixed_noreplace
>   write_to_hugetlbfs
>   hmm-tests
>   local_config.*
> +split_huge_page_test
> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> index d42115e4284d..4cbc91d6869f 100644
> --- a/tools/testing/selftests/vm/Makefile
> +++ b/tools/testing/selftests/vm/Makefile
> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>   TEST_GEN_FILES += thuge-gen
>   TEST_GEN_FILES += transhuge-stress
>   TEST_GEN_FILES += userfaultfd
> +TEST_GEN_FILES += split_huge_page_test
>   
>   ifeq ($(MACHINE),x86_64)
>   CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
> new file mode 100644
> index 000000000000..8ea8000fda62
> --- /dev/null
> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
> @@ -0,0 +1,318 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
> + * interface.
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include "numa.h"
> +#include <unistd.h>
> +#include <errno.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <sys/mman.h>
> +#include <sys/time.h>
> +#include <sys/wait.h>
> +#include <malloc.h>
> +#include <stdbool.h>
> +
> +uint64_t pagesize;
> +unsigned int pageshift;
> +uint64_t pmd_pagesize;
> +
> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
> +#define SMAP_PATH "/proc/self/smaps"
> +#define INPUT_MAX 80
> +
> +#define PFN_MASK     ((1UL<<55)-1)
> +#define KPF_THP      (1UL<<22)
> +
> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
> +{
> +	uint64_t paddr;
> +	uint64_t page_flags;
> +
> +	if (pagemap_file) {
> +		pread(pagemap_file, &paddr, sizeof(paddr),
> +			((long)vaddr >> pageshift) * sizeof(paddr));
> +
> +		if (kpageflags_file) {
> +			pread(kpageflags_file, &page_flags, sizeof(page_flags),
> +				(paddr & PFN_MASK) * sizeof(page_flags));
> +
> +			return !!(page_flags & KPF_THP);
> +		}
> +	}
> +	return 0;
> +}
> +
> +
> +static uint64_t read_pmd_pagesize(void)
> +{
> +	int fd;
> +	char buf[20];
> +	ssize_t num_read;
> +
> +	fd = open(PMD_SIZE_PATH, O_RDONLY);
> +	if (fd == -1) {
> +		perror("Open hpage_pmd_size failed");
> +		exit(EXIT_FAILURE);
> +	}
> +	num_read = read(fd, buf, 19);
> +	if (num_read < 1) {
> +		close(fd);
> +		perror("Read hpage_pmd_size failed");
> +		exit(EXIT_FAILURE);
> +	}
> +	buf[num_read] = '\0';
> +	close(fd);
> +
> +	return strtoul(buf, NULL, 10);
> +}
> +
> +static int write_file(const char *path, const char *buf, size_t buflen)
> +{
> +	int fd;
> +	ssize_t numwritten;
> +
> +	fd = open(path, O_WRONLY);
> +	if (fd == -1)
> +		return 0;
> +
> +	numwritten = write(fd, buf, buflen - 1);
> +	close(fd);
> +	if (numwritten < 1)
> +		return 0;
> +
> +	return (unsigned int) numwritten;
> +}
> +
> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
> +{
> +	char input[INPUT_MAX];
> +	int ret;
> +
> +	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
> +			vaddr_end);
> +	if (ret >= INPUT_MAX) {
> +		printf("%s: Debugfs input is too long\n", __func__);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
> +		perror(SPLIT_DEBUGFS);
> +		exit(EXIT_FAILURE);
> +	}
> +}
> +
> +#define MAX_LINE_LENGTH 500
> +
> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
> +{
> +	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
> +		if (!strncmp(buf, pattern, strlen(pattern)))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static uint64_t check_huge(void *addr)
> +{
> +	uint64_t thp = 0;
> +	int ret;
> +	FILE *fp;
> +	char buffer[MAX_LINE_LENGTH];
> +	char addr_pattern[MAX_LINE_LENGTH];
> +
> +	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
> +		       (unsigned long) addr);
> +	if (ret >= MAX_LINE_LENGTH) {
> +		printf("%s: Pattern is too long\n", __func__);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +
> +	fp = fopen(SMAP_PATH, "r");
> +	if (!fp) {
> +		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
> +		exit(EXIT_FAILURE);
> +	}
> +	if (!check_for_pattern(fp, addr_pattern, buffer))
> +		goto err_out;
> +
> +	/*
> +	 * Fetch the AnonHugePages: in the same block and check the number of
> +	 * hugepages.
> +	 */
> +	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
> +		goto err_out;
> +
> +	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
> +		printf("Reading smap error\n");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +err_out:
> +	fclose(fp);
> +	return thp;
> +}
> +
> +void split_pmd_thp(void)
> +{
> +	char *one_page;
> +	size_t len = 4 * pmd_pagesize;
> +	uint64_t thp_size;
> +	size_t i;
> +
> +	one_page = memalign(pmd_pagesize, len);
> +
> +	madvise(one_page, len, MADV_HUGEPAGE);
> +
> +	for (i = 0; i < len; i++)
> +		one_page[i] = (char)i;
> +
> +	thp_size = check_huge(one_page);
> +	if (!thp_size) {
> +		printf("No THP is allocatd");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	/* split all possible huge pages */
> +	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
> +
> +	for (i = 0; i < len; i++)
> +		if (one_page[i] != (char)i) {
> +			printf("%ld byte corrupted\n", i);
> +			exit(EXIT_FAILURE);
> +		}
> +
> +
> +	thp_size = check_huge(one_page);
> +	if (thp_size) {
> +		printf("Still %ld kB AnonHugePages not split\n", thp_size);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	printf("Split huge pages successful\n");
> +	free(one_page);
> +}
> +
> +void split_pte_mapped_thp(void)
> +{
> +	char *one_page, *pte_mapped, *pte_mapped2;
> +	size_t len = 4 * pmd_pagesize;
> +	uint64_t thp_size;
> +	size_t i;
> +	const char *pagemap_template = "/proc/%d/pagemap";
> +	const char *kpageflags_proc = "/proc/kpageflags";
> +	char pagemap_proc[255];
> +	int pagemap_fd;
> +	int kpageflags_fd;
> +
> +	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
> +		perror("get pagemap proc error");
> +		exit(EXIT_FAILURE);
> +	}
> +	pagemap_fd = open(pagemap_proc, O_RDONLY);
> +
> +	if (pagemap_fd == -1) {
> +		perror("read pagemap:");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
> +
> +	if (kpageflags_fd == -1) {
> +		perror("read kpageflags:");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
> +			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +
> +	madvise(one_page, len, MADV_HUGEPAGE);
> +
> +	for (i = 0; i < len; i++)
> +		one_page[i] = (char)i;
> +
> +	thp_size = check_huge(one_page);
> +	if (!thp_size) {
> +		printf("No THP is allocatd");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
> +
> +	for (i = 1; i < 4; i++) {
> +		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
> +				     pagesize, pagesize,
> +				     MREMAP_MAYMOVE|MREMAP_FIXED,
> +				     pte_mapped + pagesize * i);
> +		if (pte_mapped2 == (char *)-1) {
> +			perror("mremap failed");
> +			exit(EXIT_FAILURE);
> +		}
> +	}
> +
> +	/* smap does not show THPs after mremap, use kpageflags instead */
> +	thp_size = 0;
> +	for (i = 0; i < pagesize * 4; i++)
> +		if (i % pagesize == 0 &&
> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> +			thp_size++;
> +
> +	if (thp_size != 4) {
> +		printf("Some THPs are missing during mremap\n");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	/* split all possible huge pages */
> +	write_debugfs(getpid(), (uint64_t)pte_mapped,
> +		      (uint64_t)pte_mapped + pagesize * 4);
> +
> +	/* smap does not show THPs after mremap, use kpageflags instead */
> +	thp_size = 0;
> +	for (i = 0; i < pagesize * 4; i++) {
> +		if (pte_mapped[i] != (char)i) {
> +			printf("%ld byte corrupted\n", i);
> +			exit(EXIT_FAILURE);
> +		}
> +		if (i % pagesize == 0 &&
> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> +			thp_size++;
> +	}
> +
> +	if (thp_size) {
> +		printf("Still %ld THPs not split\n", thp_size);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	printf("Split PTE-mapped huge pages successful\n");
> +	munmap(one_page, len);
> +	close(pagemap_fd);
> +	close(kpageflags_fd);
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	if (geteuid() != 0) {
> +		printf("Please run the benchmark as root\n");
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	pagesize = getpagesize();
> +	pageshift = ffs(pagesize) - 1;
> +	pmd_pagesize = read_pmd_pagesize();
> +
> +	split_pmd_thp();
> +	split_pte_mapped_thp();
> +
> +	return 0;
> +}
>
Zi Yan March 8, 2021, 5:49 p.m. UTC | #2
On 8 Mar 2021, at 11:17, David Hildenbrand wrote:

> On 08.03.21 16:22, Zi Yan wrote:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>> given pid and virtual address range are split. It is used to test
>> split_huge_page function. In addition, a selftest program is added to
>> tools/testing/selftests/vm to utilize the interface by splitting
>> PMD THPs and PTE-mapped THPs.
>
> Won't something like
>
> 1. MADV_HUGEPAGE
>
> 2. Access memory
>
> 3. MADV_NOHUGEPAGE
>
> Have a similar effect? What's the benefit of this?

Thanks for checking the patch.

No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
nothing else will be done.

Without this, we do not have a way of splitting a specific THP
(the compound page) via any user interface for debugging.
We can only write to <debugfs>/split_huge_pages to split all THPs
in the system, which has an unwanted effect on the whole system
and can overwhelm us with a lot of information. This new debugfs
interface provides a more precise method.

>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   mm/huge_memory.c                              |  98 ++++++
>>   mm/internal.h                                 |   1 +
>>   mm/migrate.c                                  |   2 +-
>>   tools/testing/selftests/vm/.gitignore         |   1 +
>>   tools/testing/selftests/vm/Makefile           |   1 +
>>   .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
>>   6 files changed, 420 insertions(+), 1 deletion(-)
>>   create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 395c75111d33..818172f887bf 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -7,6 +7,7 @@
>>    #include <linux/mm.h>
>>   #include <linux/sched.h>
>> +#include <linux/sched/mm.h>
>>   #include <linux/sched/coredump.h>
>>   #include <linux/sched/numa_balancing.h>
>>   #include <linux/highmem.h>
>> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
>>   DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>>   		"%llu\n");
>>  +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
>> +		const char __user *buf, size_t count, loff_t *ppops)
>> +{
>> +	static DEFINE_MUTEX(mutex);
>> +	ssize_t ret;
>> +	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>> +	int pid;
>> +	unsigned long vaddr_start, vaddr_end, addr;
>> +	nodemask_t task_nodes;
>> +	struct mm_struct *mm;
>> +	unsigned long total = 0, split = 0;
>> +
>> +	ret = mutex_lock_interruptible(&mutex);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = -EFAULT;
>> +
>> +	memset(input_buf, 0, 80);
>> +	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>> +		goto out;
>> +
>> +	input_buf[79] = '\0';
>> +	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>> +	if (ret != 3) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +	vaddr_start &= PAGE_MASK;
>> +	vaddr_end &= PAGE_MASK;
>> +
>> +	ret = strlen(input_buf);
>> +	pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
>> +		 pid, vaddr_start, vaddr_end);
>> +
>> +	mm = find_mm_struct(pid, &task_nodes);
>> +	if (IS_ERR(mm)) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>> +
>> +	mmap_read_lock(mm);
>> +	/*
>> +	 * always increase addr by PAGE_SIZE, since we could have a PTE page
>> +	 * table filled with PTE-mapped THPs, each of which is distinct.
>> +	 */
>> +	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>> +		struct vm_area_struct *vma = find_vma(mm, addr);
>> +		unsigned int follflags;
>> +		struct page *page;
>> +
>> +		if (!vma || addr < vma->vm_start || !vma_migratable(vma))
>> +			break;
>> +
>> +		/* FOLL_DUMP to ignore special (like zero) pages */
>> +		follflags = FOLL_GET | FOLL_DUMP;
>> +		page = follow_page(vma, addr, follflags);
>> +
>> +		if (IS_ERR(page))
>> +			break;
>> +		if (!page)
>> +			break;
>> +
>> +		if (!is_transparent_hugepage(page))
>> +			continue;
>> +
>> +		total++;
>> +		if (!can_split_huge_page(compound_head(page), NULL))
>> +			continue;
>> +
>> +		if (!trylock_page(page))
>> +			continue;
>> +
>> +		if (!split_huge_page(page))
>> +			split++;
>> +
>> +		unlock_page(page);
>> +		put_page(page);
>> +	}
>> +	mmap_read_unlock(mm);
>> +	mmput(mm);
>> +
>> +	pr_debug("%lu of %lu THP split\n", split, total);
>> +out:
>> +	mutex_unlock(&mutex);
>> +	return ret;
>> +
>> +}
>> +
>> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
>> +	.owner	 = THIS_MODULE,
>> +	.write	 = split_huge_pages_in_range_pid_write,
>> +	.llseek  = no_llseek,
>> +};
>> +
>>   static int __init split_huge_pages_debugfs(void)
>>   {
>>   	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
>>   			    &split_huge_pages_fops);
>> +	debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
>> +			    &split_huge_pages_in_range_pid_fops);
>>   	return 0;
>>   }
>>   late_initcall(split_huge_pages_debugfs);
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 9902648f2206..1659d00100ef 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -623,4 +623,5 @@ struct migration_target_control {
>>   	gfp_t gfp_mask;
>>   };
>>  +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
>>   #endif	/* __MM_INTERNAL_H */
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 62b81d5257aa..ce5f213debb2 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
>>   	return nr_pages ? -EFAULT : 0;
>>   }
>>  -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>   {
>>   	struct task_struct *task;
>>   	struct mm_struct *mm;
>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
>> index 9a35c3f6a557..1f651e85ed60 100644
>> --- a/tools/testing/selftests/vm/.gitignore
>> +++ b/tools/testing/selftests/vm/.gitignore
>> @@ -22,3 +22,4 @@ map_fixed_noreplace
>>   write_to_hugetlbfs
>>   hmm-tests
>>   local_config.*
>> +split_huge_page_test
>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>> index d42115e4284d..4cbc91d6869f 100644
>> --- a/tools/testing/selftests/vm/Makefile
>> +++ b/tools/testing/selftests/vm/Makefile
>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>>   TEST_GEN_FILES += thuge-gen
>>   TEST_GEN_FILES += transhuge-stress
>>   TEST_GEN_FILES += userfaultfd
>> +TEST_GEN_FILES += split_huge_page_test
>>    ifeq ($(MACHINE),x86_64)
>>   CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>> new file mode 100644
>> index 000000000000..8ea8000fda62
>> --- /dev/null
>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>> @@ -0,0 +1,318 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
>> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
>> + * interface.
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include "numa.h"
>> +#include <unistd.h>
>> +#include <errno.h>
>> +#include <inttypes.h>
>> +#include <string.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <sys/mman.h>
>> +#include <sys/time.h>
>> +#include <sys/wait.h>
>> +#include <malloc.h>
>> +#include <stdbool.h>
>> +
>> +uint64_t pagesize;
>> +unsigned int pageshift;
>> +uint64_t pmd_pagesize;
>> +
>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
>> +#define SMAP_PATH "/proc/self/smaps"
>> +#define INPUT_MAX 80
>> +
>> +#define PFN_MASK     ((1UL<<55)-1)
>> +#define KPF_THP      (1UL<<22)
>> +
>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
>> +{
>> +	uint64_t paddr;
>> +	uint64_t page_flags;
>> +
>> +	if (pagemap_file) {
>> +		pread(pagemap_file, &paddr, sizeof(paddr),
>> +			((long)vaddr >> pageshift) * sizeof(paddr));
>> +
>> +		if (kpageflags_file) {
>> +			pread(kpageflags_file, &page_flags, sizeof(page_flags),
>> +				(paddr & PFN_MASK) * sizeof(page_flags));
>> +
>> +			return !!(page_flags & KPF_THP);
>> +		}
>> +	}
>> +	return 0;
>> +}
>> +
>> +
>> +static uint64_t read_pmd_pagesize(void)
>> +{
>> +	int fd;
>> +	char buf[20];
>> +	ssize_t num_read;
>> +
>> +	fd = open(PMD_SIZE_PATH, O_RDONLY);
>> +	if (fd == -1) {
>> +		perror("Open hpage_pmd_size failed");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +	num_read = read(fd, buf, 19);
>> +	if (num_read < 1) {
>> +		close(fd);
>> +		perror("Read hpage_pmd_size failed");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +	buf[num_read] = '\0';
>> +	close(fd);
>> +
>> +	return strtoul(buf, NULL, 10);
>> +}
>> +
>> +static int write_file(const char *path, const char *buf, size_t buflen)
>> +{
>> +	int fd;
>> +	ssize_t numwritten;
>> +
>> +	fd = open(path, O_WRONLY);
>> +	if (fd == -1)
>> +		return 0;
>> +
>> +	numwritten = write(fd, buf, buflen - 1);
>> +	close(fd);
>> +	if (numwritten < 1)
>> +		return 0;
>> +
>> +	return (unsigned int) numwritten;
>> +}
>> +
>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>> +{
>> +	char input[INPUT_MAX];
>> +	int ret;
>> +
>> +	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>> +			vaddr_end);
>> +	if (ret >= INPUT_MAX) {
>> +		printf("%s: Debugfs input is too long\n", __func__);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
>> +		perror(SPLIT_DEBUGFS);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +}
>> +
>> +#define MAX_LINE_LENGTH 500
>> +
>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
>> +{
>> +	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
>> +		if (!strncmp(buf, pattern, strlen(pattern)))
>> +			return true;
>> +	}
>> +	return false;
>> +}
>> +
>> +static uint64_t check_huge(void *addr)
>> +{
>> +	uint64_t thp = 0;
>> +	int ret;
>> +	FILE *fp;
>> +	char buffer[MAX_LINE_LENGTH];
>> +	char addr_pattern[MAX_LINE_LENGTH];
>> +
>> +	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
>> +		       (unsigned long) addr);
>> +	if (ret >= MAX_LINE_LENGTH) {
>> +		printf("%s: Pattern is too long\n", __func__);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +
>> +	fp = fopen(SMAP_PATH, "r");
>> +	if (!fp) {
>> +		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +	if (!check_for_pattern(fp, addr_pattern, buffer))
>> +		goto err_out;
>> +
>> +	/*
>> +	 * Fetch the AnonHugePages: in the same block and check the number of
>> +	 * hugepages.
>> +	 */
>> +	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
>> +		goto err_out;
>> +
>> +	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
>> +		printf("Reading smap error\n");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +err_out:
>> +	fclose(fp);
>> +	return thp;
>> +}
>> +
>> +void split_pmd_thp(void)
>> +{
>> +	char *one_page;
>> +	size_t len = 4 * pmd_pagesize;
>> +	uint64_t thp_size;
>> +	size_t i;
>> +
>> +	one_page = memalign(pmd_pagesize, len);
>> +
>> +	madvise(one_page, len, MADV_HUGEPAGE);
>> +
>> +	for (i = 0; i < len; i++)
>> +		one_page[i] = (char)i;
>> +
>> +	thp_size = check_huge(one_page);
>> +	if (!thp_size) {
>> +		printf("No THP is allocatd");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	/* split all possible huge pages */
>> +	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>> +
>> +	for (i = 0; i < len; i++)
>> +		if (one_page[i] != (char)i) {
>> +			printf("%ld byte corrupted\n", i);
>> +			exit(EXIT_FAILURE);
>> +		}
>> +
>> +
>> +	thp_size = check_huge(one_page);
>> +	if (thp_size) {
>> +		printf("Still %ld kB AnonHugePages not split\n", thp_size);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	printf("Split huge pages successful\n");
>> +	free(one_page);
>> +}
>> +
>> +void split_pte_mapped_thp(void)
>> +{
>> +	char *one_page, *pte_mapped, *pte_mapped2;
>> +	size_t len = 4 * pmd_pagesize;
>> +	uint64_t thp_size;
>> +	size_t i;
>> +	const char *pagemap_template = "/proc/%d/pagemap";
>> +	const char *kpageflags_proc = "/proc/kpageflags";
>> +	char pagemap_proc[255];
>> +	int pagemap_fd;
>> +	int kpageflags_fd;
>> +
>> +	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
>> +		perror("get pagemap proc error");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +	pagemap_fd = open(pagemap_proc, O_RDONLY);
>> +
>> +	if (pagemap_fd == -1) {
>> +		perror("read pagemap:");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
>> +
>> +	if (kpageflags_fd == -1) {
>> +		perror("read kpageflags:");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
>> +			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>> +
>> +	madvise(one_page, len, MADV_HUGEPAGE);
>> +
>> +	for (i = 0; i < len; i++)
>> +		one_page[i] = (char)i;
>> +
>> +	thp_size = check_huge(one_page);
>> +	if (!thp_size) {
>> +		printf("No THP is allocatd");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
>> +
>> +	for (i = 1; i < 4; i++) {
>> +		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
>> +				     pagesize, pagesize,
>> +				     MREMAP_MAYMOVE|MREMAP_FIXED,
>> +				     pte_mapped + pagesize * i);
>> +		if (pte_mapped2 == (char *)-1) {
>> +			perror("mremap failed");
>> +			exit(EXIT_FAILURE);
>> +		}
>> +	}
>> +
>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>> +	thp_size = 0;
>> +	for (i = 0; i < pagesize * 4; i++)
>> +		if (i % pagesize == 0 &&
>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>> +			thp_size++;
>> +
>> +	if (thp_size != 4) {
>> +		printf("Some THPs are missing during mremap\n");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	/* split all possible huge pages */
>> +	write_debugfs(getpid(), (uint64_t)pte_mapped,
>> +		      (uint64_t)pte_mapped + pagesize * 4);
>> +
>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>> +	thp_size = 0;
>> +	for (i = 0; i < pagesize * 4; i++) {
>> +		if (pte_mapped[i] != (char)i) {
>> +			printf("%ld byte corrupted\n", i);
>> +			exit(EXIT_FAILURE);
>> +		}
>> +		if (i % pagesize == 0 &&
>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>> +			thp_size++;
>> +	}
>> +
>> +	if (thp_size) {
>> +		printf("Still %ld THPs not split\n", thp_size);
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	printf("Split PTE-mapped huge pages successful\n");
>> +	munmap(one_page, len);
>> +	close(pagemap_fd);
>> +	close(kpageflags_fd);
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +	if (geteuid() != 0) {
>> +		printf("Please run the benchmark as root\n");
>> +		exit(EXIT_FAILURE);
>> +	}
>> +
>> +	pagesize = getpagesize();
>> +	pageshift = ffs(pagesize) - 1;
>> +	pmd_pagesize = read_pmd_pagesize();
>> +
>> +	split_pmd_thp();
>> +	split_pte_mapped_thp();
>> +
>> +	return 0;
>> +}
>>
>
>
> -- 
> Thanks,
>
> David / dhildenb


—
Best Regards,
Yan Zi
David Hildenbrand March 8, 2021, 6:11 p.m. UTC | #3
On 08.03.21 18:49, Zi Yan wrote:
> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
> 
>> On 08.03.21 16:22, Zi Yan wrote:
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>> given pid and virtual address range are split. It is used to test
>>> split_huge_page function. In addition, a selftest program is added to
>>> tools/testing/selftests/vm to utilize the interface by splitting
>>> PMD THPs and PTE-mapped THPs.
>>
>> Won't something like
>>
>> 1. MADV_HUGEPAGE
>>
>> 2. Access memory
>>
>> 3. MADV_NOHUGEPAGE
>>
>> Have a similar effect? What's the benefit of this?
> 
> Thanks for checking the patch.
> 
> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
> nothing else will be done.

Ah, okay - maybe my memory was tricking me. There is some s390x KVM code 
that forces MADV_NOHUGEPAGE and force-splits everything.

I do wonder, though, if this functionality would be worth a proper user 
interface (e.g., madvise), though. There might be actual benefit in 
having this as a !debug interface.

I think you aware of the discussion in 
https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com


If there will be an interface to collapse a THP -- "this memory area is 
worth extra performance now by collapsing a THP if possible" -- it might 
also be helpful to have the opposite functionality -- "this memory area 
is not worth a THP, rather use that somehwere else".

MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT

Just a thought.

> 
> Without this, we do not have a way of splitting a specific THP
> (the compound page) via any user interface for debugging.
> We can only write to <debugfs>/split_huge_pages to split all THPs
> in the system, which has an unwanted effect on the whole system
> and can overwhelm us with a lot of information. This new debugfs
> interface provides a more precise method.
> 
>>>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>    mm/huge_memory.c                              |  98 ++++++
>>>    mm/internal.h                                 |   1 +
>>>    mm/migrate.c                                  |   2 +-
>>>    tools/testing/selftests/vm/.gitignore         |   1 +
>>>    tools/testing/selftests/vm/Makefile           |   1 +
>>>    .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
>>>    6 files changed, 420 insertions(+), 1 deletion(-)
>>>    create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 395c75111d33..818172f887bf 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -7,6 +7,7 @@
>>>     #include <linux/mm.h>
>>>    #include <linux/sched.h>
>>> +#include <linux/sched/mm.h>
>>>    #include <linux/sched/coredump.h>
>>>    #include <linux/sched/numa_balancing.h>
>>>    #include <linux/highmem.h>
>>> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
>>>    DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>>>    		"%llu\n");
>>>   +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
>>> +		const char __user *buf, size_t count, loff_t *ppops)
>>> +{
>>> +	static DEFINE_MUTEX(mutex);
>>> +	ssize_t ret;
>>> +	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>>> +	int pid;
>>> +	unsigned long vaddr_start, vaddr_end, addr;
>>> +	nodemask_t task_nodes;
>>> +	struct mm_struct *mm;
>>> +	unsigned long total = 0, split = 0;
>>> +
>>> +	ret = mutex_lock_interruptible(&mutex);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	ret = -EFAULT;
>>> +
>>> +	memset(input_buf, 0, 80);
>>> +	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>>> +		goto out;
>>> +
>>> +	input_buf[79] = '\0';
>>> +	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>>> +	if (ret != 3) {
>>> +		ret = -EINVAL;
>>> +		goto out;
>>> +	}
>>> +	vaddr_start &= PAGE_MASK;
>>> +	vaddr_end &= PAGE_MASK;
>>> +
>>> +	ret = strlen(input_buf);
>>> +	pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
>>> +		 pid, vaddr_start, vaddr_end);
>>> +
>>> +	mm = find_mm_struct(pid, &task_nodes);
>>> +	if (IS_ERR(mm)) {
>>> +		ret = -EINVAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	mmap_read_lock(mm);
>>> +	/*
>>> +	 * always increase addr by PAGE_SIZE, since we could have a PTE page
>>> +	 * table filled with PTE-mapped THPs, each of which is distinct.
>>> +	 */
>>> +	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>>> +		struct vm_area_struct *vma = find_vma(mm, addr);
>>> +		unsigned int follflags;
>>> +		struct page *page;
>>> +
>>> +		if (!vma || addr < vma->vm_start || !vma_migratable(vma))
>>> +			break;
>>> +
>>> +		/* FOLL_DUMP to ignore special (like zero) pages */
>>> +		follflags = FOLL_GET | FOLL_DUMP;
>>> +		page = follow_page(vma, addr, follflags);
>>> +
>>> +		if (IS_ERR(page))
>>> +			break;
>>> +		if (!page)
>>> +			break;
>>> +
>>> +		if (!is_transparent_hugepage(page))
>>> +			continue;
>>> +
>>> +		total++;
>>> +		if (!can_split_huge_page(compound_head(page), NULL))
>>> +			continue;
>>> +
>>> +		if (!trylock_page(page))
>>> +			continue;
>>> +
>>> +		if (!split_huge_page(page))
>>> +			split++;
>>> +
>>> +		unlock_page(page);
>>> +		put_page(page);
>>> +	}
>>> +	mmap_read_unlock(mm);
>>> +	mmput(mm);
>>> +
>>> +	pr_debug("%lu of %lu THP split\n", split, total);
>>> +out:
>>> +	mutex_unlock(&mutex);
>>> +	return ret;
>>> +
>>> +}
>>> +
>>> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
>>> +	.owner	 = THIS_MODULE,
>>> +	.write	 = split_huge_pages_in_range_pid_write,
>>> +	.llseek  = no_llseek,
>>> +};
>>> +
>>>    static int __init split_huge_pages_debugfs(void)
>>>    {
>>>    	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
>>>    			    &split_huge_pages_fops);
>>> +	debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
>>> +			    &split_huge_pages_in_range_pid_fops);
>>>    	return 0;
>>>    }
>>>    late_initcall(split_huge_pages_debugfs);
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 9902648f2206..1659d00100ef 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -623,4 +623,5 @@ struct migration_target_control {
>>>    	gfp_t gfp_mask;
>>>    };
>>>   +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
>>>    #endif	/* __MM_INTERNAL_H */
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 62b81d5257aa..ce5f213debb2 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
>>>    	return nr_pages ? -EFAULT : 0;
>>>    }
>>>   -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>>    {
>>>    	struct task_struct *task;
>>>    	struct mm_struct *mm;
>>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
>>> index 9a35c3f6a557..1f651e85ed60 100644
>>> --- a/tools/testing/selftests/vm/.gitignore
>>> +++ b/tools/testing/selftests/vm/.gitignore
>>> @@ -22,3 +22,4 @@ map_fixed_noreplace
>>>    write_to_hugetlbfs
>>>    hmm-tests
>>>    local_config.*
>>> +split_huge_page_test
>>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>>> index d42115e4284d..4cbc91d6869f 100644
>>> --- a/tools/testing/selftests/vm/Makefile
>>> +++ b/tools/testing/selftests/vm/Makefile
>>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>>>    TEST_GEN_FILES += thuge-gen
>>>    TEST_GEN_FILES += transhuge-stress
>>>    TEST_GEN_FILES += userfaultfd
>>> +TEST_GEN_FILES += split_huge_page_test
>>>     ifeq ($(MACHINE),x86_64)
>>>    CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
>>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>>> new file mode 100644
>>> index 000000000000..8ea8000fda62
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>>> @@ -0,0 +1,318 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
>>> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
>>> + * interface.
>>> + */
>>> +
>>> +#define _GNU_SOURCE
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include "numa.h"
>>> +#include <unistd.h>
>>> +#include <errno.h>
>>> +#include <inttypes.h>
>>> +#include <string.h>
>>> +#include <sys/types.h>
>>> +#include <sys/stat.h>
>>> +#include <fcntl.h>
>>> +#include <sys/mman.h>
>>> +#include <sys/time.h>
>>> +#include <sys/wait.h>
>>> +#include <malloc.h>
>>> +#include <stdbool.h>
>>> +
>>> +uint64_t pagesize;
>>> +unsigned int pageshift;
>>> +uint64_t pmd_pagesize;
>>> +
>>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
>>> +#define SMAP_PATH "/proc/self/smaps"
>>> +#define INPUT_MAX 80
>>> +
>>> +#define PFN_MASK     ((1UL<<55)-1)
>>> +#define KPF_THP      (1UL<<22)
>>> +
>>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
>>> +{
>>> +	uint64_t paddr;
>>> +	uint64_t page_flags;
>>> +
>>> +	if (pagemap_file) {
>>> +		pread(pagemap_file, &paddr, sizeof(paddr),
>>> +			((long)vaddr >> pageshift) * sizeof(paddr));
>>> +
>>> +		if (kpageflags_file) {
>>> +			pread(kpageflags_file, &page_flags, sizeof(page_flags),
>>> +				(paddr & PFN_MASK) * sizeof(page_flags));
>>> +
>>> +			return !!(page_flags & KPF_THP);
>>> +		}
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +
>>> +static uint64_t read_pmd_pagesize(void)
>>> +{
>>> +	int fd;
>>> +	char buf[20];
>>> +	ssize_t num_read;
>>> +
>>> +	fd = open(PMD_SIZE_PATH, O_RDONLY);
>>> +	if (fd == -1) {
>>> +		perror("Open hpage_pmd_size failed");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +	num_read = read(fd, buf, 19);
>>> +	if (num_read < 1) {
>>> +		close(fd);
>>> +		perror("Read hpage_pmd_size failed");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +	buf[num_read] = '\0';
>>> +	close(fd);
>>> +
>>> +	return strtoul(buf, NULL, 10);
>>> +}
>>> +
>>> +static int write_file(const char *path, const char *buf, size_t buflen)
>>> +{
>>> +	int fd;
>>> +	ssize_t numwritten;
>>> +
>>> +	fd = open(path, O_WRONLY);
>>> +	if (fd == -1)
>>> +		return 0;
>>> +
>>> +	numwritten = write(fd, buf, buflen - 1);
>>> +	close(fd);
>>> +	if (numwritten < 1)
>>> +		return 0;
>>> +
>>> +	return (unsigned int) numwritten;
>>> +}
>>> +
>>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>>> +{
>>> +	char input[INPUT_MAX];
>>> +	int ret;
>>> +
>>> +	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>>> +			vaddr_end);
>>> +	if (ret >= INPUT_MAX) {
>>> +		printf("%s: Debugfs input is too long\n", __func__);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
>>> +		perror(SPLIT_DEBUGFS);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +}
>>> +
>>> +#define MAX_LINE_LENGTH 500
>>> +
>>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
>>> +{
>>> +	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
>>> +		if (!strncmp(buf, pattern, strlen(pattern)))
>>> +			return true;
>>> +	}
>>> +	return false;
>>> +}
>>> +
>>> +static uint64_t check_huge(void *addr)
>>> +{
>>> +	uint64_t thp = 0;
>>> +	int ret;
>>> +	FILE *fp;
>>> +	char buffer[MAX_LINE_LENGTH];
>>> +	char addr_pattern[MAX_LINE_LENGTH];
>>> +
>>> +	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
>>> +		       (unsigned long) addr);
>>> +	if (ret >= MAX_LINE_LENGTH) {
>>> +		printf("%s: Pattern is too long\n", __func__);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +
>>> +	fp = fopen(SMAP_PATH, "r");
>>> +	if (!fp) {
>>> +		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +	if (!check_for_pattern(fp, addr_pattern, buffer))
>>> +		goto err_out;
>>> +
>>> +	/*
>>> +	 * Fetch the AnonHugePages: in the same block and check the number of
>>> +	 * hugepages.
>>> +	 */
>>> +	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
>>> +		goto err_out;
>>> +
>>> +	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
>>> +		printf("Reading smap error\n");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +err_out:
>>> +	fclose(fp);
>>> +	return thp;
>>> +}
>>> +
>>> +void split_pmd_thp(void)
>>> +{
>>> +	char *one_page;
>>> +	size_t len = 4 * pmd_pagesize;
>>> +	uint64_t thp_size;
>>> +	size_t i;
>>> +
>>> +	one_page = memalign(pmd_pagesize, len);
>>> +
>>> +	madvise(one_page, len, MADV_HUGEPAGE);
>>> +
>>> +	for (i = 0; i < len; i++)
>>> +		one_page[i] = (char)i;
>>> +
>>> +	thp_size = check_huge(one_page);
>>> +	if (!thp_size) {
>>> +		printf("No THP is allocatd");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	/* split all possible huge pages */
>>> +	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>>> +
>>> +	for (i = 0; i < len; i++)
>>> +		if (one_page[i] != (char)i) {
>>> +			printf("%ld byte corrupted\n", i);
>>> +			exit(EXIT_FAILURE);
>>> +		}
>>> +
>>> +
>>> +	thp_size = check_huge(one_page);
>>> +	if (thp_size) {
>>> +		printf("Still %ld kB AnonHugePages not split\n", thp_size);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	printf("Split huge pages successful\n");
>>> +	free(one_page);
>>> +}
>>> +
>>> +void split_pte_mapped_thp(void)
>>> +{
>>> +	char *one_page, *pte_mapped, *pte_mapped2;
>>> +	size_t len = 4 * pmd_pagesize;
>>> +	uint64_t thp_size;
>>> +	size_t i;
>>> +	const char *pagemap_template = "/proc/%d/pagemap";
>>> +	const char *kpageflags_proc = "/proc/kpageflags";
>>> +	char pagemap_proc[255];
>>> +	int pagemap_fd;
>>> +	int kpageflags_fd;
>>> +
>>> +	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
>>> +		perror("get pagemap proc error");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +	pagemap_fd = open(pagemap_proc, O_RDONLY);
>>> +
>>> +	if (pagemap_fd == -1) {
>>> +		perror("read pagemap:");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
>>> +
>>> +	if (kpageflags_fd == -1) {
>>> +		perror("read kpageflags:");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
>>> +			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>>> +
>>> +	madvise(one_page, len, MADV_HUGEPAGE);
>>> +
>>> +	for (i = 0; i < len; i++)
>>> +		one_page[i] = (char)i;
>>> +
>>> +	thp_size = check_huge(one_page);
>>> +	if (!thp_size) {
>>> +		printf("No THP is allocatd");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
>>> +
>>> +	for (i = 1; i < 4; i++) {
>>> +		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
>>> +				     pagesize, pagesize,
>>> +				     MREMAP_MAYMOVE|MREMAP_FIXED,
>>> +				     pte_mapped + pagesize * i);
>>> +		if (pte_mapped2 == (char *)-1) {
>>> +			perror("mremap failed");
>>> +			exit(EXIT_FAILURE);
>>> +		}
>>> +	}
>>> +
>>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>>> +	thp_size = 0;
>>> +	for (i = 0; i < pagesize * 4; i++)
>>> +		if (i % pagesize == 0 &&
>>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>> +			thp_size++;
>>> +
>>> +	if (thp_size != 4) {
>>> +		printf("Some THPs are missing during mremap\n");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	/* split all possible huge pages */
>>> +	write_debugfs(getpid(), (uint64_t)pte_mapped,
>>> +		      (uint64_t)pte_mapped + pagesize * 4);
>>> +
>>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>>> +	thp_size = 0;
>>> +	for (i = 0; i < pagesize * 4; i++) {
>>> +		if (pte_mapped[i] != (char)i) {
>>> +			printf("%ld byte corrupted\n", i);
>>> +			exit(EXIT_FAILURE);
>>> +		}
>>> +		if (i % pagesize == 0 &&
>>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>> +			thp_size++;
>>> +	}
>>> +
>>> +	if (thp_size) {
>>> +		printf("Still %ld THPs not split\n", thp_size);
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	printf("Split PTE-mapped huge pages successful\n");
>>> +	munmap(one_page, len);
>>> +	close(pagemap_fd);
>>> +	close(kpageflags_fd);
>>> +}
>>> +
>>> +int main(int argc, char **argv)
>>> +{
>>> +	if (geteuid() != 0) {
>>> +		printf("Please run the benchmark as root\n");
>>> +		exit(EXIT_FAILURE);
>>> +	}
>>> +
>>> +	pagesize = getpagesize();
>>> +	pageshift = ffs(pagesize) - 1;
>>> +	pmd_pagesize = read_pmd_pagesize();
>>> +
>>> +	split_pmd_thp();
>>> +	split_pte_mapped_thp();
>>> +
>>> +	return 0;
>>> +}
>>>
>>
>>
>> -- 
>> Thanks,
>>
>> David / dhildenb
> 
> 
> —
> Best Regards,
> Yan Zi
>
Zi Yan March 8, 2021, 6:46 p.m. UTC | #4
+ the rest of cc back and move your reply inline.

On 8 Mar 2021, at 12:47, Mika Penttilä wrote:
>>
>>
>> On 8.3.2021 17.22, Zi Yan wrote:
>>> From: Zi Yan <ziy@nvidia.com>
>>>
>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>> given pid and virtual address range are split. It is used to test
>>> split_huge_page function. In addition, a selftest program is added to
>>> tools/testing/selftests/vm to utilize the interface by splitting
>>> PMD THPs and PTE-mapped THPs.
>>>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>
>> Hi!
>>
>> I think your test program is not correct. The mremaps shrink to one page, after the first mremap the pointers are bogus.
>> Also, mremap splits pmds with split_huge_pmd().. And those you can't split  with split_huge_page because it is a normal pmd.
>> Maybe you didn't indent to shrink to page size?
>>
>>
>> --Mika
> Hi,
>
> Sorry, wrote too fast.. the splits are okay of course from pte mapped thp to plain pages (mremap -> split pmd -> debugfs write ->split pages).
> But the remap offsets are I think maybe not you wanted.


You mean I mremap the first PAGESIZE from first THP, second PAGESIZE from second THP and so on to
create PTE-mapped THPs? I did it on purpose so split_huge_page can work on different part of THPs.


>>
>>
>>> ---
>>>   mm/huge_memory.c                              |  98 ++++++
>>>   mm/internal.h                                 |   1 +
>>>   mm/migrate.c                                  |   2 +-
>>>   tools/testing/selftests/vm/.gitignore         |   1 +
>>>   tools/testing/selftests/vm/Makefile           |   1 +
>>>   .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
>>>   6 files changed, 420 insertions(+), 1 deletion(-)
>>>   create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 395c75111d33..818172f887bf 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -7,6 +7,7 @@
>>>     #include <linux/mm.h>
>>>   #include <linux/sched.h>
>>> +#include <linux/sched/mm.h>
>>>   #include <linux/sched/coredump.h>
>>>   #include <linux/sched/numa_balancing.h>
>>>   #include <linux/highmem.h>
>>> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
>>>   DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>>>           "%llu\n");
>>>   +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
>>> +        const char __user *buf, size_t count, loff_t *ppops)
>>> +{
>>> +    static DEFINE_MUTEX(mutex);
>>> +    ssize_t ret;
>>> +    char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>>> +    int pid;
>>> +    unsigned long vaddr_start, vaddr_end, addr;
>>> +    nodemask_t task_nodes;
>>> +    struct mm_struct *mm;
>>> +    unsigned long total = 0, split = 0;
>>> +
>>> +    ret = mutex_lock_interruptible(&mutex);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    ret = -EFAULT;
>>> +
>>> +    memset(input_buf, 0, 80);
>>> +    if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>>> +        goto out;
>>> +
>>> +    input_buf[79] = '\0';
>>> +    ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>>> +    if (ret != 3) {
>>> +        ret = -EINVAL;
>>> +        goto out;
>>> +    }
>>> +    vaddr_start &= PAGE_MASK;
>>> +    vaddr_end &= PAGE_MASK;
>>> +
>>> +    ret = strlen(input_buf);
>>> +    pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
>>> +         pid, vaddr_start, vaddr_end);
>>> +
>>> +    mm = find_mm_struct(pid, &task_nodes);
>>> +    if (IS_ERR(mm)) {
>>> +        ret = -EINVAL;
>>> +        goto out;
>>> +    }
>>> +
>>> +    mmap_read_lock(mm);
>>> +    /*
>>> +     * always increase addr by PAGE_SIZE, since we could have a PTE page
>>> +     * table filled with PTE-mapped THPs, each of which is distinct.
>>> +     */
>>> +    for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>>> +        struct vm_area_struct *vma = find_vma(mm, addr);
>>> +        unsigned int follflags;
>>> +        struct page *page;
>>> +
>>> +        if (!vma || addr < vma->vm_start || !vma_migratable(vma))
>>> +            break;
>>> +
>>> +        /* FOLL_DUMP to ignore special (like zero) pages */
>>> +        follflags = FOLL_GET | FOLL_DUMP;
>>> +        page = follow_page(vma, addr, follflags);
>>> +
>>> +        if (IS_ERR(page))
>>> +            break;
>>> +        if (!page)
>>> +            break;
>>> +
>>> +        if (!is_transparent_hugepage(page))
>>> +            continue;
>>> +
>>> +        total++;
>>> +        if (!can_split_huge_page(compound_head(page), NULL))
>>> +            continue;
>>> +
>>> +        if (!trylock_page(page))
>>> +            continue;
>>> +
>>> +        if (!split_huge_page(page))
>>> +            split++;
>>> +
>>> +        unlock_page(page);
>>> +        put_page(page);
>>> +    }
>>> +    mmap_read_unlock(mm);
>>> +    mmput(mm);
>>> +
>>> +    pr_debug("%lu of %lu THP split\n", split, total);
>>> +out:
>>> +    mutex_unlock(&mutex);
>>> +    return ret;
>>> +
>>> +}
>>> +
>>> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
>>> +    .owner     = THIS_MODULE,
>>> +    .write     = split_huge_pages_in_range_pid_write,
>>> +    .llseek  = no_llseek,
>>> +};
>>> +
>>>   static int __init split_huge_pages_debugfs(void)
>>>   {
>>>       debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
>>>                   &split_huge_pages_fops);
>>> +    debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
>>> +                &split_huge_pages_in_range_pid_fops);
>>>       return 0;
>>>   }
>>>   late_initcall(split_huge_pages_debugfs);
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 9902648f2206..1659d00100ef 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -623,4 +623,5 @@ struct migration_target_control {
>>>       gfp_t gfp_mask;
>>>   };
>>>   +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
>>>   #endif    /* __MM_INTERNAL_H */
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 62b81d5257aa..ce5f213debb2 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
>>>       return nr_pages ? -EFAULT : 0;
>>>   }
>>>   -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>>   {
>>>       struct task_struct *task;
>>>       struct mm_struct *mm;
>>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
>>> index 9a35c3f6a557..1f651e85ed60 100644
>>> --- a/tools/testing/selftests/vm/.gitignore
>>> +++ b/tools/testing/selftests/vm/.gitignore
>>> @@ -22,3 +22,4 @@ map_fixed_noreplace
>>>   write_to_hugetlbfs
>>>   hmm-tests
>>>   local_config.*
>>> +split_huge_page_test
>>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>>> index d42115e4284d..4cbc91d6869f 100644
>>> --- a/tools/testing/selftests/vm/Makefile
>>> +++ b/tools/testing/selftests/vm/Makefile
>>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>>>   TEST_GEN_FILES += thuge-gen
>>>   TEST_GEN_FILES += transhuge-stress
>>>   TEST_GEN_FILES += userfaultfd
>>> +TEST_GEN_FILES += split_huge_page_test
>>>     ifeq ($(MACHINE),x86_64)
>>>   CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
>>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>>> new file mode 100644
>>> index 000000000000..8ea8000fda62
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>>> @@ -0,0 +1,318 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
>>> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
>>> + * interface.
>>> + */
>>> +
>>> +#define _GNU_SOURCE
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include "numa.h"
>>> +#include <unistd.h>
>>> +#include <errno.h>
>>> +#include <inttypes.h>
>>> +#include <string.h>
>>> +#include <sys/types.h>
>>> +#include <sys/stat.h>
>>> +#include <fcntl.h>
>>> +#include <sys/mman.h>
>>> +#include <sys/time.h>
>>> +#include <sys/wait.h>
>>> +#include <malloc.h>
>>> +#include <stdbool.h>
>>> +
>>> +uint64_t pagesize;
>>> +unsigned int pageshift;
>>> +uint64_t pmd_pagesize;
>>> +
>>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
>>> +#define SMAP_PATH "/proc/self/smaps"
>>> +#define INPUT_MAX 80
>>> +
>>> +#define PFN_MASK     ((1UL<<55)-1)
>>> +#define KPF_THP      (1UL<<22)
>>> +
>>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
>>> +{
>>> +    uint64_t paddr;
>>> +    uint64_t page_flags;
>>> +
>>> +    if (pagemap_file) {
>>> +        pread(pagemap_file, &paddr, sizeof(paddr),
>>> +            ((long)vaddr >> pageshift) * sizeof(paddr));
>>> +
>>> +        if (kpageflags_file) {
>>> +            pread(kpageflags_file, &page_flags, sizeof(page_flags),
>>> +                (paddr & PFN_MASK) * sizeof(page_flags));
>>> +
>>> +            return !!(page_flags & KPF_THP);
>>> +        }
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>> +
>>> +static uint64_t read_pmd_pagesize(void)
>>> +{
>>> +    int fd;
>>> +    char buf[20];
>>> +    ssize_t num_read;
>>> +
>>> +    fd = open(PMD_SIZE_PATH, O_RDONLY);
>>> +    if (fd == -1) {
>>> +        perror("Open hpage_pmd_size failed");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +    num_read = read(fd, buf, 19);
>>> +    if (num_read < 1) {
>>> +        close(fd);
>>> +        perror("Read hpage_pmd_size failed");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +    buf[num_read] = '\0';
>>> +    close(fd);
>>> +
>>> +    return strtoul(buf, NULL, 10);
>>> +}
>>> +
>>> +static int write_file(const char *path, const char *buf, size_t buflen)
>>> +{
>>> +    int fd;
>>> +    ssize_t numwritten;
>>> +
>>> +    fd = open(path, O_WRONLY);
>>> +    if (fd == -1)
>>> +        return 0;
>>> +
>>> +    numwritten = write(fd, buf, buflen - 1);
>>> +    close(fd);
>>> +    if (numwritten < 1)
>>> +        return 0;
>>> +
>>> +    return (unsigned int) numwritten;
>>> +}
>>> +
>>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>>> +{
>>> +    char input[INPUT_MAX];
>>> +    int ret;
>>> +
>>> +    ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>>> +            vaddr_end);
>>> +    if (ret >= INPUT_MAX) {
>>> +        printf("%s: Debugfs input is too long\n", __func__);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
>>> +        perror(SPLIT_DEBUGFS);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +}
>>> +
>>> +#define MAX_LINE_LENGTH 500
>>> +
>>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
>>> +{
>>> +    while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
>>> +        if (!strncmp(buf, pattern, strlen(pattern)))
>>> +            return true;
>>> +    }
>>> +    return false;
>>> +}
>>> +
>>> +static uint64_t check_huge(void *addr)
>>> +{
>>> +    uint64_t thp = 0;
>>> +    int ret;
>>> +    FILE *fp;
>>> +    char buffer[MAX_LINE_LENGTH];
>>> +    char addr_pattern[MAX_LINE_LENGTH];
>>> +
>>> +    ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
>>> +               (unsigned long) addr);
>>> +    if (ret >= MAX_LINE_LENGTH) {
>>> +        printf("%s: Pattern is too long\n", __func__);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +
>>> +    fp = fopen(SMAP_PATH, "r");
>>> +    if (!fp) {
>>> +        printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +    if (!check_for_pattern(fp, addr_pattern, buffer))
>>> +        goto err_out;
>>> +
>>> +    /*
>>> +     * Fetch the AnonHugePages: in the same block and check the number of
>>> +     * hugepages.
>>> +     */
>>> +    if (!check_for_pattern(fp, "AnonHugePages:", buffer))
>>> +        goto err_out;
>>> +
>>> +    if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
>>> +        printf("Reading smap error\n");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +err_out:
>>> +    fclose(fp);
>>> +    return thp;
>>> +}
>>> +
>>> +void split_pmd_thp(void)
>>> +{
>>> +    char *one_page;
>>> +    size_t len = 4 * pmd_pagesize;
>>> +    uint64_t thp_size;
>>> +    size_t i;
>>> +
>>> +    one_page = memalign(pmd_pagesize, len);
>>> +
>>> +    madvise(one_page, len, MADV_HUGEPAGE);
>>> +
>>> +    for (i = 0; i < len; i++)
>>> +        one_page[i] = (char)i;
>>> +
>>> +    thp_size = check_huge(one_page);
>>> +    if (!thp_size) {
>>> +        printf("No THP is allocatd");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    /* split all possible huge pages */
>>> +    write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>>> +
>>> +    for (i = 0; i < len; i++)
>>> +        if (one_page[i] != (char)i) {
>>> +            printf("%ld byte corrupted\n", i);
>>> +            exit(EXIT_FAILURE);
>>> +        }
>>> +
>>> +
>>> +    thp_size = check_huge(one_page);
>>> +    if (thp_size) {
>>> +        printf("Still %ld kB AnonHugePages not split\n", thp_size);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    printf("Split huge pages successful\n");
>>> +    free(one_page);
>>> +}
>>> +
>>> +void split_pte_mapped_thp(void)
>>> +{
>>> +    char *one_page, *pte_mapped, *pte_mapped2;
>>> +    size_t len = 4 * pmd_pagesize;
>>> +    uint64_t thp_size;
>>> +    size_t i;
>>> +    const char *pagemap_template = "/proc/%d/pagemap";
>>> +    const char *kpageflags_proc = "/proc/kpageflags";
>>> +    char pagemap_proc[255];
>>> +    int pagemap_fd;
>>> +    int kpageflags_fd;
>>> +
>>> +    if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
>>> +        perror("get pagemap proc error");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +    pagemap_fd = open(pagemap_proc, O_RDONLY);
>>> +
>>> +    if (pagemap_fd == -1) {
>>> +        perror("read pagemap:");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    kpageflags_fd = open(kpageflags_proc, O_RDONLY);
>>> +
>>> +    if (kpageflags_fd == -1) {
>>> +        perror("read kpageflags:");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
>>> +            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>>> +
>>> +    madvise(one_page, len, MADV_HUGEPAGE);
>>> +
>>> +    for (i = 0; i < len; i++)
>>> +        one_page[i] = (char)i;
>>> +
>>> +    thp_size = check_huge(one_page);
>>> +    if (!thp_size) {
>>> +        printf("No THP is allocatd");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
>>> +
>>> +    for (i = 1; i < 4; i++) {
>>> +        pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
>>> +                     pagesize, pagesize,
>>> +                     MREMAP_MAYMOVE|MREMAP_FIXED,
>>> +                     pte_mapped + pagesize * i);
>>> +        if (pte_mapped2 == (char *)-1) {
>>> +            perror("mremap failed");
>>> +            exit(EXIT_FAILURE);
>>> +        }
>>> +    }
>>> +
>>> +    /* smap does not show THPs after mremap, use kpageflags instead */
>>> +    thp_size = 0;
>>> +    for (i = 0; i < pagesize * 4; i++)
>>> +        if (i % pagesize == 0 &&
>>> +            is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>> +            thp_size++;
>>> +
>>> +    if (thp_size != 4) {
>>> +        printf("Some THPs are missing during mremap\n");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    /* split all possible huge pages */
>>> +    write_debugfs(getpid(), (uint64_t)pte_mapped,
>>> +              (uint64_t)pte_mapped + pagesize * 4);
>>> +
>>> +    /* smap does not show THPs after mremap, use kpageflags instead */
>>> +    thp_size = 0;
>>> +    for (i = 0; i < pagesize * 4; i++) {
>>> +        if (pte_mapped[i] != (char)i) {
>>> +            printf("%ld byte corrupted\n", i);
>>> +            exit(EXIT_FAILURE);
>>> +        }
>>> +        if (i % pagesize == 0 &&
>>> +            is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>> +            thp_size++;
>>> +    }
>>> +
>>> +    if (thp_size) {
>>> +        printf("Still %ld THPs not split\n", thp_size);
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    printf("Split PTE-mapped huge pages successful\n");
>>> +    munmap(one_page, len);
>>> +    close(pagemap_fd);
>>> +    close(kpageflags_fd);
>>> +}
>>> +
>>> +int main(int argc, char **argv)
>>> +{
>>> +    if (geteuid() != 0) {
>>> +        printf("Please run the benchmark as root\n");
>>> +        exit(EXIT_FAILURE);
>>> +    }
>>> +
>>> +    pagesize = getpagesize();
>>> +    pageshift = ffs(pagesize) - 1;
>>> +    pmd_pagesize = read_pmd_pagesize();
>>> +
>>> +    split_pmd_thp();
>>> +    split_pte_mapped_thp();
>>> +
>>> +    return 0;
>>> +}
>>


—
Best Regards,
Yan Zi
Zi Yan March 8, 2021, 7:01 p.m. UTC | #5
On 8 Mar 2021, at 13:11, David Hildenbrand wrote:

> On 08.03.21 18:49, Zi Yan wrote:
>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
>>
>>> On 08.03.21 16:22, Zi Yan wrote:
>>>> From: Zi Yan <ziy@nvidia.com>
>>>>
>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>>> given pid and virtual address range are split. It is used to test
>>>> split_huge_page function. In addition, a selftest program is added to
>>>> tools/testing/selftests/vm to utilize the interface by splitting
>>>> PMD THPs and PTE-mapped THPs.
>>>
>>> Won't something like
>>>
>>> 1. MADV_HUGEPAGE
>>>
>>> 2. Access memory
>>>
>>> 3. MADV_NOHUGEPAGE
>>>
>>> Have a similar effect? What's the benefit of this?
>>
>> Thanks for checking the patch.
>>
>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
>> nothing else will be done.
>
> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
>
> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
>
> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com

Yes. Thanks for bringing this up.

>
> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
>
> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT

I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.

My debugfs interface is a little different here, since it splits the compound pages mapped by the PMD mapping
(of course the PMD mapping is split too), whereas madvise only splits the PMD. I did not put it in a !debug
interface because I do not think we want to expose the kernel mechanism (the compound page) to the user and
let them decide when to split the compound page or not. MADV_HUGE_COLLAPSE is different, because we need
to form a compound THP to be able to get PMD mappings. But I am happy to change my mind if we find usefulness
in letting user split compound THPs via !debug interface.


>
> Just a thought.
>
>>
>> Without this, we do not have a way of splitting a specific THP
>> (the compound page) via any user interface for debugging.
>> We can only write to <debugfs>/split_huge_pages to split all THPs
>> in the system, which has an unwanted effect on the whole system
>> and can overwhelm us with a lot of information. This new debugfs
>> interface provides a more precise method.
>>
>>>>
>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>> ---
>>>>    mm/huge_memory.c                              |  98 ++++++
>>>>    mm/internal.h                                 |   1 +
>>>>    mm/migrate.c                                  |   2 +-
>>>>    tools/testing/selftests/vm/.gitignore         |   1 +
>>>>    tools/testing/selftests/vm/Makefile           |   1 +
>>>>    .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
>>>>    6 files changed, 420 insertions(+), 1 deletion(-)
>>>>    create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 395c75111d33..818172f887bf 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -7,6 +7,7 @@
>>>>     #include <linux/mm.h>
>>>>    #include <linux/sched.h>
>>>> +#include <linux/sched/mm.h>
>>>>    #include <linux/sched/coredump.h>
>>>>    #include <linux/sched/numa_balancing.h>
>>>>    #include <linux/highmem.h>
>>>> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
>>>>    DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
>>>>    		"%llu\n");
>>>>   +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
>>>> +		const char __user *buf, size_t count, loff_t *ppops)
>>>> +{
>>>> +	static DEFINE_MUTEX(mutex);
>>>> +	ssize_t ret;
>>>> +	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
>>>> +	int pid;
>>>> +	unsigned long vaddr_start, vaddr_end, addr;
>>>> +	nodemask_t task_nodes;
>>>> +	struct mm_struct *mm;
>>>> +	unsigned long total = 0, split = 0;
>>>> +
>>>> +	ret = mutex_lock_interruptible(&mutex);
>>>> +	if (ret)
>>>> +		return ret;
>>>> +
>>>> +	ret = -EFAULT;
>>>> +
>>>> +	memset(input_buf, 0, 80);
>>>> +	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
>>>> +		goto out;
>>>> +
>>>> +	input_buf[79] = '\0';
>>>> +	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
>>>> +	if (ret != 3) {
>>>> +		ret = -EINVAL;
>>>> +		goto out;
>>>> +	}
>>>> +	vaddr_start &= PAGE_MASK;
>>>> +	vaddr_end &= PAGE_MASK;
>>>> +
>>>> +	ret = strlen(input_buf);
>>>> +	pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
>>>> +		 pid, vaddr_start, vaddr_end);
>>>> +
>>>> +	mm = find_mm_struct(pid, &task_nodes);
>>>> +	if (IS_ERR(mm)) {
>>>> +		ret = -EINVAL;
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	mmap_read_lock(mm);
>>>> +	/*
>>>> +	 * always increase addr by PAGE_SIZE, since we could have a PTE page
>>>> +	 * table filled with PTE-mapped THPs, each of which is distinct.
>>>> +	 */
>>>> +	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>>>> +		struct vm_area_struct *vma = find_vma(mm, addr);
>>>> +		unsigned int follflags;
>>>> +		struct page *page;
>>>> +
>>>> +		if (!vma || addr < vma->vm_start || !vma_migratable(vma))
>>>> +			break;
>>>> +
>>>> +		/* FOLL_DUMP to ignore special (like zero) pages */
>>>> +		follflags = FOLL_GET | FOLL_DUMP;
>>>> +		page = follow_page(vma, addr, follflags);
>>>> +
>>>> +		if (IS_ERR(page))
>>>> +			break;
>>>> +		if (!page)
>>>> +			break;
>>>> +
>>>> +		if (!is_transparent_hugepage(page))
>>>> +			continue;
>>>> +
>>>> +		total++;
>>>> +		if (!can_split_huge_page(compound_head(page), NULL))
>>>> +			continue;
>>>> +
>>>> +		if (!trylock_page(page))
>>>> +			continue;
>>>> +
>>>> +		if (!split_huge_page(page))
>>>> +			split++;
>>>> +
>>>> +		unlock_page(page);
>>>> +		put_page(page);
>>>> +	}
>>>> +	mmap_read_unlock(mm);
>>>> +	mmput(mm);
>>>> +
>>>> +	pr_debug("%lu of %lu THP split\n", split, total);
>>>> +out:
>>>> +	mutex_unlock(&mutex);
>>>> +	return ret;
>>>> +
>>>> +}
>>>> +
>>>> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
>>>> +	.owner	 = THIS_MODULE,
>>>> +	.write	 = split_huge_pages_in_range_pid_write,
>>>> +	.llseek  = no_llseek,
>>>> +};
>>>> +
>>>>    static int __init split_huge_pages_debugfs(void)
>>>>    {
>>>>    	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
>>>>    			    &split_huge_pages_fops);
>>>> +	debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
>>>> +			    &split_huge_pages_in_range_pid_fops);
>>>>    	return 0;
>>>>    }
>>>>    late_initcall(split_huge_pages_debugfs);
>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>> index 9902648f2206..1659d00100ef 100644
>>>> --- a/mm/internal.h
>>>> +++ b/mm/internal.h
>>>> @@ -623,4 +623,5 @@ struct migration_target_control {
>>>>    	gfp_t gfp_mask;
>>>>    };
>>>>   +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
>>>>    #endif	/* __MM_INTERNAL_H */
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 62b81d5257aa..ce5f213debb2 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
>>>>    	return nr_pages ? -EFAULT : 0;
>>>>    }
>>>>   -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>>> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
>>>>    {
>>>>    	struct task_struct *task;
>>>>    	struct mm_struct *mm;
>>>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
>>>> index 9a35c3f6a557..1f651e85ed60 100644
>>>> --- a/tools/testing/selftests/vm/.gitignore
>>>> +++ b/tools/testing/selftests/vm/.gitignore
>>>> @@ -22,3 +22,4 @@ map_fixed_noreplace
>>>>    write_to_hugetlbfs
>>>>    hmm-tests
>>>>    local_config.*
>>>> +split_huge_page_test
>>>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>>>> index d42115e4284d..4cbc91d6869f 100644
>>>> --- a/tools/testing/selftests/vm/Makefile
>>>> +++ b/tools/testing/selftests/vm/Makefile
>>>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
>>>>    TEST_GEN_FILES += thuge-gen
>>>>    TEST_GEN_FILES += transhuge-stress
>>>>    TEST_GEN_FILES += userfaultfd
>>>> +TEST_GEN_FILES += split_huge_page_test
>>>>     ifeq ($(MACHINE),x86_64)
>>>>    CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
>>>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
>>>> new file mode 100644
>>>> index 000000000000..8ea8000fda62
>>>> --- /dev/null
>>>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
>>>> @@ -0,0 +1,318 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
>>>> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
>>>> + * interface.
>>>> + */
>>>> +
>>>> +#define _GNU_SOURCE
>>>> +#include <stdio.h>
>>>> +#include <stdlib.h>
>>>> +#include "numa.h"
>>>> +#include <unistd.h>
>>>> +#include <errno.h>
>>>> +#include <inttypes.h>
>>>> +#include <string.h>
>>>> +#include <sys/types.h>
>>>> +#include <sys/stat.h>
>>>> +#include <fcntl.h>
>>>> +#include <sys/mman.h>
>>>> +#include <sys/time.h>
>>>> +#include <sys/wait.h>
>>>> +#include <malloc.h>
>>>> +#include <stdbool.h>
>>>> +
>>>> +uint64_t pagesize;
>>>> +unsigned int pageshift;
>>>> +uint64_t pmd_pagesize;
>>>> +
>>>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>>>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
>>>> +#define SMAP_PATH "/proc/self/smaps"
>>>> +#define INPUT_MAX 80
>>>> +
>>>> +#define PFN_MASK     ((1UL<<55)-1)
>>>> +#define KPF_THP      (1UL<<22)
>>>> +
>>>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
>>>> +{
>>>> +	uint64_t paddr;
>>>> +	uint64_t page_flags;
>>>> +
>>>> +	if (pagemap_file) {
>>>> +		pread(pagemap_file, &paddr, sizeof(paddr),
>>>> +			((long)vaddr >> pageshift) * sizeof(paddr));
>>>> +
>>>> +		if (kpageflags_file) {
>>>> +			pread(kpageflags_file, &page_flags, sizeof(page_flags),
>>>> +				(paddr & PFN_MASK) * sizeof(page_flags));
>>>> +
>>>> +			return !!(page_flags & KPF_THP);
>>>> +		}
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +
>>>> +static uint64_t read_pmd_pagesize(void)
>>>> +{
>>>> +	int fd;
>>>> +	char buf[20];
>>>> +	ssize_t num_read;
>>>> +
>>>> +	fd = open(PMD_SIZE_PATH, O_RDONLY);
>>>> +	if (fd == -1) {
>>>> +		perror("Open hpage_pmd_size failed");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +	num_read = read(fd, buf, 19);
>>>> +	if (num_read < 1) {
>>>> +		close(fd);
>>>> +		perror("Read hpage_pmd_size failed");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +	buf[num_read] = '\0';
>>>> +	close(fd);
>>>> +
>>>> +	return strtoul(buf, NULL, 10);
>>>> +}
>>>> +
>>>> +static int write_file(const char *path, const char *buf, size_t buflen)
>>>> +{
>>>> +	int fd;
>>>> +	ssize_t numwritten;
>>>> +
>>>> +	fd = open(path, O_WRONLY);
>>>> +	if (fd == -1)
>>>> +		return 0;
>>>> +
>>>> +	numwritten = write(fd, buf, buflen - 1);
>>>> +	close(fd);
>>>> +	if (numwritten < 1)
>>>> +		return 0;
>>>> +
>>>> +	return (unsigned int) numwritten;
>>>> +}
>>>> +
>>>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
>>>> +{
>>>> +	char input[INPUT_MAX];
>>>> +	int ret;
>>>> +
>>>> +	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
>>>> +			vaddr_end);
>>>> +	if (ret >= INPUT_MAX) {
>>>> +		printf("%s: Debugfs input is too long\n", __func__);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
>>>> +		perror(SPLIT_DEBUGFS);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +}
>>>> +
>>>> +#define MAX_LINE_LENGTH 500
>>>> +
>>>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
>>>> +{
>>>> +	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
>>>> +		if (!strncmp(buf, pattern, strlen(pattern)))
>>>> +			return true;
>>>> +	}
>>>> +	return false;
>>>> +}
>>>> +
>>>> +static uint64_t check_huge(void *addr)
>>>> +{
>>>> +	uint64_t thp = 0;
>>>> +	int ret;
>>>> +	FILE *fp;
>>>> +	char buffer[MAX_LINE_LENGTH];
>>>> +	char addr_pattern[MAX_LINE_LENGTH];
>>>> +
>>>> +	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
>>>> +		       (unsigned long) addr);
>>>> +	if (ret >= MAX_LINE_LENGTH) {
>>>> +		printf("%s: Pattern is too long\n", __func__);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +
>>>> +	fp = fopen(SMAP_PATH, "r");
>>>> +	if (!fp) {
>>>> +		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +	if (!check_for_pattern(fp, addr_pattern, buffer))
>>>> +		goto err_out;
>>>> +
>>>> +	/*
>>>> +	 * Fetch the AnonHugePages: in the same block and check the number of
>>>> +	 * hugepages.
>>>> +	 */
>>>> +	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
>>>> +		goto err_out;
>>>> +
>>>> +	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
>>>> +		printf("Reading smap error\n");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +err_out:
>>>> +	fclose(fp);
>>>> +	return thp;
>>>> +}
>>>> +
>>>> +void split_pmd_thp(void)
>>>> +{
>>>> +	char *one_page;
>>>> +	size_t len = 4 * pmd_pagesize;
>>>> +	uint64_t thp_size;
>>>> +	size_t i;
>>>> +
>>>> +	one_page = memalign(pmd_pagesize, len);
>>>> +
>>>> +	madvise(one_page, len, MADV_HUGEPAGE);
>>>> +
>>>> +	for (i = 0; i < len; i++)
>>>> +		one_page[i] = (char)i;
>>>> +
>>>> +	thp_size = check_huge(one_page);
>>>> +	if (!thp_size) {
>>>> +		printf("No THP is allocatd");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	/* split all possible huge pages */
>>>> +	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
>>>> +
>>>> +	for (i = 0; i < len; i++)
>>>> +		if (one_page[i] != (char)i) {
>>>> +			printf("%ld byte corrupted\n", i);
>>>> +			exit(EXIT_FAILURE);
>>>> +		}
>>>> +
>>>> +
>>>> +	thp_size = check_huge(one_page);
>>>> +	if (thp_size) {
>>>> +		printf("Still %ld kB AnonHugePages not split\n", thp_size);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	printf("Split huge pages successful\n");
>>>> +	free(one_page);
>>>> +}
>>>> +
>>>> +void split_pte_mapped_thp(void)
>>>> +{
>>>> +	char *one_page, *pte_mapped, *pte_mapped2;
>>>> +	size_t len = 4 * pmd_pagesize;
>>>> +	uint64_t thp_size;
>>>> +	size_t i;
>>>> +	const char *pagemap_template = "/proc/%d/pagemap";
>>>> +	const char *kpageflags_proc = "/proc/kpageflags";
>>>> +	char pagemap_proc[255];
>>>> +	int pagemap_fd;
>>>> +	int kpageflags_fd;
>>>> +
>>>> +	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
>>>> +		perror("get pagemap proc error");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +	pagemap_fd = open(pagemap_proc, O_RDONLY);
>>>> +
>>>> +	if (pagemap_fd == -1) {
>>>> +		perror("read pagemap:");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
>>>> +
>>>> +	if (kpageflags_fd == -1) {
>>>> +		perror("read kpageflags:");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
>>>> +			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>>>> +
>>>> +	madvise(one_page, len, MADV_HUGEPAGE);
>>>> +
>>>> +	for (i = 0; i < len; i++)
>>>> +		one_page[i] = (char)i;
>>>> +
>>>> +	thp_size = check_huge(one_page);
>>>> +	if (!thp_size) {
>>>> +		printf("No THP is allocatd");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
>>>> +
>>>> +	for (i = 1; i < 4; i++) {
>>>> +		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
>>>> +				     pagesize, pagesize,
>>>> +				     MREMAP_MAYMOVE|MREMAP_FIXED,
>>>> +				     pte_mapped + pagesize * i);
>>>> +		if (pte_mapped2 == (char *)-1) {
>>>> +			perror("mremap failed");
>>>> +			exit(EXIT_FAILURE);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>>>> +	thp_size = 0;
>>>> +	for (i = 0; i < pagesize * 4; i++)
>>>> +		if (i % pagesize == 0 &&
>>>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>>> +			thp_size++;
>>>> +
>>>> +	if (thp_size != 4) {
>>>> +		printf("Some THPs are missing during mremap\n");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	/* split all possible huge pages */
>>>> +	write_debugfs(getpid(), (uint64_t)pte_mapped,
>>>> +		      (uint64_t)pte_mapped + pagesize * 4);
>>>> +
>>>> +	/* smap does not show THPs after mremap, use kpageflags instead */
>>>> +	thp_size = 0;
>>>> +	for (i = 0; i < pagesize * 4; i++) {
>>>> +		if (pte_mapped[i] != (char)i) {
>>>> +			printf("%ld byte corrupted\n", i);
>>>> +			exit(EXIT_FAILURE);
>>>> +		}
>>>> +		if (i % pagesize == 0 &&
>>>> +		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
>>>> +			thp_size++;
>>>> +	}
>>>> +
>>>> +	if (thp_size) {
>>>> +		printf("Still %ld THPs not split\n", thp_size);
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	printf("Split PTE-mapped huge pages successful\n");
>>>> +	munmap(one_page, len);
>>>> +	close(pagemap_fd);
>>>> +	close(kpageflags_fd);
>>>> +}
>>>> +
>>>> +int main(int argc, char **argv)
>>>> +{
>>>> +	if (geteuid() != 0) {
>>>> +		printf("Please run the benchmark as root\n");
>>>> +		exit(EXIT_FAILURE);
>>>> +	}
>>>> +
>>>> +	pagesize = getpagesize();
>>>> +	pageshift = ffs(pagesize) - 1;
>>>> +	pmd_pagesize = read_pmd_pagesize();
>>>> +
>>>> +	split_pmd_thp();
>>>> +	split_pte_mapped_thp();
>>>> +
>>>> +	return 0;
>>>> +}
>>>>
>>>
>>>
>>> -- 
>>> Thanks,
>>>
>>> David / dhildenb
>>
>>
>> —
>> Best Regards,
>> Yan Zi
>>
>
>
> -- 
> Thanks,
>
> David / dhildenb


—
Best Regards,
Yan Zi
Yang Shi March 8, 2021, 7:11 p.m. UTC | #6
On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
>
> > On 08.03.21 18:49, Zi Yan wrote:
> >> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
> >>
> >>> On 08.03.21 16:22, Zi Yan wrote:
> >>>> From: Zi Yan <ziy@nvidia.com>
> >>>>
> >>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> >>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
> >>>> given pid and virtual address range are split. It is used to test
> >>>> split_huge_page function. In addition, a selftest program is added to
> >>>> tools/testing/selftests/vm to utilize the interface by splitting
> >>>> PMD THPs and PTE-mapped THPs.
> >>>
> >>> Won't something like
> >>>
> >>> 1. MADV_HUGEPAGE
> >>>
> >>> 2. Access memory
> >>>
> >>> 3. MADV_NOHUGEPAGE
> >>>
> >>> Have a similar effect? What's the benefit of this?
> >>
> >> Thanks for checking the patch.
> >>
> >> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
> >> nothing else will be done.
> >
> > Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
> >
> > I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
> >
> > I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
>
> Yes. Thanks for bringing this up.
>
> >
> > If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
> >
> > MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
>
> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.

IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
partial THP. If the range covers the whole THP, the whole THP is going
to be freed anyway. All other places in kernel which need split THP
have been covered. So I didn't realize any usecase from userspace for
just splitting PMD to PTEs.

>
> My debugfs interface is a little different here, since it splits the compound pages mapped by the PMD mapping
> (of course the PMD mapping is split too), whereas madvise only splits the PMD. I did not put it in a !debug
> interface because I do not think we want to expose the kernel mechanism (the compound page) to the user and
> let them decide when to split the compound page or not. MADV_HUGE_COLLAPSE is different, because we need
> to form a compound THP to be able to get PMD mappings. But I am happy to change my mind if we find usefulness
> in letting user split compound THPs via !debug interface.

There might be a usecase for splitting THP earlier. MADV_DONTNEED
would put the THP on deferred split queue, then the THPs would get
split when memory pressure is hit. So someone might want to get the
free memory back sooner rather than waiting for reclaimer.

>
>
> >
> > Just a thought.
> >
> >>
> >> Without this, we do not have a way of splitting a specific THP
> >> (the compound page) via any user interface for debugging.
> >> We can only write to <debugfs>/split_huge_pages to split all THPs
> >> in the system, which has an unwanted effect on the whole system
> >> and can overwhelm us with a lot of information. This new debugfs
> >> interface provides a more precise method.
> >>
> >>>>
> >>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
> >>>> ---
> >>>>    mm/huge_memory.c                              |  98 ++++++
> >>>>    mm/internal.h                                 |   1 +
> >>>>    mm/migrate.c                                  |   2 +-
> >>>>    tools/testing/selftests/vm/.gitignore         |   1 +
> >>>>    tools/testing/selftests/vm/Makefile           |   1 +
> >>>>    .../selftests/vm/split_huge_page_test.c       | 318 ++++++++++++++++++
> >>>>    6 files changed, 420 insertions(+), 1 deletion(-)
> >>>>    create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
> >>>>
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 395c75111d33..818172f887bf 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -7,6 +7,7 @@
> >>>>     #include <linux/mm.h>
> >>>>    #include <linux/sched.h>
> >>>> +#include <linux/sched/mm.h>
> >>>>    #include <linux/sched/coredump.h>
> >>>>    #include <linux/sched/numa_balancing.h>
> >>>>    #include <linux/highmem.h>
> >>>> @@ -2971,10 +2972,107 @@ static int split_huge_pages_set(void *data, u64 val)
> >>>>    DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
> >>>>                    "%llu\n");
> >>>>   +static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
> >>>> +          const char __user *buf, size_t count, loff_t *ppops)
> >>>> +{
> >>>> +  static DEFINE_MUTEX(mutex);
> >>>> +  ssize_t ret;
> >>>> +  char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
> >>>> +  int pid;
> >>>> +  unsigned long vaddr_start, vaddr_end, addr;
> >>>> +  nodemask_t task_nodes;
> >>>> +  struct mm_struct *mm;
> >>>> +  unsigned long total = 0, split = 0;
> >>>> +
> >>>> +  ret = mutex_lock_interruptible(&mutex);
> >>>> +  if (ret)
> >>>> +          return ret;
> >>>> +
> >>>> +  ret = -EFAULT;
> >>>> +
> >>>> +  memset(input_buf, 0, 80);
> >>>> +  if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
> >>>> +          goto out;
> >>>> +
> >>>> +  input_buf[79] = '\0';
> >>>> +  ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
> >>>> +  if (ret != 3) {
> >>>> +          ret = -EINVAL;
> >>>> +          goto out;
> >>>> +  }
> >>>> +  vaddr_start &= PAGE_MASK;
> >>>> +  vaddr_end &= PAGE_MASK;
> >>>> +
> >>>> +  ret = strlen(input_buf);
> >>>> +  pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
> >>>> +           pid, vaddr_start, vaddr_end);
> >>>> +
> >>>> +  mm = find_mm_struct(pid, &task_nodes);
> >>>> +  if (IS_ERR(mm)) {
> >>>> +          ret = -EINVAL;
> >>>> +          goto out;
> >>>> +  }
> >>>> +
> >>>> +  mmap_read_lock(mm);
> >>>> +  /*
> >>>> +   * always increase addr by PAGE_SIZE, since we could have a PTE page
> >>>> +   * table filled with PTE-mapped THPs, each of which is distinct.
> >>>> +   */
> >>>> +  for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
> >>>> +          struct vm_area_struct *vma = find_vma(mm, addr);
> >>>> +          unsigned int follflags;
> >>>> +          struct page *page;
> >>>> +
> >>>> +          if (!vma || addr < vma->vm_start || !vma_migratable(vma))
> >>>> +                  break;
> >>>> +
> >>>> +          /* FOLL_DUMP to ignore special (like zero) pages */
> >>>> +          follflags = FOLL_GET | FOLL_DUMP;
> >>>> +          page = follow_page(vma, addr, follflags);
> >>>> +
> >>>> +          if (IS_ERR(page))
> >>>> +                  break;
> >>>> +          if (!page)
> >>>> +                  break;
> >>>> +
> >>>> +          if (!is_transparent_hugepage(page))
> >>>> +                  continue;
> >>>> +
> >>>> +          total++;
> >>>> +          if (!can_split_huge_page(compound_head(page), NULL))
> >>>> +                  continue;
> >>>> +
> >>>> +          if (!trylock_page(page))
> >>>> +                  continue;
> >>>> +
> >>>> +          if (!split_huge_page(page))
> >>>> +                  split++;
> >>>> +
> >>>> +          unlock_page(page);
> >>>> +          put_page(page);
> >>>> +  }
> >>>> +  mmap_read_unlock(mm);
> >>>> +  mmput(mm);
> >>>> +
> >>>> +  pr_debug("%lu of %lu THP split\n", split, total);
> >>>> +out:
> >>>> +  mutex_unlock(&mutex);
> >>>> +  return ret;
> >>>> +
> >>>> +}
> >>>> +
> >>>> +static const struct file_operations split_huge_pages_in_range_pid_fops = {
> >>>> +  .owner   = THIS_MODULE,
> >>>> +  .write   = split_huge_pages_in_range_pid_write,
> >>>> +  .llseek  = no_llseek,
> >>>> +};
> >>>> +
> >>>>    static int __init split_huge_pages_debugfs(void)
> >>>>    {
> >>>>            debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
> >>>>                                &split_huge_pages_fops);
> >>>> +  debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
> >>>> +                      &split_huge_pages_in_range_pid_fops);
> >>>>            return 0;
> >>>>    }
> >>>>    late_initcall(split_huge_pages_debugfs);
> >>>> diff --git a/mm/internal.h b/mm/internal.h
> >>>> index 9902648f2206..1659d00100ef 100644
> >>>> --- a/mm/internal.h
> >>>> +++ b/mm/internal.h
> >>>> @@ -623,4 +623,5 @@ struct migration_target_control {
> >>>>            gfp_t gfp_mask;
> >>>>    };
> >>>>   +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
> >>>>    #endif  /* __MM_INTERNAL_H */
> >>>> diff --git a/mm/migrate.c b/mm/migrate.c
> >>>> index 62b81d5257aa..ce5f213debb2 100644
> >>>> --- a/mm/migrate.c
> >>>> +++ b/mm/migrate.c
> >>>> @@ -1913,7 +1913,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
> >>>>            return nr_pages ? -EFAULT : 0;
> >>>>    }
> >>>>   -static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
> >>>> +struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
> >>>>    {
> >>>>            struct task_struct *task;
> >>>>            struct mm_struct *mm;
> >>>> diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
> >>>> index 9a35c3f6a557..1f651e85ed60 100644
> >>>> --- a/tools/testing/selftests/vm/.gitignore
> >>>> +++ b/tools/testing/selftests/vm/.gitignore
> >>>> @@ -22,3 +22,4 @@ map_fixed_noreplace
> >>>>    write_to_hugetlbfs
> >>>>    hmm-tests
> >>>>    local_config.*
> >>>> +split_huge_page_test
> >>>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> >>>> index d42115e4284d..4cbc91d6869f 100644
> >>>> --- a/tools/testing/selftests/vm/Makefile
> >>>> +++ b/tools/testing/selftests/vm/Makefile
> >>>> @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
> >>>>    TEST_GEN_FILES += thuge-gen
> >>>>    TEST_GEN_FILES += transhuge-stress
> >>>>    TEST_GEN_FILES += userfaultfd
> >>>> +TEST_GEN_FILES += split_huge_page_test
> >>>>     ifeq ($(MACHINE),x86_64)
> >>>>    CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
> >>>> diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
> >>>> new file mode 100644
> >>>> index 000000000000..8ea8000fda62
> >>>> --- /dev/null
> >>>> +++ b/tools/testing/selftests/vm/split_huge_page_test.c
> >>>> @@ -0,0 +1,318 @@
> >>>> +// SPDX-License-Identifier: GPL-2.0
> >>>> +/*
> >>>> + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
> >>>> + * address range in a process via <debugfs>/split_huge_pages_in_range_pid
> >>>> + * interface.
> >>>> + */
> >>>> +
> >>>> +#define _GNU_SOURCE
> >>>> +#include <stdio.h>
> >>>> +#include <stdlib.h>
> >>>> +#include "numa.h"
> >>>> +#include <unistd.h>
> >>>> +#include <errno.h>
> >>>> +#include <inttypes.h>
> >>>> +#include <string.h>
> >>>> +#include <sys/types.h>
> >>>> +#include <sys/stat.h>
> >>>> +#include <fcntl.h>
> >>>> +#include <sys/mman.h>
> >>>> +#include <sys/time.h>
> >>>> +#include <sys/wait.h>
> >>>> +#include <malloc.h>
> >>>> +#include <stdbool.h>
> >>>> +
> >>>> +uint64_t pagesize;
> >>>> +unsigned int pageshift;
> >>>> +uint64_t pmd_pagesize;
> >>>> +
> >>>> +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> >>>> +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
> >>>> +#define SMAP_PATH "/proc/self/smaps"
> >>>> +#define INPUT_MAX 80
> >>>> +
> >>>> +#define PFN_MASK     ((1UL<<55)-1)
> >>>> +#define KPF_THP      (1UL<<22)
> >>>> +
> >>>> +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
> >>>> +{
> >>>> +  uint64_t paddr;
> >>>> +  uint64_t page_flags;
> >>>> +
> >>>> +  if (pagemap_file) {
> >>>> +          pread(pagemap_file, &paddr, sizeof(paddr),
> >>>> +                  ((long)vaddr >> pageshift) * sizeof(paddr));
> >>>> +
> >>>> +          if (kpageflags_file) {
> >>>> +                  pread(kpageflags_file, &page_flags, sizeof(page_flags),
> >>>> +                          (paddr & PFN_MASK) * sizeof(page_flags));
> >>>> +
> >>>> +                  return !!(page_flags & KPF_THP);
> >>>> +          }
> >>>> +  }
> >>>> +  return 0;
> >>>> +}
> >>>> +
> >>>> +
> >>>> +static uint64_t read_pmd_pagesize(void)
> >>>> +{
> >>>> +  int fd;
> >>>> +  char buf[20];
> >>>> +  ssize_t num_read;
> >>>> +
> >>>> +  fd = open(PMD_SIZE_PATH, O_RDONLY);
> >>>> +  if (fd == -1) {
> >>>> +          perror("Open hpage_pmd_size failed");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +  num_read = read(fd, buf, 19);
> >>>> +  if (num_read < 1) {
> >>>> +          close(fd);
> >>>> +          perror("Read hpage_pmd_size failed");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +  buf[num_read] = '\0';
> >>>> +  close(fd);
> >>>> +
> >>>> +  return strtoul(buf, NULL, 10);
> >>>> +}
> >>>> +
> >>>> +static int write_file(const char *path, const char *buf, size_t buflen)
> >>>> +{
> >>>> +  int fd;
> >>>> +  ssize_t numwritten;
> >>>> +
> >>>> +  fd = open(path, O_WRONLY);
> >>>> +  if (fd == -1)
> >>>> +          return 0;
> >>>> +
> >>>> +  numwritten = write(fd, buf, buflen - 1);
> >>>> +  close(fd);
> >>>> +  if (numwritten < 1)
> >>>> +          return 0;
> >>>> +
> >>>> +  return (unsigned int) numwritten;
> >>>> +}
> >>>> +
> >>>> +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
> >>>> +{
> >>>> +  char input[INPUT_MAX];
> >>>> +  int ret;
> >>>> +
> >>>> +  ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
> >>>> +                  vaddr_end);
> >>>> +  if (ret >= INPUT_MAX) {
> >>>> +          printf("%s: Debugfs input is too long\n", __func__);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
> >>>> +          perror(SPLIT_DEBUGFS);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +}
> >>>> +
> >>>> +#define MAX_LINE_LENGTH 500
> >>>> +
> >>>> +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
> >>>> +{
> >>>> +  while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
> >>>> +          if (!strncmp(buf, pattern, strlen(pattern)))
> >>>> +                  return true;
> >>>> +  }
> >>>> +  return false;
> >>>> +}
> >>>> +
> >>>> +static uint64_t check_huge(void *addr)
> >>>> +{
> >>>> +  uint64_t thp = 0;
> >>>> +  int ret;
> >>>> +  FILE *fp;
> >>>> +  char buffer[MAX_LINE_LENGTH];
> >>>> +  char addr_pattern[MAX_LINE_LENGTH];
> >>>> +
> >>>> +  ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
> >>>> +                 (unsigned long) addr);
> >>>> +  if (ret >= MAX_LINE_LENGTH) {
> >>>> +          printf("%s: Pattern is too long\n", __func__);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +
> >>>> +  fp = fopen(SMAP_PATH, "r");
> >>>> +  if (!fp) {
> >>>> +          printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +  if (!check_for_pattern(fp, addr_pattern, buffer))
> >>>> +          goto err_out;
> >>>> +
> >>>> +  /*
> >>>> +   * Fetch the AnonHugePages: in the same block and check the number of
> >>>> +   * hugepages.
> >>>> +   */
> >>>> +  if (!check_for_pattern(fp, "AnonHugePages:", buffer))
> >>>> +          goto err_out;
> >>>> +
> >>>> +  if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
> >>>> +          printf("Reading smap error\n");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +err_out:
> >>>> +  fclose(fp);
> >>>> +  return thp;
> >>>> +}
> >>>> +
> >>>> +void split_pmd_thp(void)
> >>>> +{
> >>>> +  char *one_page;
> >>>> +  size_t len = 4 * pmd_pagesize;
> >>>> +  uint64_t thp_size;
> >>>> +  size_t i;
> >>>> +
> >>>> +  one_page = memalign(pmd_pagesize, len);
> >>>> +
> >>>> +  madvise(one_page, len, MADV_HUGEPAGE);
> >>>> +
> >>>> +  for (i = 0; i < len; i++)
> >>>> +          one_page[i] = (char)i;
> >>>> +
> >>>> +  thp_size = check_huge(one_page);
> >>>> +  if (!thp_size) {
> >>>> +          printf("No THP is allocatd");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  /* split all possible huge pages */
> >>>> +  write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
> >>>> +
> >>>> +  for (i = 0; i < len; i++)
> >>>> +          if (one_page[i] != (char)i) {
> >>>> +                  printf("%ld byte corrupted\n", i);
> >>>> +                  exit(EXIT_FAILURE);
> >>>> +          }
> >>>> +
> >>>> +
> >>>> +  thp_size = check_huge(one_page);
> >>>> +  if (thp_size) {
> >>>> +          printf("Still %ld kB AnonHugePages not split\n", thp_size);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  printf("Split huge pages successful\n");
> >>>> +  free(one_page);
> >>>> +}
> >>>> +
> >>>> +void split_pte_mapped_thp(void)
> >>>> +{
> >>>> +  char *one_page, *pte_mapped, *pte_mapped2;
> >>>> +  size_t len = 4 * pmd_pagesize;
> >>>> +  uint64_t thp_size;
> >>>> +  size_t i;
> >>>> +  const char *pagemap_template = "/proc/%d/pagemap";
> >>>> +  const char *kpageflags_proc = "/proc/kpageflags";
> >>>> +  char pagemap_proc[255];
> >>>> +  int pagemap_fd;
> >>>> +  int kpageflags_fd;
> >>>> +
> >>>> +  if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
> >>>> +          perror("get pagemap proc error");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +  pagemap_fd = open(pagemap_proc, O_RDONLY);
> >>>> +
> >>>> +  if (pagemap_fd == -1) {
> >>>> +          perror("read pagemap:");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  kpageflags_fd = open(kpageflags_proc, O_RDONLY);
> >>>> +
> >>>> +  if (kpageflags_fd == -1) {
> >>>> +          perror("read kpageflags:");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
> >>>> +                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> >>>> +
> >>>> +  madvise(one_page, len, MADV_HUGEPAGE);
> >>>> +
> >>>> +  for (i = 0; i < len; i++)
> >>>> +          one_page[i] = (char)i;
> >>>> +
> >>>> +  thp_size = check_huge(one_page);
> >>>> +  if (!thp_size) {
> >>>> +          printf("No THP is allocatd");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
> >>>> +
> >>>> +  for (i = 1; i < 4; i++) {
> >>>> +          pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
> >>>> +                               pagesize, pagesize,
> >>>> +                               MREMAP_MAYMOVE|MREMAP_FIXED,
> >>>> +                               pte_mapped + pagesize * i);
> >>>> +          if (pte_mapped2 == (char *)-1) {
> >>>> +                  perror("mremap failed");
> >>>> +                  exit(EXIT_FAILURE);
> >>>> +          }
> >>>> +  }
> >>>> +
> >>>> +  /* smap does not show THPs after mremap, use kpageflags instead */
> >>>> +  thp_size = 0;
> >>>> +  for (i = 0; i < pagesize * 4; i++)
> >>>> +          if (i % pagesize == 0 &&
> >>>> +              is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> >>>> +                  thp_size++;
> >>>> +
> >>>> +  if (thp_size != 4) {
> >>>> +          printf("Some THPs are missing during mremap\n");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  /* split all possible huge pages */
> >>>> +  write_debugfs(getpid(), (uint64_t)pte_mapped,
> >>>> +                (uint64_t)pte_mapped + pagesize * 4);
> >>>> +
> >>>> +  /* smap does not show THPs after mremap, use kpageflags instead */
> >>>> +  thp_size = 0;
> >>>> +  for (i = 0; i < pagesize * 4; i++) {
> >>>> +          if (pte_mapped[i] != (char)i) {
> >>>> +                  printf("%ld byte corrupted\n", i);
> >>>> +                  exit(EXIT_FAILURE);
> >>>> +          }
> >>>> +          if (i % pagesize == 0 &&
> >>>> +              is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
> >>>> +                  thp_size++;
> >>>> +  }
> >>>> +
> >>>> +  if (thp_size) {
> >>>> +          printf("Still %ld THPs not split\n", thp_size);
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  printf("Split PTE-mapped huge pages successful\n");
> >>>> +  munmap(one_page, len);
> >>>> +  close(pagemap_fd);
> >>>> +  close(kpageflags_fd);
> >>>> +}
> >>>> +
> >>>> +int main(int argc, char **argv)
> >>>> +{
> >>>> +  if (geteuid() != 0) {
> >>>> +          printf("Please run the benchmark as root\n");
> >>>> +          exit(EXIT_FAILURE);
> >>>> +  }
> >>>> +
> >>>> +  pagesize = getpagesize();
> >>>> +  pageshift = ffs(pagesize) - 1;
> >>>> +  pmd_pagesize = read_pmd_pagesize();
> >>>> +
> >>>> +  split_pmd_thp();
> >>>> +  split_pte_mapped_thp();
> >>>> +
> >>>> +  return 0;
> >>>> +}
> >>>>
> >>>
> >>>
> >>> --
> >>> Thanks,
> >>>
> >>> David / dhildenb
> >>
> >>
> >> —
> >> Best Regards,
> >> Yan Zi
> >>
> >
> >
> > --
> > Thanks,
> >
> > David / dhildenb
>
>
> —
> Best Regards,
> Yan Zi
kernel test robot March 8, 2021, 7:23 p.m. UTC | #7
Hi Zi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kselftest/next]
[also build test ERROR on linux/master linus/master v5.12-rc2 next-20210305]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210308-232339
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: x86_64-randconfig-a015-20210308 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 3a11a41795bec548e91621caaa4cc00fc31b2212)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/961321af55684845ebc1e13e4c4e7c0da14a476a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210308-232339
        git checkout 961321af55684845ebc1e13e4c4e7c0da14a476a
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> mm/huge_memory.c:3026:40: error: implicit declaration of function 'vma_migratable' [-Werror,-Wimplicit-function-declaration]
                   if (!vma || addr < vma->vm_start || !vma_migratable(vma))
                                                        ^
   1 error generated.


vim +/vma_migratable +3026 mm/huge_memory.c

  2930	
  2931	#ifdef CONFIG_DEBUG_FS
  2932	static int split_huge_pages_set(void *data, u64 val)
  2933	{
  2934		struct zone *zone;
  2935		struct page *page;
  2936		unsigned long pfn, max_zone_pfn;
  2937		unsigned long total = 0, split = 0;
  2938	
  2939		if (val != 1)
  2940			return -EINVAL;
  2941	
  2942		for_each_populated_zone(zone) {
  2943			max_zone_pfn = zone_end_pfn(zone);
  2944			for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  2945				if (!pfn_valid(pfn))
  2946					continue;
  2947	
  2948				page = pfn_to_page(pfn);
  2949				if (!get_page_unless_zero(page))
  2950					continue;
  2951	
  2952				if (zone != page_zone(page))
  2953					goto next;
  2954	
  2955				if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
  2956					goto next;
  2957	
  2958				total++;
  2959				lock_page(page);
  2960				if (!split_huge_page(page))
  2961					split++;
  2962				unlock_page(page);
  2963	next:
  2964				put_page(page);
  2965			}
  2966		}
  2967	
  2968		pr_info("%lu of %lu THP split\n", split, total);
  2969	
  2970		return 0;
  2971	}
  2972	DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
  2973			"%llu\n");
  2974	
  2975	static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
  2976			const char __user *buf, size_t count, loff_t *ppops)
  2977	{
  2978		static DEFINE_MUTEX(mutex);
  2979		ssize_t ret;
  2980		char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
  2981		int pid;
  2982		unsigned long vaddr_start, vaddr_end, addr;
  2983		nodemask_t task_nodes;
  2984		struct mm_struct *mm;
  2985		unsigned long total = 0, split = 0;
  2986	
  2987		ret = mutex_lock_interruptible(&mutex);
  2988		if (ret)
  2989			return ret;
  2990	
  2991		ret = -EFAULT;
  2992	
  2993		memset(input_buf, 0, 80);
  2994		if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
  2995			goto out;
  2996	
  2997		input_buf[79] = '\0';
  2998		ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
  2999		if (ret != 3) {
  3000			ret = -EINVAL;
  3001			goto out;
  3002		}
  3003		vaddr_start &= PAGE_MASK;
  3004		vaddr_end &= PAGE_MASK;
  3005	
  3006		ret = strlen(input_buf);
  3007		pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
  3008			 pid, vaddr_start, vaddr_end);
  3009	
  3010		mm = find_mm_struct(pid, &task_nodes);
  3011		if (IS_ERR(mm)) {
  3012			ret = -EINVAL;
  3013			goto out;
  3014		}
  3015	
  3016		mmap_read_lock(mm);
  3017		/*
  3018		 * always increase addr by PAGE_SIZE, since we could have a PTE page
  3019		 * table filled with PTE-mapped THPs, each of which is distinct.
  3020		 */
  3021		for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
  3022			struct vm_area_struct *vma = find_vma(mm, addr);
  3023			unsigned int follflags;
  3024			struct page *page;
  3025	
> 3026			if (!vma || addr < vma->vm_start || !vma_migratable(vma))
  3027				break;
  3028	
  3029			/* FOLL_DUMP to ignore special (like zero) pages */
  3030			follflags = FOLL_GET | FOLL_DUMP;
  3031			page = follow_page(vma, addr, follflags);
  3032	
  3033			if (IS_ERR(page))
  3034				break;
  3035			if (!page)
  3036				break;
  3037	
  3038			if (!is_transparent_hugepage(page))
  3039				continue;
  3040	
  3041			total++;
  3042			if (!can_split_huge_page(compound_head(page), NULL))
  3043				continue;
  3044	
  3045			if (!trylock_page(page))
  3046				continue;
  3047	
  3048			if (!split_huge_page(page))
  3049				split++;
  3050	
  3051			unlock_page(page);
  3052			put_page(page);
  3053		}
  3054		mmap_read_unlock(mm);
  3055		mmput(mm);
  3056	
  3057		pr_debug("%lu of %lu THP split\n", split, total);
  3058	out:
  3059		mutex_unlock(&mutex);
  3060		return ret;
  3061	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
David Hildenbrand March 8, 2021, 7:30 p.m. UTC | #8
On 08.03.21 20:11, Yang Shi wrote:
> On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
>>
>> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
>>
>>> On 08.03.21 18:49, Zi Yan wrote:
>>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
>>>>
>>>>> On 08.03.21 16:22, Zi Yan wrote:
>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>
>>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>>>>> given pid and virtual address range are split. It is used to test
>>>>>> split_huge_page function. In addition, a selftest program is added to
>>>>>> tools/testing/selftests/vm to utilize the interface by splitting
>>>>>> PMD THPs and PTE-mapped THPs.
>>>>>
>>>>> Won't something like
>>>>>
>>>>> 1. MADV_HUGEPAGE
>>>>>
>>>>> 2. Access memory
>>>>>
>>>>> 3. MADV_NOHUGEPAGE
>>>>>
>>>>> Have a similar effect? What's the benefit of this?
>>>>
>>>> Thanks for checking the patch.
>>>>
>>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
>>>> nothing else will be done.
>>>
>>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
>>>
>>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
>>>
>>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
>>
>> Yes. Thanks for bringing this up.
>>
>>>
>>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
>>>
>>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
>>
>> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
>> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
> 
> IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
> partial THP. If the range covers the whole THP, the whole THP is going
> to be freed anyway. All other places in kernel which need split THP
> have been covered. So I didn't realize any usecase from userspace for
> just splitting PMD to PTEs.

THP are a limited resource. So indicating which virtual memory regions 
are not performance sensitive right now (e.g., cold pages in a databse) 
and not worth a THP might be quite valuable, no?
Zi Yan March 8, 2021, 7:36 p.m. UTC | #9
On 8 Mar 2021, at 14:23, kernel test robot wrote:

> Hi Zi,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on kselftest/next]
> [also build test ERROR on linux/master linus/master v5.12-rc2 next-20210305]
> [cannot apply to hnaz-linux-mm/master]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
>
> url:    https://github.com/0day-ci/linux/commits/Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210308-232339
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
> config: x86_64-randconfig-a015-20210308 (attached as .config)
> compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 3a11a41795bec548e91621caaa4cc00fc31b2212)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install x86_64 cross compiling tool for clang build
>         # apt-get install binutils-x86-64-linux-gnu
>         # https://github.com/0day-ci/linux/commit/961321af55684845ebc1e13e4c4e7c0da14a476a
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Zi-Yan/mm-huge_memory-a-new-debugfs-interface-for-splitting-THP-tests/20210308-232339
>         git checkout 961321af55684845ebc1e13e4c4e7c0da14a476a
>         # save the attached .config to linux build tree
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>>> mm/huge_memory.c:3026:40: error: implicit declaration of function 'vma_migratable' [-Werror,-Wimplicit-function-declaration]
>                    if (!vma || addr < vma->vm_start || !vma_migratable(vma))
>                                                         ^
>    1 error generated.

There is no need to call vma_migratable() here. Will remove it.

Thanks for catching it.


—
Best Regards,
Yan Zi
Yang Shi March 8, 2021, 8:18 p.m. UTC | #10
On Mon, Mar 8, 2021 at 11:30 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.03.21 20:11, Yang Shi wrote:
> > On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
> >>
> >> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
> >>
> >>> On 08.03.21 18:49, Zi Yan wrote:
> >>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
> >>>>
> >>>>> On 08.03.21 16:22, Zi Yan wrote:
> >>>>>> From: Zi Yan <ziy@nvidia.com>
> >>>>>>
> >>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> >>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
> >>>>>> given pid and virtual address range are split. It is used to test
> >>>>>> split_huge_page function. In addition, a selftest program is added to
> >>>>>> tools/testing/selftests/vm to utilize the interface by splitting
> >>>>>> PMD THPs and PTE-mapped THPs.
> >>>>>
> >>>>> Won't something like
> >>>>>
> >>>>> 1. MADV_HUGEPAGE
> >>>>>
> >>>>> 2. Access memory
> >>>>>
> >>>>> 3. MADV_NOHUGEPAGE
> >>>>>
> >>>>> Have a similar effect? What's the benefit of this?
> >>>>
> >>>> Thanks for checking the patch.
> >>>>
> >>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
> >>>> nothing else will be done.
> >>>
> >>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
> >>>
> >>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
> >>>
> >>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
> >>
> >> Yes. Thanks for bringing this up.
> >>
> >>>
> >>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
> >>>
> >>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
> >>
> >> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
> >> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
> >
> > IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
> > partial THP. If the range covers the whole THP, the whole THP is going
> > to be freed anyway. All other places in kernel which need split THP
> > have been covered. So I didn't realize any usecase from userspace for
> > just splitting PMD to PTEs.
>
> THP are a limited resource. So indicating which virtual memory regions
> are not performance sensitive right now (e.g., cold pages in a databse)
> and not worth a THP might be quite valuable, no?

Such functionality could be achieved by MADV_COLD or MADV_PAGEOUT,
right? Then a subsequent call to MADV_NOHUGEPAGE would prevent from
collapsing or allocating THP for that area.

>
> --
> Thanks,
>
> David / dhildenb
>
David Hildenbrand March 8, 2021, 8:35 p.m. UTC | #11
> Am 08.03.2021 um 21:18 schrieb Yang Shi <shy828301@gmail.com>:
> 
> On Mon, Mar 8, 2021 at 11:30 AM David Hildenbrand <david@redhat.com> wrote:
>> 
>>> On 08.03.21 20:11, Yang Shi wrote:
>>> On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
>>>> 
>>>> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
>>>> 
>>>>> On 08.03.21 18:49, Zi Yan wrote:
>>>>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
>>>>>> 
>>>>>>> On 08.03.21 16:22, Zi Yan wrote:
>>>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>>> 
>>>>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>>>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>>>>>>> given pid and virtual address range are split. It is used to test
>>>>>>>> split_huge_page function. In addition, a selftest program is added to
>>>>>>>> tools/testing/selftests/vm to utilize the interface by splitting
>>>>>>>> PMD THPs and PTE-mapped THPs.
>>>>>>> 
>>>>>>> Won't something like
>>>>>>> 
>>>>>>> 1. MADV_HUGEPAGE
>>>>>>> 
>>>>>>> 2. Access memory
>>>>>>> 
>>>>>>> 3. MADV_NOHUGEPAGE
>>>>>>> 
>>>>>>> Have a similar effect? What's the benefit of this?
>>>>>> 
>>>>>> Thanks for checking the patch.
>>>>>> 
>>>>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
>>>>>> nothing else will be done.
>>>>> 
>>>>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
>>>>> 
>>>>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
>>>>> 
>>>>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
>>>> 
>>>> Yes. Thanks for bringing this up.
>>>> 
>>>>> 
>>>>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
>>>>> 
>>>>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
>>>> 
>>>> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
>>>> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
>>> 
>>> IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
>>> partial THP. If the range covers the whole THP, the whole THP is going
>>> to be freed anyway. All other places in kernel which need split THP
>>> have been covered. So I didn't realize any usecase from userspace for
>>> just splitting PMD to PTEs.
>> 
>> THP are a limited resource. So indicating which virtual memory regions
>> are not performance sensitive right now (e.g., cold pages in a databse)
>> and not worth a THP might be quite valuable, no?
> 
> Such functionality could be achieved by MADV_COLD or MADV_PAGEOUT,
> right? Then a subsequent call to MADV_NOHUGEPAGE would prevent from
> collapsing or allocating THP for that area.
> 

I remember these deal with optimizing swapping. Not sure how they interact with THP, especially on systems without swap - I would guess they don‘t as of now.

>> 
>> --
>> Thanks,
>> 
>> David / dhildenb
>> 
>
Yang Shi March 8, 2021, 9:25 p.m. UTC | #12
On Mon, Mar 8, 2021 at 12:36 PM David Hildenbrand <david@redhat.com> wrote:
>
>
> > Am 08.03.2021 um 21:18 schrieb Yang Shi <shy828301@gmail.com>:
> >
> > On Mon, Mar 8, 2021 at 11:30 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >>> On 08.03.21 20:11, Yang Shi wrote:
> >>> On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
> >>>>
> >>>> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
> >>>>
> >>>>> On 08.03.21 18:49, Zi Yan wrote:
> >>>>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
> >>>>>>
> >>>>>>> On 08.03.21 16:22, Zi Yan wrote:
> >>>>>>>> From: Zi Yan <ziy@nvidia.com>
> >>>>>>>>
> >>>>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> >>>>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
> >>>>>>>> given pid and virtual address range are split. It is used to test
> >>>>>>>> split_huge_page function. In addition, a selftest program is added to
> >>>>>>>> tools/testing/selftests/vm to utilize the interface by splitting
> >>>>>>>> PMD THPs and PTE-mapped THPs.
> >>>>>>>
> >>>>>>> Won't something like
> >>>>>>>
> >>>>>>> 1. MADV_HUGEPAGE
> >>>>>>>
> >>>>>>> 2. Access memory
> >>>>>>>
> >>>>>>> 3. MADV_NOHUGEPAGE
> >>>>>>>
> >>>>>>> Have a similar effect? What's the benefit of this?
> >>>>>>
> >>>>>> Thanks for checking the patch.
> >>>>>>
> >>>>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
> >>>>>> nothing else will be done.
> >>>>>
> >>>>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
> >>>>>
> >>>>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
> >>>>>
> >>>>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
> >>>>
> >>>> Yes. Thanks for bringing this up.
> >>>>
> >>>>>
> >>>>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
> >>>>>
> >>>>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
> >>>>
> >>>> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
> >>>> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
> >>>
> >>> IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
> >>> partial THP. If the range covers the whole THP, the whole THP is going
> >>> to be freed anyway. All other places in kernel which need split THP
> >>> have been covered. So I didn't realize any usecase from userspace for
> >>> just splitting PMD to PTEs.
> >>
> >> THP are a limited resource. So indicating which virtual memory regions
> >> are not performance sensitive right now (e.g., cold pages in a databse)
> >> and not worth a THP might be quite valuable, no?
> >
> > Such functionality could be achieved by MADV_COLD or MADV_PAGEOUT,
> > right? Then a subsequent call to MADV_NOHUGEPAGE would prevent from
> > collapsing or allocating THP for that area.
> >
>
> I remember these deal with optimizing swapping. Not sure how they interact with THP, especially on systems without swap - I would guess they don‘t as of now.

Yes, MADV_PAGEOUT would just swap the THP or sub pages out. I think I
just forgot to mention MADV_FREE which would be more suitable for this
usecase.

>
> >>
> >> --
> >> Thanks,
> >>
> >> David / dhildenb
> >>
> >
>
David Hildenbrand March 8, 2021, 9:58 p.m. UTC | #13
> Am 08.03.2021 um 22:25 schrieb Yang Shi <shy828301@gmail.com>:
> 
> On Mon, Mar 8, 2021 at 12:36 PM David Hildenbrand <david@redhat.com> wrote:
>> 
>> 
>>>> Am 08.03.2021 um 21:18 schrieb Yang Shi <shy828301@gmail.com>:
>>> 
>>> On Mon, Mar 8, 2021 at 11:30 AM David Hildenbrand <david@redhat.com> wrote:
>>>> 
>>>>> On 08.03.21 20:11, Yang Shi wrote:
>>>>> On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
>>>>>> 
>>>>>> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
>>>>>> 
>>>>>>> On 08.03.21 18:49, Zi Yan wrote:
>>>>>>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
>>>>>>>> 
>>>>>>>>> On 08.03.21 16:22, Zi Yan wrote:
>>>>>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>>>>>> 
>>>>>>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
>>>>>>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
>>>>>>>>>> given pid and virtual address range are split. It is used to test
>>>>>>>>>> split_huge_page function. In addition, a selftest program is added to
>>>>>>>>>> tools/testing/selftests/vm to utilize the interface by splitting
>>>>>>>>>> PMD THPs and PTE-mapped THPs.
>>>>>>>>> 
>>>>>>>>> Won't something like
>>>>>>>>> 
>>>>>>>>> 1. MADV_HUGEPAGE
>>>>>>>>> 
>>>>>>>>> 2. Access memory
>>>>>>>>> 
>>>>>>>>> 3. MADV_NOHUGEPAGE
>>>>>>>>> 
>>>>>>>>> Have a similar effect? What's the benefit of this?
>>>>>>>> 
>>>>>>>> Thanks for checking the patch.
>>>>>>>> 
>>>>>>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
>>>>>>>> nothing else will be done.
>>>>>>> 
>>>>>>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
>>>>>>> 
>>>>>>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
>>>>>>> 
>>>>>>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
>>>>>> 
>>>>>> Yes. Thanks for bringing this up.
>>>>>> 
>>>>>>> 
>>>>>>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
>>>>>>> 
>>>>>>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
>>>>>> 
>>>>>> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
>>>>>> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
>>>>> 
>>>>> IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
>>>>> partial THP. If the range covers the whole THP, the whole THP is going
>>>>> to be freed anyway. All other places in kernel which need split THP
>>>>> have been covered. So I didn't realize any usecase from userspace for
>>>>> just splitting PMD to PTEs.
>>>> 
>>>> THP are a limited resource. So indicating which virtual memory regions
>>>> are not performance sensitive right now (e.g., cold pages in a databse)
>>>> and not worth a THP might be quite valuable, no?
>>> 
>>> Such functionality could be achieved by MADV_COLD or MADV_PAGEOUT,
>>> right? Then a subsequent call to MADV_NOHUGEPAGE would prevent from
>>> collapsing or allocating THP for that area.
>>> 
>> 
>> I remember these deal with optimizing swapping. Not sure how they interact with THP, especially on systems without swap - I would guess they don‘t as of now.
> 
> Yes, MADV_PAGEOUT would just swap the THP or sub pages out. I think I
> just forgot to mention MADV_FREE which would be more suitable for this
> usecase.
> 
>> 

Can you elaborate? MADV_FREE is destructive, just like a delayed MADV_DONTNEED. How would that help here?

>>>> 
>>>> --
>>>> Thanks,
>>>> 
>>>> David / dhildenb
>>>> 
>>> 
>> 
>
Yang Shi March 8, 2021, 10:20 p.m. UTC | #14
On Mon, Mar 8, 2021 at 1:59 PM David Hildenbrand <david@redhat.com> wrote:
>
>
> > Am 08.03.2021 um 22:25 schrieb Yang Shi <shy828301@gmail.com>:
> >
> > On Mon, Mar 8, 2021 at 12:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >>
> >>>> Am 08.03.2021 um 21:18 schrieb Yang Shi <shy828301@gmail.com>:
> >>>
> >>> On Mon, Mar 8, 2021 at 11:30 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>
> >>>>> On 08.03.21 20:11, Yang Shi wrote:
> >>>>> On Mon, Mar 8, 2021 at 11:01 AM Zi Yan <ziy@nvidia.com> wrote:
> >>>>>>
> >>>>>> On 8 Mar 2021, at 13:11, David Hildenbrand wrote:
> >>>>>>
> >>>>>>> On 08.03.21 18:49, Zi Yan wrote:
> >>>>>>>> On 8 Mar 2021, at 11:17, David Hildenbrand wrote:
> >>>>>>>>
> >>>>>>>>> On 08.03.21 16:22, Zi Yan wrote:
> >>>>>>>>>> From: Zi Yan <ziy@nvidia.com>
> >>>>>>>>>>
> >>>>>>>>>> By writing "<pid>,<vaddr_start>,<vaddr_end>" to
> >>>>>>>>>> <debugfs>/split_huge_pages_in_range_pid, THPs in the process with the
> >>>>>>>>>> given pid and virtual address range are split. It is used to test
> >>>>>>>>>> split_huge_page function. In addition, a selftest program is added to
> >>>>>>>>>> tools/testing/selftests/vm to utilize the interface by splitting
> >>>>>>>>>> PMD THPs and PTE-mapped THPs.
> >>>>>>>>>
> >>>>>>>>> Won't something like
> >>>>>>>>>
> >>>>>>>>> 1. MADV_HUGEPAGE
> >>>>>>>>>
> >>>>>>>>> 2. Access memory
> >>>>>>>>>
> >>>>>>>>> 3. MADV_NOHUGEPAGE
> >>>>>>>>>
> >>>>>>>>> Have a similar effect? What's the benefit of this?
> >>>>>>>>
> >>>>>>>> Thanks for checking the patch.
> >>>>>>>>
> >>>>>>>> No, MADV_NOHUGEPAGE just replaces VM_HUGEPAGE with VM_NOHUGEPAGE,
> >>>>>>>> nothing else will be done.
> >>>>>>>
> >>>>>>> Ah, okay - maybe my memory was tricking me. There is some s390x KVM code that forces MADV_NOHUGEPAGE and force-splits everything.
> >>>>>>>
> >>>>>>> I do wonder, though, if this functionality would be worth a proper user interface (e.g., madvise), though. There might be actual benefit in having this as a !debug interface.
> >>>>>>>
> >>>>>>> I think you aware of the discussion in https://lkml.kernel.org/r/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com
> >>>>>>
> >>>>>> Yes. Thanks for bringing this up.
> >>>>>>
> >>>>>>>
> >>>>>>> If there will be an interface to collapse a THP -- "this memory area is worth extra performance now by collapsing a THP if possible" -- it might also be helpful to have the opposite functionality -- "this memory area is not worth a THP, rather use that somehwere else".
> >>>>>>>
> >>>>>>> MADV_HUGE_COLLAPSE vs. MADV_HUGE_SPLIT
> >>>>>>
> >>>>>> I agree that MADV_HUGE_SPLIT would be useful as the opposite of COLLAPSE when user might just want PAGESIZE mappings.
> >>>>>> Right now, HUGE_SPLIT is implicit from mapping changes like mprotect or MADV_DONTNEED.
> >>>>>
> >>>>> IMHO, it sounds not very useful. MADV_DONTNEED would split PMD for any
> >>>>> partial THP. If the range covers the whole THP, the whole THP is going
> >>>>> to be freed anyway. All other places in kernel which need split THP
> >>>>> have been covered. So I didn't realize any usecase from userspace for
> >>>>> just splitting PMD to PTEs.
> >>>>
> >>>> THP are a limited resource. So indicating which virtual memory regions
> >>>> are not performance sensitive right now (e.g., cold pages in a databse)
> >>>> and not worth a THP might be quite valuable, no?
> >>>
> >>> Such functionality could be achieved by MADV_COLD or MADV_PAGEOUT,
> >>> right? Then a subsequent call to MADV_NOHUGEPAGE would prevent from
> >>> collapsing or allocating THP for that area.
> >>>
> >>
> >> I remember these deal with optimizing swapping. Not sure how they interact with THP, especially on systems without swap - I would guess they don‘t as of now.
> >
> > Yes, MADV_PAGEOUT would just swap the THP or sub pages out. I think I
> > just forgot to mention MADV_FREE which would be more suitable for this
> > usecase.
> >
> >>
>
> Can you elaborate? MADV_FREE is destructive, just like a delayed MADV_DONTNEED. How would that help here?

Split THP and reclaim the memory. Then not allocate or collapse THP
for this area anymore (need subsequent MADV_NOHUGEPAGE call). I'm
supposed this is the main purpose of splitting a THP. And we don't
have to introduce a new advise flag.

Just splitting PMD to PTEs sounds less useful to me IMHO except for
vma changes (i.e. mprotect, mlock, etc).

>
> >>>>
> >>>> --
> >>>> Thanks,
> >>>>
> >>>> David / dhildenb
> >>>>
> >>>
> >>
> >
>
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 395c75111d33..818172f887bf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@ 
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
@@ -2971,10 +2972,107 @@  static int split_huge_pages_set(void *data, u64 val)
 DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
 		"%llu\n");
 
+static ssize_t split_huge_pages_in_range_pid_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppops)
+{
+	static DEFINE_MUTEX(mutex);
+	ssize_t ret;
+	char input_buf[80]; /* hold pid, start_vaddr, end_vaddr */
+	int pid;
+	unsigned long vaddr_start, vaddr_end, addr;
+	nodemask_t task_nodes;
+	struct mm_struct *mm;
+	unsigned long total = 0, split = 0;
+
+	ret = mutex_lock_interruptible(&mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+
+	memset(input_buf, 0, 80);
+	if (copy_from_user(input_buf, buf, min_t(size_t, count, 80)))
+		goto out;
+
+	input_buf[79] = '\0';
+	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+	if (ret != 3) {
+		ret = -EINVAL;
+		goto out;
+	}
+	vaddr_start &= PAGE_MASK;
+	vaddr_end &= PAGE_MASK;
+
+	ret = strlen(input_buf);
+	pr_debug("split huge pages in pid: %d, vaddr: [%lx - %lx]\n",
+		 pid, vaddr_start, vaddr_end);
+
+	mm = find_mm_struct(pid, &task_nodes);
+	if (IS_ERR(mm)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mmap_read_lock(mm);
+	/*
+	 * always increase addr by PAGE_SIZE, since we could have a PTE page
+	 * table filled with PTE-mapped THPs, each of which is distinct.
+	 */
+	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+		struct vm_area_struct *vma = find_vma(mm, addr);
+		unsigned int follflags;
+		struct page *page;
+
+		if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+			break;
+
+		/* FOLL_DUMP to ignore special (like zero) pages */
+		follflags = FOLL_GET | FOLL_DUMP;
+		page = follow_page(vma, addr, follflags);
+
+		if (IS_ERR(page))
+			break;
+		if (!page)
+			break;
+
+		if (!is_transparent_hugepage(page))
+			continue;
+
+		total++;
+		if (!can_split_huge_page(compound_head(page), NULL))
+			continue;
+
+		if (!trylock_page(page))
+			continue;
+
+		if (!split_huge_page(page))
+			split++;
+
+		unlock_page(page);
+		put_page(page);
+	}
+	mmap_read_unlock(mm);
+	mmput(mm);
+
+	pr_debug("%lu of %lu THP split\n", split, total);
+out:
+	mutex_unlock(&mutex);
+	return ret;
+
+}
+
+static const struct file_operations split_huge_pages_in_range_pid_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = split_huge_pages_in_range_pid_write,
+	.llseek  = no_llseek,
+};
+
 static int __init split_huge_pages_debugfs(void)
 {
 	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
 			    &split_huge_pages_fops);
+	debugfs_create_file("split_huge_pages_in_range_pid", 0200, NULL, NULL,
+			    &split_huge_pages_in_range_pid_fops);
 	return 0;
 }
 late_initcall(split_huge_pages_debugfs);
diff --git a/mm/internal.h b/mm/internal.h
index 9902648f2206..1659d00100ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -623,4 +623,5 @@  struct migration_target_control {
 	gfp_t gfp_mask;
 };
 
+struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..ce5f213debb2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1913,7 +1913,7 @@  static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 	return nr_pages ? -EFAULT : 0;
 }
 
-static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
+struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 9a35c3f6a557..1f651e85ed60 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -22,3 +22,4 @@  map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 local_config.*
+split_huge_page_test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index d42115e4284d..4cbc91d6869f 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -42,6 +42,7 @@  TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += split_huge_page_test
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
new file mode 100644
index 000000000000..8ea8000fda62
--- /dev/null
+++ b/tools/testing/selftests/vm/split_huge_page_test.c
@@ -0,0 +1,318 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages_in_range_pid
+ * interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include "numa.h"
+#include <unistd.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages_in_range_pid"
+#define SMAP_PATH "/proc/self/smaps"
+#define INPUT_MAX 80
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+	uint64_t paddr;
+	uint64_t page_flags;
+
+	if (pagemap_file) {
+		pread(pagemap_file, &paddr, sizeof(paddr),
+			((long)vaddr >> pageshift) * sizeof(paddr));
+
+		if (kpageflags_file) {
+			pread(kpageflags_file, &page_flags, sizeof(page_flags),
+				(paddr & PFN_MASK) * sizeof(page_flags));
+
+			return !!(page_flags & KPF_THP);
+		}
+	}
+	return 0;
+}
+
+
+static uint64_t read_pmd_pagesize(void)
+{
+	int fd;
+	char buf[20];
+	ssize_t num_read;
+
+	fd = open(PMD_SIZE_PATH, O_RDONLY);
+	if (fd == -1) {
+		perror("Open hpage_pmd_size failed");
+		exit(EXIT_FAILURE);
+	}
+	num_read = read(fd, buf, 19);
+	if (num_read < 1) {
+		close(fd);
+		perror("Read hpage_pmd_size failed");
+		exit(EXIT_FAILURE);
+	}
+	buf[num_read] = '\0';
+	close(fd);
+
+	return strtoul(buf, NULL, 10);
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1)
+		return 0;
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1)
+		return 0;
+
+	return (unsigned int) numwritten;
+}
+
+static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end)
+{
+	char input[INPUT_MAX];
+	int ret;
+
+	ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start,
+			vaddr_end);
+	if (ret >= INPUT_MAX) {
+		printf("%s: Debugfs input is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+		perror(SPLIT_DEBUGFS);
+		exit(EXIT_FAILURE);
+	}
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
+{
+	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+		if (!strncmp(buf, pattern, strlen(pattern)))
+			return true;
+	}
+	return false;
+}
+
+static uint64_t check_huge(void *addr)
+{
+	uint64_t thp = 0;
+	int ret;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+	char addr_pattern[MAX_LINE_LENGTH];
+
+	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+		       (unsigned long) addr);
+	if (ret >= MAX_LINE_LENGTH) {
+		printf("%s: Pattern is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+
+	fp = fopen(SMAP_PATH, "r");
+	if (!fp) {
+		printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
+		exit(EXIT_FAILURE);
+	}
+	if (!check_for_pattern(fp, addr_pattern, buffer))
+		goto err_out;
+
+	/*
+	 * Fetch the AnonHugePages: in the same block and check the number of
+	 * hugepages.
+	 */
+	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+		goto err_out;
+
+	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
+		printf("Reading smap error\n");
+		exit(EXIT_FAILURE);
+	}
+
+err_out:
+	fclose(fp);
+	return thp;
+}
+
+void split_pmd_thp(void)
+{
+	char *one_page;
+	size_t len = 4 * pmd_pagesize;
+	uint64_t thp_size;
+	size_t i;
+
+	one_page = memalign(pmd_pagesize, len);
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	thp_size = check_huge(one_page);
+	if (!thp_size) {
+		printf("No THP is allocatd");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all possible huge pages */
+	write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len);
+
+	for (i = 0; i < len; i++)
+		if (one_page[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+
+
+	thp_size = check_huge(one_page);
+	if (thp_size) {
+		printf("Still %ld kB AnonHugePages not split\n", thp_size);
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split huge pages successful\n");
+	free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+	char *one_page, *pte_mapped, *pte_mapped2;
+	size_t len = 4 * pmd_pagesize;
+	uint64_t thp_size;
+	size_t i;
+	const char *pagemap_template = "/proc/%d/pagemap";
+	const char *kpageflags_proc = "/proc/kpageflags";
+	char pagemap_proc[255];
+	int pagemap_fd;
+	int kpageflags_fd;
+
+	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+		perror("get pagemap proc error");
+		exit(EXIT_FAILURE);
+	}
+	pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+	if (pagemap_fd == -1) {
+		perror("read pagemap:");
+		exit(EXIT_FAILURE);
+	}
+
+	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+	if (kpageflags_fd == -1) {
+		perror("read kpageflags:");
+		exit(EXIT_FAILURE);
+	}
+
+	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	thp_size = check_huge(one_page);
+	if (!thp_size) {
+		printf("No THP is allocatd");
+		exit(EXIT_FAILURE);
+	}
+
+	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+	for (i = 1; i < 4; i++) {
+		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+				     pagesize, pagesize,
+				     MREMAP_MAYMOVE|MREMAP_FIXED,
+				     pte_mapped + pagesize * i);
+		if (pte_mapped2 == (char *)-1) {
+			perror("mremap failed");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++)
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+
+	if (thp_size != 4) {
+		printf("Some THPs are missing during mremap\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all possible huge pages */
+	write_debugfs(getpid(), (uint64_t)pte_mapped,
+		      (uint64_t)pte_mapped + pagesize * 4);
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++) {
+		if (pte_mapped[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+	}
+
+	if (thp_size) {
+		printf("Still %ld THPs not split\n", thp_size);
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split PTE-mapped huge pages successful\n");
+	munmap(one_page, len);
+	close(pagemap_fd);
+	close(kpageflags_fd);
+}
+
+int main(int argc, char **argv)
+{
+	if (geteuid() != 0) {
+		printf("Please run the benchmark as root\n");
+		exit(EXIT_FAILURE);
+	}
+
+	pagesize = getpagesize();
+	pageshift = ffs(pagesize) - 1;
+	pmd_pagesize = read_pmd_pagesize();
+
+	split_pmd_thp();
+	split_pte_mapped_thp();
+
+	return 0;
+}