From patchwork Thu Dec 28 13:10:45 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506066 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A63C28465 for ; Thu, 28 Dec 2023 13:28:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.216]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T184x2vMbz6K8j1; Thu, 28 Dec 2023 21:10:25 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 3A851140DD5; Thu, 28 Dec 2023 21:12:42 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:39 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 01/12] mm: allow per-NUMA node local PUD/PMD allocation Date: Thu, 28 Dec 2023 21:10:45 +0800 Message-ID: <20231228131056.602411-2-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- include/asm-generic/pgalloc.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/gfp.h | 2 ++ mm/page_alloc.c | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index a7cf825befae..6364375388bf 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -132,6 +132,24 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) } return (pmd_t *)page_address(page); } + +static inline pmd_t *pmd_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + struct page *page; + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + page = alloc_pages_node(nid, gfp, 0); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + return (pmd_t *)page_address(page); +} #endif #ifndef __HAVE_ARCH_PMD_FREE @@ -156,6 +174,16 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) return (pud_t *)get_zeroed_page(gfp); } +static inline pud_t *__pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + return (pud_t *)get_zeroed_page_node(nid, gfp); +} + #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate a page for PUD-level page table @@ -170,6 +198,12 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one(mm, addr); } + +static inline pud_t *pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + return __pud_alloc_one_node(nid, mm, addr); +} #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 665f06675c83..6ee0004b9774 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -288,6 +288,8 @@ static inline struct page *alloc_page_vma(gfp_t gfp, extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); +extern unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, unsigned int order); +extern unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d3460c7a480..dc8f4a57d8b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4537,6 +4537,24 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); +unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(nid, gfp_mask & ~__GFP_HIGHMEM, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pages_node); + +unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask) +{ + return __get_free_pages_node(nid, gfp_mask | __GFP_ZERO, 0); +} +EXPORT_SYMBOL(get_zeroed_page_node); + /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). From patchwork Thu Dec 28 13:10:46 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506067 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1ED978464 for ; Thu, 28 Dec 2023 13:32:13 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.31]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T186K0xDXz67K2K; Thu, 28 Dec 2023 21:11:37 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 3E2DD1400D2; Thu, 28 Dec 2023 21:12:44 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:42 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 02/12] mm: add config option and per-NUMA node VMS support Date: Thu, 28 Dec 2023 21:10:46 +0800 Message-ID: <20231228131056.602411-3-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- include/linux/mm_types.h | 11 ++++++++++- mm/Kconfig | 10 ++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7d30dc4ff0ff..1fafb8425994 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -22,6 +22,8 @@ #include +#include + #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif @@ -626,7 +628,14 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - pgd_t * pgd; +#ifndef CONFIG_KERNEL_REPLICATION + pgd_t *pgd; +#else + union { + pgd_t *pgd; + pgd_t *pgd_numa[MAX_NUMNODES]; + }; +#endif #ifdef CONFIG_MEMBARRIER /** diff --git a/mm/Kconfig b/mm/Kconfig index 09130434e30d..5fe5b3ba7f99 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1236,6 +1236,16 @@ config LOCK_MM_AND_FIND_VMA bool depends on !STACK_GROWSUP +config KERNEL_REPLICATION + bool "Enable kernel text and ro-data replication between NUMA nodes" + default n + depends on (X86_64 && !(KASAN && X86_5LEVEL)) && MMU && NUMA && !MAXSMP + + help + Creates per-NUMA node replicas of kernel text and rodata sections. + Page tables are replicated partially, according to replicated kernel memory range. + If unsure, say "n". + source "mm/damon/Kconfig" endmenu From patchwork Thu Dec 28 13:10:47 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506068 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1EDC08465 for ; Thu, 28 Dec 2023 13:32:13 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.231]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T186L6mJxz6K5xf; Thu, 28 Dec 2023 21:11:38 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id F0A241408F9; Thu, 28 Dec 2023 21:12:45 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:44 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 03/12] mm: per-NUMA node replication core infrastructure Date: Thu, 28 Dec 2023 21:10:47 +0800 Message-ID: <20231228131056.602411-4-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/include/asm/numa_replication.h | 42 ++ arch/x86/include/asm/pgalloc.h | 10 + include/linux/mm.h | 79 +++- include/linux/numa_replication.h | 85 ++++ mm/Makefile | 1 + mm/memory.c | 251 ++++++++++- mm/numa_replication.c | 564 ++++++++++++++++++++++++ 7 files changed, 1021 insertions(+), 11 deletions(-) create mode 100644 arch/x86/include/asm/numa_replication.h create mode 100644 include/linux/numa_replication.h create mode 100644 mm/numa_replication.c diff --git a/arch/x86/include/asm/numa_replication.h b/arch/x86/include/asm/numa_replication.h new file mode 100644 index 000000000000..ba1b5bc7f6f1 --- /dev/null +++ b/arch/x86/include/asm/numa_replication.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_NUMA_REPLICATION_H +#define _ASM_X86_NUMA_REPLICATION_H + +#include +#include +#include + +/* Replicated region of kernel space */ +#define PAGE_TABLE_REPLICATION_LEFT (0xffffffffffffffff - (SZ_2G - 1)) +#define PAGE_TABLE_REPLICATION_RIGHT (0xffffffffffffffff) + +static inline pgd_t *numa_replicate_pgt_pgd(int nid) +{ + pgd_t *new_pgd; + + struct page *pgd_page; + + pgd_page = alloc_pages_node(nid, GFP_PGTABLE_KERNEL, PGD_ALLOCATION_ORDER); + BUG_ON(pgd_page == NULL); + + new_pgd = (pgd_t *)page_address(pgd_page); + clone_pgd_range(new_pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + return new_pgd; +} + +static inline void load_replicated_pgd(pgd_t *pgd) +{ + load_cr3(pgd); + flush_tlb_local(); +} + +static inline ssize_t str_cpu_dump(char *buf) +{ + return sprintf(buf, "NODE: #%02d, CPU: #%04d, cr3: 0x%p\n", numa_node_id(), + smp_processor_id(), (void *)__native_read_cr3()); +} + +#endif /* _ASM_X86_NUMA_REPLICATION_H */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334..d9f921b6d65a 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -156,6 +156,16 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) return (p4d_t *)get_zeroed_page(gfp); } +static inline p4d_t *p4d_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (p4d_t *)get_zeroed_page_node(nid, gfp); +} + static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!pgtable_l5_enabled()) diff --git a/include/linux/mm.h b/include/linux/mm.h index 34f9dba17c1a..c61852c8f6a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1066,6 +1066,8 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); +struct page *walk_to_page_node(int nid, const void *addr); + /* * Determine if an address is within the vmalloc range * @@ -2645,25 +2647,46 @@ static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, - unsigned long address) + unsigned long address) +{ + return 0; +} + +static inline int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __p4d_alloc_node(unsigned int nid, struct mm_struct *mm, + pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) -static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, - unsigned long address) +static inline int __pud_alloc(struct mm_struct *mm, + p4d_t *p4d, unsigned long address) { return 0; } + +static inline int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return 0; +} + static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else -int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); +int __pud_alloc(struct mm_struct *mm, + p4d_t *p4d, unsigned long address); +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { @@ -2681,8 +2704,15 @@ static inline void mm_dec_nr_puds(struct mm_struct *mm) #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) -static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, - unsigned long address) +static inline int __pmd_alloc(struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return 0; +} + +static inline int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) { return 0; } @@ -2692,6 +2722,9 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { @@ -2744,7 +2777,6 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) - static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2759,11 +2791,36 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, NULL : pud_offset(p4d, address); } -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, + unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +static inline p4d_t *p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + return (unlikely(pgd_none(*pgd)) && __p4d_alloc_node(nid, mm, pgd, address)) ? + NULL : p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && __pud_alloc_node(nid, mm, p4d, address)) ? + NULL : pud_offset(p4d, address); +} + +static inline pmd_t *pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return (unlikely(pud_none(*pud)) && __pmd_alloc_node(nid, mm, pud, address)) ? + NULL : pmd_offset(pud, address); +} #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -3444,6 +3501,12 @@ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +#ifdef CONFIG_KERNEL_REPLICATION +extern int numa_apply_to_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, pte_fn_t fn, void *data); +#else +#define numa_apply_to_page_range apply_to_page_range +#endif extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/include/linux/numa_replication.h b/include/linux/numa_replication.h new file mode 100644 index 000000000000..f53c35c28d35 --- /dev/null +++ b/include/linux/numa_replication.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_REPLICATION_H +#define _LINUX_NUMA_REPLICATION_H + +#include +#include +#include +#include +#include + +#ifdef CONFIG_KERNEL_REPLICATION + +#include + +extern int closest_memory_node[MAX_NUMNODES]; + +#define per_numa_pgd(mm, nid) ((mm)->pgd_numa[nid]) +#define for_each_replica(nid) for_each_node_state(nid, N_MEMORY) + +static inline bool numa_addr_has_replica(const void *addr) +{ + return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && + ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); +} + +bool is_text_replicated(void); +void numa_replicate_kernel_rodata(void); +void numa_setup_pgd(void); +void numa_clear_linear_addresses(void); +void __init numa_reserve_memory(void); +void __init numa_replicate_kernel(void); +void __init_or_module *numa_addr_in_replica(void *vaddr, int nid); +void numa_dump_mm_tables(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline int numa_closest_memory_node(int nid) +{ + return closest_memory_node[nid]; +} + +#else + +#define per_numa_pgd(mm, nid) ((mm)->pgd) +#define for_each_replica(nid) for (nid = 0; nid < 1; nid++) + +static inline bool numa_addr_has_replica(const void *addr) +{ + return false; +} + +static inline bool is_text_replicated(void) +{ + return false; +} + +static inline void numa_replicate_kernel_rodata(void) +{ +} + +static inline void numa_setup_pgd(void) +{ +} + +static inline void numa_clear_linear_addresses(void) +{ +} + +static inline void __init numa_reserve_memory(void) +{ +} + +static inline void __init numa_replicate_kernel(void) +{ +} + +static inline void __init_or_module *numa_addr_in_replica(void *vaddr, int nid) +{ + return lm_alias(vaddr); +} + +static inline void numa_dump_mm_tables(struct mm_struct *mm, unsigned long start, unsigned long end) +{ +} + + +#endif /*CONFIG_KERNEL_REPLICATION*/ +#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/mm/Makefile b/mm/Makefile index 678530a07326..3ea7e3c144fd 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -139,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_KERNEL_REPLICATION) += numa_replication.o diff --git a/mm/memory.c b/mm/memory.c index cdc4d4c1c858..22630bcdb0b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -236,7 +237,10 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud_t *pud; unsigned long next; unsigned long start; - +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + int offset; +#endif start = addr; pud = pud_offset(p4d, addr); do { @@ -258,7 +262,18 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, return; pud = pud_offset(p4d, start); +#ifdef CONFIG_KERNEL_REPLICATION + if (mm_p4d_folded(tlb->mm)) { + offset = p4d - (p4d_t *)tlb->mm->pgd; + for_each_replica(nid) { + p4d_clear((p4d_t *)tlb->mm->pgd_numa[nid] + offset); + } + } else { + p4d_clear(p4d); + } +#else p4d_clear(p4d); +#endif pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } @@ -270,7 +285,10 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_t *p4d; unsigned long next; unsigned long start; - +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + int offset; +#endif start = addr; p4d = p4d_offset(pgd, addr); do { @@ -292,7 +310,16 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, return; p4d = p4d_offset(pgd, start); +#ifdef CONFIG_KERNEL_REPLICATION + if (!mm_p4d_folded(tlb->mm)) { + offset = pgd - (pgd_t *)tlb->mm->pgd; + for_each_replica(nid) { + pgd_clear(tlb->mm->pgd_numa[nid] + offset); + } + } +#else pgd_clear(pgd); +#endif p4d_free_tlb(tlb, p4d, start); } @@ -2766,6 +2793,60 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +#ifdef CONFIG_KERNEL_REPLICATION +static int numa_apply_to_page_range_pgd(struct mm_struct *mm, + pgd_t *pgtable, unsigned long addr, + unsigned long size, pte_fn_t fn, + void *data, bool create) +{ + pgd_t *pgd; + unsigned long start = addr, next; + unsigned long end = addr + size; + pgtbl_mod_mask mask = 0; + int err = 0; + + if (WARN_ON(addr >= end)) + return -EINVAL; + + pgd = pgd_offset_pgd(pgtable, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd) && !create) + continue; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } + err = apply_to_p4d_range(mm, pgd, addr, next, + fn, data, create, &mask); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, start + size); + + return err; +} + +int numa_apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + int nid; + int ret = 0; + + for_each_replica(nid) + ret |= numa_apply_to_page_range_pgd(mm, per_numa_pgd(mm, nid), + addr, size, fn, data, true); + + return ret; +} +EXPORT_SYMBOL_GPL(numa_apply_to_page_range); +#endif + /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. @@ -5440,6 +5521,10 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + pgd_t *target; +#endif p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; @@ -5447,6 +5532,42 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; + } + smp_wmb(); /* See comment in pmd_install() */ + pgd_populate(mm, pgd, new); + +#ifdef CONFIG_KERNEL_REPLICATION + if (mm_p4d_folded(mm) || !is_text_replicated()) { + spin_unlock(&mm->page_table_lock); + return 0; + } + for_each_replica(nid) { + target = pgd_offset_pgd(mm->pgd_numa[nid], address); + if (pgd_present(*target)) + continue; + pgd_populate(mm, target, new); + } +#endif + spin_unlock(&mm->page_table_lock); + + return 0; +} + +int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) { /* Another has populated it */ + p4d_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; } else { smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); @@ -5463,6 +5584,10 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + p4d_t *target; +#endif pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; @@ -5472,9 +5597,48 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); - } else /* Another has populated it */ + } else {/* Another has populated it */ pud_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; + } +#ifdef CONFIG_KERNEL_REPLICATION + if (!mm_p4d_folded(mm) || !is_text_replicated()) { + spin_unlock(&mm->page_table_lock); + return 0; + } + for_each_online_node(nid) { + target = (p4d_t *)pgd_offset_pgd(mm->pgd_numa[nid], address); + if (p4d_present(*target)) + continue; + p4d_populate(mm, target, new); + } +#endif spin_unlock(&mm->page_table_lock); + + return 0; +} + +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + pud_t *new = pud_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ + p4d_populate(mm, p4d, new); + } else {/* Another has populated it */ + pud_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; + } + spin_unlock(&mm->page_table_lock); + return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ @@ -5502,6 +5666,28 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(ptl); return 0; } + +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + spinlock_t *ptl; + pmd_t *new = pmd_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + ptl = pud_lock(mm, pud); + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ + pud_populate(mm, pud, new); + } else { /* Another has populated it */ + pmd_free(mm, new); + } + spin_unlock(ptl); + return 0; +} + #endif /* __PAGETABLE_PMD_FOLDED */ /** @@ -6075,3 +6261,62 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +/** + * Walk in replicated tranlation table specified by nid. + * If kernel replication is disabled or text is not replicated yet, + * value of nid is not used + */ +struct page *walk_to_page_node(int nid, const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long)vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!is_text_replicated()) + nid = 0; + + pgd = pgd_offset_pgd(per_numa_pgd(&init_mm, nid), addr); + if (pgd_none(*pgd)) + return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + + return page; +} diff --git a/mm/numa_replication.c b/mm/numa_replication.c new file mode 100644 index 000000000000..8042a9ef9781 --- /dev/null +++ b/mm/numa_replication.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KERNEL_TEXT_START ((unsigned long)&_stext) +#define KERNEL_TEXT_END ((unsigned long)&_etext) + +#define KERNEL_RODATA_START ((unsigned long)&__start_rodata) +#define KERNEL_RODATA_END ((unsigned long)&__end_rodata) + +#define PMD_ALLOC_ORDER (PMD_SHIFT-PAGE_SHIFT) +#define PAGES_PER_PMD (1 << PMD_ALLOC_ORDER) + +struct numa_node_pgt { + pgd_t *pgd; + void *text_vaddr; + void *rodata_vaddr; +}; + +static struct numa_node_pgt __initdata_or_module numa_pgt[MAX_NUMNODES]; + +unsigned int master_node = -1; + +int closest_memory_node[MAX_NUMNODES]; + +struct tt_dump_config { + int pgd_extra_info:1; + int p4d_extra_info:1; + int pud_extra_info:1; + int pmd_extra_info:1; + int pte_extra_info:1; +}; + +static bool text_replicated; + +bool is_text_replicated(void) +{ + return text_replicated; +} + +static void binary_dump(unsigned long value) +{ + int i; + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + pr_info("%-9d", i); + } + pr_info("%d\n", 0); + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + pr_info("|"); + + pr_info("%d", (1UL << i) & value ? 1 : 0); + } + pr_info("|"); +} + +static int pgd_callback(pgd_t *pgd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pgd_val(*pgd); + struct tt_dump_config *c = (struct tt_dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PGDIR_MASK; + next = (addr & PGDIR_MASK) - 1 + PGDIR_SIZE; + + pr_info("PGD ADDR: 0x%p PGD VAL: 0x%016lx [%p --- %p]\n", + pgd, val, (void *)addr, (void *)next); + + if (c->pgd_extra_info) + binary_dump(val); + + return 0; +} + +static int p4d_callback(p4d_t *p4d, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = p4d_val(*p4d); + struct tt_dump_config *c = (struct tt_dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & P4D_MASK; + next = (addr & P4D_MASK) - 1 + P4D_SIZE; + + pr_info("P4D ADDR: 0x%p P4D VAL: 0x%016lx [%p --- %p]\n", + p4d, val, (void *)addr, (void *)next); + + if (c->p4d_extra_info) + binary_dump(val); + + return 0; +} + +static int pud_callback(pud_t *pud, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pud_val(*pud); + struct tt_dump_config *c = (struct tt_dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PUD_MASK; + next = (addr & PUD_MASK) - 1 + PUD_SIZE; + + pr_info("PUD ADDR: 0x%p PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", + pud, val, pud_huge(*pud), (void *)addr, (void *)next); + + if (c->pud_extra_info) + binary_dump(val); + + return 0; +} + +static int pmd_callback(pmd_t *pmd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pmd_val(*pmd); + unsigned long paddr = pmd_pfn(*pmd) << PAGE_SHIFT; + struct tt_dump_config *c = (struct tt_dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PMD_MASK; + next = (addr & PMD_MASK) - 1 + PMD_SIZE; + + pr_info("PMD ADDR: 0x%p PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", + pmd, val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); + + if (c->pmd_extra_info) + binary_dump(val); + + return 0; +} + +static int pte_callback(pte_t *pte, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pte_val(*pte); + unsigned long paddr = pte_pfn(*pte) << PAGE_SHIFT; + struct tt_dump_config *c = (struct tt_dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PAGE_MASK; + next = (addr & PAGE_MASK) - 1 + PAGE_SIZE; + + pr_info("PTE ADDR: 0x%p PTE VAL: 0x%016lx [%p --- %p] to %p\n", + pte, val, (void *)addr, (void *)next, (void *)paddr); + + if (c->pte_extra_info) + binary_dump(val); + + return 0; +} + +static int pte_hole_callback(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + pr_info("%*chole\n", depth * 2, ' '); + + return 0; +} + +void numa_dump_mm_tables(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + int nid = 0; + struct tt_dump_config conf = { + .pgd_extra_info = 0, + .p4d_extra_info = 0, + .pud_extra_info = 0, + .pmd_extra_info = 0, + .pte_extra_info = 0, + }; + + const struct mm_walk_ops ops = { + .pgd_entry = pgd_callback, + .p4d_entry = p4d_callback, + .pud_entry = pud_callback, + .pmd_entry = pmd_callback, + .pte_entry = pte_callback, + .pte_hole = pte_hole_callback + }; + + start = start & PAGE_MASK; + end = (end & PAGE_MASK) - 1 + PAGE_SIZE; + + pr_info("----------PER-NUMA NODE KERNEL REPLICATION ENABLED----------\n"); + mmap_read_lock(mm); + for_each_replica(nid) { + pr_info("NUMA node id #%d\n", nid); + pr_info("PGD: %p PGD phys: %p\n", + mm->pgd_numa[nid], (void *)virt_to_phys(mm->pgd_numa[nid])); + walk_page_range_novma(mm, start, end, &ops, mm->pgd_numa[nid], &conf); + } + mmap_read_unlock(mm); + pr_info("----------PER-NUMA NODE KERNEL REPLICATION ENABLED----------\n"); +} + +static void numa_dump_tt(unsigned long start, unsigned long end) +{ + numa_dump_mm_tables(&init_mm, start, end); +} + +DEFINE_SPINLOCK(numa_sysfs_lock); +struct dump_data { + char *buf; + ssize_t offset; +}; + +static void cpu_dump(void *info) +{ + struct dump_data *data = (struct dump_data *) info; + ssize_t offset; + + spin_lock(&numa_sysfs_lock); + + offset = READ_ONCE(data->offset); + + offset += str_cpu_dump(data->buf + offset); + + WRITE_ONCE(data->offset, offset); + + spin_unlock(&numa_sysfs_lock); +} + +static ssize_t sysfs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct dump_data data = { + .buf = buf, + .offset = 0 + }; + numa_dump_tt(KERNEL_TEXT_START, KERNEL_RODATA_END - 1); + on_each_cpu(cpu_dump, &data, 1); + + return data.offset; +} + +static struct kobj_attribute etx_attr = __ATTR(numa_replication_dump, 0440, sysfs_show, NULL); + +static void numa_replication_sysfs_init(void) +{ + if (sysfs_create_file(mm_kobj, &etx_attr.attr)) + pr_info("Unable to create sysfs entry for numa replication\n"); +} + + +static void copy_pages_and_flush(struct page *to, struct page *from, size_t nr_pages) +{ + while (nr_pages--) { + copy_page(page_address(to), page_address(from)); + flush_dcache_page(to); + to++; + from++; + } +} + +static void replicate_pages(struct page *pages, int nid, + unsigned long start, unsigned long end, + unsigned long nr_pages) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pgprot_t prot; + unsigned int nr_pmd = 0; + + copy_pages_and_flush(pages, virt_to_page(lm_alias(start)), nr_pages); + + for (unsigned long vaddr = start; vaddr < end; vaddr += PMD_SIZE, nr_pmd++) { + + pgd = pgd_offset_pgd(numa_pgt[nid].pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + + prot = pmd_pgprot(*pmd); + + set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + nr_pmd * PAGES_PER_PMD, prot)); + } +} + +static void __init replicate_kernel_text(int nid) +{ + unsigned long nr_pages = (KERNEL_TEXT_END - KERNEL_TEXT_START) / PAGE_SIZE; + + replicate_pages(virt_to_page(numa_pgt[nid].text_vaddr), nid, + KERNEL_TEXT_START, KERNEL_TEXT_END, nr_pages); +} + +static void replicate_kernel_rodata(int nid) +{ + unsigned long nr_pages = (KERNEL_RODATA_END - KERNEL_RODATA_START) / PAGE_SIZE; + + replicate_pages(virt_to_page(numa_pgt[nid].rodata_vaddr), nid, + KERNEL_RODATA_START, KERNEL_RODATA_END, nr_pages); +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PUD_MASK; + unsigned long right = (end & PUD_MASK) - 1 + PUD_SIZE; + + pud_t *clone_pud = pud_offset(dst, left); + pud_t *orig_pud = pud_offset(src, left); + + for (unsigned long addr = left; (addr >= left && addr < right); addr += PUD_SIZE) { + pmd_t *new_pmd; + + if (pud_none(*orig_pud) || pud_huge(*orig_pud)) { + clone_pud++; + orig_pud++; + continue; + } + + pud_clear(clone_pud); + new_pmd = pmd_alloc_node(nid, &init_mm, clone_pud, addr); + BUG_ON(new_pmd == NULL); + + copy_page(pud_pgtable(*clone_pud), pud_pgtable(*orig_pud)); + + clone_pud++; + orig_pud++; + } +} + +static void replicate_pgt_pud(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & P4D_MASK; + unsigned long right = (end & P4D_MASK) - 1 + P4D_SIZE; + + p4d_t *clone_p4d = p4d_offset(dst, left); + p4d_t *orig_p4d = p4d_offset(src, left); + + for (unsigned long addr = left; (addr >= left && addr < right); addr += P4D_SIZE) { + pud_t *new_pud; + + if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d)) { + clone_p4d++; + orig_p4d++; + continue; + } + + p4d_clear(clone_p4d); + new_pud = pud_alloc_node(nid, &init_mm, clone_p4d, addr); + BUG_ON(new_pud == NULL); + + copy_page(p4d_pgtable(*clone_p4d), p4d_pgtable(*orig_p4d)); + /* start and end passed to the next function must be in range of p4ds, + * so min and max are used here + */ + replicate_pgt_pmd(clone_p4d, orig_p4d, max(addr, start), + min(addr - 1 + P4D_SIZE, end), nid); + + clone_p4d++; + orig_p4d++; + } +} + +static void replicate_pgt_p4d(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PGDIR_MASK; + unsigned long right = (end & PGDIR_MASK) - 1 + PGDIR_SIZE; + + pgd_t *clone_pgd = pgd_offset_pgd(dst, left); + pgd_t *orig_pgd = pgd_offset_pgd(src, left); + + for (unsigned long addr = left; (addr >= left && addr < right); addr += PGDIR_SIZE) { + p4d_t *new_p4d; + + /* TODO: remove last condition and do something better + * In the case of a folded P4D level, pgd_none and pgd_huge + * always return 0, so we might start to replicate empty entries. + * We obviously want to avoid this, so the last check is performed here. + */ + if (pgd_none(*orig_pgd) || pgd_huge(*orig_pgd) || + (unsigned long)(orig_pgd->pgd) == 0) { + clone_pgd++; + orig_pgd++; + continue; + } + + pgd_clear(clone_pgd); + new_p4d = p4d_alloc_node(nid, &init_mm, clone_pgd, addr); + BUG_ON(new_p4d == NULL); + + copy_page((void *)pgd_page_vaddr(*clone_pgd), (void *)pgd_page_vaddr(*orig_pgd)); + replicate_pgt_pud(clone_pgd, orig_pgd, max(addr, start), + min(addr - 1 + PGDIR_SIZE, end), nid); + + clone_pgd++; + orig_pgd++; + } +} + +static void replicate_pgt(int nid, unsigned long start, unsigned long end) +{ + replicate_pgt_p4d(numa_pgt[nid].pgd, init_mm.pgd, start, end, nid); +} + +static void replicate_pagetables(void) +{ + int nid; + + for_each_replica(nid) { + numa_pgt[nid].pgd = numa_replicate_pgt_pgd(nid); + + replicate_pgt(nid, PAGE_TABLE_REPLICATION_LEFT, + PAGE_TABLE_REPLICATION_RIGHT); + + } + + for_each_online_node(nid) { + init_mm.pgd_numa[nid] = numa_pgt[closest_memory_node[nid]].pgd; + } +} + +void __init numa_replicate_kernel(void) +{ + int nid; + + replicate_pagetables(); + + for_each_replica(nid) { + if (nid == master_node) + continue; + replicate_kernel_text(nid); + } + + text_replicated = true; + numa_setup_pgd(); +} + +void numa_replicate_kernel_rodata(void) +{ + int nid; + + for_each_replica(nid) { + if (nid == master_node) + continue; + replicate_kernel_rodata(nid); + } + + flush_tlb_all(); + pr_info("Replicated page table : [%p --- %p]\n", (void *)PAGE_TABLE_REPLICATION_LEFT, + (void *)PAGE_TABLE_REPLICATION_RIGHT); + + numa_replication_sysfs_init(); + numa_dump_tt(KERNEL_TEXT_START, KERNEL_RODATA_END - 1); +} + +void numa_setup_pgd(void) +{ + /* switch away from the initial page table */ + load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]); +} + +void __init_or_module *numa_addr_in_replica(void *vaddr, int nid) +{ + unsigned long addr = (unsigned long)vaddr; + unsigned long offset = addr - KERNEL_TEXT_START; + + BUG_ON(addr < KERNEL_TEXT_START || addr >= KERNEL_TEXT_END); + BUG_ON(numa_pgt[nid].text_vaddr == NULL); + BUG_ON(closest_memory_node[nid] != nid); + + return numa_pgt[nid].text_vaddr + offset; +} + +void numa_clear_linear_addresses(void) +{ + int nid; + + for_each_replica(nid) { + numa_pgt[nid].text_vaddr = NULL; + numa_pgt[nid].rodata_vaddr = NULL; + } +} + +static void numa_find_closest_memory_nodes(void) +{ + int nid; + + for_each_online_node(nid) { + int new_node; + int min_dist = INT_MAX; + int found_node = nid; + + for_each_node_state(new_node, N_MEMORY) { + int new_dist = node_distance(nid, new_node); + + if (new_dist < min_dist) { + found_node = new_node; + min_dist = new_dist; + } + } + closest_memory_node[nid] = found_node; + + pr_info("For node %d closest - %d\n", nid, found_node); + } +} + +void __init numa_reserve_memory(void) +{ + int nid; + + for_each_replica(nid) + pr_info("Memory node: %d\n", nid); + + numa_find_closest_memory_nodes(); + master_node = page_to_nid(virt_to_page(lm_alias((void *)KERNEL_TEXT_START))); + + pr_info("Master Node: #%d\n", master_node); + for_each_replica(nid) { + if (nid == master_node) { + numa_pgt[nid].text_vaddr = lm_alias((void *)KERNEL_TEXT_START); + numa_pgt[nid].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START); + } else { + numa_pgt[nid].text_vaddr = memblock_alloc_try_nid( + (KERNEL_TEXT_END - KERNEL_TEXT_START), + HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + + numa_pgt[nid].rodata_vaddr = memblock_alloc_try_nid( + (KERNEL_RODATA_END - KERNEL_RODATA_START), + HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + } + + BUG_ON(numa_pgt[nid].text_vaddr == NULL); + BUG_ON(numa_pgt[nid].rodata_vaddr == NULL); + } +} + From patchwork Thu Dec 28 13:10:48 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506070 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 142B58479 for ; Thu, 28 Dec 2023 13:32:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.216]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185h4Fgwz6J9yq; Thu, 28 Dec 2023 21:11:04 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id AA1D2140DD5; Thu, 28 Dec 2023 21:12:47 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:45 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 04/12] x86: add support of memory protection for NUMA replicas Date: Thu, 28 Dec 2023 21:10:48 +0800 Message-ID: <20231228131056.602411-5-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/include/asm/set_memory.h | 14 +++ arch/x86/mm/pat/set_memory.c | 150 +++++++++++++++++++++++++++++- include/asm-generic/set_memory.h | 12 +++ include/linux/set_memory.h | 10 ++ 4 files changed, 185 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index a5e89641bd2d..1efa15a08ef0 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -7,7 +7,9 @@ #include #define set_memory_rox set_memory_rox +#define numa_set_memory_rox numa_set_memory_rox int set_memory_rox(unsigned long addr, int numpages); +int numa_set_memory_rox(unsigned long addr, int numpages); /* * The set_memory_* API can be used to change various attributes of a virtual @@ -58,6 +60,18 @@ int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_np(unsigned long addr, int numpages); +int numa_set_memory_np_noalias(unsigned long addr, int numpages); +int numa_set_memory_global(unsigned long addr, int numpages); +int numa_set_memory_nonglobal(unsigned long addr, int numpages); +#else +#define numa_set_memory_np set_memory_np +#define numa_set_memory_np_noalias set_memory_np_noalias +#define numa_set_memory_global set_memory_global +#define numa_set_memory_nonglobal set_memory_nonglobal +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * For legacy compatibility with the old APIs, a few functions * are provided that work on a "struct page". diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index df4182b6449f..ceba209ee653 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1790,7 +1791,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) return ret; } -static int change_page_attr_set_clr(unsigned long *addr, int numpages, +static int change_page_attr_set_clr_pgd(pgd_t *pgd, unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages) @@ -1845,6 +1846,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; + cpa.pgd = pgd; ret = __change_page_attr_set_clr(&cpa, 1); @@ -1873,6 +1875,15 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, return ret; } +static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split, int in_flag, + struct page **pages) +{ + return change_page_attr_set_clr_pgd(NULL, addr, numpages, mask_set, + mask_clr, force_split, in_flag, pages); +} + static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { @@ -1880,6 +1891,13 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages, (array ? CPA_ARRAY : 0), NULL); } +static inline int change_page_attr_set_pgd(pgd_t *pgd, unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr_pgd(pgd, addr, numpages, mask, __pgprot(0), 0, + (array ? CPA_ARRAY : 0), NULL); +} + static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { @@ -1887,6 +1905,13 @@ static inline int change_page_attr_clear(unsigned long *addr, int numpages, (array ? CPA_ARRAY : 0), NULL); } +static inline int change_page_attr_clear_pgd(pgd_t *pgd, unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr_pgd(pgd, addr, numpages, __pgprot(0), mask, 0, + (array ? CPA_ARRAY : 0), NULL); +} + static inline int cpa_set_pages_array(struct page **pages, int numpages, pgprot_t mask) { @@ -2122,6 +2147,129 @@ int set_memory_global(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_x(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + for_each_replica(nid) + ret |= change_page_attr_clear_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_NX), 0); + + return ret; +} + +int numa_set_memory_nx(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + for_each_replica(nid) + ret |= change_page_attr_set_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_NX), 0); + + return ret; +} + +int numa_set_memory_ro(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + for_each_replica(nid) + ret |= change_page_attr_clear_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_RW), 0); + + return ret; +} + +int numa_set_memory_rox(unsigned long addr, int numpages) +{ + int nid; + + int ret = 0; + pgprot_t clr = __pgprot(_PAGE_RW); + + if (__supported_pte_mask & _PAGE_NX) + clr.pgprot |= _PAGE_NX; + + for_each_online_node(nid) { + ret |= change_page_attr_clear_pgd(init_mm.pgd_numa[nid], &addr, numpages, clr, 0); + if (!is_text_replicated()) + break; + } + return ret; +} + +int numa_set_memory_rw(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + for_each_replica(nid) + ret |= change_page_attr_set_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_RW), 0); + + return ret; +} + +int numa_set_memory_np(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + for_each_replica(nid) + ret |= change_page_attr_clear_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_PRESENT), 0); + + return ret; +} + +int numa_set_memory_np_noalias(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + int cpa_flags = CPA_NO_CHECK_ALIAS; + + for_each_replica(nid) + ret |= change_page_attr_set_clr_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); + + return ret; +} + +int numa_set_memory_global(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + for_each_replica(nid) + ret |= change_page_attr_set_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); + + return ret; +} + +int numa_set_memory_nonglobal(unsigned long addr, int numpages) +{ + int ret = 0; + int nid; + + for_each_replica(nid) + ret |= change_page_attr_clear_pgd(init_mm.pgd_numa[nid], &addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); + + return ret; +} +#endif + /* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables. diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h index c86abf6bc7ba..886639600e64 100644 --- a/include/asm-generic/set_memory.h +++ b/include/asm-generic/set_memory.h @@ -10,4 +10,16 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_ro(unsigned long addr, int numpages); +int numa_set_memory_rw(unsigned long addr, int numpages); +int numa_set_memory_x(unsigned long addr, int numpages); +int numa_set_memory_nx(unsigned long addr, int numpages); +#else +#define numa_set_memory_ro set_memory_ro +#define numa_set_memory_rw set_memory_rw +#define numa_set_memory_x set_memory_x +#define numa_set_memory_nx set_memory_nx +#endif /* CONFIG_KERNEL_REPLICATION */ + #endif diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 95ac8398ee72..3213bfd335dd 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -24,6 +24,16 @@ static inline int set_memory_rox(unsigned long addr, int numpages) } #endif +#ifndef numa_set_memory_rox +static inline int numa_set_memory_rox(unsigned long addr, int numpages) +{ + int ret = numa_set_memory_ro(addr, numpages); + if (ret) + return ret; + return numa_set_memory_x(addr, numpages); +} +#endif + #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP static inline int set_direct_map_invalid_noflush(struct page *page) { From patchwork Thu Dec 28 13:10:49 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506046 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 82F4E79C1 for ; Thu, 28 Dec 2023 13:12:51 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.31]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T186Q1JlJz6K5yJ; Thu, 28 Dec 2023 21:11:42 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 4B6ED1408FF; Thu, 28 Dec 2023 21:12:49 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:47 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 05/12] x86: enable memory protection for replicated memory Date: Thu, 28 Dec 2023 21:10:49 +0800 Message-ID: <20231228131056.602411-6-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/mm/init.c | 8 ++++---- arch/x86/mm/init_64.c | 4 ++-- arch/x86/mm/pti.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7f6042eb7e6..0fb29a4855fe 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -422,7 +422,7 @@ void *alloc_insn_page(void) * TODO: Once additional kernel code protection mechanisms are set, ensure * that the page was not maliciously altered and it is still zeroed. */ - set_memory_rox((unsigned long)page, 1); + numa_set_memory_rox((unsigned long)page, 1); return page; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8192452d1d2d..f797e194bfb0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -927,15 +927,15 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) * corresponding pages will be unmapped. */ kmemleak_free_part((void *)begin, end - begin); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + numa_set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that * writeable and non-executable first. */ - set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + numa_set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + numa_set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); @@ -971,7 +971,7 @@ void free_kernel_image_pages(const char *what, void *begin, void *end) * which can't be treated in this way for obvious reasons. */ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) - set_memory_np_noalias(begin_ul, len_pages); + numa_set_memory_np_noalias(begin_ul, len_pages); } void __ref free_initmem(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a190aae8ceaf..98cb7f5f2863 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1379,7 +1379,7 @@ void mark_rodata_ro(void) printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); - set_memory_ro(start, (end - start) >> PAGE_SHIFT); + numa_set_memory_ro(start, (end - start) >> PAGE_SHIFT); kernel_set_to_readonly = 1; @@ -1396,7 +1396,7 @@ void mark_rodata_ro(void) * has been zapped already via cleanup_highmem(). */ all_end = roundup((unsigned long)_brk_end, PMD_SIZE); - set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + numa_set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); set_ftrace_ops_ro(); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 78414c6d1b5e..23f30edf71b3 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -580,7 +580,7 @@ static void pti_clone_kernel_text(void) */ /* Set the global bit for normal non-__init kernel text: */ - set_memory_global(start, (end_global - start) >> PAGE_SHIFT); + numa_set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } static void pti_set_kernel_image_nonglobal(void) From patchwork Thu Dec 28 13:10:50 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506047 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7537379C6 for ; Thu, 28 Dec 2023 13:12:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.231]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T186R6nnjz6K5lK; Thu, 28 Dec 2023 21:11:43 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 07A2C1400F4; Thu, 28 Dec 2023 21:12:51 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:49 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 06/12] x86: align kernel text and rodata using HUGE_PAGE boundary Date: Thu, 28 Dec 2023 21:10:50 +0800 Message-ID: <20231228131056.602411-7-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f15fb71f280e..3841293e7aad 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -159,11 +159,11 @@ SECTIONS } :text = 0xcccccccc /* End of text section, which should occupy whole number of pages */ + . = ALIGN(HPAGE_SIZE); //For kernel replication _etext = .; - . = ALIGN(PAGE_SIZE); X86_ALIGN_RODATA_BEGIN - RO_DATA(PAGE_SIZE) + RO_DATA(HPAGE_SIZE) X86_ALIGN_RODATA_END /* Data */ From patchwork Thu Dec 28 13:10:51 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506048 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2081279C8 for ; Thu, 28 Dec 2023 13:12:54 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.231]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185805Mhz67cTF; Thu, 28 Dec 2023 21:10:36 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id D47711408F9; Thu, 28 Dec 2023 21:12:52 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:50 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 07/12] x86: enable per-NUMA node kernel text and rodata replication Date: Thu, 28 Dec 2023 21:10:51 +0800 Message-ID: <20231228131056.602411-8-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/kernel/smpboot.c | 2 + arch/x86/mm/dump_pagetables.c | 9 +++++ arch/x86/mm/fault.c | 4 +- arch/x86/mm/pgtable.c | 76 ++++++++++++++++++++++++----------- arch/x86/mm/tlb.c | 30 +++++++++++--- init/main.c | 5 +++ 6 files changed, 97 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 747b83a373a2..d2a852ba1bcf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -244,6 +245,7 @@ static void notrace start_secondary(void *unused) * limit the things done here to the most necessary things. */ cr4_init(); + numa_setup_pgd(); /* * 32-bit specific. 64-bit reaches this code with the correct page diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc2..5a2e36c9468a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -433,7 +434,15 @@ void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { +#ifdef CONFIG_KERNEL_REPLICATION + int node; + + for_each_replica(node) + ptdump_walk_pgd_level_core(NULL, &init_mm, + per_numa_pgd(&init_mm, node), true, false); +#else ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); +#endif } static int __init pt_dump_init(void) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..d76e072dd028 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,6 +20,7 @@ #include /* efi_crash_gracefully_on_page_fault()*/ #include #include /* find_and_lock_vma() */ +#include #include /* boot_cpu_has, ... */ #include /* dotraplinkage, ... */ @@ -1031,7 +1032,8 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; - pgd = init_mm.pgd + pgd_index(address); + pgd = per_numa_pgd(&init_mm, numa_node_id()); + if (!pgd_present(*pgd)) return 0; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 15a8009a4480..4c905fe0b84f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -120,23 +121,25 @@ struct mm_struct *pgd_page_get_mm(struct page *page) return page->pt_mm; } -static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) +static void pgd_ctor(struct mm_struct *mm, int nid) { + pgd_t *dst_pgd = per_numa_pgd(mm, nid); + pgd_t *src_pgd = per_numa_pgd(&init_mm, nid); /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || CONFIG_PGTABLE_LEVELS >= 4) { - clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, + clone_pgd_range(dst_pgd + KERNEL_PGD_BOUNDARY, + src_pgd + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) { - pgd_set_mm(pgd, mm); - pgd_list_add(pgd); + pgd_set_mm(dst_pgd, mm); + pgd_list_add(dst_pgd); } } @@ -416,20 +419,33 @@ static inline void _pgd_free(pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pgd_t *_pgd_alloc_node(int nid) +{ + struct page *pages; + + pages = __alloc_pages_node(nid, GFP_PGTABLE_USER, + PGD_ALLOCATION_ORDER); + return (pgd_t *)page_address(pages); +} + +#else +#define _pgd_alloc_node(nid) _pgd_alloc() +#endif /* CONFIG_KERNEL_REPLICATION */ #endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd; + int nid; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; - pgd = _pgd_alloc(); - - if (pgd == NULL) - goto out; - - mm->pgd = pgd; + for_each_replica(nid) { + per_numa_pgd(mm, nid) = _pgd_alloc_node(nid); + if (per_numa_pgd(mm, nid) == NULL) + goto out_free_pgd; + } if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) @@ -449,16 +465,22 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock(&pgd_lock); - pgd_ctor(mm, pgd); - if (sizeof(pmds) != 0) - pgd_prepopulate_pmd(mm, pgd, pmds); + for_each_replica(nid) { + pgd_ctor(mm, nid); + if (sizeof(pmds) != 0) + pgd_prepopulate_pmd(mm, per_numa_pgd(mm, nid), pmds); - if (sizeof(u_pmds) != 0) - pgd_prepopulate_user_pmd(mm, pgd, u_pmds); + if (sizeof(u_pmds) != 0) + pgd_prepopulate_user_pmd(mm, per_numa_pgd(mm, nid), u_pmds); + } + + for_each_online_node(nid) { + per_numa_pgd(mm, nid) = per_numa_pgd(mm, numa_closest_memory_node(nid)); + } spin_unlock(&pgd_lock); - return pgd; + return mm->pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) @@ -467,17 +489,25 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: - _pgd_free(pgd); -out: + for_each_replica(nid) { + if (per_numa_pgd(mm, nid) != NULL) + _pgd_free(per_numa_pgd(mm, nid)); + } return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + int nid; + pgd_mop_up_pmds(mm, pgd); - pgd_dtor(pgd); - paravirt_pgd_free(mm, pgd); - _pgd_free(pgd); + for_each_replica(nid) { + pgd_t *pgd_numa = per_numa_pgd(mm, nid); + + pgd_dtor(pgd_numa); + paravirt_pgd_free(mm, pgd_numa); + _pgd_free(pgd_numa); + } } /* diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480a..de0e57827f98 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -491,6 +492,22 @@ void cr4_update_pce(void *ignored) static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif +#ifdef CONFIG_KERNEL_REPLICATION +extern struct mm_struct *poking_mm; +static pgd_t *get_next_pgd(struct mm_struct *next) +{ + if (next == poking_mm) + return next->pgd; + else + return next->pgd_numa[numa_node_id()]; +} +#else +static pgd_t *get_next_pgd(struct mm_struct *next) +{ + return next->pgd; +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -502,6 +519,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, u64 next_tlb_gen; bool need_flush; u16 new_asid; + pgd_t *next_pgd; /* * NB: The scheduler will call us with prev == next when switching @@ -636,15 +654,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } set_tlbstate_lam_mode(next); + + next_pgd = get_next_pgd(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, new_lam, true); + load_new_mm_cr3(next_pgd, new_asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, new_lam, false); + load_new_mm_cr3(next_pgd, new_asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -703,7 +723,7 @@ void initialize_tlbstate_and_flush(void) unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ - WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(per_numa_pgd(mm, numa_node_id()))); /* LAM expected to be disabled */ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); @@ -718,7 +738,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Disable LAM, force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm->pgd, 0, 0)); + write_cr3(build_cr3(per_numa_pgd(mm, numa_node_id()), 0, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); @@ -1091,7 +1111,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) unsigned long __get_current_cr3_fast(void) { unsigned long cr3 = - build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + build_cr3(per_numa_pgd(this_cpu_read(cpu_tlbstate.loaded_mm), numa_node_id()), this_cpu_read(cpu_tlbstate.loaded_mm_asid), tlbstate_lam_cr3_mask()); diff --git a/init/main.c b/init/main.c index ad920fac325c..98c4a908ac13 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -921,11 +922,13 @@ void start_kernel(void) * These use large bootmem allocations and must precede * initalization of page allocator */ + numa_reserve_memory(); setup_log_buf(0); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_core_init(); + numa_replicate_kernel(); poking_init(); ftrace_init(); @@ -1446,6 +1449,8 @@ static int __ref kernel_init(void *unused) free_initmem(); mark_readonly(); + numa_replicate_kernel_rodata(); + numa_clear_linear_addresses(); /* * Kernel mappings are now finalized - update the userspace page-table * to finalize PTI. From patchwork Thu Dec 28 13:10:52 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506049 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 86A0679C8 for ; Thu, 28 Dec 2023 13:12:57 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.231]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185B2nk5z67ncD; Thu, 28 Dec 2023 21:10:38 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 3BF711400F4; Thu, 28 Dec 2023 21:12:55 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:52 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 08/12] x86: make kernel text patching aware about replicas Date: Thu, 28 Dec 2023 21:10:52 +0800 Message-ID: <20231228131056.602411-9-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/kernel/alternative.c | 116 ++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 44843a492e69..b0abd60bcafe 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1659,6 +1660,7 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; + int nid; if (boot_cpu_has(X86_FEATURE_NX) && is_module_text_address((unsigned long)addr)) { @@ -1669,8 +1671,18 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, */ memcpy(addr, opcode, len); } else { + unsigned long iaddr = (unsigned long)addr; + local_irq_save(flags); - memcpy(addr, opcode, len); + if (is_text_replicated() && is_kernel_text(iaddr)) { + for_each_replica(nid) { + void *vaddr = numa_addr_in_replica(addr, nid); + + memcpy(vaddr, opcode, len); + } + } else { + memcpy(addr, opcode, len); + } local_irq_restore(flags); sync_core(); @@ -1764,36 +1776,21 @@ typedef void text_poke_f(void *dst, const void *src, size_t len); static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { + int nid; bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; temp_mm_state_t prev; unsigned long flags; + int size_in_poking_mm = PAGE_SIZE; pte_t pte, *ptep; spinlock_t *ptl; pgprot_t pgprot; - + bool has_replica = numa_addr_has_replica(addr); /* * While boot memory allocator is running we cannot use struct pages as * they are not yet initialized. There is no way to recover. */ BUG_ON(!after_bootmem); - - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - if (cross_page_boundary) - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - if (cross_page_boundary) - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - /* - * If something went wrong, crash and burn since recovery paths are not - * implemented. - */ - BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); - /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. @@ -1812,48 +1809,59 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); - pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + for_each_replica(nid) { + prev = use_temporary_mm(poking_mm); - if (cross_page_boundary) { - pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); - } + pages[0] = walk_to_page_node(nid, addr); + if (cross_page_boundary) + pages[1] = walk_to_page_node(nid, addr + PAGE_SIZE); - /* - * Loading the temporary mm behaves as a compiler barrier, which - * guarantees that the PTE will be set at the time memcpy() is done. - */ - prev = use_temporary_mm(poking_mm); + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); - kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); - kasan_enable_current(); + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); - /* - * Ensure that the PTE is only cleared after the instructions of memcpy - * were issued by using a compiler barrier. - */ - barrier(); + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + /* + * Compiler barrier to ensure that PTE is set before func() + */ + barrier(); - pte_clear(poking_mm, poking_addr, ptep); - if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + kasan_disable_current(); + func((u8 *)poking_addr + offset_in_page(addr), src, len); + kasan_enable_current(); - /* - * Loading the previous page-table hierarchy requires a serializing - * instruction that already allows the core to see the updated version. - * Xen-PV is assumed to serialize execution in a similar manner. - */ - unuse_temporary_mm(prev); + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); - /* - * Flushing the TLB might involve IPIs, which would require enabled - * IRQs, but not if the mm is not used, as it is in this point. - */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + - (cross_page_boundary ? 2 : 1) * PAGE_SIZE, - PAGE_SHIFT, false); + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * size_in_poking_mm, + PAGE_SHIFT, false); + if (!has_replica) + break; + } if (func == text_poke_memcpy) { /* From patchwork Thu Dec 28 13:10:53 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506050 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 85A2479C8 for ; Thu, 28 Dec 2023 13:12:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.216]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185D2r4Pz67kr9; Thu, 28 Dec 2023 21:10:40 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 39BB2140DD5; Thu, 28 Dec 2023 21:12:57 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:55 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 09/12] x86: add support of NUMA replication for efi page tables Date: Thu, 28 Dec 2023 21:10:53 +0800 Message-ID: <20231228131056.602411-10-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/platform/efi/efi_64.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77f7ac3668cb..986d2dddef7a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -71,6 +72,9 @@ int __init efi_alloc_page_tables(void) p4d_t *p4d; pud_t *pud; gfp_t gfp_mask; +#ifdef CONFIG_KERNEL_REPLICATION + int nid; +#endif gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); @@ -86,7 +90,12 @@ int __init efi_alloc_page_tables(void) if (!pud) goto free_p4d; +#ifdef CONFIG_KERNEL_REPLICATION + for_each_online_node(nid) + per_numa_pgd(&efi_mm, nid) = efi_pgd; +#else efi_mm.pgd = efi_pgd; +#endif mm_init_cpumask(&efi_mm); init_new_context(NULL, &efi_mm); From patchwork Thu Dec 28 13:10:54 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506051 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4547379FE for ; Thu, 28 Dec 2023 13:13:01 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.31]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T186b6pLdz6K5sM; Thu, 28 Dec 2023 21:11:51 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 0531A141928; Thu, 28 Dec 2023 21:12:59 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:57 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 10/12] mm: add replicas allocation support for vmalloc Date: Thu, 28 Dec 2023 21:10:54 +0800 Message-ID: <20231228131056.602411-11-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- include/linux/vmalloc.h | 24 ++ mm/vmalloc.c | 469 +++++++++++++++++++++++++++++++--------- 2 files changed, 396 insertions(+), 97 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..61496ac316e0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -9,6 +9,7 @@ #include /* pgprot_t */ #include #include +#include #include @@ -29,6 +30,10 @@ struct iov_iter; /* in uio.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */ +#ifdef CONFIG_KERNEL_REPLICATION +#define VM_NUMA_SHARED 0x00000800 /* Pages shared between per-NUMA node TT*/ +#endif + #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ @@ -54,6 +59,10 @@ struct vm_struct { struct page **pages; #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC unsigned int page_order; +#endif +#ifdef CONFIG_KERNEL_REPLICATION + int node; + bool replicated; #endif unsigned int nr_pages; phys_addr_t phys_addr; @@ -149,6 +158,16 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); + +#ifdef CONFIG_KERNEL_REPLICATION + /* + * DO NOT USE this function if you don't understand what it is doing + * Use only in pair with __vmalloc_numa_shared_replicated_start() + */ +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags); +#endif + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); @@ -233,6 +252,11 @@ static inline bool is_vm_area_hugepages(const void *addr) #ifdef CONFIG_MMU void vunmap_range(unsigned long addr, unsigned long end); +#ifdef CONFIG_KERNEL_REPLICATION +void numa_vunmap_range(unsigned long addr, unsigned long end); +#else +#define numa_vunmap_range vunmap_range +#endif static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ef8599d394fd..3f57137c4173 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -439,6 +440,29 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) __vunmap_range_noflush(start, end); } +void vunmap_range_noflush_pgd(unsigned long start, unsigned long end, + pgd_t *pagetable) +{ + unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_pgd(pagetable, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_p4d_range(pgd, addr, next, &mask); + } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -455,6 +479,18 @@ void vunmap_range(unsigned long addr, unsigned long end) flush_tlb_kernel_range(addr, end); } +#ifdef CONFIG_KERNEL_REPLICATION +void numa_vunmap_range(unsigned long addr, unsigned long end) +{ + int node; + + flush_cache_vunmap(addr, end); + for_each_replica(node) + vunmap_range_noflush_pgd(addr, end, init_mm.pgd_numa[node]); + flush_tlb_kernel_range(addr, end); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) @@ -540,7 +576,8 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, +static int vmap_small_pages_range_noflush_pgd(pgd_t *pagetable, + unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; @@ -551,7 +588,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pagetable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -567,17 +604,40 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } -/* - * vmap_pages_range_noflush is similar to vmap_pages_range, but does not - * flush caches. - * - * The caller is responsible for calling flush_cache_vmap() after this - * function returns successfully and before the addresses are accessed. - * - * This is an internal function only. Do not use outside mm/. - */ -int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) +static int vmap_range_noflush_pgd(pgd_t *pagetable, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_pgd(pagetable, addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +static int vmap_pages_range_noflush_pgd(pgd_t *pagetable, + unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -585,12 +645,13 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) - return vmap_small_pages_range_noflush(addr, end, prot, pages); + return vmap_small_pages_range_noflush_pgd(pagetable, addr, end, + prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; - err = vmap_range_noflush(addr, addr + (1UL << page_shift), + err = vmap_range_noflush_pgd(pagetable, addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) @@ -610,7 +671,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (ret) return ret; - return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return vmap_pages_range_noflush_pgd(init_mm.pgd, addr, end, prot, pages, page_shift); } /** @@ -658,57 +719,12 @@ EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); */ struct page *vmalloc_to_page(const void *vmalloc_addr) { - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - - if (pgd_none(*pgd)) - return NULL; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return NULL; /* XXX: no allowance for huge pgd */ - if (WARN_ON_ONCE(pgd_bad(*pgd))) - return NULL; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) - return NULL; - if (p4d_leaf(*p4d)) - return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(p4d_bad(*p4d))) - return NULL; - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) - return NULL; - if (pud_leaf(*pud)) - return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pud_bad(*pud))) - return NULL; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return NULL; - if (pmd_leaf(*pmd)) - return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pmd_bad(*pmd))) - return NULL; - - ptep = pte_offset_kernel(pmd, addr); - pte = ptep_get(ptep); - if (pte_present(pte)) - page = pte_page(pte); - - return page; + return walk_to_page_node(0, vmalloc_addr); } EXPORT_SYMBOL(vmalloc_to_page); @@ -1841,6 +1857,30 @@ static void free_vmap_area_noflush(struct vmap_area *va) schedule_work(&drain_vmap_work); } +/* + * Free and unmap a vmap area for every NUMA node + */ +#ifdef CONFIG_KERNEL_REPLICATION +static void free_unmap_vmap_area_per_pgd(struct vmap_area *va) +{ + int node; + + flush_cache_vunmap(va->va_start, va->va_end); + /** + * In some scenarios we might clear + * empty entries here, which is totally fine + */ + for_each_replica(node) + vunmap_range_noflush_pgd(va->va_start, va->va_end, + init_mm.pgd_numa[node]); + + if (debug_pagealloc_enabled_static()) + flush_tlb_kernel_range(va->va_start, va->va_end); + + free_vmap_area_noflush(va); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + /* * Free and unmap a vmap area */ @@ -2700,9 +2740,19 @@ struct vm_struct *remove_vm_area(const void *addr) debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica(addr)) { + /* TODO kasan_poison_vmalloc_numa */ + free_unmap_vmap_area_per_pgd(va); + } else { + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); + free_unmap_vmap_area(va); + } +#else kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); free_unmap_vmap_area(va); +#endif return vm; } @@ -2712,9 +2762,20 @@ static inline void set_area_direct_map(const struct vm_struct *area, int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ - for (i = 0; i < area->nr_pages; i++) + for (i = 0; i < area->nr_pages; i++) { if (page_address(area->pages[i])) set_direct_map(area->pages[i]); +#ifdef CONFIG_KERNEL_REPLICATION + if (area->replicated) { + struct page *page; + + list_for_each_entry(page, &area->pages[i]->lru, lru) { + if (page_address(page)) + set_direct_map(page); + } + } +#endif + } } /* @@ -2742,8 +2803,24 @@ static void vm_reset_perms(struct vm_struct *area) end = max(addr + page_size, end); flush_dmap = 1; } +#ifdef CONFIG_KERNEL_REPLICATION + if (area->replicated) { + struct page *page; + + list_for_each_entry(page, &area->pages[i]->lru, lru) { + unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; + start = min(addr, start); + end = max(addr + page_size, end); + flush_dmap = 1; + } + } + } +#endif } - /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and @@ -2832,8 +2909,36 @@ void vfree(const void *addr) vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; +#ifdef CONFIG_KERNEL_REPLICATION + if (vm->replicated) { + struct page *cursor, *tmp; + + BUG_ON(!page); + list_for_each_entry_safe(cursor, tmp, &vm->pages[i]->lru, lru) { + BUG_ON(!cursor); + + list_del(&cursor->lru); + mod_memcg_page_state(cursor, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(cursor, 0); + cond_resched(); + } + } +#endif + BUG_ON(!page); + + /* + * Cleanup lru entries used for replicas. During page allocation + * clear_page_pfmemalloc() is called, but just to be sure let's + * clear page->lru here. + */ + page->lru.next = LIST_POISON1; + page->lru.prev = LIST_POISON2; + - BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so @@ -3098,26 +3203,90 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static int vmalloc_map_area_pages_node(unsigned long addr, + struct page **pages, unsigned long size, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift, int node) +{ + int ret = 0; + unsigned int flags; + bool nofail = gfp_mask & __GFP_NOFAIL; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + pgd_t *pgd = per_numa_pgd(&init_mm, node); + ret = vmap_pages_range_noflush_pgd(pgd, addr, addr + size, + prot, pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to map pages", + size); + } + + return ret; +} + +static int vmalloc_map_area_pages(unsigned long addr, unsigned long size, + struct vm_struct *area, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift) +{ + int ret; + int node = 0; + +#ifdef CONFIG_KERNEL_REPLICATION + if (area->flags & VM_NUMA_SHARED) { + for_each_replica(node) { + ret = vmalloc_map_area_pages_node(addr, area->pages, size, + gfp_mask, prot, page_shift, node); + if (ret) + return ret; + } + } else { + ret = vmalloc_map_area_pages_node(addr, area->pages, size, + gfp_mask, prot, page_shift, node); + } +#else + ret = vmalloc_map_area_pages_node(addr, area->pages, size, + gfp_mask, prot, page_shift, node); +#endif + return ret; +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; - unsigned int flags; - int ret; + int ret = 0; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; - /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); @@ -3129,8 +3298,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3169,38 +3337,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - /* - * page tables allocations ignore external gfp mask, enforce it - * by the scope API - */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - - do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); - if (nofail && (ret < 0)) - schedule_timeout_uninterruptible(1); - } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); - - if (ret < 0) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, failed to map pages", - area->nr_pages * PAGE_SIZE); + ret = vmalloc_map_area_pages(addr, size, area, gfp_mask, prot, page_shift); + if (ret) goto fail; - } + + flush_cache_vmap(addr, addr + size); return area->addr; fail: vfree(area->addr); + return NULL; } @@ -3292,6 +3439,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica(area->addr)) + vm_flags |= VM_NUMA_SHARED; + area->node = node; +#endif /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). @@ -3385,6 +3537,129 @@ void *__vmalloc_node(unsigned long size, unsigned long align, return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } + +#ifdef CONFIG_KERNEL_REPLICATION +static void numa_replicated_page_range(struct page **src, struct page **dst, int nr_pages) +{ + int i; + void *from, *to; + + for (i = 0; i < nr_pages; i++) { + from = kmap(src[i]); + to = kmap(dst[i]); + + copy_page(to, from); + + kunmap(src[i]); + kunmap(dst[i]); + } +} + +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags) +{ + int i, ret, node = 0; + struct vm_struct *area; + unsigned int page_order; + unsigned int nr_allocated; + struct page **pages; + unsigned long area_start, area_end; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + unsigned long array_size; + + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + + if (unlikely(!numa_addr_has_replica(addr))) + return -EINVAL; + + area = find_vm_area(addr); + if (unlikely(!area)) + return -ENOENT; + + if (area->node == NUMA_NO_NODE) + return -EINVAL; + + array_size = sizeof(struct page *) * area->nr_pages; + if (array_size > PAGE_SIZE) + pages = __vmalloc(array_size, nested_gfp); + else + pages = kmalloc(array_size, nested_gfp); + + if (!pages) + return -ENOMEM; + + page_order = vm_area_page_order(area); + for (i = 0; i < area->nr_pages; i++) + INIT_LIST_HEAD(&area->pages[i]->lru); + + area_start = (unsigned long)area->addr; + area_end = (unsigned long)(area->addr + area->nr_pages * PAGE_SIZE); + + for_each_replica(node) { + if (area->node == node) + continue; + + nr_allocated = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, area->nr_pages, pages); + if (nr_allocated != area->nr_pages) + goto fail_alloc_pages; + + for (i = 0; i < area->nr_pages; i++) + list_add(&pages[i]->lru, &area->pages[i]->lru); + + vunmap_range_noflush_pgd(area_start, area_end, + init_mm.pgd_numa[node]); + + /* + * We can't fail here (hopefully) + * Possible errors: not enough memory for tables and not empty entries. + * Both unrealistic because we just cleared entries in existed tables. + */ + ret = vmalloc_map_area_pages_node(area_start, pages, + nr_allocated * PAGE_SIZE, + gfp_mask, prot, PAGE_SHIFT, + node); + if (ret != 0) + goto fail_map_pages; + + atomic_long_add(nr_allocated, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + for (i = 0; i < nr_allocated; i++) + mod_memcg_page_state(pages[i], MEMCG_VMALLOC, 1); + } + numa_replicated_page_range(area->pages, pages, area->nr_pages); + + for (i = 0; i < area->nr_pages; i++) + pages[i] = NULL; + } + kvfree(pages); + + flush_tlb_kernel_range(area_start, area_end); + area->replicated = true; + + return 0; +fail_alloc_pages: + for (i = 0; i < nr_allocated; i++) + __free_pages(pages[i], 0); + +fail_map_pages: + kfree(pages); + for (i = 0; i < area->nr_pages; i++) { + struct page *page, *tmp; + + list_for_each_entry_safe(page, tmp, &area->pages[i]->lru, lru) { + list_del(&page->lru); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + __free_pages(page, 0); + } + } + + return ret; +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other From patchwork Thu Dec 28 13:10:55 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506052 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F0738480 for ; Thu, 28 Dec 2023 13:13:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.231]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185x5mvkz6J9Zb; Thu, 28 Dec 2023 21:11:17 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 83DD7140B67; Thu, 28 Dec 2023 21:13:00 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:12:58 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 11/12] x86: add kernel modules text and rodata replication support Date: Thu, 28 Dec 2023 21:10:55 +0800 Message-ID: <20231228131056.602411-12-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- arch/x86/kernel/module.c | 35 ++++++++++++++++++++++++++++++----- include/linux/moduleloader.h | 10 ++++++++++ kernel/module/main.c | 8 ++++++++ kernel/module/strict_rwx.c | 14 +++++++------- 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5f71a0cf4399..6d74d8c33293 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -65,28 +65,53 @@ static unsigned long int get_module_load_offset(void) } #endif -void *module_alloc(unsigned long size) +static void *__module_alloc(unsigned long size, unsigned long vm_flags, int nid) { gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - + /* + * In case replicas vmalloc should be able to unmap/reclaim them + * somehow. Due to this fact it is necessary to account suck pages + * separately. __vmalloc_not_replicated_per_pgd_range() function + * perform this accounting using internal vmalloc buffer with size + * equal to nr_pages * nr_online_nodes. + */ p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK | vm_flags, + nid, __builtin_return_address(0)); if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } - return p; } +#ifdef CONFIG_KERNEL_REPLICATION +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, VM_NUMA_SHARED, 0); +} + +void module_replicate_numa(void *ptr) +{ + gfp_t gfp_mask = GFP_KERNEL; + + __vmalloc_node_replicate_range(ptr, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK); +} +#else +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, 0, NUMA_NO_NODE); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 001b2ce83832..722016d36bda 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,6 +29,16 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); sections. Returns NULL on failure. */ void *module_alloc(unsigned long size); +#ifndef CONFIG_KERNEL_REPLICATION +static inline void module_replicate_numa(void *ptr) +{ + (void) ptr; +} +#else +/* Replicate memory allocated in previous function*/ +void module_replicate_numa(void *ptr); +#endif /* CONFIG_KERNEL_REPLICATION */ + /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); diff --git a/kernel/module/main.c b/kernel/module/main.c index 98fedfdb8db5..2ece8a7743de 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2718,6 +2718,12 @@ static int add_unformed_module(struct module *mod) return err; } +static void module_replicate_rodata(struct module *mod) +{ + module_replicate_numa(mod->mem[MOD_TEXT].base); + module_replicate_numa(mod->mem[MOD_RODATA].base); +} + static int complete_formation(struct module *mod, struct load_info *info) { int err; @@ -2733,6 +2739,8 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); module_cfi_finalize(info->hdr, info->sechdrs, mod); + module_replicate_rodata(mod); + module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index a2b656b4e3d2..23abb3b0520b 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -29,7 +29,7 @@ static void module_set_memory(const struct module *mod, enum mod_mem_type type, void module_enable_x(const struct module *mod) { for_class_mod_mem_type(type, text) - module_set_memory(mod, type, set_memory_x); + module_set_memory(mod, type, numa_set_memory_x); } void module_enable_ro(const struct module *mod, bool after_init) @@ -41,13 +41,13 @@ void module_enable_ro(const struct module *mod, bool after_init) return; #endif - module_set_memory(mod, MOD_TEXT, set_memory_ro); - module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro); - module_set_memory(mod, MOD_RODATA, set_memory_ro); - module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro); + module_set_memory(mod, MOD_TEXT, numa_set_memory_ro); + module_set_memory(mod, MOD_INIT_TEXT, numa_set_memory_ro); + module_set_memory(mod, MOD_RODATA, numa_set_memory_ro); + module_set_memory(mod, MOD_INIT_RODATA, numa_set_memory_ro); if (after_init) - module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); + module_set_memory(mod, MOD_RO_AFTER_INIT, numa_set_memory_ro); } void module_enable_nx(const struct module *mod) @@ -56,7 +56,7 @@ void module_enable_nx(const struct module *mod) return; for_class_mod_mem_type(type, data) - module_set_memory(mod, type, set_memory_nx); + module_set_memory(mod, type, numa_set_memory_nx); } int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, From patchwork Thu Dec 28 13:10:56 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Artem Kuzin X-Patchwork-Id: 13506053 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 42FC879EE for ; Thu, 28 Dec 2023 13:13:04 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.18.186.31]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4T185z3ybJz6J9xD; Thu, 28 Dec 2023 21:11:19 +0800 (CST) Received: from lhrpeml500001.china.huawei.com (unknown [7.191.163.213]) by mail.maildlp.com (Postfix) with ESMTPS id 4259E1400D2; Thu, 28 Dec 2023 21:13:02 +0800 (CST) Received: from mscphis00060.huawei.com (10.123.65.147) by lhrpeml500001.china.huawei.com (7.191.163.213) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.35; Thu, 28 Dec 2023 13:13:00 +0000 From: To: , , , , , , , , , , , , , CC: , , , , , , , , , , Subject: [PATCH RFC 12/12] mm: set memory permissions for BPF handlers replicas Date: Thu, 28 Dec 2023 21:10:56 +0800 Message-ID: <20231228131056.602411-13-artem.kuzin@huawei.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20231228131056.602411-1-artem.kuzin@huawei.com> References: <20231228131056.602411-1-artem.kuzin@huawei.com> Precedence: bulk X-Mailing-List: linux-modules@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-ClientProxiedBy: mscpeml500004.china.huawei.com (7.188.26.250) To lhrpeml500001.china.huawei.com (7.191.163.213) From: Artem Kuzin Co-developed-by: Nikita Panov Signed-off-by: Nikita Panov Co-developed-by: Alexander Grubnikov Signed-off-by: Alexander Grubnikov Signed-off-by: Artem Kuzin --- kernel/bpf/bpf_struct_ops.c | 8 ++++---- kernel/bpf/core.c | 4 ++-- kernel/bpf/trampoline.c | 6 +++--- net/bpf/bpf_dummy_struct_ops.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 116a0ce378ec..9fb1dc5fbd5c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -512,7 +512,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = st_ops->validate(kdata); if (err) goto reset_unlock; - set_memory_rox((long)st_map->image, 1); + numa_set_memory_rox((long)st_map->image, 1); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -521,7 +521,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - set_memory_rox((long)st_map->image, 1); + numa_set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -544,8 +544,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + numa_set_memory_nx((long)st_map->image, 1); + numa_set_memory_rw((long)st_map->image, 1); reset_unlock: bpf_struct_ops_map_put_progs(st_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e3e45b651cd4..73ebda57c0f5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -870,7 +870,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + numa_set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } @@ -888,7 +888,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + numa_set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 53ff50cac61e..964ae6128ef7 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -444,7 +444,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut if (err < 0) goto out_free; - set_memory_rox((long)im->image, 1); + numa_set_memory_rox((long)im->image, 1); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -465,8 +465,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut tr->fops->trampoline = 0; /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - set_memory_nx((long)im->image, 1); - set_memory_rw((long)im->image, 1); + numa_set_memory_nx((long)im->image, 1); + numa_set_memory_rw((long)im->image, 1); goto again; } #endif diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 5918d1b32e19..45a5dbd379ac 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -124,7 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - set_memory_rox((long)image, 1); + numa_set_memory_rox((long)image, 1); prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args);