@@ -22,6 +22,18 @@ struct page;
extern struct range pfn_mapped[];
extern int nr_pfn_mapped;
+#ifdef __HAVE_ARCH_CLEAR_USER_PAGES /* x86_64 */
+
+#define clear_page(page) clear_pages(page, 1)
+
+static inline void clear_user_pages(void *page, unsigned long vaddr,
+ struct page *pg, unsigned int npages)
+{
+ clear_pages(page, npages);
+}
+
+#endif /* __HAVE_ARCH_CLEAR_USER_PAGES */
+
static inline void clear_user_page(void *page, unsigned long vaddr,
struct page *pg)
{
@@ -41,16 +41,28 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
-void clear_page_orig(void *page);
-void clear_page_rep(void *page);
-void clear_page_erms(void *page);
+/*
+ * Clear in chunks of 256 pages/1024KB.
+ *
+ * Assuming a clearing BW of 3b/cyc (recent generation processors have
+ * more), this amounts to around 400K cycles for each chunk.
+ *
+ * With a cpufreq of ~2.5GHz, this amounts to ~160us for each chunk
+ * (which would also be the interval between calls to cond_resched().)
+ */
+#define ARCH_MAX_CLEAR_PAGES_ORDER 8
-static inline void clear_page(void *page)
+void clear_pages_orig(void *page, unsigned long npages);
+void clear_pages_rep(void *page, unsigned long npages);
+void clear_pages_erms(void *page, unsigned long npages);
+
+#define __HAVE_ARCH_CLEAR_USER_PAGES
+static inline void clear_pages(void *page, unsigned int npages)
{
- alternative_call_2(clear_page_orig,
- clear_page_rep, X86_FEATURE_REP_GOOD,
- clear_page_erms, X86_FEATURE_ERMS,
- "=D" (page),
+ alternative_call_2(clear_pages_orig,
+ clear_pages_rep, X86_FEATURE_REP_GOOD,
+ clear_pages_erms, X86_FEATURE_ERMS,
+ "=D" (page), "S" ((unsigned long) npages),
"0" (page)
: "cc", "memory", "rax", "rcx");
}
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/linkage.h>
#include <asm/export.h>
+#include <asm/page_types.h>
/*
* Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
@@ -10,23 +11,29 @@
*/
/*
- * Zero a page.
- * %rdi - page
+ * Zero pages.
+ * %rdi - base page
+ * %rsi - number of pages
+ *
+ * Note: clear_pages_*() have differing alignments restrictions
+ * but callers are always expected to page align.
*/
-SYM_FUNC_START(clear_page_rep)
- movl $4096/8,%ecx
+SYM_FUNC_START(clear_pages_rep)
+ movq %rsi,%rcx
+ shlq $(PAGE_SHIFT - 3),%rcx
xorl %eax,%eax
rep stosq
RET
-SYM_FUNC_END(clear_page_rep)
-EXPORT_SYMBOL_GPL(clear_page_rep)
+SYM_FUNC_END(clear_pages_rep)
+EXPORT_SYMBOL_GPL(clear_pages_rep)
-SYM_FUNC_START(clear_page_orig)
+SYM_FUNC_START(clear_pages_orig)
xorl %eax,%eax
- movl $4096/64,%ecx
+ movq %rsi,%rcx
+ shlq $(PAGE_SHIFT - 6),%rcx
.p2align 4
.Lloop:
- decl %ecx
+ decq %rcx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
@@ -40,13 +47,14 @@ SYM_FUNC_START(clear_page_orig)
jnz .Lloop
nop
RET
-SYM_FUNC_END(clear_page_orig)
-EXPORT_SYMBOL_GPL(clear_page_orig)
+SYM_FUNC_END(clear_pages_orig)
+EXPORT_SYMBOL_GPL(clear_pages_orig)
-SYM_FUNC_START(clear_page_erms)
- movl $4096,%ecx
+SYM_FUNC_START(clear_pages_erms)
+ movq %rsi,%rcx
+ shlq $PAGE_SHIFT, %rcx
xorl %eax,%eax
rep stosb
RET
-SYM_FUNC_END(clear_page_erms)
-EXPORT_SYMBOL_GPL(clear_page_erms)
+SYM_FUNC_END(clear_pages_erms)
+EXPORT_SYMBOL_GPL(clear_pages_erms)
Add clear_pages(), with ARCH_MAX_CLEAR_PAGES_ORDER=8, so we can clear in chunks of upto 1024KB. The case for doing this is to expose huge or gigantic page clearing as a few long strings of zeroes instead of many PAGE_SIZE'd operations. Processors could take advantage of this hint by foregoing cacheline allocation. Unfortunately current generation CPUs generally do not do this optimization: among CPUs tested, Intel Skylake, Icelakex don't at all; AMD Milan does for extents > ~LLC-size. (Note, however, numbers below do show a ~25% increase in clearing BW -- just that they aren't due to foregoing cacheline allocation.) One hope for this change is that it might provide enough of a hint that future uarchs could optimize for. A minor negative with this change is that calls to clear_page() (which now calls clear_pages()) clobber an additional register. Performance === System: Oracle X9-2c (2 nodes * 32 cores * 2 threads) Processor: Intel Xeon(R) Platinum 8358 CPU @ 2.60GHz (Icelakex, 6:106:6) Memory: 1024 GB evenly split between nodes LLC-size: 48MB for each node (32-cores * 2-threads) no_turbo: 1, Microcode: 0xd0002c1, scaling-governor: performance System: Oracle E4-2c (2 nodes * 8 CCXs * 8 cores * 2 threads) Processor: AMD EPYC 7J13 64-Core Processor (Milan, 25:1:1) Memory: 512 GB evenly split between nodes LLC-size: 32MB for each CCX (8-cores * 2-threads) boost: 1, Microcode: 0xa00115d, scaling-governor: performance Workload: create a 192GB qemu-VM (backed by preallocated 2MB pages on the local node) == Icelakex -- Time (s) Delta (%) clear_page_erms() 22.37 ( +- 0.14s ) # 9.21 bytes/ns clear_pages_erms() 16.49 ( +- 0.06s ) -26.28% # 12.50 bytes/ns Looking at the perf stats [1] [2], it's not obvious where the improvement is coming from. For clear_pages_erms(), we do execute fewer instructions and branches (multiple pages per call to clear_pages_erms(), and fewer cond_resched() calls) but since this code isn't frontend bound (though there is a marginal improvement in topdown-fe-bound), not clear if that's the cause for the ~25% improvement. The topdown-be-bound numbers are significantly better but they are in a similar proportion to the total slots in both cases. Milan -- Time (s) Delta (%) clear_page_erms() 16.49 ( +- 0.06s ) # 12.50 bytes/ns clear_pages_erms() 11.82 ( +- 0.06s ) -28.32% # 17.44 bytes/ns Similar to the Icelakex case above, from the perf stats [3], [4] it's unclear where the improvement is coming from. We do somewhat better for L1-dcache-loads and marginally better for stalled-cycles-backend but nothing obvious stands out. Workload: vm-scalability hugetlb tests (on Icelakex) == For case-anon-w-seq-hugetlb, there is a ~19.49% improvement in cpu-cycles expended. As above, from perf stats there isn't a clear reason why. No significant differences in user/kernel cache misses. case-anon-w-seq-hugetlb: - 2,632,688,342,385 cpu-cycles # 2.301 GHz ( +- 6.76% ) (33.29%) + 2,119,058,504,338 cpu-cycles # 1.654 GHz ( +- 4.63% ) (33.37%) Other hugetlb tests are flat. case-anon-w-rand-hugetlb: - 14,423,774,217,911 cpu-cycles # 2.452 GHz ( +- 0.55% ) (33.30%) + 14,009,785,056,082 cpu-cycles # 2.428 GHz ( +- 3.11% ) (33.32%) case-anon-cow-seq-hugetlb: - 2,689,994,027,601 cpu-cycles # 2.220 GHz ( +- 1.91% ) (33.27%) + 2,735,414,889,894 cpu-cycles # 2.262 GHz ( +- 1.82% ) (27.73%) case-anon-cow-rand-hugetlb: - 16,130,147,328,192 cpu-cycles # 2.482 GHz ( +- 1.07% ) (33.30%) + 15,815,163,909,204 cpu-cycles # 2.432 GHz ( +- 0.64% ) (33.32%) cache-references, cache-misses are within margin of error across all the tests. [1] Icelakex, create 192GB qemu-VM, clear_page_erms() # perf stat -r 5 --all-kernel -ddd ./qemu.sh Performance counter stats for './qemu.sh' (5 runs): 22,378.31 msec task-clock # 1.000 CPUs utilized ( +- 0.67% ) 153 context-switches # 6.844 /sec ( +- 0.57% ) 8 cpu-migrations # 0.358 /sec ( +- 16.49% ) 116 page-faults # 5.189 /sec ( +- 0.17% ) 57,290,131,280 cycles # 2.563 GHz ( +- 0.66% ) (38.46%) 3,077,416,348 instructions # 0.05 insn per cycle ( +- 0.30% ) (46.14%) 631,473,780 branches # 28.246 M/sec ( +- 0.18% ) (53.83%) 1,167,792 branch-misses # 0.19% of all branches ( +- 0.79% ) (61.52%) 286,600,215,705 slots # 12.820 G/sec ( +- 0.66% ) (69.20%) 11,435,999,662 topdown-retiring # 3.9% retiring ( +- 1.56% ) (69.20%) 19,428,489,213 topdown-bad-spec # 6.2% bad speculation ( +- 3.23% ) (69.20%) 3,504,763,769 topdown-fe-bound # 1.2% frontend bound ( +- 0.67% ) (69.20%) 258,517,960,428 topdown-be-bound # 88.7% backend bound ( +- 0.58% ) (69.20%) 749,211,322 L1-dcache-loads # 33.513 M/sec ( +- 0.13% ) (69.18%) 3,244,380,956 L1-dcache-load-misses # 433.32% of all L1-dcache accesses ( +- 0.00% ) (69.20%) 11,441,841 LLC-loads # 511.805 K/sec ( +- 0.30% ) (69.23%) 839,878 LLC-load-misses # 7.32% of all LL-cache accesses ( +- 1.28% ) (69.24%) <not supported> L1-icache-loads 23,091,397 L1-icache-load-misses ( +- 0.72% ) (30.82%) 772,619,434 dTLB-loads # 34.560 M/sec ( +- 0.31% ) (30.82%) 49,750 dTLB-load-misses # 0.01% of all dTLB cache accesses ( +- 3.21% ) (30.80%) <not supported> iTLB-loads 503,570 iTLB-load-misses ( +- 0.44% ) (30.78%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 22.374 +- 0.149 seconds time elapsed ( +- 0.66% ) [2] Icelakex, create 192GB qemu-VM, clear_pages_erms() # perf stat -r 5 --all-kernel -ddd ./qemu.sh Performance counter stats for './qemu.sh' (5 runs): 16,329.41 msec task-clock # 0.990 CPUs utilized ( +- 0.42% ) 143 context-switches # 8.681 /sec ( +- 0.93% ) 1 cpu-migrations # 0.061 /sec ( +- 63.25% ) 118 page-faults # 7.164 /sec ( +- 0.27% ) 41,735,523,673 cycles # 2.534 GHz ( +- 0.42% ) (38.46%) 1,454,116,543 instructions # 0.03 insn per cycle ( +- 0.49% ) (46.16%) 266,749,920 branches # 16.194 M/sec ( +- 0.41% ) (53.86%) 928,726 branch-misses # 0.35% of all branches ( +- 0.38% ) (61.54%) 208,805,754,709 slots # 12.676 G/sec ( +- 0.41% ) (69.23%) 5,355,889,366 topdown-retiring # 2.5% retiring ( +- 0.50% ) (69.23%) 12,720,749,784 topdown-bad-spec # 6.1% bad speculation ( +- 1.38% ) (69.23%) 998,710,552 topdown-fe-bound # 0.5% frontend bound ( +- 0.85% ) (69.23%) 192,653,197,875 topdown-be-bound # 90.9% backend bound ( +- 0.38% ) (69.23%) 407,619,058 L1-dcache-loads # 24.746 M/sec ( +- 0.17% ) (69.20%) 3,245,399,461 L1-dcache-load-misses # 801.49% of all L1-dcache accesses ( +- 0.01% ) (69.22%) 10,805,747 LLC-loads # 656.009 K/sec ( +- 0.37% ) (69.25%) 804,475 LLC-load-misses # 7.44% of all LL-cache accesses ( +- 2.73% ) (69.26%) <not supported> L1-icache-loads 18,134,527 L1-icache-load-misses ( +- 1.24% ) (30.80%) 435,474,462 dTLB-loads # 26.437 M/sec ( +- 0.28% ) (30.80%) 41,187 dTLB-load-misses # 0.01% of all dTLB cache accesses ( +- 4.06% ) (30.79%) <not supported> iTLB-loads 440,135 iTLB-load-misses ( +- 1.07% ) (30.78%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 16.4906 +- 0.0676 seconds time elapsed ( +- 0.41% ) [3] Milan, create 192GB qemu-VM, clear_page_erms() # perf stat -r 5 --all-kernel -ddd ./qemu.sh Performance counter stats for './qemu.sh' (5 runs): 16,321.98 msec task-clock # 0.989 CPUs utilized ( +- 0.42% ) 104 context-switches # 6.312 /sec ( +- 0.47% ) 0 cpu-migrations # 0.000 /sec 109 page-faults # 6.616 /sec ( +- 0.41% ) 39,430,057,963 cycles # 2.393 GHz ( +- 0.42% ) (33.33%) 252,874,009 stalled-cycles-frontend # 0.64% frontend cycles idle ( +- 17.81% ) (33.34%) 7,240,041 stalled-cycles-backend # 0.02% backend cycles idle ( +-245.73% ) (33.34%) 3,031,754,124 instructions # 0.08 insn per cycle # 0.11 stalled cycles per insn ( +- 0.41% ) (33.35%) 711,675,976 branches # 43.197 M/sec ( +- 0.15% ) (33.34%) 52,470,018 branch-misses # 7.38% of all branches ( +- 0.21% ) (33.36%) 7,744,057,748 L1-dcache-loads # 470.041 M/sec ( +- 0.05% ) (33.36%) 3,241,880,079 L1-dcache-load-misses # 41.92% of all L1-dcache accesses ( +- 0.01% ) (33.35%) <not supported> LLC-loads <not supported> LLC-load-misses 155,312,115 L1-icache-loads # 9.427 M/sec ( +- 0.23% ) (33.34%) 1,573,793 L1-icache-load-misses # 1.01% of all L1-icache accesses ( +- 3.74% ) (33.36%) 3,521,392 dTLB-loads # 213.738 K/sec ( +- 4.97% ) (33.35%) 346,337 dTLB-load-misses # 9.31% of all dTLB cache accesses ( +- 5.54% ) (33.35%) 725 iTLB-loads # 44.005 /sec ( +- 8.75% ) (33.34%) 115,723 iTLB-load-misses # 19261.48% of all iTLB cache accesses ( +- 1.20% ) (33.34%) 139,229,403 L1-dcache-prefetches # 8.451 M/sec ( +- 10.97% ) (33.34%) <not supported> L1-dcache-prefetch-misses 16.4962 +- 0.0665 seconds time elapsed ( +- 0.40% ) [4] Milan, create 192GB qemu-VM, clear_pages_erms() # perf stat -r 5 --all-kernel -ddd ./qemu.sh Performance counter stats for './qemu.sh' (5 runs): 11,676.79 msec task-clock # 0.987 CPUs utilized ( +- 0.68% ) 96 context-switches # 8.131 /sec ( +- 0.78% ) 2 cpu-migrations # 0.169 /sec ( +- 18.71% ) 106 page-faults # 8.978 /sec ( +- 0.23% ) 28,161,726,414 cycles # 2.385 GHz ( +- 0.69% ) (33.33%) 141,032,827 stalled-cycles-frontend # 0.50% frontend cycles idle ( +- 52.44% ) (33.35%) 796,792,139 stalled-cycles-backend # 2.80% backend cycles idle ( +- 23.73% ) (33.35%) 1,140,172,646 instructions # 0.04 insn per cycle # 0.50 stalled cycles per insn ( +- 0.89% ) (33.35%) 219,864,061 branches # 18.622 M/sec ( +- 1.06% ) (33.36%) 1,407,446 branch-misses # 0.63% of all branches ( +- 10.66% ) (33.40%) 6,882,968,897 L1-dcache-loads # 582.960 M/sec ( +- 0.03% ) (33.38%) 3,267,546,914 L1-dcache-load-misses # 47.45% of all L1-dcache accesses ( +- 0.02% ) (33.37%) <not supported> LLC-loads <not supported> LLC-load-misses 146,901,513 L1-icache-loads # 12.442 M/sec ( +- 0.78% ) (33.36%) 1,462,155 L1-icache-load-misses # 0.99% of all L1-icache accesses ( +- 0.83% ) (33.34%) 2,055,805 dTLB-loads # 174.118 K/sec ( +- 22.56% ) (33.33%) 136,260 dTLB-load-misses # 4.69% of all dTLB cache accesses ( +- 23.13% ) (33.35%) 941 iTLB-loads # 79.699 /sec ( +- 5.54% ) (33.35%) 115,444 iTLB-load-misses # 14051.12% of all iTLB cache accesses ( +- 21.17% ) (33.34%) 95,438,373 L1-dcache-prefetches # 8.083 M/sec ( +- 19.99% ) (33.34%) <not supported> L1-dcache-prefetch-misses 11.8296 +- 0.0805 seconds time elapsed ( +- 0.68% ) Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> --- arch/x86/include/asm/page.h | 12 +++++++++++ arch/x86/include/asm/page_64.h | 28 ++++++++++++++++++------- arch/x86/lib/clear_page_64.S | 38 ++++++++++++++++++++-------------- 3 files changed, 55 insertions(+), 23 deletions(-)