Message ID | 1344524583-1096-5-git-send-email-kirill.shutemov@linux.intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
>>> On 09.08.12 at 17:03, "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote: > From: Andi Kleen <ak@linux.intel.com> > > Add a cache avoiding version of clear_page. Straight forward integer variant > of the existing 64bit clear_page, for both 32bit and 64bit. While on 64-bit this is fine, I fail to see how you avoid using the SSE2 instruction on non-SSE2 systems. > Also add the necessary glue for highmem including a layer that non cache > coherent architectures that use the virtual address for flushing can > hook in. This is not needed on x86 of course. > > If an architecture wants to provide cache avoiding version of clear_page > it should to define ARCH_HAS_USER_NOCACHE to 1 and implement > clear_page_nocache() and clear_user_highpage_nocache(). > > Signed-off-by: Andi Kleen <ak@linux.intel.com> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > --- > arch/x86/include/asm/page.h | 2 ++ > arch/x86/include/asm/string_32.h | 5 +++++ > arch/x86/include/asm/string_64.h | 5 +++++ > arch/x86/lib/Makefile | 1 + > arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++ > arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++ Couldn't this more reasonably go into clear_page_{32,64}.S? > arch/x86/mm/fault.c | 7 +++++++ > 7 files changed, 79 insertions(+), 0 deletions(-) > create mode 100644 arch/x86/lib/clear_page_nocache_32.S > create mode 100644 arch/x86/lib/clear_page_nocache_64.S >... >--- /dev/null >+++ b/arch/x86/lib/clear_page_nocache_32.S >@@ -0,0 +1,30 @@ >+#include <linux/linkage.h> >+#include <asm/dwarf2.h> >+ >+/* >+ * Zero a page avoiding the caches >+ * rdi page Wrong comment. >+ */ >+ENTRY(clear_page_nocache) >+ CFI_STARTPROC >+ mov %eax,%edi You need to pick a different register here (e.g. %edx), since %edi has to be preserved by all functions called from C. >+ xorl %eax,%eax >+ movl $4096/64,%ecx >+ .p2align 4 >+.Lloop: >+ decl %ecx >+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) Is doing twice as much unrolling as on 64-bit really worth it? Jan -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/09/2012 08:03 AM, Kirill A. Shutemov wrote: > From: Andi Kleen <ak@linux.intel.com> > > Add a cache avoiding version of clear_page. Straight forward integer variant > of the existing 64bit clear_page, for both 32bit and 64bit. > > Also add the necessary glue for highmem including a layer that non cache > coherent architectures that use the virtual address for flushing can > hook in. This is not needed on x86 of course. > > If an architecture wants to provide cache avoiding version of clear_page > it should to define ARCH_HAS_USER_NOCACHE to 1 and implement > clear_page_nocache() and clear_user_highpage_nocache(). > Compile failure: /home/hpa/kernel/tip.x86-mm/arch/x86/mm/fault.c: In function ‘clear_user_highpage_nocache’: /home/hpa/kernel/tip.x86-mm/arch/x86/mm/fault.c:1215:30: error: ‘KM_USER0’ undeclared (first use in this function) /home/hpa/kernel/tip.x86-mm/arch/x86/mm/fault.c:1215:30: note: each undeclared identifier is reported only once for each function it appears in /home/hpa/kernel/tip.x86-mm/arch/x86/mm/fault.c:1215:2: error: too many arguments to function ‘kmap_atomic’ In file included from /home/hpa/kernel/tip.x86-mm/include/linux/pagemap.h:10:0, from /home/hpa/kernel/tip.x86-mm/include/linux/mempolicy.h:70, from /home/hpa/kernel/tip.x86-mm/include/linux/hugetlb.h:15, from /home/hpa/kernel/tip.x86-mm/arch/x86/mm/fault.c:14: /home/hpa/kernel/tip.x86-mm/include/linux/highmem.h:66:21: note: declared here make[4]: *** [arch/x86/mm/fault.o] Error 1 make[3]: *** [arch/x86/mm] Error 2 make[2]: *** [arch/x86] Error 2 make[1]: *** [sub-make] Error 2 make[1]: Leaving directory `/home/hpa/kernel/tip.x86-mm' This happens on *all* my test configurations, including both x86-64 and i386 allyesconfig. I suspect your patchset base is stale. -hpa
On Thu, Aug 09, 2012 at 04:22:04PM +0100, Jan Beulich wrote: > >>> On 09.08.12 at 17:03, "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote: ... > > --- > > arch/x86/include/asm/page.h | 2 ++ > > arch/x86/include/asm/string_32.h | 5 +++++ > > arch/x86/include/asm/string_64.h | 5 +++++ > > arch/x86/lib/Makefile | 1 + > > arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++ > > arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++ > > Couldn't this more reasonably go into clear_page_{32,64}.S? We don't have clear_page_32.S. > >+ xorl %eax,%eax > >+ movl $4096/64,%ecx > >+ .p2align 4 > >+.Lloop: > >+ decl %ecx > >+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) > > Is doing twice as much unrolling as on 64-bit really worth it? Moving 64 bytes per cycle is faster on Sandy Bridge, but slower on Westmere. Any preference? ;) Westmere: Performance counter stats for './test_unroll32' (20 runs): 31498.420608 task-clock # 0.998 CPUs utilized ( +- 0.25% ) 40 context-switches # 0.001 K/sec ( +- 1.40% ) 0 CPU-migrations # 0.000 K/sec ( +-100.00% ) 89 page-faults # 0.003 K/sec ( +- 0.13% ) 74,728,231,935 cycles # 2.372 GHz ( +- 0.25% ) [83.34%] 53,789,969,009 stalled-cycles-frontend # 71.98% frontend cycles idle ( +- 0.35% ) [83.33%] 41,681,014,054 stalled-cycles-backend # 55.78% backend cycles idle ( +- 0.43% ) [66.67%] 37,992,733,278 instructions # 0.51 insns per cycle # 1.42 stalled cycles per insn ( +- 0.05% ) [83.33%] 3,561,376,245 branches # 113.065 M/sec ( +- 0.05% ) [83.33%] 27,182,795 branch-misses # 0.76% of all branches ( +- 0.06% ) [83.33%] 31.558545812 seconds time elapsed ( +- 0.25% ) Performance counter stats for './test_unroll64' (20 runs): 31564.753623 task-clock # 0.998 CPUs utilized ( +- 0.19% ) 39 context-switches # 0.001 K/sec ( +- 0.40% ) 0 CPU-migrations # 0.000 K/sec 90 page-faults # 0.003 K/sec ( +- 0.12% ) 74,886,045,192 cycles # 2.372 GHz ( +- 0.19% ) [83.33%] 57,477,323,995 stalled-cycles-frontend # 76.75% frontend cycles idle ( +- 0.26% ) [83.34%] 44,548,142,150 stalled-cycles-backend # 59.49% backend cycles idle ( +- 0.31% ) [66.67%] 32,940,027,099 instructions # 0.44 insns per cycle # 1.74 stalled cycles per insn ( +- 0.05% ) [83.34%] 1,884,944,093 branches # 59.717 M/sec ( +- 0.05% ) [83.32%] 1,027,135 branch-misses # 0.05% of all branches ( +- 0.56% ) [83.34%] 31.621001407 seconds time elapsed ( +- 0.19% ) Sandy Bridge: Performance counter stats for './test_unroll32' (20 runs): 8578.382891 task-clock # 0.997 CPUs utilized ( +- 0.08% ) 15 context-switches # 0.000 M/sec ( +- 2.97% ) 0 CPU-migrations # 0.000 M/sec 84 page-faults # 0.000 M/sec ( +- 0.13% ) 29,154,476,597 cycles # 3.399 GHz ( +- 0.08% ) [83.33%] 11,851,215,147 stalled-cycles-frontend # 40.65% frontend cycles idle ( +- 0.20% ) [83.33%] 1,530,172,593 stalled-cycles-backend # 5.25% backend cycles idle ( +- 1.44% ) [66.67%] 37,915,778,094 instructions # 1.30 insns per cycle # 0.31 stalled cycles per insn ( +- 0.00% ) [83.34%] 3,590,533,447 branches # 418.556 M/sec ( +- 0.01% ) [83.35%] 26,500,765 branch-misses # 0.74% of all branches ( +- 0.01% ) [83.34%] 8.604638449 seconds time elapsed ( +- 0.08% ) Performance counter stats for './test_unroll64' (20 runs): 8463.789963 task-clock # 0.997 CPUs utilized ( +- 0.07% ) 14 context-switches # 0.000 M/sec ( +- 1.70% ) 0 CPU-migrations # 0.000 M/sec ( +-100.00% ) 85 page-faults # 0.000 M/sec ( +- 0.12% ) 28,763,328,688 cycles # 3.398 GHz ( +- 0.07% ) [83.32%] 13,517,462,952 stalled-cycles-frontend # 47.00% frontend cycles idle ( +- 0.14% ) [83.33%] 1,356,208,859 stalled-cycles-backend # 4.72% backend cycles idle ( +- 1.42% ) [66.68%] 32,885,492,141 instructions # 1.14 insns per cycle # 0.41 stalled cycles per insn ( +- 0.00% ) [83.34%] 1,912,094,072 branches # 225.915 M/sec ( +- 0.02% ) [83.34%] 305,896 branch-misses # 0.02% of all branches ( +- 1.05% ) [83.33%] 8.488304839 seconds time elapsed ( +- 0.07% ) $ cat test.c #include <stdio.h> #include <sys/mman.h> #define SIZE 1024*1024*1024 void clear_page_nocache_sse2(void *page) __attribute__((regparm(1))); int main(int argc, char** argv) { char *p; unsigned long i, j; p = mmap(NULL, SIZE, PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); for(j = 0; j < 100; j++) { for(i = 0; i < SIZE; i += 4096) { clear_page_nocache_sse2(p + i); } } return 0; } $ cat clear_page_nocache_unroll32.S .globl clear_page_nocache_sse2 .align 4,0x90 clear_page_nocache_sse2: .cfi_startproc mov %eax,%edx xorl %eax,%eax movl $4096/32,%ecx .p2align 4 .Lloop_sse2: decl %ecx #define PUT(x) movnti %eax,x*4(%edx) PUT(0) PUT(1) PUT(2) PUT(3) PUT(4) PUT(5) PUT(6) PUT(7) #undef PUT lea 32(%edx),%edx jnz .Lloop_sse2 nop ret .cfi_endproc .type clear_page_nocache_sse2, @function .size clear_page_nocache_sse2, .-clear_page_nocache_sse2 $ cat clear_page_nocache_unroll64.S .globl clear_page_nocache_sse2 .align 4,0x90 clear_page_nocache_sse2: .cfi_startproc mov %eax,%edx xorl %eax,%eax movl $4096/64,%ecx .p2align 4 .Lloop_sse2: decl %ecx #define PUT(x) movnti %eax,x*8(%edx) ; movnti %eax,x*8+4(%edx) PUT(0) PUT(1) PUT(2) PUT(3) PUT(4) PUT(5) PUT(6) PUT(7) #undef PUT lea 64(%edx),%edx jnz .Lloop_sse2 nop ret .cfi_endproc .type clear_page_nocache_sse2, @function .size clear_page_nocache_sse2, .-clear_page_nocache_sse2
>>> On 13.08.12 at 13:43, "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote: > On Thu, Aug 09, 2012 at 04:22:04PM +0100, Jan Beulich wrote: >> >>> On 09.08.12 at 17:03, "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> wrote: > > ... > >> > --- >> > arch/x86/include/asm/page.h | 2 ++ >> > arch/x86/include/asm/string_32.h | 5 +++++ >> > arch/x86/include/asm/string_64.h | 5 +++++ >> > arch/x86/lib/Makefile | 1 + >> > arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++ >> > arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++ >> >> Couldn't this more reasonably go into clear_page_{32,64}.S? > > We don't have clear_page_32.S. Sure, but you're introducing a file anyway. Fold the new code into the existing file for 64-bit, and create a new, similarly named one for 32-bit. >> >+ xorl %eax,%eax >> >+ movl $4096/64,%ecx >> >+ .p2align 4 >> >+.Lloop: >> >+ decl %ecx >> >+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) >> >> Is doing twice as much unrolling as on 64-bit really worth it? > > Moving 64 bytes per cycle is faster on Sandy Bridge, but slower on > Westmere. Any preference? ;) If it's not a clear win, I'd favor the 8-stores-per-cycle variant, matching x86-64. Jan -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Moving 64 bytes per cycle is faster on Sandy Bridge, but slower on > Westmere. Any preference? ;) You have to be careful with these benchmarks. - You need to make sure the data is cache cold, cache hot is misleading. - The numbers can change if you have multiple CPUs doing this in parallel. -Andi -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 13, 2012 at 02:43:34PM +0300, Kirill A. Shutemov wrote: > $ cat test.c > #include <stdio.h> > #include <sys/mman.h> > > #define SIZE 1024*1024*1024 > > void clear_page_nocache_sse2(void *page) __attribute__((regparm(1))); > > int main(int argc, char** argv) > { > char *p; > unsigned long i, j; > > p = mmap(NULL, SIZE, PROT_WRITE|PROT_READ, > MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); > for(j = 0; j < 100; j++) { > for(i = 0; i < SIZE; i += 4096) { > clear_page_nocache_sse2(p + i); > } > } > > return 0; > } > $ cat clear_page_nocache_unroll32.S > .globl clear_page_nocache_sse2 > .align 4,0x90 > clear_page_nocache_sse2: > .cfi_startproc > mov %eax,%edx > xorl %eax,%eax > movl $4096/32,%ecx > .p2align 4 > .Lloop_sse2: > decl %ecx > #define PUT(x) movnti %eax,x*4(%edx) > PUT(0) > PUT(1) > PUT(2) > PUT(3) > PUT(4) > PUT(5) > PUT(6) > PUT(7) > #undef PUT > lea 32(%edx),%edx > jnz .Lloop_sse2 > nop > ret > .cfi_endproc > .type clear_page_nocache_sse2, @function > .size clear_page_nocache_sse2, .-clear_page_nocache_sse2 > $ cat clear_page_nocache_unroll64.S > .globl clear_page_nocache_sse2 > .align 4,0x90 > clear_page_nocache_sse2: > .cfi_startproc > mov %eax,%edx This must still be the 32-bit version becaue it segfaults here. Here's why: mmap above gives a ptr which, on 64-bit, is larger than 32-bit, i.e. it looks like 0x7fffxxxxx000, i.e. starting from top of userspace. Now, the mov above truncates that ptr and the thing segfaults. Doing s/edx/rdx/g fixes it though. Thanks.
On Mon, Aug 13, 2012 at 07:04:02PM +0200, Borislav Petkov wrote: > On Mon, Aug 13, 2012 at 02:43:34PM +0300, Kirill A. Shutemov wrote: > > $ cat test.c > > #include <stdio.h> > > #include <sys/mman.h> > > > > #define SIZE 1024*1024*1024 > > > > void clear_page_nocache_sse2(void *page) __attribute__((regparm(1))); > > > > int main(int argc, char** argv) > > { > > char *p; > > unsigned long i, j; > > > > p = mmap(NULL, SIZE, PROT_WRITE|PROT_READ, > > MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); > > for(j = 0; j < 100; j++) { > > for(i = 0; i < SIZE; i += 4096) { > > clear_page_nocache_sse2(p + i); > > } > > } > > > > return 0; > > } > > $ cat clear_page_nocache_unroll32.S > > .globl clear_page_nocache_sse2 > > .align 4,0x90 > > clear_page_nocache_sse2: > > .cfi_startproc > > mov %eax,%edx > > xorl %eax,%eax > > movl $4096/32,%ecx > > .p2align 4 > > .Lloop_sse2: > > decl %ecx > > #define PUT(x) movnti %eax,x*4(%edx) > > PUT(0) > > PUT(1) > > PUT(2) > > PUT(3) > > PUT(4) > > PUT(5) > > PUT(6) > > PUT(7) > > #undef PUT > > lea 32(%edx),%edx > > jnz .Lloop_sse2 > > nop > > ret > > .cfi_endproc > > .type clear_page_nocache_sse2, @function > > .size clear_page_nocache_sse2, .-clear_page_nocache_sse2 > > $ cat clear_page_nocache_unroll64.S > > .globl clear_page_nocache_sse2 > > .align 4,0x90 > > clear_page_nocache_sse2: > > .cfi_startproc > > mov %eax,%edx > > This must still be the 32-bit version becaue it segfaults here. Yes, it's test for 32-bit version.
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca8283..aa83a1b 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -29,6 +29,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } +void clear_user_highpage_nocache(struct page *page, unsigned long vaddr); + #define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e835..3f2fbcf 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include <linux/linkage.h> + /* Let gcc decide whether to inline or use the out of line functions */ #define __HAVE_ARCH_STRCPY @@ -337,6 +339,9 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, #define __HAVE_ARCH_MEMSCAN extern void *memscan(void *addr, int c, size_t size); +#define ARCH_HAS_USER_NOCACHE 1 +asmlinkage void clear_page_nocache(void *page); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_32_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c46..ca23d1d 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include <linux/linkage.h> + /* Written 2002 by Andi Kleen */ /* Only used for special circumstances. Stolen from i386/string.h */ @@ -63,6 +65,9 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); +#define ARCH_HAS_USER_NOCACHE 1 +asmlinkage void clear_page_nocache(void *page); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index b00f678..a8ad6dd 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,6 +23,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_SMP) += rwlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-y += clear_page_nocache_$(BITS).o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/clear_page_nocache_32.S b/arch/x86/lib/clear_page_nocache_32.S new file mode 100644 index 0000000..2394e0c --- /dev/null +++ b/arch/x86/lib/clear_page_nocache_32.S @@ -0,0 +1,30 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Zero a page avoiding the caches + * rdi page + */ +ENTRY(clear_page_nocache) + CFI_STARTPROC + mov %eax,%edi + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) + PUT(0) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + lea 64(%edi),%edi + jnz .Lloop + nop + ret + CFI_ENDPROC +ENDPROC(clear_page_nocache) diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S new file mode 100644 index 0000000..ee16d15 --- /dev/null +++ b/arch/x86/lib/clear_page_nocache_64.S @@ -0,0 +1,29 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Zero a page avoiding the caches + * rdi page + */ +ENTRY(clear_page_nocache) + CFI_STARTPROC + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movnti %rax,x*8(%rdi) + movnti %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret + CFI_ENDPROC +ENDPROC(clear_page_nocache) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d..20888b4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1209,3 +1209,10 @@ good_area: up_read(&mm->mmap_sem); } + +void clear_user_highpage_nocache(struct page *page, unsigned long vaddr) +{ + void *p = kmap_atomic(page, KM_USER0); + clear_page_nocache(p); + kunmap_atomic(p); +}