@@ -73,30 +73,79 @@ ENDPROC(clear_page)
.Lclear_page_end-clear_page,3b-2b
.previous
+#define SSE_UNROLL 128
+
/*
* Zero a page avoiding the caches
* rdi page
*/
ENTRY(clear_page_nocache)
CFI_STARTPROC
- xorl %eax,%eax
- movl $4096/64,%ecx
+ pushq_cfi %rdi
+ call kernel_fpu_begin
+ popq_cfi %rdi
+ sub $16,%rsp
+ CFI_ADJUST_CFA_OFFSET 16
+ movdqu %xmm0,(%rsp)
+ xorpd %xmm0,%xmm0
+ movl $4096/SSE_UNROLL,%ecx
.p2align 4
.Lloop_nocache:
decl %ecx
-#define PUT(x) movnti %rax,x*8(%rdi)
- movnti %rax,(%rdi)
- PUT(1)
- PUT(2)
- PUT(3)
- PUT(4)
- PUT(5)
- PUT(6)
- PUT(7)
-#undef PUT
- leaq 64(%rdi),%rdi
+ .set x,0
+ .rept SSE_UNROLL/16
+ movntdq %xmm0,x(%rdi)
+ .set x,x+16
+ .endr
+ leaq SSE_UNROLL(%rdi),%rdi
jnz .Lloop_nocache
- nop
- ret
+ movdqu (%rsp),%xmm0
+ addq $16,%rsp
+ CFI_ADJUST_CFA_OFFSET -16
+ jmp kernel_fpu_end
CFI_ENDPROC
ENDPROC(clear_page_nocache)
+
+#ifdef CONFIG_AS_AVX
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b)
+ /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\
+ 16, 2b-1b
+ .previous
+
+#define AVX_UNROLL 256 /* TUNE ME */
+
+ENTRY(clear_page_nocache_avx)
+ CFI_STARTPROC
+ pushq_cfi %rdi
+ call kernel_fpu_begin
+ popq_cfi %rdi
+ sub $32,%rsp
+ CFI_ADJUST_CFA_OFFSET 32
+ vmovdqu %ymm0,(%rsp)
+ vxorpd %ymm0,%ymm0,%ymm0
+ movl $4096/AVX_UNROLL,%ecx
+ .p2align 4
+.Lloop_avx:
+ decl %ecx
+ .set x,0
+ .rept AVX_UNROLL/32
+ vmovntdq %ymm0,x(%rdi)
+ .set x,x+32
+ .endr
+ leaq AVX_UNROLL(%rdi),%rdi
+ jnz .Lloop_avx
+ vmovdqu (%rsp),%ymm0
+ addq $32,%rsp
+ CFI_ADJUST_CFA_OFFSET -32
+ jmp kernel_fpu_end
+ CFI_ENDPROC
+ENDPROC(clear_page_nocache_avx)
+
+#endif