diff mbox series

[v3,09/21] x86/asm: add clear_pages_movnt()

Message ID 20220606203725.1313715-5-ankur.a.arora@oracle.com (mailing list archive)
State New
Headers show
Series huge page clearing optimizations | expand

Commit Message

Ankur Arora June 6, 2022, 8:37 p.m. UTC
Add clear_pages_movnt(), which uses MOVNTI as the underlying primitive.
With this, page-clearing can skip the memory hierarchy, thus providing
a non cache-polluting implementation of clear_pages().

MOVNTI, from the Intel SDM, Volume 2B, 4-101:
 "The non-temporal hint is implemented by using a write combining (WC)
  memory type protocol when writing the data to memory. Using this
  protocol, the processor does not write the data into the cache
  hierarchy, nor does it fetch the corresponding cache line from memory
  into the cache hierarchy."

The AMD Arch Manual has something similar to say as well.

One use-case is to zero large extents without bringing in never-to-be-
accessed cachelines. Also, often clear_pages_movnt() based clearing is
faster once extent sizes are O(LLC-size).

As the excerpt notes, MOVNTI is weakly ordered with respect to other
instructions operating on the memory hierarchy. This needs to be
handled by the caller by executing an SFENCE when done.

The implementation is straight-forward: unroll the inner loop to keep
the code similar to memset_movnti(), so that we can gauge
clear_pages_movnt() performance via perf bench mem memset.

 # Intel Icelakex
 # Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb
 # (X86_FEATURE_ERMS) and x86-64-movnt:

 System:      Oracle X9-2 (2 nodes * 32 cores * 2 threads)
 Processor:   Intel Xeon(R) Platinum 8358 CPU @ 2.60GHz (Icelakex, 6:106:6)
 Memory:      512 GB evenly split between nodes
 LLC-size:    48MB for each node (32-cores * 2-threads)
 no_turbo: 1, Microcode: 0xd0001e0, scaling-governor: performance

              x86-64-stosb (5 runs)     x86-64-movnt (5 runs)    Delta(%)
              ----------------------    ---------------------    --------
     size            BW   (   stdev)          BW    (   stdev)

      2MB      14.37 GB/s ( +- 1.55)     12.59 GB/s ( +- 1.20)   -12.38%
     16MB      16.93 GB/s ( +- 2.61)     15.91 GB/s ( +- 2.74)    -6.02%
    128MB      12.12 GB/s ( +- 1.06)     22.33 GB/s ( +- 1.84)   +84.24%
   1024MB      12.12 GB/s ( +- 0.02)     23.92 GB/s ( +- 0.14)   +97.35%
   4096MB      12.08 GB/s ( +- 0.02)     23.98 GB/s ( +- 0.18)   +98.50%

Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 arch/x86/include/asm/page_64.h |  1 +
 arch/x86/lib/clear_page_64.S   | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

Comments

Noah Goldstein June 10, 2022, 10:11 p.m. UTC | #1
On Mon, Jun 6, 2022 at 11:39 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>
> Add clear_pages_movnt(), which uses MOVNTI as the underlying primitive.
> With this, page-clearing can skip the memory hierarchy, thus providing
> a non cache-polluting implementation of clear_pages().
>
> MOVNTI, from the Intel SDM, Volume 2B, 4-101:
>  "The non-temporal hint is implemented by using a write combining (WC)
>   memory type protocol when writing the data to memory. Using this
>   protocol, the processor does not write the data into the cache
>   hierarchy, nor does it fetch the corresponding cache line from memory
>   into the cache hierarchy."
>
> The AMD Arch Manual has something similar to say as well.
>
> One use-case is to zero large extents without bringing in never-to-be-
> accessed cachelines. Also, often clear_pages_movnt() based clearing is
> faster once extent sizes are O(LLC-size).
>
> As the excerpt notes, MOVNTI is weakly ordered with respect to other
> instructions operating on the memory hierarchy. This needs to be
> handled by the caller by executing an SFENCE when done.
>
> The implementation is straight-forward: unroll the inner loop to keep
> the code similar to memset_movnti(), so that we can gauge
> clear_pages_movnt() performance via perf bench mem memset.
>
>  # Intel Icelakex
>  # Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb
>  # (X86_FEATURE_ERMS) and x86-64-movnt:
>
>  System:      Oracle X9-2 (2 nodes * 32 cores * 2 threads)
>  Processor:   Intel Xeon(R) Platinum 8358 CPU @ 2.60GHz (Icelakex, 6:106:6)
>  Memory:      512 GB evenly split between nodes
>  LLC-size:    48MB for each node (32-cores * 2-threads)
>  no_turbo: 1, Microcode: 0xd0001e0, scaling-governor: performance
>
>               x86-64-stosb (5 runs)     x86-64-movnt (5 runs)    Delta(%)
>               ----------------------    ---------------------    --------
>      size            BW   (   stdev)          BW    (   stdev)
>
>       2MB      14.37 GB/s ( +- 1.55)     12.59 GB/s ( +- 1.20)   -12.38%
>      16MB      16.93 GB/s ( +- 2.61)     15.91 GB/s ( +- 2.74)    -6.02%
>     128MB      12.12 GB/s ( +- 1.06)     22.33 GB/s ( +- 1.84)   +84.24%
>    1024MB      12.12 GB/s ( +- 0.02)     23.92 GB/s ( +- 0.14)   +97.35%
>    4096MB      12.08 GB/s ( +- 0.02)     23.98 GB/s ( +- 0.18)   +98.50%

For these sizes it may be worth it to save/rstor an xmm register to do
the memset:

Just on my Tigerlake laptop:
model name : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

                  movntdq xmm (5 runs)          movnti GPR (5 runs)
    Delta(%)
                  -----------------------       -----------------------
           size      BW GB/s ( +-  stdev)          BW GB/s ( +-
stdev)         %
           2 MB   35.71 GB/s ( +-   1.02)       34.62 GB/s ( +-
0.77)    -3.15%
          16 MB   36.43 GB/s ( +-   0.35)        31.3 GB/s ( +-
0.1)   -16.39%
         128 MB    35.6 GB/s ( +-   0.83)       30.82 GB/s ( +-
0.08)    -15.5%
        1024 MB   36.85 GB/s ( +-   0.26)       30.71 GB/s ( +-
0.2)    -20.0%
>
> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> ---
>  arch/x86/include/asm/page_64.h |  1 +
>  arch/x86/lib/clear_page_64.S   | 21 +++++++++++++++++++++
>  2 files changed, 22 insertions(+)
>
> diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
> index a88a3508888a..3affc4ecb8da 100644
> --- a/arch/x86/include/asm/page_64.h
> +++ b/arch/x86/include/asm/page_64.h
> @@ -55,6 +55,7 @@ extern unsigned long __phys_addr_symbol(unsigned long);
>  void clear_pages_orig(void *page, unsigned long npages);
>  void clear_pages_rep(void *page, unsigned long npages);
>  void clear_pages_erms(void *page, unsigned long npages);
> +void clear_pages_movnt(void *page, unsigned long npages);
>
>  #define __HAVE_ARCH_CLEAR_USER_PAGES
>  static inline void clear_pages(void *page, unsigned int npages)
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index 2cc3b681734a..83d14f1c9f57 100644
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -58,3 +58,24 @@ SYM_FUNC_START(clear_pages_erms)
>         RET
>  SYM_FUNC_END(clear_pages_erms)
>  EXPORT_SYMBOL_GPL(clear_pages_erms)
> +
> +SYM_FUNC_START(clear_pages_movnt)
> +       xorl    %eax,%eax
> +       movq    %rsi,%rcx
> +       shlq    $PAGE_SHIFT, %rcx
> +
> +       .p2align 4
> +.Lstart:
> +       movnti  %rax, 0x00(%rdi)
> +       movnti  %rax, 0x08(%rdi)
> +       movnti  %rax, 0x10(%rdi)
> +       movnti  %rax, 0x18(%rdi)
> +       movnti  %rax, 0x20(%rdi)
> +       movnti  %rax, 0x28(%rdi)
> +       movnti  %rax, 0x30(%rdi)
> +       movnti  %rax, 0x38(%rdi)
> +       addq    $0x40, %rdi
> +       subl    $0x40, %ecx
> +       ja      .Lstart
> +       RET
> +SYM_FUNC_END(clear_pages_movnt)
> --
> 2.31.1
>
Noah Goldstein June 10, 2022, 10:15 p.m. UTC | #2
On Fri, Jun 10, 2022 at 3:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 6, 2022 at 11:39 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
> >
> > Add clear_pages_movnt(), which uses MOVNTI as the underlying primitive.
> > With this, page-clearing can skip the memory hierarchy, thus providing
> > a non cache-polluting implementation of clear_pages().
> >
> > MOVNTI, from the Intel SDM, Volume 2B, 4-101:
> >  "The non-temporal hint is implemented by using a write combining (WC)
> >   memory type protocol when writing the data to memory. Using this
> >   protocol, the processor does not write the data into the cache
> >   hierarchy, nor does it fetch the corresponding cache line from memory
> >   into the cache hierarchy."
> >
> > The AMD Arch Manual has something similar to say as well.
> >
> > One use-case is to zero large extents without bringing in never-to-be-
> > accessed cachelines. Also, often clear_pages_movnt() based clearing is
> > faster once extent sizes are O(LLC-size).
> >
> > As the excerpt notes, MOVNTI is weakly ordered with respect to other
> > instructions operating on the memory hierarchy. This needs to be
> > handled by the caller by executing an SFENCE when done.
> >
> > The implementation is straight-forward: unroll the inner loop to keep
> > the code similar to memset_movnti(), so that we can gauge
> > clear_pages_movnt() performance via perf bench mem memset.
> >
> >  # Intel Icelakex
> >  # Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb
> >  # (X86_FEATURE_ERMS) and x86-64-movnt:
> >
> >  System:      Oracle X9-2 (2 nodes * 32 cores * 2 threads)
> >  Processor:   Intel Xeon(R) Platinum 8358 CPU @ 2.60GHz (Icelakex, 6:106:6)
> >  Memory:      512 GB evenly split between nodes
> >  LLC-size:    48MB for each node (32-cores * 2-threads)
> >  no_turbo: 1, Microcode: 0xd0001e0, scaling-governor: performance
> >
> >               x86-64-stosb (5 runs)     x86-64-movnt (5 runs)    Delta(%)
> >               ----------------------    ---------------------    --------
> >      size            BW   (   stdev)          BW    (   stdev)
> >
> >       2MB      14.37 GB/s ( +- 1.55)     12.59 GB/s ( +- 1.20)   -12.38%
> >      16MB      16.93 GB/s ( +- 2.61)     15.91 GB/s ( +- 2.74)    -6.02%
> >     128MB      12.12 GB/s ( +- 1.06)     22.33 GB/s ( +- 1.84)   +84.24%
> >    1024MB      12.12 GB/s ( +- 0.02)     23.92 GB/s ( +- 0.14)   +97.35%
> >    4096MB      12.08 GB/s ( +- 0.02)     23.98 GB/s ( +- 0.18)   +98.50%
>
> For these sizes it may be worth it to save/rstor an xmm register to do
> the memset:
>
> Just on my Tigerlake laptop:
> model name : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
>                   movntdq xmm (5 runs)          movnti GPR (5 runs)
>     Delta(%)
>                   -----------------------       -----------------------
>            size      BW GB/s ( +-  stdev)          BW GB/s ( +-
> stdev)         %
>            2 MB   35.71 GB/s ( +-   1.02)       34.62 GB/s ( +-
> 0.77)    -3.15%
>           16 MB   36.43 GB/s ( +-   0.35)        31.3 GB/s ( +-
> 0.1)   -16.39%
>          128 MB    35.6 GB/s ( +-   0.83)       30.82 GB/s ( +-
> 0.08)    -15.5%
>         1024 MB   36.85 GB/s ( +-   0.26)       30.71 GB/s ( +-
> 0.2)    -20.0%


Also (again just from Tigerlake laptop) I found the trend favor
`rep stosb` more (as opposed to non-cacheable writes) when
there are multiple threads competing for BW:

https://docs.google.com/spreadsheets/d/1f6N9EVqHg71cDIR-RALLR76F_ovW5gzwIWr26yLCmS0/edit?usp=sharing
> >
> > Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> > ---
> >  arch/x86/include/asm/page_64.h |  1 +
> >  arch/x86/lib/clear_page_64.S   | 21 +++++++++++++++++++++
> >  2 files changed, 22 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
> > index a88a3508888a..3affc4ecb8da 100644
> > --- a/arch/x86/include/asm/page_64.h
> > +++ b/arch/x86/include/asm/page_64.h
> > @@ -55,6 +55,7 @@ extern unsigned long __phys_addr_symbol(unsigned long);
> >  void clear_pages_orig(void *page, unsigned long npages);
> >  void clear_pages_rep(void *page, unsigned long npages);
> >  void clear_pages_erms(void *page, unsigned long npages);
> > +void clear_pages_movnt(void *page, unsigned long npages);
> >
> >  #define __HAVE_ARCH_CLEAR_USER_PAGES
> >  static inline void clear_pages(void *page, unsigned int npages)
> > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> > index 2cc3b681734a..83d14f1c9f57 100644
> > --- a/arch/x86/lib/clear_page_64.S
> > +++ b/arch/x86/lib/clear_page_64.S
> > @@ -58,3 +58,24 @@ SYM_FUNC_START(clear_pages_erms)
> >         RET
> >  SYM_FUNC_END(clear_pages_erms)
> >  EXPORT_SYMBOL_GPL(clear_pages_erms)
> > +
> > +SYM_FUNC_START(clear_pages_movnt)
> > +       xorl    %eax,%eax
> > +       movq    %rsi,%rcx
> > +       shlq    $PAGE_SHIFT, %rcx
> > +
> > +       .p2align 4
> > +.Lstart:
> > +       movnti  %rax, 0x00(%rdi)
> > +       movnti  %rax, 0x08(%rdi)
> > +       movnti  %rax, 0x10(%rdi)
> > +       movnti  %rax, 0x18(%rdi)
> > +       movnti  %rax, 0x20(%rdi)
> > +       movnti  %rax, 0x28(%rdi)
> > +       movnti  %rax, 0x30(%rdi)
> > +       movnti  %rax, 0x38(%rdi)
> > +       addq    $0x40, %rdi
> > +       subl    $0x40, %ecx
> > +       ja      .Lstart
> > +       RET
> > +SYM_FUNC_END(clear_pages_movnt)
> > --
> > 2.31.1
> >
Ankur Arora June 12, 2022, 11:18 a.m. UTC | #3
Noah Goldstein <goldstein.w.n@gmail.com> writes:

> On Fri, Jun 10, 2022 at 3:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Mon, Jun 6, 2022 at 11:39 PM Ankur Arora <ankur.a.arora@oracle.com> wrote:
>> >
>> > Add clear_pages_movnt(), which uses MOVNTI as the underlying primitive.
>> > With this, page-clearing can skip the memory hierarchy, thus providing
>> > a non cache-polluting implementation of clear_pages().
>> >
>> > MOVNTI, from the Intel SDM, Volume 2B, 4-101:
>> >  "The non-temporal hint is implemented by using a write combining (WC)
>> >   memory type protocol when writing the data to memory. Using this
>> >   protocol, the processor does not write the data into the cache
>> >   hierarchy, nor does it fetch the corresponding cache line from memory
>> >   into the cache hierarchy."
>> >
>> > The AMD Arch Manual has something similar to say as well.
>> >
>> > One use-case is to zero large extents without bringing in never-to-be-
>> > accessed cachelines. Also, often clear_pages_movnt() based clearing is
>> > faster once extent sizes are O(LLC-size).
>> >
>> > As the excerpt notes, MOVNTI is weakly ordered with respect to other
>> > instructions operating on the memory hierarchy. This needs to be
>> > handled by the caller by executing an SFENCE when done.
>> >
>> > The implementation is straight-forward: unroll the inner loop to keep
>> > the code similar to memset_movnti(), so that we can gauge
>> > clear_pages_movnt() performance via perf bench mem memset.
>> >
>> >  # Intel Icelakex
>> >  # Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb
>> >  # (X86_FEATURE_ERMS) and x86-64-movnt:
>> >
>> >  System:      Oracle X9-2 (2 nodes * 32 cores * 2 threads)
>> >  Processor:   Intel Xeon(R) Platinum 8358 CPU @ 2.60GHz (Icelakex, 6:106:6)
>> >  Memory:      512 GB evenly split between nodes
>> >  LLC-size:    48MB for each node (32-cores * 2-threads)
>> >  no_turbo: 1, Microcode: 0xd0001e0, scaling-governor: performance
>> >
>> >               x86-64-stosb (5 runs)     x86-64-movnt (5 runs)    Delta(%)
>> >               ----------------------    ---------------------    --------
>> >      size            BW   (   stdev)          BW    (   stdev)
>> >
>> >       2MB      14.37 GB/s ( +- 1.55)     12.59 GB/s ( +- 1.20)   -12.38%
>> >      16MB      16.93 GB/s ( +- 2.61)     15.91 GB/s ( +- 2.74)    -6.02%
>> >     128MB      12.12 GB/s ( +- 1.06)     22.33 GB/s ( +- 1.84)   +84.24%
>> >    1024MB      12.12 GB/s ( +- 0.02)     23.92 GB/s ( +- 0.14)   +97.35%
>> >    4096MB      12.08 GB/s ( +- 0.02)     23.98 GB/s ( +- 0.18)   +98.50%
>>
>> For these sizes it may be worth it to save/rstor an xmm register to do
>> the memset:
>>
>> Just on my Tigerlake laptop:
>> model name : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>>
>>                   movntdq xmm (5 runs)          movnti GPR (5 runs)
>>     Delta(%)
>>                   -----------------------       -----------------------
>>            size      BW GB/s ( +-  stdev)          BW GB/s ( +-
>> stdev)         %
>>            2 MB   35.71 GB/s ( +-   1.02)       34.62 GB/s ( +-
>> 0.77)    -3.15%
>>           16 MB   36.43 GB/s ( +-   0.35)        31.3 GB/s ( +-
>> 0.1)   -16.39%
>>          128 MB    35.6 GB/s ( +-   0.83)       30.82 GB/s ( +-
>> 0.08)    -15.5%
>>         1024 MB   36.85 GB/s ( +-   0.26)       30.71 GB/s ( +-
>> 0.2)    -20.0%

Thanks this looks interesting. Any thoughts on what causes the drop-off
for the movnti loop as the region size increases?

I can see the usual two problems with using the XMM registers:

 - the kernel_fpu_begin()/_end() overhead
 - kernel_fpu regions need preemption disabled, which limits the
   extent that can be cleared in a single operation

And given how close movntdq and movnti are for size=2MB, I'm not
sure movntdq would even come out ahead if we include the XMM
save/restore overhead?

> Also (again just from Tigerlake laptop) I found the trend favor
> `rep stosb` more (as opposed to non-cacheable writes) when
> there are multiple threads competing for BW:

I notice in your spreadsheet, that you ran the tests only until
~32MB. How does the performance on Tigerlake change as you
go up to say 512MB? Also, a little unexpected that the cacheable
SIMD variant always performs pretty much the worst.

In general, I wouldn't expect NT writes to perform better for O(LLC-size).
That's why this series avoids using NT writes for sizes smaller than
that (see patch-19.)

The argument is: the larger the region being cleared, the less the
caller cares about the contents and thus we can avoid using the cache.
The other part of course is that NT doesn't perform as well for small
sizes and so using that would regress performance for some user.


Ankur

> https://docs.google.com/spreadsheets/d/1f6N9EVqHg71cDIR-RALLR76F_ovW5gzwIWr26yLCmS0/edit?usp=sharing

>> >
>> > Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
>> > ---
>> >  arch/x86/include/asm/page_64.h |  1 +
>> >  arch/x86/lib/clear_page_64.S   | 21 +++++++++++++++++++++
>> >  2 files changed, 22 insertions(+)
>> >
>> > diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
>> > index a88a3508888a..3affc4ecb8da 100644
>> > --- a/arch/x86/include/asm/page_64.h
>> > +++ b/arch/x86/include/asm/page_64.h
>> > @@ -55,6 +55,7 @@ extern unsigned long __phys_addr_symbol(unsigned long);
>> >  void clear_pages_orig(void *page, unsigned long npages);
>> >  void clear_pages_rep(void *page, unsigned long npages);
>> >  void clear_pages_erms(void *page, unsigned long npages);
>> > +void clear_pages_movnt(void *page, unsigned long npages);
>> >
>> >  #define __HAVE_ARCH_CLEAR_USER_PAGES
>> >  static inline void clear_pages(void *page, unsigned int npages)
>> > diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
>> > index 2cc3b681734a..83d14f1c9f57 100644
>> > --- a/arch/x86/lib/clear_page_64.S
>> > +++ b/arch/x86/lib/clear_page_64.S
>> > @@ -58,3 +58,24 @@ SYM_FUNC_START(clear_pages_erms)
>> >         RET
>> >  SYM_FUNC_END(clear_pages_erms)
>> >  EXPORT_SYMBOL_GPL(clear_pages_erms)
>> > +
>> > +SYM_FUNC_START(clear_pages_movnt)
>> > +       xorl    %eax,%eax
>> > +       movq    %rsi,%rcx
>> > +       shlq    $PAGE_SHIFT, %rcx
>> > +
>> > +       .p2align 4
>> > +.Lstart:
>> > +       movnti  %rax, 0x00(%rdi)
>> > +       movnti  %rax, 0x08(%rdi)
>> > +       movnti  %rax, 0x10(%rdi)
>> > +       movnti  %rax, 0x18(%rdi)
>> > +       movnti  %rax, 0x20(%rdi)
>> > +       movnti  %rax, 0x28(%rdi)
>> > +       movnti  %rax, 0x30(%rdi)
>> > +       movnti  %rax, 0x38(%rdi)
>> > +       addq    $0x40, %rdi
>> > +       subl    $0x40, %ecx
>> > +       ja      .Lstart
>> > +       RET
>> > +SYM_FUNC_END(clear_pages_movnt)
>> > --
>> > 2.31.1
>> >


--
ankur
diff mbox series

Patch

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index a88a3508888a..3affc4ecb8da 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -55,6 +55,7 @@  extern unsigned long __phys_addr_symbol(unsigned long);
 void clear_pages_orig(void *page, unsigned long npages);
 void clear_pages_rep(void *page, unsigned long npages);
 void clear_pages_erms(void *page, unsigned long npages);
+void clear_pages_movnt(void *page, unsigned long npages);
 
 #define __HAVE_ARCH_CLEAR_USER_PAGES
 static inline void clear_pages(void *page, unsigned int npages)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 2cc3b681734a..83d14f1c9f57 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -58,3 +58,24 @@  SYM_FUNC_START(clear_pages_erms)
 	RET
 SYM_FUNC_END(clear_pages_erms)
 EXPORT_SYMBOL_GPL(clear_pages_erms)
+
+SYM_FUNC_START(clear_pages_movnt)
+	xorl	%eax,%eax
+	movq	%rsi,%rcx
+	shlq    $PAGE_SHIFT, %rcx
+
+	.p2align 4
+.Lstart:
+	movnti  %rax, 0x00(%rdi)
+	movnti  %rax, 0x08(%rdi)
+	movnti  %rax, 0x10(%rdi)
+	movnti  %rax, 0x18(%rdi)
+	movnti  %rax, 0x20(%rdi)
+	movnti  %rax, 0x28(%rdi)
+	movnti  %rax, 0x30(%rdi)
+	movnti  %rax, 0x38(%rdi)
+	addq    $0x40, %rdi
+	subl    $0x40, %ecx
+	ja      .Lstart
+	RET
+SYM_FUNC_END(clear_pages_movnt)