[1/3] x86: avoid flush IPI when possible

Message ID	56BB419B02000078000D08A9@prv-mh.provo.novell.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> Message-Id: <56BB419B02000078000D08A9@prv-mh.provo.novell.com> Date: Wed, 10 Feb 2016 05:56:43 -0700 From: "Jan Beulich" <JBeulich@suse.com> To: "xen-devel" <xen-devel@lists.xenproject.org> References: <56BB3DE902000078000D087A@prv-mh.provo.novell.com> In-Reply-To: <56BB3DE902000078000D087A@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__Part2D1ACD9B.1__=" Cc: Andrew Cooper <andrew.cooper3@citrix.com>, Keir Fraser <keir@xen.org> Subject: [Xen-devel] [PATCH 1/3] x86: avoid flush IPI when possible Precedence: list Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org

Message ID

56BB419B02000078000D08A9@prv-mh.provo.novell.com (mailing list archive)

State

New, archived

Headers

Message-Id: <56BB419B02000078000D08A9@prv-mh.provo.novell.com>
Date: Wed, 10 Feb 2016 05:56:43 -0700
From: "Jan Beulich" <JBeulich@suse.com>
To: "xen-devel" <xen-devel@lists.xenproject.org>
References: <56BB3DE902000078000D087A@prv-mh.provo.novell.com>
In-Reply-To: <56BB3DE902000078000D087A@prv-mh.provo.novell.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="=__Part2D1ACD9B.1__="
Cc: Andrew Cooper <andrew.cooper3@citrix.com>, Keir Fraser <keir@xen.org>
Subject: [Xen-devel] [PATCH 1/3] x86: avoid flush IPI when possible
Precedence: list
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org

Commit Message

Jan Beulich Feb. 10, 2016, 12:56 p.m. UTC

Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
there's no need to IPI other CPUs if this is the only flushing being
requested. (As a secondary change, move a local variable into the scope
where it's actually needed.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86: avoid flush IPI when possible

Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
there's no need to IPI other CPUs if this is the only flushing being
requested. (As a secondary change, move a local variable into the scope
where it's actually needed.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -91,9 +91,8 @@ void write_cr3(unsigned long cr3)
     local_irq_restore(flags);
 }
 
-void flush_area_local(const void *va, unsigned int flags)
+unsigned int flush_area_local(const void *va, unsigned int flags)
 {
-    const struct cpuinfo_x86 *c = &current_cpu_data;
     unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
     unsigned long irqfl;
 
@@ -130,6 +129,7 @@ void flush_area_local(const void *va, un
 
     if ( flags & FLUSH_CACHE )
     {
+        const struct cpuinfo_x86 *c = &current_cpu_data;
         unsigned long i, sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
@@ -147,8 +147,11 @@ void flush_area_local(const void *va, un
         else
         {
             wbinvd();
+            flags &= ~FLUSH_CACHE;
         }
     }
 
     local_irq_restore(irqfl);
+
+    return flags & FLUSH_CACHE;
 }
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -205,26 +205,30 @@ static unsigned int flush_flags;
 
 void invalidate_interrupt(struct cpu_user_regs *regs)
 {
+    unsigned int flags = flush_flags;
     ack_APIC_irq();
     perfc_incr(ipis);
-    if ( !__sync_local_execstate() ||
-         (flush_flags & (FLUSH_TLB_GLOBAL | FLUSH_CACHE)) )
-        flush_area_local(flush_va, flush_flags);
+    if ( __sync_local_execstate() )
+        flags &= ~FLUSH_TLB;
+    flush_area_local(flush_va, flags);
     cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
 }
 
 void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
 {
+    unsigned int cpu = smp_processor_id();
+
     ASSERT(local_irq_is_enabled());
 
-    if ( cpumask_test_cpu(smp_processor_id(), mask) )
-        flush_area_local(va, flags);
+    if ( cpumask_test_cpu(cpu, mask) )
+        flags &= ~flush_area_local(va, flags);
 
-    if ( !cpumask_subset(mask, cpumask_of(smp_processor_id())) )
+    if ( (flags & ~FLUSH_ORDER_MASK) &&
+         !cpumask_subset(mask, cpumask_of(cpu)) )
     {
         spin_lock(&flush_lock);
         cpumask_and(&flush_cpumask, mask, &cpu_online_map);
-        cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
+        cpumask_clear_cpu(cpu, &flush_cpumask);
         flush_va      = va;
         flush_flags   = flags;
         send_IPI_mask(&flush_cpumask, INVALIDATE_TLB_VECTOR);
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -87,7 +87,7 @@ void write_cr3(unsigned long cr3);
 #define FLUSH_CACHE      0x400
 
 /* Flush local TLBs/caches. */
-void flush_area_local(const void *va, unsigned int flags);
+unsigned int flush_area_local(const void *va, unsigned int flags);
 #define flush_local(flags) flush_area_local(NULL, flags)
 
 /* Flush specified CPUs' TLBs/caches */

Comments

Andrew Cooper Feb. 10, 2016, 3 p.m. UTC | #1

On 10/02/16 12:56, Jan Beulich wrote:
> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,

I can't parse this sentence.

CLFUSH states "Invalidates from every level of the cache hierarchy in
the cache coherence domain"

WBINVD however states "The instruction then issues a special-function
bus cycle that directs external caches to also write back modified data
and another bus cycle to indicate that the external caches should be
invalidated."

I think we need input from Intel and AMD here as to the behaviour and
terminology here, and in particular, where the coherency domain
boundaries are.  All CPUs, even across multiple sockets, see coherent
caching, but it is unclear whether this qualifies them to be in the same
cache coherency domain per the instruction spec.

In particular, given the architecture of 8-socket systems and 45MB of
RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
Causing 45MB of data to move to remote memory controllers all at once
would cause a massive system stall.

~Andrew

Jan Beulich Feb. 10, 2016, 3:37 p.m. UTC | #2

>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
> On 10/02/16 12:56, Jan Beulich wrote:
>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
> 
> I can't parse this sentence.

Should have been "..., is a cache coherency domain wide flush, ..." -
does it read any better then?

> CLFUSH states "Invalidates from every level of the cache hierarchy in
> the cache coherence domain"
> 
> WBINVD however states "The instruction then issues a special-function
> bus cycle that directs external caches to also write back modified data
> and another bus cycle to indicate that the external caches should be
> invalidated."
> 
> I think we need input from Intel and AMD here as to the behaviour and
> terminology here, and in particular, where the coherency domain
> boundaries are.  All CPUs, even across multiple sockets, see coherent
> caching, but it is unclear whether this qualifies them to be in the same
> cache coherency domain per the instruction spec.

Linux already doing what this patch switches us to, I'm not sure
we need much extra input.

> In particular, given the architecture of 8-socket systems and 45MB of
> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 

Not everywhere, just on the local socket (assuming there's no external
cache).

> Causing 45MB of data to move to remote memory controllers all at once
> would cause a massive system stall.

That's why it takes (as we know) so long. See the figure in SDM Vol 3
section "Invalidating Caches and TLBs".

Jan

Andrew Cooper Feb. 10, 2016, 5:51 p.m. UTC | #3

On 10/02/16 15:37, Jan Beulich wrote:
>>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
>> On 10/02/16 12:56, Jan Beulich wrote:
>>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
>> I can't parse this sentence.
> Should have been "..., is a cache coherency domain wide flush, ..." -
> does it read any better then?

I believe, given the code in the patch, your intent is "if we WBINVD, we
don't need to IPI other cores cache flushing reasons".

However, given your comment below...

>
>> CLFUSH states "Invalidates from every level of the cache hierarchy in
>> the cache coherence domain"
>>
>> WBINVD however states "The instruction then issues a special-function
>> bus cycle that directs external caches to also write back modified data
>> and another bus cycle to indicate that the external caches should be
>> invalidated."
>>
>> I think we need input from Intel and AMD here as to the behaviour and
>> terminology here, and in particular, where the coherency domain
>> boundaries are.  All CPUs, even across multiple sockets, see coherent
>> caching, but it is unclear whether this qualifies them to be in the same
>> cache coherency domain per the instruction spec.
> Linux already doing what this patch switches us to, I'm not sure
> we need much extra input.
>
>> In particular, given the architecture of 8-socket systems and 45MB of
>> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
> Not everywhere, just on the local socket (assuming there's no external
> cache).

If this is true, then it is clearly not safe to omit the IPIs.

>
>> Causing 45MB of data to move to remote memory controllers all at once
>> would cause a massive system stall.
> That's why it takes (as we know) so long. See the figure in SDM Vol 3
> section "Invalidating Caches and TLBs".

I presume you mean Figure 2-10. WBINVD Invalidation of Shared and
Non-Shared Cache Hierarchy?

This quite clearly shows that WBINVD will not invalidate or write back
the L1 caches for other cores in the same processor.

Have I misunderstood the logic for choosing when to omit the IPIs?

~Andrew

Jan Beulich Feb. 11, 2016, 10:48 a.m. UTC | #4

>>> On 10.02.16 at 18:51, <andrew.cooper3@citrix.com> wrote:
> On 10/02/16 15:37, Jan Beulich wrote:
>>>>> On 10.02.16 at 16:00, <andrew.cooper3@citrix.com> wrote:
>>> On 10/02/16 12:56, Jan Beulich wrote:
>>>> Since CLFLUSH, other than WBINVD, is a coherency domain wide flush,
>>> I can't parse this sentence.
>> Should have been "..., is a cache coherency domain wide flush, ..." -
>> does it read any better then?
> 
> I believe, given the code in the patch, your intent is "if we WBINVD, we
> don't need to IPI other cores cache flushing reasons".

I don't see how this can be read from the sentence. The primary
statement is makes if "CLFLUSH is a cache coherency domain wide
flush". A secondary statement is that this is different from WBINVD.

> However, given your comment below...
> 
>>
>>> CLFUSH states "Invalidates from every level of the cache hierarchy in
>>> the cache coherence domain"
>>>
>>> WBINVD however states "The instruction then issues a special-function
>>> bus cycle that directs external caches to also write back modified data
>>> and another bus cycle to indicate that the external caches should be
>>> invalidated."
>>>
>>> I think we need input from Intel and AMD here as to the behaviour and
>>> terminology here, and in particular, where the coherency domain
>>> boundaries are.  All CPUs, even across multiple sockets, see coherent
>>> caching, but it is unclear whether this qualifies them to be in the same
>>> cache coherency domain per the instruction spec.
>> Linux already doing what this patch switches us to, I'm not sure
>> we need much extra input.
>>
>>> In particular, given the architecture of 8-socket systems and 45MB of
>>> RAM in L3 caches, does wbinvd seriously drain all caches everywhere? 
>> Not everywhere, just on the local socket (assuming there's no external
>> cache).
> 
> If this is true, then it is clearly not safe to omit the IPIs.

When using CLFLUSH it is safe, while when using WBINVD it's not.

>>> Causing 45MB of data to move to remote memory controllers all at once
>>> would cause a massive system stall.
>> That's why it takes (as we know) so long. See the figure in SDM Vol 3
>> section "Invalidating Caches and TLBs".
> 
> I presume you mean Figure 2-10. WBINVD Invalidation of Shared and
> Non-Shared Cache Hierarchy?
> 
> This quite clearly shows that WBINVD will not invalidate or write back
> the L1 caches for other cores in the same processor.
> 
> Have I misunderstood the logic for choosing when to omit the IPIs?

I'm afraid you did, or else I must have introduced a (latent,
because I didn't notice any issues so far) bug.

Jan

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -91,9 +91,8 @@  void write_cr3(unsigned long cr3)
     local_irq_restore(flags);
 }
 
-void flush_area_local(const void *va, unsigned int flags)
+unsigned int flush_area_local(const void *va, unsigned int flags)
 {
-    const struct cpuinfo_x86 *c = &current_cpu_data;
     unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
     unsigned long irqfl;
 
@@ -130,6 +129,7 @@  void flush_area_local(const void *va, un
 
     if ( flags & FLUSH_CACHE )
     {
+        const struct cpuinfo_x86 *c = &current_cpu_data;
         unsigned long i, sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
@@ -147,8 +147,11 @@  void flush_area_local(const void *va, un
         else
         {
             wbinvd();
+            flags &= ~FLUSH_CACHE;
         }
     }
 
     local_irq_restore(irqfl);
+
+    return flags & FLUSH_CACHE;
 }
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -205,26 +205,30 @@  static unsigned int flush_flags;
 
 void invalidate_interrupt(struct cpu_user_regs *regs)
 {
+    unsigned int flags = flush_flags;
     ack_APIC_irq();
     perfc_incr(ipis);
-    if ( !__sync_local_execstate() ||
-         (flush_flags & (FLUSH_TLB_GLOBAL | FLUSH_CACHE)) )
-        flush_area_local(flush_va, flush_flags);
+    if ( __sync_local_execstate() )
+        flags &= ~FLUSH_TLB;
+    flush_area_local(flush_va, flags);
     cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
 }
 
 void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
 {
+    unsigned int cpu = smp_processor_id();
+
     ASSERT(local_irq_is_enabled());
 
-    if ( cpumask_test_cpu(smp_processor_id(), mask) )
-        flush_area_local(va, flags);
+    if ( cpumask_test_cpu(cpu, mask) )
+        flags &= ~flush_area_local(va, flags);
 
-    if ( !cpumask_subset(mask, cpumask_of(smp_processor_id())) )
+    if ( (flags & ~FLUSH_ORDER_MASK) &&
+         !cpumask_subset(mask, cpumask_of(cpu)) )
     {
         spin_lock(&flush_lock);
         cpumask_and(&flush_cpumask, mask, &cpu_online_map);
-        cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
+        cpumask_clear_cpu(cpu, &flush_cpumask);
         flush_va      = va;
         flush_flags   = flags;
         send_IPI_mask(&flush_cpumask, INVALIDATE_TLB_VECTOR);
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -87,7 +87,7 @@  void write_cr3(unsigned long cr3);
 #define FLUSH_CACHE      0x400
 
 /* Flush local TLBs/caches. */
-void flush_area_local(const void *va, unsigned int flags);
+unsigned int flush_area_local(const void *va, unsigned int flags);
 #define flush_local(flags) flush_area_local(NULL, flags)
 
 /* Flush specified CPUs' TLBs/caches */

[1/3] x86: avoid flush IPI when possible

Commit Message

Comments

Patch