diff mbox series

[12/18] arch/tlb: Clean up simple architectures

Message ID 20180926114801.146189550@infradead.org (mailing list archive)
State New, archived
Headers show
Series my generic mmu_gather patches | expand

Commit Message

Peter Zijlstra Sept. 26, 2018, 11:36 a.m. UTC
There are generally two cases:

 1) either the platform has an efficient flush_tlb_range() and
    asm-generic/tlb.h doesn't need any overrides at all.

 2) or an architecture lacks an efficient flush_tlb_range() and
    we override tlb_end_vma() and tlb_flush().

Convert all 'simple' architectures to one of these two forms.

alpha:	    has no range invalidate -> 2
arc:	    already used flush_tlb_range() -> 1
c6x:	    has no range invalidate -> 2
h8300:	    has no mmu
hexagon:    has an efficient flush_tlb_range() -> 1
            (flush_tlb_mm() is in fact a full range invalidate,
	     so no need to shoot down everything)
m68k:	    has inefficient flush_tlb_range() -> 2
microblaze: has no flush_tlb_range() -> 2
mips:	    has efficient flush_tlb_range() -> 1
	    (even though it currently seems to use flush_tlb_mm())
nds32:	    already uses flush_tlb_range() -> 1
nios2:	    has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
openrisc:   has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
parisc:	    already uses flush_tlb_range() -> 1
sparc32:    already uses flush_tlb_range() -> 1
unicore32:  has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
xtensa:	    has efficient flush_tlb_range() -> 1

Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Helge Deller <deller@gmx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/alpha/include/asm/tlb.h      |    2 --
 arch/arc/include/asm/tlb.h        |   23 -----------------------
 arch/c6x/include/asm/tlb.h        |    1 +
 arch/h8300/include/asm/tlb.h      |    2 --
 arch/hexagon/include/asm/tlb.h    |   12 ------------
 arch/m68k/include/asm/tlb.h       |    1 -
 arch/microblaze/include/asm/tlb.h |    4 +---
 arch/mips/include/asm/tlb.h       |    8 --------
 arch/nds32/include/asm/tlb.h      |   10 ----------
 arch/nios2/include/asm/tlb.h      |    8 +++++---
 arch/openrisc/include/asm/tlb.h   |    6 ++++--
 arch/parisc/include/asm/tlb.h     |   13 -------------
 arch/powerpc/include/asm/tlb.h    |    1 -
 arch/sparc/include/asm/tlb_32.h   |   13 -------------
 arch/unicore32/include/asm/tlb.h  |   10 ++++++----
 arch/xtensa/include/asm/tlb.h     |   17 -----------------
 16 files changed, 17 insertions(+), 114 deletions(-)

Comments

Vineet Gupta Oct. 3, 2018, 5:03 p.m. UTC | #1
On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
> There are generally two cases:
>
>  1) either the platform has an efficient flush_tlb_range() and
>     asm-generic/tlb.h doesn't need any overrides at all.
>
>  2) or an architecture lacks an efficient flush_tlb_range() and
>     we override tlb_end_vma() and tlb_flush().
>
> Convert all 'simple' architectures to one of these two forms.
>
> alpha:	    has no range invalidate -> 2
> arc:	    already used flush_tlb_range() -> 1
> c6x:	    has no range invalidate -> 2
> h8300:	    has no mmu
> hexagon:    has an efficient flush_tlb_range() -> 1
>             (flush_tlb_mm() is in fact a full range invalidate,
> 	     so no need to shoot down everything)
> m68k:	    has inefficient flush_tlb_range() -> 2
> microblaze: has no flush_tlb_range() -> 2
> mips:	    has efficient flush_tlb_range() -> 1
> 	    (even though it currently seems to use flush_tlb_mm())
> nds32:	    already uses flush_tlb_range() -> 1
> nios2:	    has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> openrisc:   has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> parisc:	    already uses flush_tlb_range() -> 1
> sparc32:    already uses flush_tlb_range() -> 1
> unicore32:  has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> xtensa:	    has efficient flush_tlb_range() -> 1
>
> Cc: Richard Henderson <rth@twiddle.net>
> Cc: Vineet Gupta <vgupta@synopsys.com>
> Cc: Mark Salter <msalter@redhat.com>
> Cc: Richard Kuo <rkuo@codeaurora.org>
> Cc: Michal Simek <monstr@monstr.eu>
> Cc: Paul Burton <paul.burton@mips.com>
> Cc: Greentime Hu <green.hu@gmail.com>
> Cc: Ley Foon Tan <lftan@altera.com>
> Cc: Jonas Bonn <jonas@southpole.se>
> Cc: Helge Deller <deller@gmx.de>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Guan Xuetao <gxt@pku.edu.cn>
> Cc: Max Filippov <jcmvbkbc@gmail.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Nick Piggin <npiggin@gmail.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/alpha/include/asm/tlb.h      |    2 --
>  arch/arc/include/asm/tlb.h        |   23 -----------------------
>  arch/c6x/include/asm/tlb.h        |    1 +
>  arch/h8300/include/asm/tlb.h      |    2 --
>  arch/hexagon/include/asm/tlb.h    |   12 ------------
>  arch/m68k/include/asm/tlb.h       |    1 -
>  arch/microblaze/include/asm/tlb.h |    4 +---
>  arch/mips/include/asm/tlb.h       |    8 --------
>  arch/nds32/include/asm/tlb.h      |   10 ----------
>  arch/nios2/include/asm/tlb.h      |    8 +++++---
>  arch/openrisc/include/asm/tlb.h   |    6 ++++--
>  arch/parisc/include/asm/tlb.h     |   13 -------------
>  arch/powerpc/include/asm/tlb.h    |    1 -
>  arch/sparc/include/asm/tlb_32.h   |   13 -------------
>  arch/unicore32/include/asm/tlb.h  |   10 ++++++----
>  arch/xtensa/include/asm/tlb.h     |   17 -----------------
>  16 files changed, 17 insertions(+), 114 deletions(-)
>
> --- a/arch/alpha/include/asm/tlb.h
> +++ b/arch/alpha/include/asm/tlb.h
> @@ -4,8 +4,6 @@
>  
>  #define tlb_start_vma(tlb, vma)			do { } while (0)
>  #define tlb_end_vma(tlb, vma)			do { } while (0)
> -#define __tlb_remove_tlb_entry(tlb, pte, addr)	do { } while (0)
> -
>  #define tlb_flush(tlb)				flush_tlb_mm((tlb)->mm)
>  
>  #include <asm-generic/tlb.h>
> --- a/arch/arc/include/asm/tlb.h
> +++ b/arch/arc/include/asm/tlb.h
> @@ -9,29 +9,6 @@
>  #ifndef _ASM_ARC_TLB_H
>  #define _ASM_ARC_TLB_H
>  
> -#define tlb_flush(tlb)				\
> -do {						\
> -	if (tlb->fullmm)			\
> -		flush_tlb_mm((tlb)->mm);	\
> -} while (0)
> -
> -/*
> - * This pair is called at time of munmap/exit to flush cache and TLB entries
> - * for mappings being torn down.
> - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
> - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
> - *
> - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
> - */
> -
> -#define tlb_end_vma(tlb, vma)						\
> -do {									\
> -	if (!tlb->fullmm)						\
> -		flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
> -} while (0)
> -
> -#define __tlb_remove_tlb_entry(tlb, ptep, address)
> -
>  #include <linux/pagemap.h>
>  #include <asm-generic/tlb.h>

LGTM per discussion in an earlier thread. However given that for "simpler" arches
the whole series doesn't apply can you please beef up the changelog so I don't go
scratching my head 2 years down the line. It currently describes the hows of
things but not exactly whys: shift_arg_pages missing tlb_start_vma,
move_page_tables look dodgy, yady yadda ?

Thx,
-Vineet
Peter Zijlstra Oct. 11, 2018, 3:04 p.m. UTC | #2
On Wed, Oct 03, 2018 at 05:03:50PM +0000, Vineet Gupta wrote:
> On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
> > There are generally two cases:
> >
> >  1) either the platform has an efficient flush_tlb_range() and
> >     asm-generic/tlb.h doesn't need any overrides at all.
> >
> >  2) or an architecture lacks an efficient flush_tlb_range() and
> >     we override tlb_end_vma() and tlb_flush().
> >
> > Convert all 'simple' architectures to one of these two forms.
> >

> > --- a/arch/arc/include/asm/tlb.h
> > +++ b/arch/arc/include/asm/tlb.h
> > @@ -9,29 +9,6 @@
> >  #ifndef _ASM_ARC_TLB_H
> >  #define _ASM_ARC_TLB_H
> >  
> > -#define tlb_flush(tlb)				\
> > -do {						\
> > -	if (tlb->fullmm)			\
> > -		flush_tlb_mm((tlb)->mm);	\
> > -} while (0)
> > -
> > -/*
> > - * This pair is called at time of munmap/exit to flush cache and TLB entries
> > - * for mappings being torn down.
> > - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
> > - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
> > - *
> > - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
> > - */
> > -
> > -#define tlb_end_vma(tlb, vma)						\
> > -do {									\
> > -	if (!tlb->fullmm)						\
> > -		flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
> > -} while (0)
> > -
> > -#define __tlb_remove_tlb_entry(tlb, ptep, address)
> > -
> >  #include <linux/pagemap.h>
> >  #include <asm-generic/tlb.h>
> 
> LGTM per discussion in an earlier thread. However given that for "simpler" arches
> the whole series doesn't apply can you please beef up the changelog so I don't go
> scratching my head 2 years down the line. It currently describes the hows of
> things but not exactly whys: shift_arg_pages missing tlb_start_vma,
> move_page_tables look dodgy, yady yadda ?

Right you are. Thanks for pointing out the somewhat sparse Changelog;
typically I end up kicking myself a few years down the line.

I think I will in fact change the implementation a little and provide a
symbol/Kconfig to switch the default implementation between
flush_tlb_vma() and flush_tlb_mm().

That avoids some of the repetition. But see here a preview of the new
Changelog, does that clarify things enough?

---
Subject: arch/tlb: Clean up simple architectures
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue Sep 4 17:04:07 CEST 2018

The generic mmu_gather implementation is geared towards range tracking
and provided the architecture provides a fairly efficient
flush_tlb_range() implementation (or provides a custom tlb_flush()
implementation) things will work well.

The one case this doesn't cover well is where there is no (efficient)
range invalidate at all. In this case we can select
MMU_GATHER_NO_RANGE.

So this reduces to two cases:

 1) either the platform has an efficient flush_tlb_range() and
    asm-generic/tlb.h doesn't need any overrides at all.

 2) or an architecture lacks an efficient flush_tlb_range() and
    we need to select MMU_GATHER_NO_RANGE.

Convert all 'simple' architectures to one of these two forms.

alpha:	    has no range invalidate -> 2
arc:	    already used flush_tlb_range() -> 1
c6x:	    has no range invalidate -> 2
hexagon:    has an efficient flush_tlb_range() -> 1
            (flush_tlb_mm() is in fact a full range invalidate,
	     so no need to shoot down everything)
m68k:	    has inefficient flush_tlb_range() -> 2
microblaze: has no flush_tlb_range() -> 2
mips:	    has efficient flush_tlb_range() -> 1
	    (even though it currently seems to use flush_tlb_mm())
nds32:	    already uses flush_tlb_range() -> 1
nios2:	    has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
openrisc:   has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
parisc:	    already uses flush_tlb_range() -> 1
sparc32:    already uses flush_tlb_range() -> 1
unicore32:  has inefficient flush_tlb_range() -> 2
	    (no limit on range iteration)
xtensa:	    has efficient flush_tlb_range() -> 1

Note this also fixes a bug in the existing code for a number
platforms. Those platforms that did:

  tlb_end_vma() -> if (!fullmm) flush_tlb_*()
  tlb_flush -> if (full_mm) flush_tlb_mm()

missed the case of shift_arg_pages(), which doesn't have @fullmm set,
nor calls into tlb_*vma(), but still frees page-tables and thus needs
an invalidate. The new code handles this by detecting a non-empty
range, and either issuing the matching range invalidate or a full
invalidate, depending on the capabilities.

Cc: Nick Piggin <npiggin@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Helge Deller <deller@gmx.de>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Mark Salter <msalter@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Vineet Gupta Oct. 12, 2018, 7:40 p.m. UTC | #3
On 10/11/2018 08:06 AM, Peter Zijlstra wrote:
> On Wed, Oct 03, 2018 at 05:03:50PM +0000, Vineet Gupta wrote:
>> On 09/26/2018 04:56 AM, Peter Zijlstra wrote:
>>> There are generally two cases:
>>>
>>>  1) either the platform has an efficient flush_tlb_range() and
>>>     asm-generic/tlb.h doesn't need any overrides at all.
>>>
>>>  2) or an architecture lacks an efficient flush_tlb_range() and
>>>     we override tlb_end_vma() and tlb_flush().
>>>
>>> Convert all 'simple' architectures to one of these two forms.
>>>
>>> --- a/arch/arc/include/asm/tlb.h
>>> +++ b/arch/arc/include/asm/tlb.h
>>> @@ -9,29 +9,6 @@
>>>  #ifndef _ASM_ARC_TLB_H
>>>  #define _ASM_ARC_TLB_H
>>>  
>>> -#define tlb_flush(tlb)				\
>>> -do {						\
>>> -	if (tlb->fullmm)			\
>>> -		flush_tlb_mm((tlb)->mm);	\
>>> -} while (0)
>>> -
>>> -/*
>>> - * This pair is called at time of munmap/exit to flush cache and TLB entries
>>> - * for mappings being torn down.
>>> - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
>>> - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
>>> - *
>>> - * Note, read https://urldefense.proofpoint.com/v2/url?u=http-3A__lkml.org_lkml_2004_1_15_6&d=DwIBaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=c14YS-cH-kdhTOW89KozFhBtBJgs1zXscZojEZQ0THs&m=5jiyvgRek4SKK5DUWDBGufVcuLez5G-jJCh3K-ndHsg&s=7uAzzw_jdAXMfb07B-vGPh3V1vggbTAsB7xL6Kie47A&e=
>>> - */
>>> -
>>> -#define tlb_end_vma(tlb, vma)						\
>>> -do {									\
>>> -	if (!tlb->fullmm)						\
>>> -		flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
>>> -} while (0)
>>> -
>>> -#define __tlb_remove_tlb_entry(tlb, ptep, address)
>>> -
>>>  #include <linux/pagemap.h>
>>>  #include <asm-generic/tlb.h>
>> LGTM per discussion in an earlier thread. However given that for "simpler" arches
>> the whole series doesn't apply can you please beef up the changelog so I don't go
>> scratching my head 2 years down the line. It currently describes the hows of
>> things but not exactly whys: shift_arg_pages missing tlb_start_vma,
>> move_page_tables look dodgy, yady yadda ?
> Right you are. Thanks for pointing out the somewhat sparse Changelog;
> typically I end up kicking myself a few years down the line.
>
> I think I will in fact change the implementation a little and provide a
> symbol/Kconfig to switch the default implementation between
> flush_tlb_vma() and flush_tlb_mm().
>
> That avoids some of the repetition. But see here a preview of the new
> Changelog, does that clarify things enough?
>
> ---
> Subject: arch/tlb: Clean up simple architectures
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue Sep 4 17:04:07 CEST 2018
>
> The generic mmu_gather implementation is geared towards range tracking
> and provided the architecture provides a fairly efficient
> flush_tlb_range() implementation (or provides a custom tlb_flush()
> implementation) things will work well.
>
> The one case this doesn't cover well is where there is no (efficient)
> range invalidate at all. In this case we can select
> MMU_GATHER_NO_RANGE.
>
> So this reduces to two cases:
>
>  1) either the platform has an efficient flush_tlb_range() and
>     asm-generic/tlb.h doesn't need any overrides at all.
>
>  2) or an architecture lacks an efficient flush_tlb_range() and
>     we need to select MMU_GATHER_NO_RANGE.
>
> Convert all 'simple' architectures to one of these two forms.
>
> alpha:	    has no range invalidate -> 2
> arc:	    already used flush_tlb_range() -> 1
> c6x:	    has no range invalidate -> 2
> hexagon:    has an efficient flush_tlb_range() -> 1
>             (flush_tlb_mm() is in fact a full range invalidate,
> 	     so no need to shoot down everything)
> m68k:	    has inefficient flush_tlb_range() -> 2
> microblaze: has no flush_tlb_range() -> 2
> mips:	    has efficient flush_tlb_range() -> 1
> 	    (even though it currently seems to use flush_tlb_mm())
> nds32:	    already uses flush_tlb_range() -> 1
> nios2:	    has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> openrisc:   has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> parisc:	    already uses flush_tlb_range() -> 1
> sparc32:    already uses flush_tlb_range() -> 1
> unicore32:  has inefficient flush_tlb_range() -> 2
> 	    (no limit on range iteration)
> xtensa:	    has efficient flush_tlb_range() -> 1
>
> Note this also fixes a bug in the existing code for a number
> platforms. Those platforms that did:
>
>   tlb_end_vma() -> if (!fullmm) flush_tlb_*()
>   tlb_flush -> if (full_mm) flush_tlb_mm()
>
> missed the case of shift_arg_pages(), which doesn't have @fullmm set,
> nor calls into tlb_*vma(), but still frees page-tables and thus needs
> an invalidate. The new code handles this by detecting a non-empty
> range, and either issuing the matching range invalidate or a full
> invalidate, depending on the capabilities.
>
> Cc: Nick Piggin <npiggin@gmail.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Michal Simek <monstr@monstr.eu>
> Cc: Helge Deller <deller@gmx.de>
> Cc: Greentime Hu <green.hu@gmail.com>
> Cc: Richard Henderson <rth@twiddle.net>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Ley Foon Tan <lftan@altera.com>
> Cc: Jonas Bonn <jonas@southpole.se>
> Cc: Mark Salter <msalter@redhat.com>
> Cc: Richard Kuo <rkuo@codeaurora.org
> Cc: Vineet Gupta <vgupta@synopsys.com>
> Cc: Paul Burton <paul.burton@mips.com>
> Cc: Max Filippov <jcmvbkbc@gmail.com>
> Cc: Guan Xuetao <gxt@pku.edu.cn>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Very nice. Thx for doing this.

Once you have redone this, please point me to a branch so I can give this a spin.
I've always been interested in tracking down / optimizing the full TLB flushes -
which ARC implements by simply moving the MMU/process to a new ASID (TLB entries
tagged with an 8 bit value - unique per process). When I started looking into this
, a simple ls (fork+execve) would increment the ASID by 13 which I'd optimized to
a reasonable 4. Haven't checked that in recent times though so would be fun to
revive that measurement.

-Vineet
Peter Zijlstra Oct. 15, 2018, 2:14 p.m. UTC | #4
On Fri, Oct 12, 2018 at 07:40:04PM +0000, Vineet Gupta wrote:
> Very nice. Thx for doing this.
> 
> Once you have redone this, please point me to a branch so I can give this a spin.
> I've always been interested in tracking down / optimizing the full TLB flushes -
> which ARC implements by simply moving the MMU/process to a new ASID (TLB entries
> tagged with an 8 bit value - unique per process). When I started looking into this
> , a simple ls (fork+execve) would increment the ASID by 13 which I'd optimized to
> a reasonable 4. Haven't checked that in recent times though so would be fun to
> revive that measurement.

I just pushed out the latest version to:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git mm/tlb

(mandatory caution: that tree is unstable / throw-away)

I'll wait a few days to see what, if anything, comes back from 0day
before posting again.
diff mbox series

Patch

--- a/arch/alpha/include/asm/tlb.h
+++ b/arch/alpha/include/asm/tlb.h
@@ -4,8 +4,6 @@ 
 
 #define tlb_start_vma(tlb, vma)			do { } while (0)
 #define tlb_end_vma(tlb, vma)			do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, pte, addr)	do { } while (0)
-
 #define tlb_flush(tlb)				flush_tlb_mm((tlb)->mm)
 
 #include <asm-generic/tlb.h>
--- a/arch/arc/include/asm/tlb.h
+++ b/arch/arc/include/asm/tlb.h
@@ -9,29 +9,6 @@ 
 #ifndef _ASM_ARC_TLB_H
 #define _ASM_ARC_TLB_H
 
-#define tlb_flush(tlb)				\
-do {						\
-	if (tlb->fullmm)			\
-		flush_tlb_mm((tlb)->mm);	\
-} while (0)
-
-/*
- * This pair is called at time of munmap/exit to flush cache and TLB entries
- * for mappings being torn down.
- * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$
- * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range
- *
- * Note, read http://lkml.org/lkml/2004/1/15/6
- */
-
-#define tlb_end_vma(tlb, vma)						\
-do {									\
-	if (!tlb->fullmm)						\
-		flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, ptep, address)
-
 #include <linux/pagemap.h>
 #include <asm-generic/tlb.h>
 
--- a/arch/c6x/include/asm/tlb.h
+++ b/arch/c6x/include/asm/tlb.h
@@ -2,6 +2,7 @@ 
 #ifndef _ASM_C6X_TLB_H
 #define _ASM_C6X_TLB_H
 
+#define tlb_end_vma(tlb,vma) do { } while (0)
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
 #include <asm-generic/tlb.h>
--- a/arch/h8300/include/asm/tlb.h
+++ b/arch/h8300/include/asm/tlb.h
@@ -2,8 +2,6 @@ 
 #ifndef __H8300_TLB_H__
 #define __H8300_TLB_H__
 
-#define tlb_flush(tlb)	do { } while (0)
-
 #include <asm-generic/tlb.h>
 
 #endif
--- a/arch/hexagon/include/asm/tlb.h
+++ b/arch/hexagon/include/asm/tlb.h
@@ -22,18 +22,6 @@ 
 #include <linux/pagemap.h>
 #include <asm/tlbflush.h>
 
-/*
- * We don't need any special per-pte or per-vma handling...
- */
-#define tlb_start_vma(tlb, vma)				do { } while (0)
-#define tlb_end_vma(tlb, vma)				do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
-
-/*
- * .. because we flush the whole mm when it fills up
- */
-#define tlb_flush(tlb)		flush_tlb_mm((tlb)->mm)
-
 #include <asm-generic/tlb.h>
 
 #endif
--- a/arch/m68k/include/asm/tlb.h
+++ b/arch/m68k/include/asm/tlb.h
@@ -8,7 +8,6 @@ 
  */
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
 
 /*
  * .. because we flush the whole mm when it
--- a/arch/microblaze/include/asm/tlb.h
+++ b/arch/microblaze/include/asm/tlb.h
@@ -11,14 +11,12 @@ 
 #ifndef _ASM_MICROBLAZE_TLB_H
 #define _ASM_MICROBLAZE_TLB_H
 
-#define tlb_flush(tlb)	flush_tlb_mm((tlb)->mm)
-
 #include <linux/pagemap.h>
 
 #ifdef CONFIG_MMU
 #define tlb_start_vma(tlb, vma)		do { } while (0)
 #define tlb_end_vma(tlb, vma)		do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0)
+#define tlb_flush(tlb)			flush_tlb_mm((tlb)->mm)
 #endif
 
 #include <asm-generic/tlb.h>
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -5,14 +5,6 @@ 
 #include <asm/cpu-features.h>
 #include <asm/mipsregs.h>
 
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-
-/*
- * .. because we flush the whole mm when it fills up.
- */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
 #define _UNIQUE_ENTRYHI(base, idx)					\
 		(((base) + ((idx) << (PAGE_SHIFT + 1))) |		\
 		 (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
--- a/arch/nds32/include/asm/tlb.h
+++ b/arch/nds32/include/asm/tlb.h
@@ -4,16 +4,6 @@ 
 #ifndef __ASMNDS32_TLB_H
 #define __ASMNDS32_TLB_H
 
-#define tlb_end_vma(tlb,vma)				\
-	do { 						\
-		if(!tlb->fullmm)			\
-			flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
-	} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0)
-
-#define tlb_flush(tlb)	flush_tlb_mm((tlb)->mm)
-
 #include <asm-generic/tlb.h>
 
 #define __pte_free_tlb(tlb, pte, addr)	pte_free((tlb)->mm, pte)
--- a/arch/nios2/include/asm/tlb.h
+++ b/arch/nios2/include/asm/tlb.h
@@ -11,12 +11,14 @@ 
 #ifndef _ASM_NIOS2_TLB_H
 #define _ASM_NIOS2_TLB_H
 
-#define tlb_flush(tlb)	flush_tlb_mm((tlb)->mm)
-
 extern void set_mmu_pid(unsigned long pid);
 
+/*
+ * NIOS32 does have flush_tlb_range(), but it lacks a limit and fallback to
+ * full mm invalidation. So use flush_tlb_mm() for everything.
+ */
 #define tlb_end_vma(tlb, vma)	do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
+#define tlb_flush(tlb)	flush_tlb_mm((tlb)->mm)
 
 #include <linux/pagemap.h>
 #include <asm-generic/tlb.h>
--- a/arch/openrisc/include/asm/tlb.h
+++ b/arch/openrisc/include/asm/tlb.h
@@ -22,12 +22,14 @@ 
 /*
  * or32 doesn't need any special per-pte or
  * per-vma handling..
+ *
+ * OpenRISC doesn't have an efficient flush_tlb_range() so use flush_tlb_mm()
+ * for everything.
  */
 #define tlb_start_vma(tlb, vma) do { } while (0)
 #define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
 #include <linux/pagemap.h>
 #include <asm-generic/tlb.h>
 
--- a/arch/parisc/include/asm/tlb.h
+++ b/arch/parisc/include/asm/tlb.h
@@ -2,19 +2,6 @@ 
 #ifndef _PARISC_TLB_H
 #define _PARISC_TLB_H
 
-#define tlb_flush(tlb)			\
-do {	if ((tlb)->fullmm)		\
-		flush_tlb_mm((tlb)->mm);\
-} while (0)
-
-#define tlb_end_vma(tlb, vma)	\
-do {	if (!(tlb)->fullmm)	\
-		flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, address) \
-	do { } while (0)
-
 #include <asm-generic/tlb.h>
 
 #define __pmd_free_tlb(tlb, pmd, addr)	pmd_free((tlb)->mm, pmd)
--- a/arch/sparc/include/asm/tlb_32.h
+++ b/arch/sparc/include/asm/tlb_32.h
@@ -2,19 +2,6 @@ 
 #ifndef _SPARC_TLB_H
 #define _SPARC_TLB_H
 
-#define tlb_end_vma(tlb, vma) \
-do {								\
-	flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
-} while (0)
-
-#define __tlb_remove_tlb_entry(tlb, pte, address) \
-	do { } while (0)
-
-#define tlb_flush(tlb) \
-do {								\
-	flush_tlb_mm((tlb)->mm);				\
-} while (0)
-
 #include <asm-generic/tlb.h>
 
 #endif /* _SPARC_TLB_H */
--- a/arch/unicore32/include/asm/tlb.h
+++ b/arch/unicore32/include/asm/tlb.h
@@ -12,10 +12,12 @@ 
 #ifndef __UNICORE_TLB_H__
 #define __UNICORE_TLB_H__
 
-#define tlb_start_vma(tlb, vma)				do { } while (0)
-#define tlb_end_vma(tlb, vma)				do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+/*
+ * unicore32 lacks an afficient flush_tlb_range(), use flush_tlb_mm().
+ */
+#define tlb_start_vma(tlb, vma)		do { } while (0)
+#define tlb_end_vma(tlb, vma)		do { } while (0)
+#define tlb_flush(tlb)			flush_tlb_mm((tlb)->mm)
 
 #define __pte_free_tlb(tlb, pte, addr)				\
 	do {							\
--- a/arch/xtensa/include/asm/tlb.h
+++ b/arch/xtensa/include/asm/tlb.h
@@ -14,23 +14,6 @@ 
 #include <asm/cache.h>
 #include <asm/page.h>
 
-#if (DCACHE_WAY_SIZE <= PAGE_SIZE)
-
-# define tlb_end_vma(tlb,vma)			do { } while (0)
-
-#else
-
-# define tlb_end_vma(tlb, vma)						      \
-	do {								      \
-		if (!tlb->fullmm)					      \
-			flush_tlb_range(vma, vma->vm_start, vma->vm_end);     \
-	} while(0)
-
-#endif
-
-#define __tlb_remove_tlb_entry(tlb,pte,addr)	do { } while (0)
-#define tlb_flush(tlb)				flush_tlb_mm((tlb)->mm)
-
 #include <asm-generic/tlb.h>
 
 #define __pte_free_tlb(tlb, pte, address)	pte_free((tlb)->mm, pte)