diff mbox series

arm64/mm: save memory access in check_and_switch_context() fast switch path

Message ID 1593755079-2160-1-git-send-email-kernelfans@gmail.com (mailing list archive)
State Mainlined
Commit c4885bbb3afee80f41d39a33e49881a18e500f47
Headers show
Series arm64/mm: save memory access in check_and_switch_context() fast switch path | expand

Commit Message

Pingfan Liu July 3, 2020, 5:44 a.m. UTC
The cpu_number and __per_cpu_offset cost two different cache lines, and may
not exist after a heavy user space load.

By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
fast path, register is used and these memory access are avoided.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
To: linux-arm-kernel@lists.infradead.org
---
 arch/arm64/include/asm/mmu_context.h |  6 ++----
 arch/arm64/mm/context.c              | 10 ++++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

Comments

Mark Rutland July 3, 2020, 10:13 a.m. UTC | #1
On Fri, Jul 03, 2020 at 01:44:39PM +0800, Pingfan Liu wrote:
> The cpu_number and __per_cpu_offset cost two different cache lines, and may
> not exist after a heavy user space load.
> 
> By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> fast path, register is used and these memory access are avoided.

How about:

| On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
| using the per-cpu offset stored in the tpidr_el1 system register. In
| some cases we generate a per-cpu address with a sequence like:
|
| | cpu_ptr = &per_cpu(ptr, smp_processor_id());
|
| Which potentially incurs a cache miss for both `cpu_number` and the
| in-memory `__per_cpu_offset` array. This can be written more optimally
| as:
|
| | cpu_ptr = this_cpu_ptr(ptr);
|
| ... which only needs the offset from tpidr_el1, and does not need to
| load from memory.

> By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> fast path, register is used and these memory access are avoided.

Do you have any numbers that show benefit here? It's not clear to me how
often the above case would apply where the cahes would also be hot for
everything else we need, and numbers would help to justify that.

> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Steve Capper <steve.capper@arm.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Vladimir Murzin <vladimir.murzin@arm.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> To: linux-arm-kernel@lists.infradead.org
> ---
>  arch/arm64/include/asm/mmu_context.h |  6 ++----
>  arch/arm64/mm/context.c              | 10 ++++++----
>  2 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index ab46187..808c3be 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
>   * take CPU migration into account.
>   */
>  #define destroy_context(mm)		do { } while(0)
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
> +void check_and_switch_context(struct mm_struct *mm);
>  
>  #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
>  
> @@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>  
>  static inline void __switch_mm(struct mm_struct *next)
>  {
> -	unsigned int cpu = smp_processor_id();
> -
>  	/*
>  	 * init_mm.pgd does not contain any user mappings and it is always
>  	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
> @@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
>  		return;
>  	}
>  
> -	check_and_switch_context(next, cpu);
> +	check_and_switch_context(next);
>  }
>  
>  static inline void
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index d702d60..a206655 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
>  	return idx2asid(asid) | generation;
>  }
>  
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> +void check_and_switch_context(struct mm_struct *mm)
>  {
>  	unsigned long flags;
> +	unsigned int cpu;
>  	u64 asid, old_active_asid;
>  
>  	if (system_supports_cnp())
> @@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>  	 *   relaxed xchg in flush_context will treat us as reserved
>  	 *   because atomic RmWs are totally ordered for a given location.
>  	 */
> -	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
> +	old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
>  	if (old_active_asid && asid_gen_match(asid) &&
> -	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
> +	    atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
>  				     old_active_asid, asid))
>  		goto switch_mm_fastpath;
>  
> @@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>  		atomic64_set(&mm->context.id, asid);
>  	}
>  
> +	cpu = smp_processor_id();
>  	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
>  		local_flush_tlb_all();
>  
> -	atomic64_set(&per_cpu(active_asids, cpu), asid);
> +	atomic64_set(this_cpu_ptr(&active_asids), asid);
>  	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);

FWIW, this looks sound to me.

Mark.
Pingfan Liu July 6, 2020, 8:10 a.m. UTC | #2
On Fri, Jul 3, 2020 at 6:13 PM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Fri, Jul 03, 2020 at 01:44:39PM +0800, Pingfan Liu wrote:
> > The cpu_number and __per_cpu_offset cost two different cache lines, and may
> > not exist after a heavy user space load.
> >
> > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > fast path, register is used and these memory access are avoided.
>
> How about:
>
> | On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
> | using the per-cpu offset stored in the tpidr_el1 system register. In
> | some cases we generate a per-cpu address with a sequence like:
> |
> | | cpu_ptr = &per_cpu(ptr, smp_processor_id());
> |
> | Which potentially incurs a cache miss for both `cpu_number` and the
> | in-memory `__per_cpu_offset` array. This can be written more optimally
> | as:
> |
> | | cpu_ptr = this_cpu_ptr(ptr);
> |
> | ... which only needs the offset from tpidr_el1, and does not need to
> | load from memory.
Appreciate for your clear document.
>
> > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > fast path, register is used and these memory access are avoided.
>
> Do you have any numbers that show benefit here? It's not clear to me how
> often the above case would apply where the cahes would also be hot for
> everything else we need, and numbers would help to justify that.
Initially, I was just abstracted by the macro __my_cpu_offset
implement, and came to this question. But following your thinking, I
realized data is needed to make things clear.

I have finished a test with 5.8.0-rc4 kernel on a 46 cpus qualcomm machine.
command: time -p make all -j138

Before this patch:
real 291.86
user 11050.18
sys 362.91

After this patch
real 291.11
user 11055.62
sys 363.39

As the data, it shows a very small improvement.

Thanks,
Pingfan
Pingfan Liu July 7, 2020, 1:50 a.m. UTC | #3
On Mon, Jul 6, 2020 at 4:10 PM Pingfan Liu <kernelfans@gmail.com> wrote:
>
> On Fri, Jul 3, 2020 at 6:13 PM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > On Fri, Jul 03, 2020 at 01:44:39PM +0800, Pingfan Liu wrote:
> > > The cpu_number and __per_cpu_offset cost two different cache lines, and may
> > > not exist after a heavy user space load.
> > >
> > > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > > fast path, register is used and these memory access are avoided.
> >
> > How about:
> >
> > | On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
> > | using the per-cpu offset stored in the tpidr_el1 system register. In
> > | some cases we generate a per-cpu address with a sequence like:
> > |
> > | | cpu_ptr = &per_cpu(ptr, smp_processor_id());
> > |
> > | Which potentially incurs a cache miss for both `cpu_number` and the
> > | in-memory `__per_cpu_offset` array. This can be written more optimally
> > | as:
> > |
> > | | cpu_ptr = this_cpu_ptr(ptr);
> > |
> > | ... which only needs the offset from tpidr_el1, and does not need to
> > | load from memory.
> Appreciate for your clear document.
> >
> > > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > > fast path, register is used and these memory access are avoided.
> >
> > Do you have any numbers that show benefit here? It's not clear to me how
> > often the above case would apply where the cahes would also be hot for
> > everything else we need, and numbers would help to justify that.
> Initially, I was just abstracted by the macro __my_cpu_offset
> implement, and came to this question. But following your thinking, I
> realized data is needed to make things clear.
>
> I have finished a test with 5.8.0-rc4 kernel on a 46 cpus qualcomm machine.
> command: time -p make all -j138
>
> Before this patch:
> real 291.86
> user 11050.18
> sys 362.91
>
> After this patch
> real 291.11
> user 11055.62
> sys 363.39
>
> As the data, it shows a very small improvement.
The data may be affected by random factors, and less persuasive. And I
tried to do some repeated tests with perf-stat.
#cat b.sh
make clean && make all -j138

#perf stat --repeat 10 --null --sync sh b.sh

- before this patch
 Performance counter stats for 'sh b.sh' (10 runs):

            298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )


- after this patch
 Performance counter stats for 'sh b.sh' (10 runs):

           297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )


As the mean value  298.62 VS 297.734 shows, this trivial change does
bring a small improvement in performance.
>
> Thanks,
> Pingfan
Mark Rutland July 9, 2020, 11:48 a.m. UTC | #4
On Tue, Jul 07, 2020 at 09:50:58AM +0800, Pingfan Liu wrote:
> On Mon, Jul 6, 2020 at 4:10 PM Pingfan Liu <kernelfans@gmail.com> wrote:
> >
> > On Fri, Jul 3, 2020 at 6:13 PM Mark Rutland <mark.rutland@arm.com> wrote:
> > >
> > > On Fri, Jul 03, 2020 at 01:44:39PM +0800, Pingfan Liu wrote:
> > > > The cpu_number and __per_cpu_offset cost two different cache lines, and may
> > > > not exist after a heavy user space load.
> > > >
> > > > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > > > fast path, register is used and these memory access are avoided.
> > >
> > > How about:
> > >
> > > | On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
> > > | using the per-cpu offset stored in the tpidr_el1 system register. In
> > > | some cases we generate a per-cpu address with a sequence like:
> > > |
> > > | | cpu_ptr = &per_cpu(ptr, smp_processor_id());
> > > |
> > > | Which potentially incurs a cache miss for both `cpu_number` and the
> > > | in-memory `__per_cpu_offset` array. This can be written more optimally
> > > | as:
> > > |
> > > | | cpu_ptr = this_cpu_ptr(ptr);
> > > |
> > > | ... which only needs the offset from tpidr_el1, and does not need to
> > > | load from memory.
> > Appreciate for your clear document.
> > >
> > > > By replacing per_cpu(active_asids, cpu) with this_cpu_ptr(&active_asids) in
> > > > fast path, register is used and these memory access are avoided.
> > >
> > > Do you have any numbers that show benefit here? It's not clear to me how
> > > often the above case would apply where the cahes would also be hot for
> > > everything else we need, and numbers would help to justify that.
> > Initially, I was just abstracted by the macro __my_cpu_offset
> > implement, and came to this question. But following your thinking, I
> > realized data is needed to make things clear.
> >
> > I have finished a test with 5.8.0-rc4 kernel on a 46 cpus qualcomm machine.
> > command: time -p make all -j138
> >
> > Before this patch:
> > real 291.86
> > user 11050.18
> > sys 362.91
> >
> > After this patch
> > real 291.11
> > user 11055.62
> > sys 363.39
> >
> > As the data, it shows a very small improvement.
> The data may be affected by random factors, and less persuasive. And I
> tried to do some repeated tests with perf-stat.
> #cat b.sh
> make clean && make all -j138
> 
> #perf stat --repeat 10 --null --sync sh b.sh
> 
> - before this patch
>  Performance counter stats for 'sh b.sh' (10 runs):
> 
>             298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )
> 
> 
> - after this patch
>  Performance counter stats for 'sh b.sh' (10 runs):
> 
>            297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )
> 

IIUC that's a 0.3% improvement. It'd be worth putting these results in
the commit message.

Could you also try that with "perf bench sched messaging" as the
workload? As a microbenchmark, that might show the highest potential
benefit, and it'd be nice to have those figures too if possible.

Thanks,
Mark.
Pingfan Liu July 10, 2020, 8:03 a.m. UTC | #5
On Thu, Jul 9, 2020 at 7:48 PM Mark Rutland <mark.rutland@arm.com> wrote:
[...]
>
> IIUC that's a 0.3% improvement. It'd be worth putting these results in
> the commit message.
Sure, I will.
>
> Could you also try that with "perf bench sched messaging" as the
> workload? As a microbenchmark, that might show the highest potential
> benefit, and it'd be nice to have those figures too if possible.
I have finished 10 times of this test, and will put the results in the
commit log too. In summary, this microbenchmark has about 1.69%
improvement after this patch.

Test data:

1. without this patch, total 0.707 sec for 10 times

# perf stat -r 10 perf bench sched messaging
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.074 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.071 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.068 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.072 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.072 [sec]
# Running 'sched/messaging' benchmark:
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.072 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.068 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]

 Performance counter stats for 'perf bench sched messaging' (10 runs):

          3,102.15 msec task-clock                #   11.018 CPUs
utilized            ( +-  0.47% )
            16,468      context-switches          #    0.005 M/sec
               ( +-  2.56% )
             6,877      cpu-migrations            #    0.002 M/sec
               ( +-  3.44% )
            83,645      page-faults               #    0.027 M/sec
               ( +-  0.05% )
     6,440,897,966      cycles                    #    2.076 GHz
               ( +-  0.37% )
     3,620,264,483      instructions              #    0.56  insn per
cycle           ( +-  0.11% )
   <not supported>      branches
        11,187,394      branch-misses
               ( +-  0.73% )

           0.28155 +- 0.00166 seconds time elapsed  ( +-  0.59% )

2. with this patch, totol 0.695 sec for 10 times
perf stat -r 10 perf bench sched messaging
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.069 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.070 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.071 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.069 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.072 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.066 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.069 [sec]
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run

     Total time: 0.069 [sec]

 Performance counter stats for 'perf bench sched messaging' (10 runs):

          3,098.48 msec task-clock                #   11.182 CPUs
utilized            ( +-  0.38% )
            15,485      context-switches          #    0.005 M/sec
               ( +-  2.28% )
             6,707      cpu-migrations            #    0.002 M/sec
               ( +-  2.80% )
            83,606      page-faults               #    0.027 M/sec
               ( +-  0.00% )
     6,435,068,186      cycles                    #    2.077 GHz
               ( +-  0.26% )
     3,611,197,297      instructions              #    0.56  insn per
cycle           ( +-  0.08% )
   <not supported>      branches
        11,323,244      branch-misses
               ( +-  0.51% )

          0.277087 +- 0.000625 seconds time elapsed  ( +-  0.23% )


Thanks,
Pingfan
Mark Rutland July 10, 2020, 9:35 a.m. UTC | #6
On Fri, Jul 10, 2020 at 04:03:39PM +0800, Pingfan Liu wrote:
> On Thu, Jul 9, 2020 at 7:48 PM Mark Rutland <mark.rutland@arm.com> wrote:
> [...]
> >
> > IIUC that's a 0.3% improvement. It'd be worth putting these results in
> > the commit message.
> Sure, I will.
> >
> > Could you also try that with "perf bench sched messaging" as the
> > workload? As a microbenchmark, that might show the highest potential
> > benefit, and it'd be nice to have those figures too if possible.
> I have finished 10 times of this test, and will put the results in the
> commit log too. In summary, this microbenchmark has about 1.69%
> improvement after this patch.

Great; thanks for gathering this data!

Mark.

> 
> Test data:
> 
> 1. without this patch, total 0.707 sec for 10 times
> 
> # perf stat -r 10 perf bench sched messaging
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.074 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.071 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.068 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.072 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.072 [sec]
> # Running 'sched/messaging' benchmark:
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.072 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.068 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> 
>  Performance counter stats for 'perf bench sched messaging' (10 runs):
> 
>           3,102.15 msec task-clock                #   11.018 CPUs
> utilized            ( +-  0.47% )
>             16,468      context-switches          #    0.005 M/sec
>                ( +-  2.56% )
>              6,877      cpu-migrations            #    0.002 M/sec
>                ( +-  3.44% )
>             83,645      page-faults               #    0.027 M/sec
>                ( +-  0.05% )
>      6,440,897,966      cycles                    #    2.076 GHz
>                ( +-  0.37% )
>      3,620,264,483      instructions              #    0.56  insn per
> cycle           ( +-  0.11% )
>    <not supported>      branches
>         11,187,394      branch-misses
>                ( +-  0.73% )
> 
>            0.28155 +- 0.00166 seconds time elapsed  ( +-  0.59% )
> 
> 2. with this patch, totol 0.695 sec for 10 times
> perf stat -r 10 perf bench sched messaging
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.069 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.070 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.071 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.069 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.072 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.066 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.069 [sec]
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
> 
>      Total time: 0.069 [sec]
> 
>  Performance counter stats for 'perf bench sched messaging' (10 runs):
> 
>           3,098.48 msec task-clock                #   11.182 CPUs
> utilized            ( +-  0.38% )
>             15,485      context-switches          #    0.005 M/sec
>                ( +-  2.28% )
>              6,707      cpu-migrations            #    0.002 M/sec
>                ( +-  2.80% )
>             83,606      page-faults               #    0.027 M/sec
>                ( +-  0.00% )
>      6,435,068,186      cycles                    #    2.077 GHz
>                ( +-  0.26% )
>      3,611,197,297      instructions              #    0.56  insn per
> cycle           ( +-  0.08% )
>    <not supported>      branches
>         11,323,244      branch-misses
>                ( +-  0.51% )
> 
>           0.277087 +- 0.000625 seconds time elapsed  ( +-  0.23% )
> 
> 
> Thanks,
> Pingfan
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index ab46187..808c3be 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -175,7 +175,7 @@  static inline void cpu_replace_ttbr1(pgd_t *pgdp)
  * take CPU migration into account.
  */
 #define destroy_context(mm)		do { } while(0)
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
+void check_and_switch_context(struct mm_struct *mm);
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
 
@@ -214,8 +214,6 @@  enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 
 static inline void __switch_mm(struct mm_struct *next)
 {
-	unsigned int cpu = smp_processor_id();
-
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -225,7 +223,7 @@  static inline void __switch_mm(struct mm_struct *next)
 		return;
 	}
 
-	check_and_switch_context(next, cpu);
+	check_and_switch_context(next);
 }
 
 static inline void
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index d702d60..a206655 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -198,9 +198,10 @@  static u64 new_context(struct mm_struct *mm)
 	return idx2asid(asid) | generation;
 }
 
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
+void check_and_switch_context(struct mm_struct *mm)
 {
 	unsigned long flags;
+	unsigned int cpu;
 	u64 asid, old_active_asid;
 
 	if (system_supports_cnp())
@@ -222,9 +223,9 @@  void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	 *   relaxed xchg in flush_context will treat us as reserved
 	 *   because atomic RmWs are totally ordered for a given location.
 	 */
-	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
+	old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
 	if (old_active_asid && asid_gen_match(asid) &&
-	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+	    atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
 				     old_active_asid, asid))
 		goto switch_mm_fastpath;
 
@@ -236,10 +237,11 @@  void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 		atomic64_set(&mm->context.id, asid);
 	}
 
+	cpu = smp_processor_id();
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
 		local_flush_tlb_all();
 
-	atomic64_set(&per_cpu(active_asids, cpu), asid);
+	atomic64_set(this_cpu_ptr(&active_asids), asid);
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath: