diff mbox series

[RFC] avoid refcounting the lazy tlb mm struct

Message ID 1594019787.286knc5cet.astroid@bobo.none (mailing list archive)
State New, archived
Headers show
Series [RFC] avoid refcounting the lazy tlb mm struct | expand

Commit Message

Nicholas Piggin July 6, 2020, 7:23 a.m. UTC
On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Not doing lazy tlb at all slows switching down quite a bit, so I wonder
if we can avoid the refcount for the lazy tlb, but have __mmdrop() IPI
all CPUs that might be using this mm lazily.

This patch has only had light testing so far, but seems to work okay.

Thanks,
Nick

--

Comments

Anton Blanchard July 10, 2020, 12:45 a.m. UTC | #1
Hi Nick,

> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
> 
> Not doing lazy tlb at all slows switching down quite a bit, so I
> wonder if we can avoid the refcount for the lazy tlb, but have
> __mmdrop() IPI all CPUs that might be using this mm lazily.
> 
> This patch has only had light testing so far, but seems to work okay.

I tested this patch on a large POWER8 system with 1536 hardware threads.
I can create a worst case situation for mm refcounting by using
the threaded context switch test in will-it-scale set to half the
number of available CPUs (768).

With that workload the patch improves the context switch rate by 118x!

Tested-by: Anton Blanchard <anton@ozlabs.org>

Thanks,
Anton

> diff --git a/arch/Kconfig b/arch/Kconfig
> index 8cc35dc556c7..69ea7172db3d 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -411,6 +411,16 @@ config MMU_GATHER_NO_GATHER
>  	bool
>  	depends on MMU_GATHER_TABLE_FREE
>  
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can
> cause
> +	  contention with multi-threaded apps on large
> multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the
> mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as
> the lazy
> +	  tlb. Architectures which do not track all possible lazy
> tlb CPUs in
> +	  mm_cpumask can not use this (without modification).
> +
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool
>  
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 920c4e3ca4ef..24ac85c868db 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -225,6 +225,7 @@ config PPC
>  	select HAVE_PERF_USER_STACK_DUMP
>  	select MMU_GATHER_RCU_TABLE_FREE
>  	select MMU_GATHER_PAGE_SIZE
> +	select MMU_LAZY_TLB_SHOOTDOWN
>  	select HAVE_REGS_AND_STACK_ACCESS_API
>  	select HAVE_RELIABLE_STACKTRACE		if
> PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c
> b/arch/powerpc/mm/book3s64/radix_tlb.c index
> b5cc9b23cf02..52730629b3eb 100644 ---
> a/arch/powerpc/mm/book3s64/radix_tlb.c +++
> b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -652,10 +652,10 @@ static
> void do_exit_flush_lazy_tlb(void *arg)
>  		 * Must be a kernel thread because sender is
> single-threaded. */
>  		BUG_ON(current->mm);
> -		mmgrab(&init_mm);
> +		mmgrab_lazy_tlb(&init_mm);
>  		switch_mm(mm, &init_mm, current);
>  		current->active_mm = &init_mm;
> -		mmdrop(mm);
> +		mmdrop_lazy_tlb(mm);
>  	}
>  	_tlbiel_pid(pid, RIC_FLUSH_ALL);
>  }
> diff --git a/fs/exec.c b/fs/exec.c
> index e6e8a9a70327..6c96c8feba1f 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
>  		mmput(old_mm);
>  		return 0;
>  	}
> -	mmdrop(active_mm);
> +	mmdrop_lazy_tlb(active_mm);
>  	return 0;
>  }
>  
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 480a4d1b7dd8..ef28059086a1 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,6 +51,25 @@ static inline void mmdrop(struct mm_struct *mm)
>  
>  void mmdrop(struct mm_struct *mm);
>  
> +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
> +{
> +	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		mmgrab(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
> +{
> +	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		mmdrop(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
> +{
> +	mmdrop_lazy_tlb(mm);
> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		smp_mb();
> +}
> +
>  /*
>   * This has to be called after a get_task_mm()/mmget_not_zero()
>   * followed by taking the mmap_lock for writing before modifying the
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..e3f1039cee9f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -685,6 +685,34 @@ static void check_mm(struct mm_struct *mm)
>  #define allocate_mm()	(kmem_cache_alloc(mm_cachep,
> GFP_KERNEL)) #define free_mm(mm)	(kmem_cache_free(mm_cachep,
> (mm))) 
> +static void do_shoot_lazy_tlb(void *arg)
> +{
> +	struct mm_struct *mm = arg;
> +
> +	if (current->active_mm == mm) {
> +		BUG_ON(current->mm);
> +		switch_mm(mm, &init_mm, current);
> +		current->active_mm = &init_mm;
> +	}
> +}
> +
> +static void do_check_lazy_tlb(void *arg)
> +{
> +	struct mm_struct *mm = arg;
> +
> +	BUG_ON(current->active_mm == mm);
> +}
> +
> +void shoot_lazy_tlbs(struct mm_struct *mm)
> +{
> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
> +		smp_call_function_many(mm_cpumask(mm),
> do_shoot_lazy_tlb, (void *)mm, 1);
> +		do_shoot_lazy_tlb(mm);
> +	}
> +	smp_call_function(do_check_lazy_tlb, (void *)mm, 1);
> +	do_check_lazy_tlb(mm);
> +}
> +
>  /*
>   * Called when the last reference to the mm
>   * is dropped: either by a lazy thread or by
> @@ -692,6 +720,7 @@ static void check_mm(struct mm_struct *mm)
>   */
>  void __mmdrop(struct mm_struct *mm)
>  {
> +	shoot_lazy_tlbs(mm);
>  	BUG_ON(mm == &init_mm);
>  	WARN_ON_ONCE(mm == current->mm);
>  	WARN_ON_ONCE(mm == current->active_mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ca5db40392d4..4d615e0be9e0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3308,7 +3308,7 @@ static struct rq *finish_task_switch(struct
> task_struct *prev) */
>  	if (mm) {
>  		membarrier_mm_sync_core_before_usermode(mm);
> -		mmdrop(mm);
> +		mmdrop_lazy_tlb_smp_mb(mm);
>  	}
>  	if (unlikely(prev_state == TASK_DEAD)) {
>  		if (prev->sched_class->task_dead)
> @@ -3413,9 +3413,9 @@ context_switch(struct rq *rq, struct
> task_struct *prev, 
>  	/*
>  	 * kernel -> kernel   lazy + transfer active
> -	 *   user -> kernel   lazy + mmgrab() active
> +	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
>  	 *
> -	 * kernel ->   user   switch + mmdrop() active
> +	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
>  	 *   user ->   user   switch
>  	 */
>  	if (!next->mm) {                                // to kernel
> @@ -3423,7 +3423,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev, 
>  		next->active_mm = prev->active_mm;
>  		if (prev->mm)                           // from user
> -			mmgrab(prev->active_mm);
> +			mmgrab_lazy_tlb(prev->active_mm);
>  		else
>  			prev->active_mm = NULL;
>  	} else {                                        // to user
> @@ -3439,7 +3439,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm,
> next); 
>  		if (!prev->mm) {                        // from
> kernel
> -			/* will mmdrop() in finish_task_switch(). */
> +			/* will mmdrop_lazy_tlb() in
> finish_task_switch(). */ rq->prev_mm = prev->active_mm;
>  			prev->active_mm = NULL;
>  		}
>
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..69ea7172db3d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -411,6 +411,16 @@  config MMU_GATHER_NO_GATHER
 	bool
 	depends on MMU_GATHER_TABLE_FREE
 
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. Architectures which do not track all possible lazy tlb CPUs in
+	  mm_cpumask can not use this (without modification).
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 920c4e3ca4ef..24ac85c868db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -225,6 +225,7 @@  config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b5cc9b23cf02..52730629b3eb 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -652,10 +652,10 @@  static void do_exit_flush_lazy_tlb(void *arg)
 		 * Must be a kernel thread because sender is single-threaded.
 		 */
 		BUG_ON(current->mm);
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		switch_mm(mm, &init_mm, current);
 		current->active_mm = &init_mm;
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 	_tlbiel_pid(pid, RIC_FLUSH_ALL);
 }
diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..6c96c8feba1f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1119,7 +1119,7 @@  static int exec_mmap(struct mm_struct *mm)
 		mmput(old_mm);
 		return 0;
 	}
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 480a4d1b7dd8..ef28059086a1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -51,6 +51,25 @@  static inline void mmdrop(struct mm_struct *mm)
 
 void mmdrop(struct mm_struct *mm);
 
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+		mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+		mmdrop(mm);
+}
+
+static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
+{
+	mmdrop_lazy_tlb(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
+		smp_mb();
+}
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_lock for writing before modifying the
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..e3f1039cee9f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -685,6 +685,34 @@  static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		BUG_ON(current->mm);
+		switch_mm(mm, &init_mm, current);
+		current->active_mm = &init_mm;
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	BUG_ON(current->active_mm == mm);
+}
+
+void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		smp_call_function_many(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		do_shoot_lazy_tlb(mm);
+	}
+	smp_call_function(do_check_lazy_tlb, (void *)mm, 1);
+	do_check_lazy_tlb(mm);
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -692,6 +720,7 @@  static void check_mm(struct mm_struct *mm)
  */
 void __mmdrop(struct mm_struct *mm)
 {
+	shoot_lazy_tlbs(mm);
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
 	WARN_ON_ONCE(mm == current->active_mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca5db40392d4..4d615e0be9e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3308,7 +3308,7 @@  static struct rq *finish_task_switch(struct task_struct *prev)
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb_smp_mb(mm);
 	}
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
@@ -3413,9 +3413,9 @@  context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3423,7 +3423,7 @@  context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3439,7 +3439,7 @@  context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}