diff mbox series

[RFC,43/86] sched: enable PREEMPT_COUNT, PREEMPTION for all preemption models

Message ID 20231107215742.363031-44-ankur.a.arora@oracle.com (mailing list archive)
State New
Headers show
Series Make the kernel preemptible | expand

Commit Message

Ankur Arora Nov. 7, 2023, 9:57 p.m. UTC
The scheduler uses PREEMPT_COUNT and PREEMPTION to drive
preemption: the first to demarcate non-preemptible sections and
the second for the actual mechanics of preemption.

Enable both for voluntary preemption models.

In addition, define a new scheduler feature FORCE_PREEMPT which
can now be used to distinguish between voluntary and full
preemption models at runtime.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 init/Makefile           |  2 +-
 kernel/Kconfig.preempt  | 12 ++++++++----
 kernel/entry/common.c   |  3 +--
 kernel/sched/core.c     | 26 +++++++++++---------------
 kernel/sched/features.h |  6 ++++++
 5 files changed, 27 insertions(+), 22 deletions(-)

Comments

Peter Zijlstra Nov. 8, 2023, 9:58 a.m. UTC | #1
On Tue, Nov 07, 2023 at 01:57:29PM -0800, Ankur Arora wrote:
> The scheduler uses PREEMPT_COUNT and PREEMPTION to drive
> preemption: the first to demarcate non-preemptible sections and
> the second for the actual mechanics of preemption.
> 
> Enable both for voluntary preemption models.
> 
> In addition, define a new scheduler feature FORCE_PREEMPT which
> can now be used to distinguish between voluntary and full
> preemption models at runtime.
> 
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> ---
>  init/Makefile           |  2 +-
>  kernel/Kconfig.preempt  | 12 ++++++++----
>  kernel/entry/common.c   |  3 +--
>  kernel/sched/core.c     | 26 +++++++++++---------------
>  kernel/sched/features.h |  6 ++++++
>  5 files changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/init/Makefile b/init/Makefile
> index 385fd80fa2ef..99e480f24cf3 100644
> --- a/init/Makefile
> +++ b/init/Makefile
> @@ -24,7 +24,7 @@ mounts-$(CONFIG_BLK_DEV_INITRD)	+= do_mounts_initrd.o
>  #
>  
>  smp-flag-$(CONFIG_SMP)			:= SMP
> -preempt-flag-$(CONFIG_PREEMPT)          := PREEMPT
> +preempt-flag-$(CONFIG_PREEMPTION)       := PREEMPT_DYNAMIC
>  preempt-flag-$(CONFIG_PREEMPT_RT)	:= PREEMPT_RT
>  
>  build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index aa87b5cd3ecc..074fe5e253b5 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -6,20 +6,23 @@ choice
>  
>  config PREEMPT_NONE
>  	bool "No Forced Preemption (Server)"
> +	select PREEMPTION
>  	help
>  	  This is the traditional Linux preemption model, geared towards
>  	  throughput. It will still provide good latencies most of the
> -	  time, but there are no guarantees and occasional longer delays
> -	  are possible.
> +	  time, but occasional delays are possible.
>  
>  	  Select this option if you are building a kernel for a server or
>  	  scientific/computation system, or if you want to maximize the
>  	  raw processing power of the kernel, irrespective of scheduling
> -	  latencies.
> +	  latencies. Unless your architecture actively disables preemption,
> +	  you can always switch to one of the other preemption models
> +	  at runtime.


> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 6433e6c77185..f7f2efabb5b5 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -422,8 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
>  		}
>  
>  		instrumentation_begin();
> -		if (IS_ENABLED(CONFIG_PREEMPTION))
> -			irqentry_exit_cond_resched();
> +		irqentry_exit_cond_resched();
>  		/* Covers both tracing and lockdep */
>  		trace_hardirqs_on();
>  		instrumentation_end();

I'm totally confused by the PREEMPT_NONE changes here. How does that
make sense?
diff mbox series

Patch

diff --git a/init/Makefile b/init/Makefile
index 385fd80fa2ef..99e480f24cf3 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -24,7 +24,7 @@  mounts-$(CONFIG_BLK_DEV_INITRD)	+= do_mounts_initrd.o
 #
 
 smp-flag-$(CONFIG_SMP)			:= SMP
-preempt-flag-$(CONFIG_PREEMPT)          := PREEMPT
+preempt-flag-$(CONFIG_PREEMPTION)       := PREEMPT_DYNAMIC
 preempt-flag-$(CONFIG_PREEMPT_RT)	:= PREEMPT_RT
 
 build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index aa87b5cd3ecc..074fe5e253b5 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -6,20 +6,23 @@  choice
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
+	select PREEMPTION
 	help
 	  This is the traditional Linux preemption model, geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
-	  are possible.
+	  time, but occasional delays are possible.
 
 	  Select this option if you are building a kernel for a server or
 	  scientific/computation system, or if you want to maximize the
 	  raw processing power of the kernel, irrespective of scheduling
-	  latencies.
+	  latencies. Unless your architecture actively disables preemption,
+	  you can always switch to one of the other preemption models
+	  at runtime.
 
 config PREEMPT_VOLUNTARY
 	bool "Voluntary Kernel Preemption (Desktop)"
 	depends on !ARCH_NO_PREEMPT
+	select PREEMPTION
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
@@ -53,7 +56,8 @@  config PREEMPT
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
-	  range.
+	  range. You can always switch to one of lower preemption options
+	  at runtime.
 
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 6433e6c77185..f7f2efabb5b5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -422,8 +422,7 @@  noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 		}
 
 		instrumentation_begin();
-		if (IS_ENABLED(CONFIG_PREEMPTION))
-			irqentry_exit_cond_resched();
+		irqentry_exit_cond_resched();
 		/* Covers both tracing and lockdep */
 		trace_hardirqs_on();
 		instrumentation_end();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f65bf3ce0e9d..2a50a64255c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,7 +1065,7 @@  void __resched_curr(struct rq *rq, resched_t rs)
  *
  * Always schedule eagerly, if:
  *
- *  - running under full preemption
+ *  - running under full preemption (sched_feat(FORCE_PREEMPT))
  *
  *  - idle: when not polling (or if we don't have TIF_POLLING_NRFLAG)
  *    force TIF_NEED_RESCHED to be set and send a resched IPI.
@@ -1081,7 +1081,7 @@  void resched_curr(struct rq *rq)
 	resched_t rs = RESCHED_lazy;
 	int context;
 
-	if (IS_ENABLED(CONFIG_PREEMPT) ||
+	if (sched_feat(FORCE_PREEMPT) ||
 	    (rq->curr->sched_class == &idle_sched_class)) {
 		rs = RESCHED_eager;
 		goto resched;
@@ -1108,7 +1108,6 @@  void resched_curr(struct rq *rq)
 	context = ct_state_cpu(cpu_of(rq));
 	if ((context == CONTEXT_USER) ||
 	    (context == CONTEXT_GUEST)) {
-
 		rs = RESCHED_eager;
 		goto resched;
 	}
@@ -6597,20 +6596,18 @@  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  *
  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
  *
- *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- *      paths. For example, see arch/x86/entry_64.S.
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and TIF_NEED_RESCHED[_LAZY]
+ *      flags on userspace return paths. For example, see arch/x86/entry_64.S.
  *
- *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
+ *      To drive preemption between tasks, the scheduler sets one of these
+ *      flags in timer interrupt handler scheduler_tick().
  *
  *   3. Wakeups don't really cause entry into schedule(). They add a
  *      task to the run-queue and that's it.
  *
- *      Now, if the new task added to the run-queue preempts the current
- *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- *      called on the nearest possible occasion:
- *
- *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
+ *      - Now, if the new task added to the run-queue preempts the current
+ *        task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *        called on the nearest possible occasion:
  *
  *         - in syscall or exception context, at the next outmost
  *           preempt_enable(). (this might be as soon as the wake_up()'s
@@ -6619,10 +6616,9 @@  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  *         - in IRQ context, return from interrupt-handler to
  *           preemptible context
  *
- *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
- *         then at the next:
+ *      - If the new task preempts the current task, but the scheduling
+ *        policy is only preempt voluntarily, then at the next:
  *
- *          - cond_resched() call
  *          - explicit schedule() call
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..9b4c2967b2b7 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,9 @@  SCHED_FEAT(UTIL_EST_FASTUP, true)
 SCHED_FEAT(LATENCY_WARN, false)
 
 SCHED_FEAT(HZ_BW, true)
+
+#if defined(CONFIG_PREEMPT)
+SCHED_FEAT(FORCE_PREEMPT, true)
+#else
+SCHED_FEAT(FORCE_PREEMPT, false)
+#endif