diff mbox series

[RFC,6/8] cpufreq: intel_pstate: Remove iowait boost

Message ID 20240905092645.2885200-7-christian.loehle@arm.com (mailing list archive)
State RFC, archived
Headers show
Series cpufreq: cpuidle: Remove iowait behaviour | expand

Commit Message

Christian Loehle Sept. 5, 2024, 9:26 a.m. UTC
Analogous to schedutil, remove iowait boost for the same reasons.

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
 drivers/cpufreq/intel_pstate.c | 50 ++--------------------------------
 1 file changed, 3 insertions(+), 47 deletions(-)

Comments

Rafael J. Wysocki Sept. 30, 2024, 6:03 p.m. UTC | #1
+Srinivas who can say more about the reasons why iowait boosting makes
a difference for intel_pstate than I do.

On Thu, Sep 5, 2024 at 11:27 AM Christian Loehle
<christian.loehle@arm.com> wrote:
>
> Analogous to schedutil, remove iowait boost for the same reasons.

Well, first of all, iowait boosting was added to intel_pstate to help
some workloads that otherwise were underperforming.  I'm not sure if
you can simply remove it without introducing performance regressions
in those workloads.

While you can argue that it is not useful in schedutil any more due to
the improved scheduler input for it, you can hardly extend that
argument to intel_pstate because it doesn't use all of the scheduler
input used by schedutil.

Also, the EAS and UCLAMP_MAX arguments are not applicable to
intel_pstate because it doesn't support any of them.

This applies to the ondemand cpufreq governor either.


> Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> ---
>  drivers/cpufreq/intel_pstate.c | 50 ++--------------------------------
>  1 file changed, 3 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index c0278d023cfc..7f30b2569bb3 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -191,7 +191,6 @@ struct global_params {
>   * @policy:            CPUFreq policy value
>   * @update_util:       CPUFreq utility callback information
>   * @update_util_set:   CPUFreq utility callback is set
> - * @iowait_boost:      iowait-related boost fraction
>   * @last_update:       Time of the last update.
>   * @pstate:            Stores P state limits for this CPU
>   * @vid:               Stores VID limits for this CPU
> @@ -245,7 +244,6 @@ struct cpudata {
>         struct acpi_processor_performance acpi_perf_data;
>         bool valid_pss_table;
>  #endif
> -       unsigned int iowait_boost;
>         s16 epp_powersave;
>         s16 epp_policy;
>         s16 epp_default;
> @@ -2136,28 +2134,7 @@ static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
>  {
>         cpu->sample.time = time;
>
> -       if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
> -               bool do_io = false;
> -
> -               cpu->sched_flags = 0;
> -               /*
> -                * Set iowait_boost flag and update time. Since IO WAIT flag
> -                * is set all the time, we can't just conclude that there is
> -                * some IO bound activity is scheduled on this CPU with just
> -                * one occurrence. If we receive at least two in two
> -                * consecutive ticks, then we treat as boost candidate.
> -                */
> -               if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
> -                       do_io = true;
> -
> -               cpu->last_io_update = time;
> -
> -               if (do_io)
> -                       intel_pstate_hwp_boost_up(cpu);
> -
> -       } else {
> -               intel_pstate_hwp_boost_down(cpu);
> -       }
> +       intel_pstate_hwp_boost_down(cpu);
>  }
>
>  static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
> @@ -2240,9 +2217,6 @@ static inline int32_t get_target_pstate(struct cpudata *cpu)
>         busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
>                            sample->tsc);
>
> -       if (busy_frac < cpu->iowait_boost)
> -               busy_frac = cpu->iowait_boost;
> -
>         sample->busy_scaled = busy_frac * 100;
>
>         target = READ_ONCE(global.no_turbo) ?
> @@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(struct cpudata *cpu)
>                 sample->aperf,
>                 sample->tsc,
>                 get_avg_frequency(cpu),
> -               fp_toint(cpu->iowait_boost * 100));
> +               0);
>  }
>
>  static void intel_pstate_update_util(struct update_util_data *data, u64 time,
> @@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
>                 return;
>
>         delta_ns = time - cpu->last_update;
> -       if (flags & SCHED_CPUFREQ_IOWAIT) {
> -               /* Start over if the CPU may have been idle. */
> -               if (delta_ns > TICK_NSEC) {
> -                       cpu->iowait_boost = ONE_EIGHTH_FP;
> -               } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
> -                       cpu->iowait_boost <<= 1;
> -                       if (cpu->iowait_boost > int_tofp(1))
> -                               cpu->iowait_boost = int_tofp(1);
> -               } else {
> -                       cpu->iowait_boost = ONE_EIGHTH_FP;
> -               }
> -       } else if (cpu->iowait_boost) {
> -               /* Clear iowait_boost if the CPU may have been idle. */
> -               if (delta_ns > TICK_NSEC)
> -                       cpu->iowait_boost = 0;
> -               else
> -                       cpu->iowait_boost >>= 1;
> -       }
>         cpu->last_update = time;
>         delta_ns = time - cpu->sample.time;
>         if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
> @@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in
>                 sample->aperf,
>                 sample->tsc,
>                 get_avg_frequency(cpu),
> -               fp_toint(cpu->iowait_boost * 100));
> +               0);
>  }
>
>  static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
> --
> 2.34.1
>
srinivas pandruvada Sept. 30, 2024, 8:35 p.m. UTC | #2
On Mon, 2024-09-30 at 20:03 +0200, Rafael J. Wysocki wrote:
> +Srinivas who can say more about the reasons why iowait boosting
> makes
> a difference for intel_pstate than I do.
> 
It makes difference on Xeons and also GFX performance.
The actual gains will be model specific as it will be dependent on
hardware algorithms and EPP.

It was introduced to solve regression in Skylake xeons. But even in the
recent servers there are gains.
Refer to
https://lkml.iu.edu/hypermail/linux/kernel/1806.0/03574.html

Thanks,
Srinivas


> On Thu, Sep 5, 2024 at 11:27 AM Christian Loehle
> <christian.loehle@arm.com> wrote:
> > 
> > Analogous to schedutil, remove iowait boost for the same reasons.
> 
> Well, first of all, iowait boosting was added to intel_pstate to help
> some workloads that otherwise were underperforming.  I'm not sure if
> you can simply remove it without introducing performance regressions
> in those workloads.
> 
> While you can argue that it is not useful in schedutil any more due
> to
> the improved scheduler input for it, you can hardly extend that
> argument to intel_pstate because it doesn't use all of the scheduler
> input used by schedutil.
> 
> Also, the EAS and UCLAMP_MAX arguments are not applicable to
> intel_pstate because it doesn't support any of them.
> 
> This applies to the ondemand cpufreq governor either.
> 
> 
> > Signed-off-by: Christian Loehle <christian.loehle@arm.com>
> > ---
> >  drivers/cpufreq/intel_pstate.c | 50 ++----------------------------
> > ----
> >  1 file changed, 3 insertions(+), 47 deletions(-)
> > 
> > diff --git a/drivers/cpufreq/intel_pstate.c
> > b/drivers/cpufreq/intel_pstate.c
> > index c0278d023cfc..7f30b2569bb3 100644
> > --- a/drivers/cpufreq/intel_pstate.c
> > +++ b/drivers/cpufreq/intel_pstate.c
> > @@ -191,7 +191,6 @@ struct global_params {
> >   * @policy:            CPUFreq policy value
> >   * @update_util:       CPUFreq utility callback information
> >   * @update_util_set:   CPUFreq utility callback is set
> > - * @iowait_boost:      iowait-related boost fraction
> >   * @last_update:       Time of the last update.
> >   * @pstate:            Stores P state limits for this CPU
> >   * @vid:               Stores VID limits for this CPU
> > @@ -245,7 +244,6 @@ struct cpudata {
> >         struct acpi_processor_performance acpi_perf_data;
> >         bool valid_pss_table;
> >  #endif
> > -       unsigned int iowait_boost;
> >         s16 epp_powersave;
> >         s16 epp_policy;
> >         s16 epp_default;
> > @@ -2136,28 +2134,7 @@ static inline void
> > intel_pstate_update_util_hwp_local(struct cpudata *cpu,
> >  {
> >         cpu->sample.time = time;
> > 
> > -       if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
> > -               bool do_io = false;
> > -
> > -               cpu->sched_flags = 0;
> > -               /*
> > -                * Set iowait_boost flag and update time. Since IO
> > WAIT flag
> > -                * is set all the time, we can't just conclude that
> > there is
> > -                * some IO bound activity is scheduled on this CPU
> > with just
> > -                * one occurrence. If we receive at least two in
> > two
> > -                * consecutive ticks, then we treat as boost
> > candidate.
> > -                */
> > -               if (time_before64(time, cpu->last_io_update + 2 *
> > TICK_NSEC))
> > -                       do_io = true;
> > -
> > -               cpu->last_io_update = time;
> > -
> > -               if (do_io)
> > -                       intel_pstate_hwp_boost_up(cpu);
> > -
> > -       } else {
> > -               intel_pstate_hwp_boost_down(cpu);
> > -       }
> > +       intel_pstate_hwp_boost_down(cpu);
> >  }
> > 
> >  static inline void intel_pstate_update_util_hwp(struct
> > update_util_data *data,
> > @@ -2240,9 +2217,6 @@ static inline int32_t
> > get_target_pstate(struct cpudata *cpu)
> >         busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
> >                            sample->tsc);
> > 
> > -       if (busy_frac < cpu->iowait_boost)
> > -               busy_frac = cpu->iowait_boost;
> > -
> >         sample->busy_scaled = busy_frac * 100;
> > 
> >         target = READ_ONCE(global.no_turbo) ?
> > @@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(struct
> > cpudata *cpu)
> >                 sample->aperf,
> >                 sample->tsc,
> >                 get_avg_frequency(cpu),
> > -               fp_toint(cpu->iowait_boost * 100));
> > +               0);
> >  }
> > 
> >  static void intel_pstate_update_util(struct update_util_data
> > *data, u64 time,
> > @@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(struct
> > update_util_data *data, u64 time,
> >                 return;
> > 
> >         delta_ns = time - cpu->last_update;
> > -       if (flags & SCHED_CPUFREQ_IOWAIT) {
> > -               /* Start over if the CPU may have been idle. */
> > -               if (delta_ns > TICK_NSEC) {
> > -                       cpu->iowait_boost = ONE_EIGHTH_FP;
> > -               } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
> > -                       cpu->iowait_boost <<= 1;
> > -                       if (cpu->iowait_boost > int_tofp(1))
> > -                               cpu->iowait_boost = int_tofp(1);
> > -               } else {
> > -                       cpu->iowait_boost = ONE_EIGHTH_FP;
> > -               }
> > -       } else if (cpu->iowait_boost) {
> > -               /* Clear iowait_boost if the CPU may have been
> > idle. */
> > -               if (delta_ns > TICK_NSEC)
> > -                       cpu->iowait_boost = 0;
> > -               else
> > -                       cpu->iowait_boost >>= 1;
> > -       }
> >         cpu->last_update = time;
> >         delta_ns = time - cpu->sample.time;
> >         if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
> > @@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct
> > cpudata *cpu, unsigned int trace_type, in
> >                 sample->aperf,
> >                 sample->tsc,
> >                 get_avg_frequency(cpu),
> > -               fp_toint(cpu->iowait_boost * 100));
> > +               0);
> >  }
> > 
> >  static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min,
> > u32 max,
> > --
> > 2.34.1
> >
Christian Loehle Oct. 1, 2024, 9:57 a.m. UTC | #3
On 9/30/24 21:35, srinivas pandruvada wrote:
> On Mon, 2024-09-30 at 20:03 +0200, Rafael J. Wysocki wrote:
>> +Srinivas who can say more about the reasons why iowait boosting
>> makes
>> a difference for intel_pstate than I do.
>>

Hi Srinivas,

> It makes difference on Xeons and also GFX performance.

AFAIU the GFX performance with iowait boost is a regression though,
because it cuts into the system power budget (CPU+GPU), especially
on desktop and mobile chips (but also some servers), no?
https://lore.kernel.org/lkml/20180730220029.81983-1-srinivas.pandruvada@linux.intel.com/
https://lore.kernel.org/lkml/e7388bf4-deb1-34b6-97d7-89ced8e78ef1@intel.com/
Or is there a reported case where iowait boosting helps
graphics workloads?

> The actual gains will be model specific as it will be dependent on
> hardware algorithms and EPP.
> 
> It was introduced to solve regression in Skylake xeons. But even in the
> recent servers there are gains.
> Refer to
> https://lkml.iu.edu/hypermail/linux/kernel/1806.0/03574.html

Did you look into PELT utilization values at that time?
I see why intel_pstate might be worse off than schedutil wrt removing
iowait boosting and do see two remedies essentially:
1. Boost after all sleeps (less aggressively), although I'm not a huge fan of
this.
2. If the gap between util_est and HWP-determined frequency is too large
then apply some boost. A sort of fallback on a schedutil strategy.
That would of course require util_est to be significantly large in those
scenarios.

I might try to propose something for 2, although as you can probably
guess, playing with HWP is somewhat uncharted waters for me.

Since intel_pstate will actually boost into unsustainable P-states,
there should be workloads that regress with iowait boosting. I'll
go looking for those.
srinivas pandruvada Oct. 1, 2024, 2:46 p.m. UTC | #4
Hi Christian,

On Tue, 2024-10-01 at 10:57 +0100, Christian Loehle wrote:
> On 9/30/24 21:35, srinivas pandruvada wrote:
> > On Mon, 2024-09-30 at 20:03 +0200, Rafael J. Wysocki wrote:
> > > +Srinivas who can say more about the reasons why iowait boosting
> > > makes
> > > a difference for intel_pstate than I do.
> > > 
> 
> Hi Srinivas,
> 
> > It makes difference on Xeons and also GFX performance.
> 
> AFAIU the GFX performance with iowait boost is a regression though,
> because it cuts into the system power budget (CPU+GPU), especially
> on desktop and mobile chips (but also some servers), no?
> https://lore.kernel.org/lkml/20180730220029.81983-1-srinivas.pandruvada@linux.intel.com/
> https://lore.kernel.org/lkml/e7388bf4-deb1-34b6-97d7-89ced8e78ef1@intel.com/
> Or is there a reported case where iowait boosting helps
> graphics workloads?
> 
GFX is complex as you have both cases depending on the generation. We
don't enable the control by default. There is a user space control, so
that it can be selected when it helps.


> > The actual gains will be model specific as it will be dependent on
> > hardware algorithms and EPP.
> > 
> > It was introduced to solve regression in Skylake xeons. But even in
> > the
> > recent servers there are gains.
> > Refer to
> > https://lkml.iu.edu/hypermail/linux/kernel/1806.0/03574.html
> 
> Did you look into PELT utilization values at that time?
No. But boost is needed for idle or semi-idle CPUs, otherwise HWP would
have already running at higher frequency. But we could avoid boot if
util is above a threshold.


> I see why intel_pstate might be worse off than schedutil wrt removing
> iowait boosting and do see two remedies essentially:
> 1. Boost after all sleeps (less aggressively), although I'm not a
> huge fan of
> this.
> 2. If the gap between util_est and HWP-determined frequency is too
> large
> then apply some boost. A sort of fallback on a schedutil strategy.
> That would of course require util_est to be significantly large in
> those
> scenarios.
> 
> I might try to propose something for 2, although as you can probably
> guess, playing with HWP is somewhat uncharted waters for me.
> 
Now we sample the last HWP determined frequency at every tick and can
use to avoid boost. So need some experiments.

> Since intel_pstate will actually boost into unsustainable P-states,
> there should be workloads that regress with iowait boosting. I'll
> go looking for those.

Xeons power limit is in order of 100s of Watts. So boost doesn't
generally to unsustainable state. Even if all cores boost at the same
time, the worst case we still get all core turbo.

Thanks,
Srinivas




> 
>
diff mbox series

Patch

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c0278d023cfc..7f30b2569bb3 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -191,7 +191,6 @@  struct global_params {
  * @policy:		CPUFreq policy value
  * @update_util:	CPUFreq utility callback information
  * @update_util_set:	CPUFreq utility callback is set
- * @iowait_boost:	iowait-related boost fraction
  * @last_update:	Time of the last update.
  * @pstate:		Stores P state limits for this CPU
  * @vid:		Stores VID limits for this CPU
@@ -245,7 +244,6 @@  struct cpudata {
 	struct acpi_processor_performance acpi_perf_data;
 	bool valid_pss_table;
 #endif
-	unsigned int iowait_boost;
 	s16 epp_powersave;
 	s16 epp_policy;
 	s16 epp_default;
@@ -2136,28 +2134,7 @@  static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
 {
 	cpu->sample.time = time;
 
-	if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
-		bool do_io = false;
-
-		cpu->sched_flags = 0;
-		/*
-		 * Set iowait_boost flag and update time. Since IO WAIT flag
-		 * is set all the time, we can't just conclude that there is
-		 * some IO bound activity is scheduled on this CPU with just
-		 * one occurrence. If we receive at least two in two
-		 * consecutive ticks, then we treat as boost candidate.
-		 */
-		if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
-			do_io = true;
-
-		cpu->last_io_update = time;
-
-		if (do_io)
-			intel_pstate_hwp_boost_up(cpu);
-
-	} else {
-		intel_pstate_hwp_boost_down(cpu);
-	}
+	intel_pstate_hwp_boost_down(cpu);
 }
 
 static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
@@ -2240,9 +2217,6 @@  static inline int32_t get_target_pstate(struct cpudata *cpu)
 	busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
 			   sample->tsc);
 
-	if (busy_frac < cpu->iowait_boost)
-		busy_frac = cpu->iowait_boost;
-
 	sample->busy_scaled = busy_frac * 100;
 
 	target = READ_ONCE(global.no_turbo) ?
@@ -2303,7 +2277,7 @@  static void intel_pstate_adjust_pstate(struct cpudata *cpu)
 		sample->aperf,
 		sample->tsc,
 		get_avg_frequency(cpu),
-		fp_toint(cpu->iowait_boost * 100));
+		0);
 }
 
 static void intel_pstate_update_util(struct update_util_data *data, u64 time,
@@ -2317,24 +2291,6 @@  static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 		return;
 
 	delta_ns = time - cpu->last_update;
-	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		/* Start over if the CPU may have been idle. */
-		if (delta_ns > TICK_NSEC) {
-			cpu->iowait_boost = ONE_EIGHTH_FP;
-		} else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
-			cpu->iowait_boost <<= 1;
-			if (cpu->iowait_boost > int_tofp(1))
-				cpu->iowait_boost = int_tofp(1);
-		} else {
-			cpu->iowait_boost = ONE_EIGHTH_FP;
-		}
-	} else if (cpu->iowait_boost) {
-		/* Clear iowait_boost if the CPU may have been idle. */
-		if (delta_ns > TICK_NSEC)
-			cpu->iowait_boost = 0;
-		else
-			cpu->iowait_boost >>= 1;
-	}
 	cpu->last_update = time;
 	delta_ns = time - cpu->sample.time;
 	if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
@@ -2832,7 +2788,7 @@  static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in
 		sample->aperf,
 		sample->tsc,
 		get_avg_frequency(cpu),
-		fp_toint(cpu->iowait_boost * 100));
+		0);
 }
 
 static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,