diff mbox

[RFC,07/16,v3] Init Workload Consolidation flags in sched_domain

Message ID 1401431772-14320-8-git-send-email-yuyang.du@intel.com (mailing list archive)
State RFC, archived
Headers show

Commit Message

Yuyang Du May 30, 2014, 6:36 a.m. UTC
Workload Consolidation can be enabled/disabled on the fly. This patchset
enables MC and CPU domain WC by default.

To enable CPU WC (SD_WORKLOAD_CONSOLIDATION=0x8000):

sysctl -w kernel.sched_domain.cpuX.domainY.flags += 0x8000

To disable CPU WC:

sysctl -w kernel.sched_domain.cpuX.domainY.flags -= 0x8000

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
 include/linux/topology.h |    6 ++++++
 1 file changed, 6 insertions(+)

Comments

Peter Zijlstra June 3, 2014, 12:14 p.m. UTC | #1
On Fri, May 30, 2014 at 02:36:03PM +0800, Yuyang Du wrote:
> Workload Consolidation can be enabled/disabled on the fly. This patchset
> enables MC and CPU domain WC by default.
> 
> To enable CPU WC (SD_WORKLOAD_CONSOLIDATION=0x8000):
> 
> sysctl -w kernel.sched_domain.cpuX.domainY.flags += 0x8000
> 
> To disable CPU WC:
> 
> sysctl -w kernel.sched_domain.cpuX.domainY.flags -= 0x8000
> 
> Signed-off-by: Yuyang Du <yuyang.du@intel.com>
> ---
>  include/linux/topology.h |    6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 7062330..ebc339c3 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -102,12 +102,14 @@ int arch_update_cpu_topology(void);
>  				| 0*SD_SERIALIZE			\
>  				| 0*SD_PREFER_SIBLING			\
>  				| arch_sd_sibling_asym_packing()	\
> +				| 0*SD_WORKLOAD_CONSOLIDATION	\
>  				,					\
>  	.last_balance		= jiffies,				\
>  	.balance_interval	= 1,					\
>  	.smt_gain		= 1178,	/* 15% */			\
>  	.max_newidle_lb_cost	= 0,					\
>  	.next_decay_max_lb_cost	= jiffies,				\
> +	.consolidating_coeff = 0,					\
>  }
>  #endif
>  #endif /* CONFIG_SCHED_SMT */
> @@ -134,11 +136,13 @@ int arch_update_cpu_topology(void);
>  				| 0*SD_SHARE_CPUPOWER			\
>  				| 1*SD_SHARE_PKG_RESOURCES		\
>  				| 0*SD_SERIALIZE			\
> +				| 1*SD_WORKLOAD_CONSOLIDATION	\
>  				,					\
>  	.last_balance		= jiffies,				\
>  	.balance_interval	= 1,					\
>  	.max_newidle_lb_cost	= 0,					\
>  	.next_decay_max_lb_cost	= jiffies,				\
> +	.consolidating_coeff = 180,					\
>  }
>  #endif
>  #endif /* CONFIG_SCHED_MC */
> @@ -167,11 +171,13 @@ int arch_update_cpu_topology(void);
>  				| 0*SD_SHARE_PKG_RESOURCES		\
>  				| 0*SD_SERIALIZE			\
>  				| 1*SD_PREFER_SIBLING			\
> +				| 1*SD_WORKLOAD_CONSOLIDATION	\
>  				,					\
>  	.last_balance		= jiffies,				\
>  	.balance_interval	= 1,					\
>  	.max_newidle_lb_cost	= 0,					\
>  	.next_decay_max_lb_cost	= jiffies,				\
> +	.consolidating_coeff = 180,					\
>  }
>  #endif

What tree are you working against, non of that exists anymore. Also, you
cannot unconditionally set this.
Dietmar Eggemann June 9, 2014, 5:56 p.m. UTC | #2
... turned out that probably the cc list was too big for lkml. Dropping
all the individual email addresses on CC.

... it seems that this message hasn't made it to the list. Apologies to
everyone on To: and Cc: receiving it again.

On 03/06/14 13:14, Peter Zijlstra wrote:
> On Fri, May 30, 2014 at 02:36:03PM +0800, Yuyang Du wrote:
>> Workload Consolidation can be enabled/disabled on the fly. This patchset
>> enables MC and CPU domain WC by default.
>>
>> To enable CPU WC (SD_WORKLOAD_CONSOLIDATION=0x8000):
>>
>> sysctl -w kernel.sched_domain.cpuX.domainY.flags += 0x8000
>>
>> To disable CPU WC:
>>
>> sysctl -w kernel.sched_domain.cpuX.domainY.flags -= 0x8000
>>
>> Signed-off-by: Yuyang Du <yuyang.du@intel.com>
>> ---
>>  include/linux/topology.h |    6 ++++++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>> index 7062330..ebc339c3 100644
>> --- a/include/linux/topology.h
>> +++ b/include/linux/topology.h
>> @@ -102,12 +102,14 @@ int arch_update_cpu_topology(void);
>>  				| 0*SD_SERIALIZE			\
>>  				| 0*SD_PREFER_SIBLING			\
>>  				| arch_sd_sibling_asym_packing()	\
>> +				| 0*SD_WORKLOAD_CONSOLIDATION	\
>>  				,					\
>>  	.last_balance		= jiffies,				\
>>  	.balance_interval	= 1,					\
>>  	.smt_gain		= 1178,	/* 15% */			\
>>  	.max_newidle_lb_cost	= 0,					\
>>  	.next_decay_max_lb_cost	= jiffies,				\
>> +	.consolidating_coeff = 0,					\
>>  }
>>  #endif
>>  #endif /* CONFIG_SCHED_SMT */
>> @@ -134,11 +136,13 @@ int arch_update_cpu_topology(void);
>>  				| 0*SD_SHARE_CPUPOWER			\
>>  				| 1*SD_SHARE_PKG_RESOURCES		\
>>  				| 0*SD_SERIALIZE			\
>> +				| 1*SD_WORKLOAD_CONSOLIDATION	\
>>  				,					\
>>  	.last_balance		= jiffies,				\
>>  	.balance_interval	= 1,					\
>>  	.max_newidle_lb_cost	= 0,					\
>>  	.next_decay_max_lb_cost	= jiffies,				\
>> +	.consolidating_coeff = 180,					\
>>  }
>>  #endif
>>  #endif /* CONFIG_SCHED_MC */
>> @@ -167,11 +171,13 @@ int arch_update_cpu_topology(void);
>>  				| 0*SD_SHARE_PKG_RESOURCES		\
>>  				| 0*SD_SERIALIZE			\
>>  				| 1*SD_PREFER_SIBLING			\
>> +				| 1*SD_WORKLOAD_CONSOLIDATION	\
>>  				,					\
>>  	.last_balance		= jiffies,				\
>>  	.balance_interval	= 1,					\
>>  	.max_newidle_lb_cost	= 0,					\
>>  	.next_decay_max_lb_cost	= jiffies,				\
>> +	.consolidating_coeff = 180,					\
>>  }
>>  #endif
> 
> What tree are you working against, non of that exists anymore. Also, you
> cannot unconditionally set this.
> 

Hi Yuyang,

I'm running these patches on my ARM TC2 on top of
kernel/git/torvalds/linux.git (v3.15-rc7-79-gfe45736f4134). There're
considerable changes in the area of sched domain setup since Vincent's
patchset 'rework sched_domain topology description' (destined for v3.16)
which you can find on kernel/git/tip/tip.git .

Why did you make SD_WORKLOAD_CONSOLIDATION controllable via sysctl? All
the other SD flags are set during setup. Your top_flag_domain() function
takes care of figuring out what is the highest sd level this is set on
during load-balance but I can't find any good reason to do it this way
other then for testing purposes?

Setting SD_WORKLOAD_CONSOLIDATION  (which is probably a behavioural flag
rather than a topology description related one) on a certain sd level
requires you to also think about its implications in sd_init() and in sd
degenerate functionality.

-- Dietmar



--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 9, 2014, 9:18 p.m. UTC | #3
On Mon, Jun 09, 2014 at 06:56:17PM +0100, Dietmar Eggemann wrote:

Thanks, Dietmar.

> I'm running these patches on my ARM TC2 on top of
> kernel/git/torvalds/linux.git (v3.15-rc7-79-gfe45736f4134). There're
> considerable changes in the area of sched domain setup since Vincent's
> patchset 'rework sched_domain topology description' (destined for v3.16)
> which you can find on kernel/git/tip/tip.git .
> 

Yeah, PeterZ pointed it out. It was on top of mainline not tip.

> Why did you make SD_WORKLOAD_CONSOLIDATION controllable via sysctl? All
> the other SD flags are set during setup.
> 

I don't understand. Any flag or parameter in sched_domain can be modified
on-the-fly after booting via sysctl. The SD_XXX_INIT is a template to make
the sched_domain initialization easier, IIUC.

Yes, I should not unconditionally enable SD_WORKLOAD_CONSOLIDATION in MC
and CPU domain (pointed out by PeterZ), but I did so for the purpose of
testing this patchset at this moment. Eventually, this flag should not be
turned on for any domain by default for many reasons, not to mention CPU
topology is getting more diverse/complex.

I just checked Vincent's "rework sched_domain topology description". The
general picture for init sched_domain does not change. If you work on top
of tip tree, you can simply skip this patch (0007), and after booting
enable SD_WORKLOAD_CONSOLIDATION by:

sysctl -w kernel.sched_domain.cpuX.domainY.flags += 0x8000
sysctl -w kernel.sched_domain.cpu0.domain1.consolidating_coeff=180
sysctl -w kernel.sched_cc_wakeup_threshold=80

> Your top_flag_domain() function
> takes care of figuring out what is the highest sd level this is set on
> during load-balance but I can't find any good reason to do it this way
> other then for testing purposes?

Any flag is used for testing whether it is set on or not when encountering
it, including the flags in sched_domain for load balancing, this is why flag
is called flag. My flag is any excpetion?

Thanks,
Yuyang
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dietmar Eggemann June 10, 2014, 11:52 a.m. UTC | #4
On 09/06/14 22:18, Yuyang Du wrote:
> On Mon, Jun 09, 2014 at 06:56:17PM +0100, Dietmar Eggemann wrote:
> 
> Thanks, Dietmar.
> 
>> I'm running these patches on my ARM TC2 on top of
>> kernel/git/torvalds/linux.git (v3.15-rc7-79-gfe45736f4134). There're
>> considerable changes in the area of sched domain setup since Vincent's
>> patchset 'rework sched_domain topology description' (destined for v3.16)
>> which you can find on kernel/git/tip/tip.git .
>>
> 
> Yeah, PeterZ pointed it out. It was on top of mainline not tip.
> 
>> Why did you make SD_WORKLOAD_CONSOLIDATION controllable via sysctl? All
>> the other SD flags are set during setup.
>>
> 
> I don't understand. Any flag or parameter in sched_domain can be modified
> on-the-fly after booting via sysctl. The SD_XXX_INIT is a template to make
> the sched_domain initialization easier, IIUC.

Technically true but since the sysctrl stuff is per-cpu and you want to
change per-domain data, you have to be extremely careful that each cpu
still sees the same data.

Another counter example, if I delete the SD_SHARE_PKG_RESOURCES flag on
my ARM TC2 system for all CPU's on domain0 (MC level) via sysctl, the
scheduler still has sd_llc assigned to the struct sched_domain for the
MC level of the CPU.

> 
> Yes, I should not unconditionally enable SD_WORKLOAD_CONSOLIDATION in MC
> and CPU domain (pointed out by PeterZ), but I did so for the purpose of
> testing this patchset at this moment. Eventually, this flag should not be
> turned on for any domain by default for many reasons, not to mention CPU
> topology is getting more diverse/complex.

But isn't this the point to show how and under which conditions you
would set this flag in the existing code? Since I guess it's a scheduler
behavioural (not a topology related one) flag, it has to be integrated
nicely into sd_init() etc.

> 
> I just checked Vincent's "rework sched_domain topology description". The
> general picture for init sched_domain does not change. If you work on top
> of tip tree, you can simply skip this patch (0007), and after booting
> enable SD_WORKLOAD_CONSOLIDATION by:
> 
> sysctl -w kernel.sched_domain.cpuX.domainY.flags += 0x8000
> sysctl -w kernel.sched_domain.cpu0.domain1.consolidating_coeff=180
> sysctl -w kernel.sched_cc_wakeup_threshold=80
> 
>> Your top_flag_domain() function
>> takes care of figuring out what is the highest sd level this is set on
>> during load-balance but I can't find any good reason to do it this way
>> other then for testing purposes?
> 
> Any flag is used for testing whether it is set on or not when encountering
> it, including the flags in sched_domain for load balancing, this is why flag
> is called flag. My flag is any excpetion?

Not in this sense but there is no functionality in the scheduler right
now to check constantly if an sd flag has been set/unset via sysctl.
IMHO, there's only sd_init and (highest/lowest)_flag_domain to cache
pointers to special sd's and both are called during start-up or cpu
hotplug
((init/partition_sched_domains()->build_sched_domains()->{build_sched_domain()->sd_init(),
cpu_attach_domain()-> update_top_cache_domain())}

-- Dietmar

> 
> Thanks,
> Yuyang
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuyang Du June 10, 2014, 6:09 p.m. UTC | #5
On Tue, Jun 10, 2014 at 12:52:06PM +0100, Dietmar Eggemann wrote:

Hi Dietmar,

> Not in this sense but there is no functionality in the scheduler right
> now to check constantly if an sd flag has been set/unset via sysctl.

Sorry, I still don't understand. There are many "if (sd->flags & SD_XXX)"
in fair.c. What does it mean to you?

Probably you mean the SD_XX should be fixed in init and never changed via sysctl
thereafter. Ah... I don't know about this...

Overall, I think I should come up with a better way to implement the SD_WORKLOAD_CONSOLIDATION
policy (enabled or disabled) in load balancing (as is also pointed out by PeterZ).
But I just don't see the current implementation is any particular different than
any other SD_XX's.

Have you tried it on your platform?

Thanks a lot,
Yuyang
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dietmar Eggemann June 11, 2014, 9:27 a.m. UTC | #6
On 10/06/14 19:09, Yuyang Du wrote:
> On Tue, Jun 10, 2014 at 12:52:06PM +0100, Dietmar Eggemann wrote:
> 
> Hi Dietmar,
> 
>> Not in this sense but there is no functionality in the scheduler right
>> now to check constantly if an sd flag has been set/unset via sysctl.
> 
> Sorry, I still don't understand. There are many "if (sd->flags & SD_XXX)"
> in fair.c. What does it mean to you?
> 
> Probably you mean the SD_XX should be fixed in init and never changed via sysctl
> thereafter. Ah... I don't know about this...

yes :-) I'm referring to your top_flag_domain() function which you need
to check what the highest sd level is where your flag is set. Existing
code only relies on flag setup during startup and after cpu hotplug or
on cached per-cpu sd pointers like sd_llc .

> 
> Overall, I think I should come up with a better way to implement the SD_WORKLOAD_CONSOLIDATION
> policy (enabled or disabled) in load balancing (as is also pointed out by PeterZ).
> But I just don't see the current implementation is any particular different than
> any other SD_XX's.
> 
> Have you tried it on your platform?

I'm running these patches on my ARM TC2 (2 clusters (2 CPUs, 3 CPUs)) on
top of kernel/git/torvalds/linux.git (v3.15-rc7-79-gfe45736f4134). By
default, on this platform CC is enabled on MC and CPU level. Overall
workloads show very different behaviour (CC enabled on MC and CPU level
as well as only enabled on MC level) compared to testruns wo/ CC but I
do not have the time to analyse it further. BTW, I hot-plugged out the
3rd CPU on the 2. cluster (there is this comment on top of
__nonshielded_groups() 'every sched_group has the same weight').

-- Dietmar

> 
> Thanks a lot,
> Yuyang
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7062330..ebc339c3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -102,12 +102,14 @@  int arch_update_cpu_topology(void);
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
 				| arch_sd_sibling_asym_packing()	\
+				| 0*SD_WORKLOAD_CONSOLIDATION	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
 	.smt_gain		= 1178,	/* 15% */			\
 	.max_newidle_lb_cost	= 0,					\
 	.next_decay_max_lb_cost	= jiffies,				\
+	.consolidating_coeff = 0,					\
 }
 #endif
 #endif /* CONFIG_SCHED_SMT */
@@ -134,11 +136,13 @@  int arch_update_cpu_topology(void);
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
+				| 1*SD_WORKLOAD_CONSOLIDATION	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
 	.max_newidle_lb_cost	= 0,					\
 	.next_decay_max_lb_cost	= jiffies,				\
+	.consolidating_coeff = 180,					\
 }
 #endif
 #endif /* CONFIG_SCHED_MC */
@@ -167,11 +171,13 @@  int arch_update_cpu_topology(void);
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 1*SD_PREFER_SIBLING			\
+				| 1*SD_WORKLOAD_CONSOLIDATION	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
 	.max_newidle_lb_cost	= 0,					\
 	.next_decay_max_lb_cost	= jiffies,				\
+	.consolidating_coeff = 180,					\
 }
 #endif