diff mbox series

[v2,3/3] lib/group_cpus.c: honor housekeeping config when grouping CPUs

Message ID 20240627-isolcpus-io-queues-v2-3-26a32e3c4f75@suse.de (mailing list archive)
State New, archived
Headers show
Series nvme-pci: honor isolcpus configuration | expand

Commit Message

Daniel Wagner June 27, 2024, 2:10 p.m. UTC
group_cpus_evenly distributes all present CPUs into groups. This ignores
the isolcpus configuration and assigns isolated CPUs into the groups.

Make group_cpus_evenly aware of isolcpus configuration and use the
housekeeping CPU mask as base for distributing the available CPUs into
groups.

Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 lib/group_cpus.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig June 28, 2024, 6:03 a.m. UTC | #1
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Hannes Reinecke June 28, 2024, 6:24 a.m. UTC | #2
On 6/27/24 16:10, Daniel Wagner wrote:
> group_cpus_evenly distributes all present CPUs into groups. This ignores
> the isolcpus configuration and assigns isolated CPUs into the groups.
> 
> Make group_cpus_evenly aware of isolcpus configuration and use the
> housekeeping CPU mask as base for distributing the available CPUs into
> groups.
> 
> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   lib/group_cpus.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 73 insertions(+), 2 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
Sagi Grimberg June 30, 2024, 8:25 a.m. UTC | #3
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Ming Lei June 30, 2024, 1:39 p.m. UTC | #4
On Thu, Jun 27, 2024 at 04:10:53PM +0200, Daniel Wagner wrote:
> group_cpus_evenly distributes all present CPUs into groups. This ignores

The above isn't true, it is really cpu_possible_mask which is
distributed, instead of all present CPUs.

> the isolcpus configuration and assigns isolated CPUs into the groups.
> 
> Make group_cpus_evenly aware of isolcpus configuration and use the
> housekeeping CPU mask as base for distributing the available CPUs into
> groups.
> 
> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")

isolated CPUs are actually handled when figuring out irq effective mask,
so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
patch from user viewpoint?


Thanks, 
Ming
Ming Lei July 1, 2024, 2:09 a.m. UTC | #5
On Thu, Jun 27, 2024 at 04:10:53PM +0200, Daniel Wagner wrote:
> group_cpus_evenly distributes all present CPUs into groups. This ignores
> the isolcpus configuration and assigns isolated CPUs into the groups.
> 
> Make group_cpus_evenly aware of isolcpus configuration and use the
> housekeeping CPU mask as base for distributing the available CPUs into
> groups.
> 
> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>  lib/group_cpus.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/group_cpus.c b/lib/group_cpus.c
> index ee272c4cefcc..19fb7186f9d4 100644
> --- a/lib/group_cpus.c
> +++ b/lib/group_cpus.c
> @@ -8,6 +8,7 @@
>  #include <linux/cpu.h>
>  #include <linux/sort.h>
>  #include <linux/group_cpus.h>
> +#include <linux/sched/isolation.h>
>  
>  #ifdef CONFIG_SMP
>  
> @@ -330,7 +331,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
>  }
>  
>  /**
> - * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> + * group_possible_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
>   * @numgrps: number of groups
>   *
>   * Return: cpumask array if successful, NULL otherwise. And each element
> @@ -344,7 +345,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
>   * We guarantee in the resulted grouping that all CPUs are covered, and
>   * no same CPU is assigned to multiple groups
>   */
> -struct cpumask *group_cpus_evenly(unsigned int numgrps)
> +static struct cpumask *group_possible_cpus_evenly(unsigned int numgrps)
>  {
>  	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
>  	cpumask_var_t *node_to_cpumask;
> @@ -423,6 +424,76 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
>  	}
>  	return masks;
>  }
> +
> +/**
> + * group_mask_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> + * @numgrps: number of groups
> + * @cpu_mask: CPU to consider for the grouping
> + *
> + * Return: cpumask array if successful, NULL otherwise. And each element
> + * includes CPUs assigned to this group.
> + *
> + * Try to put close CPUs from viewpoint of CPU and NUMA locality into
> + * same group. Allocate present CPUs on these groups evenly.
> + */
> +static struct cpumask *group_mask_cpus_evenly(unsigned int numgrps,
> +					      const struct cpumask *cpu_mask)
> +{
> +	cpumask_var_t *node_to_cpumask;
> +	cpumask_var_t nmsk;
> +	int ret = -ENOMEM;
> +	struct cpumask *masks = NULL;
> +
> +	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
> +		return NULL;
> +
> +	node_to_cpumask = alloc_node_to_cpumask();
> +	if (!node_to_cpumask)
> +		goto fail_nmsk;
> +
> +	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
> +	if (!masks)
> +		goto fail_node_to_cpumask;
> +
> +	build_node_to_cpumask(node_to_cpumask);
> +
> +	ret = __group_cpus_evenly(0, numgrps, node_to_cpumask, cpu_mask, nmsk,
> +				  masks);
> +
> +fail_node_to_cpumask:
> +	free_node_to_cpumask(node_to_cpumask);
> +
> +fail_nmsk:
> +	free_cpumask_var(nmsk);
> +	if (ret < 0) {
> +		kfree(masks);
> +		return NULL;
> +	}
> +	return masks;
> +}
> +
> +/**
> + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> + * @numgrps: number of groups
> + *
> + * Return: cpumask array if successful, NULL otherwise.
> + *
> + * group_possible_cpus_evently() is used for distributing the cpus on all
> + * possible cpus in absence of isolcpus command line argument.
> + * group_mask_cpu_evenly() is used when the isolcpus command line
> + * argument is used with managed_irq option. In this case only the
> + * housekeeping CPUs are considered.
> + */
> +struct cpumask *group_cpus_evenly(unsigned int numgrps)
> +{
> +	const struct cpumask *hk_mask;
> +
> +	hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
> +	if (!cpumask_empty(hk_mask))
> +		return group_mask_cpus_evenly(numgrps, hk_mask);
> +
> +	return group_possible_cpus_evenly(numgrps);

Since this patch, some isolated CPUs may not be covered in
blk-mq queue mapping.

Meantime people still may submit IO workload from isolated CPUs
such as by 'taskset -c', blk-mq may not work well for this situation,
for example, IO hang may be caused during cpu hotplug.

I did see this kind of usage in some RH Openshift workloads.

If blk-mq problem can be solved, I am fine with this kind of
change. 


Thanks,
Ming
Hannes Reinecke July 1, 2024, 6:43 a.m. UTC | #6
On 7/1/24 04:09, Ming Lei wrote:
> On Thu, Jun 27, 2024 at 04:10:53PM +0200, Daniel Wagner wrote:
>> group_cpus_evenly distributes all present CPUs into groups. This ignores
>> the isolcpus configuration and assigns isolated CPUs into the groups.
>>
>> Make group_cpus_evenly aware of isolcpus configuration and use the
>> housekeeping CPU mask as base for distributing the available CPUs into
>> groups.
>>
>> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
>> Signed-off-by: Daniel Wagner <dwagner@suse.de>
>> ---
>>   lib/group_cpus.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/group_cpus.c b/lib/group_cpus.c
>> index ee272c4cefcc..19fb7186f9d4 100644
>> --- a/lib/group_cpus.c
>> +++ b/lib/group_cpus.c
>> @@ -8,6 +8,7 @@
>>   #include <linux/cpu.h>
>>   #include <linux/sort.h>
>>   #include <linux/group_cpus.h>
>> +#include <linux/sched/isolation.h>
>>   
>>   #ifdef CONFIG_SMP
>>   
>> @@ -330,7 +331,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
>>   }
>>   
>>   /**
>> - * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
>> + * group_possible_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
>>    * @numgrps: number of groups
>>    *
>>    * Return: cpumask array if successful, NULL otherwise. And each element
>> @@ -344,7 +345,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
>>    * We guarantee in the resulted grouping that all CPUs are covered, and
>>    * no same CPU is assigned to multiple groups
>>    */
>> -struct cpumask *group_cpus_evenly(unsigned int numgrps)
>> +static struct cpumask *group_possible_cpus_evenly(unsigned int numgrps)
>>   {
>>   	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
>>   	cpumask_var_t *node_to_cpumask;
>> @@ -423,6 +424,76 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
>>   	}
>>   	return masks;
>>   }
>> +
>> +/**
>> + * group_mask_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
>> + * @numgrps: number of groups
>> + * @cpu_mask: CPU to consider for the grouping
>> + *
>> + * Return: cpumask array if successful, NULL otherwise. And each element
>> + * includes CPUs assigned to this group.
>> + *
>> + * Try to put close CPUs from viewpoint of CPU and NUMA locality into
>> + * same group. Allocate present CPUs on these groups evenly.
>> + */
>> +static struct cpumask *group_mask_cpus_evenly(unsigned int numgrps,
>> +					      const struct cpumask *cpu_mask)
>> +{
>> +	cpumask_var_t *node_to_cpumask;
>> +	cpumask_var_t nmsk;
>> +	int ret = -ENOMEM;
>> +	struct cpumask *masks = NULL;
>> +
>> +	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
>> +		return NULL;
>> +
>> +	node_to_cpumask = alloc_node_to_cpumask();
>> +	if (!node_to_cpumask)
>> +		goto fail_nmsk;
>> +
>> +	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
>> +	if (!masks)
>> +		goto fail_node_to_cpumask;
>> +
>> +	build_node_to_cpumask(node_to_cpumask);
>> +
>> +	ret = __group_cpus_evenly(0, numgrps, node_to_cpumask, cpu_mask, nmsk,
>> +				  masks);
>> +
>> +fail_node_to_cpumask:
>> +	free_node_to_cpumask(node_to_cpumask);
>> +
>> +fail_nmsk:
>> +	free_cpumask_var(nmsk);
>> +	if (ret < 0) {
>> +		kfree(masks);
>> +		return NULL;
>> +	}
>> +	return masks;
>> +}
>> +
>> +/**
>> + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
>> + * @numgrps: number of groups
>> + *
>> + * Return: cpumask array if successful, NULL otherwise.
>> + *
>> + * group_possible_cpus_evently() is used for distributing the cpus on all
>> + * possible cpus in absence of isolcpus command line argument.
>> + * group_mask_cpu_evenly() is used when the isolcpus command line
>> + * argument is used with managed_irq option. In this case only the
>> + * housekeeping CPUs are considered.
>> + */
>> +struct cpumask *group_cpus_evenly(unsigned int numgrps)
>> +{
>> +	const struct cpumask *hk_mask;
>> +
>> +	hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
>> +	if (!cpumask_empty(hk_mask))
>> +		return group_mask_cpus_evenly(numgrps, hk_mask);
>> +
>> +	return group_possible_cpus_evenly(numgrps);
> 
> Since this patch, some isolated CPUs may not be covered in
> blk-mq queue mapping.
> 
> Meantime people still may submit IO workload from isolated CPUs
> such as by 'taskset -c', blk-mq may not work well for this situation,
> for example, IO hang may be caused during cpu hotplug.
> 
> I did see this kind of usage in some RH Openshift workloads.
> 
> If blk-mq problem can be solved, I am fine with this kind of
> change.
> 
That was kinda the idea of this patchset; when 'isolcpus' is active any 
in-kernel driver can only run on the housekeeping CPUs, and I/O from the 
isolcpus is impossible.
(Otherwise they won't be isolated anymore, and the whole concepts 
becomes ever so shaky.).
Consequently we should not spread blk-mq onto the isolcpus (which is 
what this patchset attempts). We do need to check how we could inhibit 
I/O from the isolcpus, though; not sure if we do that now.
Something we need to check.

Cheers,

Hannes
Daniel Wagner July 1, 2024, 7:08 a.m. UTC | #7
On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
> > Make group_cpus_evenly aware of isolcpus configuration and use the
> > housekeeping CPU mask as base for distributing the available CPUs into
> > groups.
> > 
> > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> 
> isolated CPUs are actually handled when figuring out irq effective mask,
> so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
> patch from user viewpoint?

IO queues are allocated/spread on the isolated CPUs and if there is an
thread submitting IOs from an isolated CPU it will cause noise on the
isolated CPUs. The question is this a use case you need/want to support?
We have customers who are complaining that even with isolcpus provided
they still see IO noise on the isolated CPUs.
Ming Lei July 1, 2024, 7:10 a.m. UTC | #8
On Mon, Jul 01, 2024 at 08:43:34AM +0200, Hannes Reinecke wrote:
> On 7/1/24 04:09, Ming Lei wrote:
> > On Thu, Jun 27, 2024 at 04:10:53PM +0200, Daniel Wagner wrote:
> > > group_cpus_evenly distributes all present CPUs into groups. This ignores
> > > the isolcpus configuration and assigns isolated CPUs into the groups.
> > > 
> > > Make group_cpus_evenly aware of isolcpus configuration and use the
> > > housekeeping CPU mask as base for distributing the available CPUs into
> > > groups.
> > > 
> > > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> > > Signed-off-by: Daniel Wagner <dwagner@suse.de>
> > > ---
> > >   lib/group_cpus.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > >   1 file changed, 73 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/lib/group_cpus.c b/lib/group_cpus.c
> > > index ee272c4cefcc..19fb7186f9d4 100644
> > > --- a/lib/group_cpus.c
> > > +++ b/lib/group_cpus.c
> > > @@ -8,6 +8,7 @@
> > >   #include <linux/cpu.h>
> > >   #include <linux/sort.h>
> > >   #include <linux/group_cpus.h>
> > > +#include <linux/sched/isolation.h>
> > >   #ifdef CONFIG_SMP
> > > @@ -330,7 +331,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
> > >   }
> > >   /**
> > > - * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> > > + * group_possible_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> > >    * @numgrps: number of groups
> > >    *
> > >    * Return: cpumask array if successful, NULL otherwise. And each element
> > > @@ -344,7 +345,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
> > >    * We guarantee in the resulted grouping that all CPUs are covered, and
> > >    * no same CPU is assigned to multiple groups
> > >    */
> > > -struct cpumask *group_cpus_evenly(unsigned int numgrps)
> > > +static struct cpumask *group_possible_cpus_evenly(unsigned int numgrps)
> > >   {
> > >   	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
> > >   	cpumask_var_t *node_to_cpumask;
> > > @@ -423,6 +424,76 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
> > >   	}
> > >   	return masks;
> > >   }
> > > +
> > > +/**
> > > + * group_mask_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> > > + * @numgrps: number of groups
> > > + * @cpu_mask: CPU to consider for the grouping
> > > + *
> > > + * Return: cpumask array if successful, NULL otherwise. And each element
> > > + * includes CPUs assigned to this group.
> > > + *
> > > + * Try to put close CPUs from viewpoint of CPU and NUMA locality into
> > > + * same group. Allocate present CPUs on these groups evenly.
> > > + */
> > > +static struct cpumask *group_mask_cpus_evenly(unsigned int numgrps,
> > > +					      const struct cpumask *cpu_mask)
> > > +{
> > > +	cpumask_var_t *node_to_cpumask;
> > > +	cpumask_var_t nmsk;
> > > +	int ret = -ENOMEM;
> > > +	struct cpumask *masks = NULL;
> > > +
> > > +	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
> > > +		return NULL;
> > > +
> > > +	node_to_cpumask = alloc_node_to_cpumask();
> > > +	if (!node_to_cpumask)
> > > +		goto fail_nmsk;
> > > +
> > > +	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
> > > +	if (!masks)
> > > +		goto fail_node_to_cpumask;
> > > +
> > > +	build_node_to_cpumask(node_to_cpumask);
> > > +
> > > +	ret = __group_cpus_evenly(0, numgrps, node_to_cpumask, cpu_mask, nmsk,
> > > +				  masks);
> > > +
> > > +fail_node_to_cpumask:
> > > +	free_node_to_cpumask(node_to_cpumask);
> > > +
> > > +fail_nmsk:
> > > +	free_cpumask_var(nmsk);
> > > +	if (ret < 0) {
> > > +		kfree(masks);
> > > +		return NULL;
> > > +	}
> > > +	return masks;
> > > +}
> > > +
> > > +/**
> > > + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
> > > + * @numgrps: number of groups
> > > + *
> > > + * Return: cpumask array if successful, NULL otherwise.
> > > + *
> > > + * group_possible_cpus_evently() is used for distributing the cpus on all
> > > + * possible cpus in absence of isolcpus command line argument.
> > > + * group_mask_cpu_evenly() is used when the isolcpus command line
> > > + * argument is used with managed_irq option. In this case only the
> > > + * housekeeping CPUs are considered.
> > > + */
> > > +struct cpumask *group_cpus_evenly(unsigned int numgrps)
> > > +{
> > > +	const struct cpumask *hk_mask;
> > > +
> > > +	hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
> > > +	if (!cpumask_empty(hk_mask))
> > > +		return group_mask_cpus_evenly(numgrps, hk_mask);
> > > +
> > > +	return group_possible_cpus_evenly(numgrps);
> > 
> > Since this patch, some isolated CPUs may not be covered in
> > blk-mq queue mapping.
> > 
> > Meantime people still may submit IO workload from isolated CPUs
> > such as by 'taskset -c', blk-mq may not work well for this situation,
> > for example, IO hang may be caused during cpu hotplug.
> > 
> > I did see this kind of usage in some RH Openshift workloads.
> > 
> > If blk-mq problem can be solved, I am fine with this kind of
> > change.
> > 
> That was kinda the idea of this patchset; when 'isolcpus' is active any
> in-kernel driver can only run on the housekeeping CPUs, and I/O from the
> isolcpus is impossible.
> (Otherwise they won't be isolated anymore, and the whole concepts becomes
> ever so shaky.).

Userspace may still force to run IO workload from isolated CPUs when they do
not care CPU isolation, and kernel still should complete IO from isolated CPUs,
and can't run into hang or panic meantime.

And we do support this kind of usage now, then regression is caused by
this patch.

Thanks, 
Ming
Ming Lei July 1, 2024, 7:21 a.m. UTC | #9
On Mon, Jul 01, 2024 at 09:08:32AM +0200, Daniel Wagner wrote:
> On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
> > > Make group_cpus_evenly aware of isolcpus configuration and use the
> > > housekeeping CPU mask as base for distributing the available CPUs into
> > > groups.
> > > 
> > > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> > 
> > isolated CPUs are actually handled when figuring out irq effective mask,
> > so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
> > patch from user viewpoint?
> 
> IO queues are allocated/spread on the isolated CPUs and if there is an
> thread submitting IOs from an isolated CPU it will cause noise on the
> isolated CPUs. The question is this a use case you need/want to support?

I have talked RH Openshift team weeks ago and they have such usage.

userspace is free to run any application from isolated CPUs via 'taskset
-c' even though 'isolcpus=' is passed from command line.

Kernel can not add such new constraint on userspace.

> We have customers who are complaining that even with isolcpus provided
> they still see IO noise on the isolated CPUs.

That is another issue, which has been fixed by the following patch:

a46c27026da1 blk-mq: don't schedule block kworker on isolated CPUs



Thanks,
Ming
Daniel Wagner July 1, 2024, 8:19 a.m. UTC | #10
On Mon, Jul 01, 2024 at 03:21:13PM GMT, Ming Lei wrote:
> On Mon, Jul 01, 2024 at 09:08:32AM +0200, Daniel Wagner wrote:
> > On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
> > > > Make group_cpus_evenly aware of isolcpus configuration and use the
> > > > housekeeping CPU mask as base for distributing the available CPUs into
> > > > groups.
> > > > 
> > > > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> > > 
> > > isolated CPUs are actually handled when figuring out irq effective mask,
> > > so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
> > > patch from user viewpoint?
> > 
> > IO queues are allocated/spread on the isolated CPUs and if there is an
> > thread submitting IOs from an isolated CPU it will cause noise on the
> > isolated CPUs. The question is this a use case you need/want to support?
> 
> I have talked RH Openshift team weeks ago and they have such usage.
> 
> userspace is free to run any application from isolated CPUs via 'taskset
> -c' even though 'isolcpus=' is passed from command line.
>
> Kernel can not add such new constraint on userspace.

Okay, that is why I asked if we need an additional HK type.

> > We have customers who are complaining that even with isolcpus provided
> > they still see IO noise on the isolated CPUs.
> 
> That is another issue, which has been fixed by the following patch:
> 
> a46c27026da1 blk-mq: don't schedule block kworker on isolated CPUs

I've checked our downstream kernels and we don't have this one yet. I'll
ask our customer to test if this patch addressed their issue.

Thanks!
Daniel
Hannes Reinecke July 1, 2024, 8:37 a.m. UTC | #11
On 7/1/24 09:10, Ming Lei wrote:
> On Mon, Jul 01, 2024 at 08:43:34AM +0200, Hannes Reinecke wrote:
>> On 7/1/24 04:09, Ming Lei wrote:
[ .. ]
>>>
>>> Since this patch, some isolated CPUs may not be covered in
>>> blk-mq queue mapping.
>>>
>>> Meantime people still may submit IO workload from isolated CPUs
>>> such as by 'taskset -c', blk-mq may not work well for this situation,
>>> for example, IO hang may be caused during cpu hotplug.
>>>
>>> I did see this kind of usage in some RH Openshift workloads.
>>>
>>> If blk-mq problem can be solved, I am fine with this kind of
>>> change.
>>>
>> That was kinda the idea of this patchset; when 'isolcpus' is active any
>> in-kernel driver can only run on the housekeeping CPUs, and I/O from the
>> isolcpus is impossible.
>> (Otherwise they won't be isolated anymore, and the whole concepts becomes
>> ever so shaky.).
> 
> Userspace may still force to run IO workload from isolated CPUs when they do
> not care CPU isolation, and kernel still should complete IO from isolated CPUs,
> and can't run into hang or panic meantime.
> 
> And we do support this kind of usage now, then regression is caused by
> this patch.
> 
Hmm. Guess we need to modify the grouping algorithm to group across all 
cpus, but ensure that each group consists either of all housekeeping 
CPUs or all isolated cpus.
Daniel?

Cheers,

Hannes
Hannes Reinecke July 1, 2024, 8:43 a.m. UTC | #12
On 7/1/24 09:21, Ming Lei wrote:
> On Mon, Jul 01, 2024 at 09:08:32AM +0200, Daniel Wagner wrote:
>> On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
>>>> Make group_cpus_evenly aware of isolcpus configuration and use the
>>>> housekeeping CPU mask as base for distributing the available CPUs into
>>>> groups.
>>>>
>>>> Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
>>>
>>> isolated CPUs are actually handled when figuring out irq effective mask,
>>> so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
>>> patch from user viewpoint?
>>
>> IO queues are allocated/spread on the isolated CPUs and if there is an
>> thread submitting IOs from an isolated CPU it will cause noise on the
>> isolated CPUs. The question is this a use case you need/want to support?
> 
> I have talked RH Openshift team weeks ago and they have such usage.
> 
> userspace is free to run any application from isolated CPUs via 'taskset
> -c' even though 'isolcpus=' is passed from command line.
> 
> Kernel can not add such new constraint on userspace.
> 
>> We have customers who are complaining that even with isolcpus provided
>> they still see IO noise on the isolated CPUs.
> 
> That is another issue, which has been fixed by the following patch:
> 
> a46c27026da1 blk-mq: don't schedule block kworker on isolated CPUs
> 
Hmm. Just when I thought I understood the issue ...

How is this supposed to work, then, given that I/O can be initiated
from the isolated CPUs?
I would have accepted that we have two scheduling domains, blk-mq is
spread across all cpus, and the blk-mq cpusets are arranged according
to the isolcpu settings.
Then we can initiate I/O from the isolated cpus, and the scheduler
would 'magically' ensure that everything is only run on isolated cpus.

But that patch would completely counteract such a setup, as during
I/O we more often than not will invoke kblockd, which then would cause
cross-talk on non-isolated cpus.

What is the idea here?

Confused,

Hannes
Ming Lei July 1, 2024, 8:47 a.m. UTC | #13
On Mon, Jul 01, 2024 at 10:19:25AM +0200, Daniel Wagner wrote:
> On Mon, Jul 01, 2024 at 03:21:13PM GMT, Ming Lei wrote:
> > On Mon, Jul 01, 2024 at 09:08:32AM +0200, Daniel Wagner wrote:
> > > On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
> > > > > Make group_cpus_evenly aware of isolcpus configuration and use the
> > > > > housekeeping CPU mask as base for distributing the available CPUs into
> > > > > groups.
> > > > > 
> > > > > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> > > > 
> > > > isolated CPUs are actually handled when figuring out irq effective mask,
> > > > so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
> > > > patch from user viewpoint?
> > > 
> > > IO queues are allocated/spread on the isolated CPUs and if there is an
> > > thread submitting IOs from an isolated CPU it will cause noise on the
> > > isolated CPUs. The question is this a use case you need/want to support?
> > 
> > I have talked RH Openshift team weeks ago and they have such usage.
> > 
> > userspace is free to run any application from isolated CPUs via 'taskset
> > -c' even though 'isolcpus=' is passed from command line.
> >
> > Kernel can not add such new constraint on userspace.
> 
> Okay, that is why I asked if we need an additional HK type.
> 
> > > We have customers who are complaining that even with isolcpus provided
> > > they still see IO noise on the isolated CPUs.
> > 
> > That is another issue, which has been fixed by the following patch:
> > 
> > a46c27026da1 blk-mq: don't schedule block kworker on isolated CPUs
> 
> I've checked our downstream kernels and we don't have this one yet. I'll
> ask our customer to test if this patch addressed their issue.

BTW, you need the following one too:

7b815817aa58 blk-mq: add helper for checking if one CPU is mapped to specified hctx

Thanks,
Ming
Ming Lei July 1, 2024, 9:16 a.m. UTC | #14
On Mon, Jul 01, 2024 at 10:43:14AM +0200, Hannes Reinecke wrote:
> On 7/1/24 09:21, Ming Lei wrote:
> > On Mon, Jul 01, 2024 at 09:08:32AM +0200, Daniel Wagner wrote:
> > > On Sun, Jun 30, 2024 at 09:39:59PM GMT, Ming Lei wrote:
> > > > > Make group_cpus_evenly aware of isolcpus configuration and use the
> > > > > housekeeping CPU mask as base for distributing the available CPUs into
> > > > > groups.
> > > > > 
> > > > > Fixes: 11ea68f553e2 ("genirq, sched/isolation: Isolate from handling managed interrupts")
> > > > 
> > > > isolated CPUs are actually handled when figuring out irq effective mask,
> > > > so not sure how commit 11ea68f553e2 is wrong, and what is fixed in this
> > > > patch from user viewpoint?
> > > 
> > > IO queues are allocated/spread on the isolated CPUs and if there is an
> > > thread submitting IOs from an isolated CPU it will cause noise on the
> > > isolated CPUs. The question is this a use case you need/want to support?
> > 
> > I have talked RH Openshift team weeks ago and they have such usage.
> > 
> > userspace is free to run any application from isolated CPUs via 'taskset
> > -c' even though 'isolcpus=' is passed from command line.
> > 
> > Kernel can not add such new constraint on userspace.
> > 
> > > We have customers who are complaining that even with isolcpus provided
> > > they still see IO noise on the isolated CPUs.
> > 
> > That is another issue, which has been fixed by the following patch:
> > 
> > a46c27026da1 blk-mq: don't schedule block kworker on isolated CPUs
> > 
> Hmm. Just when I thought I understood the issue ...
> 
> How is this supposed to work, then, given that I/O can be initiated
> from the isolated CPUs?
> I would have accepted that we have two scheduling domains, blk-mq is
> spread across all cpus, and the blk-mq cpusets are arranged according
> to the isolcpu settings.
> Then we can initiate I/O from the isolated cpus, and the scheduler
> would 'magically' ensure that everything is only run on isolated cpus.

blk-mq issues IO either from current context or kblockd context.

> 
> But that patch would completely counteract such a setup, as during
> I/O we more often than not will invoke kblockd, which then would cause
> cross-talk on non-isolated cpus.

If IO is submitted from isolated CPU, blk-mq will issue this IO via
unbound kblockd WQ, which is guaranteed to not run on isolated CPUs.


Thanks,
Ming
Daniel Wagner July 2, 2024, 7:25 a.m. UTC | #15
On Mon, Jul 01, 2024 at 10:37:46AM GMT, Hannes Reinecke wrote:
> Hmm. Guess we need to modify the grouping algorithm to group across all
> cpus, but ensure that each group consists either of all housekeeping CPUs or
> all isolated cpus.

This is what this series does, though just for the housekeeping CPUs. v1
introduces the io_queue option for isolcpus which made sure the
managed_irq behavior doesn't change.
diff mbox series

Patch

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index ee272c4cefcc..19fb7186f9d4 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -8,6 +8,7 @@ 
 #include <linux/cpu.h>
 #include <linux/sort.h>
 #include <linux/group_cpus.h>
+#include <linux/sched/isolation.h>
 
 #ifdef CONFIG_SMP
 
@@ -330,7 +331,7 @@  static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 }
 
 /**
- * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
+ * group_possible_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
  * @numgrps: number of groups
  *
  * Return: cpumask array if successful, NULL otherwise. And each element
@@ -344,7 +345,7 @@  static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  * We guarantee in the resulted grouping that all CPUs are covered, and
  * no same CPU is assigned to multiple groups
  */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+static struct cpumask *group_possible_cpus_evenly(unsigned int numgrps)
 {
 	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
 	cpumask_var_t *node_to_cpumask;
@@ -423,6 +424,76 @@  struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	}
 	return masks;
 }
+
+/**
+ * group_mask_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
+ * @numgrps: number of groups
+ * @cpu_mask: CPU to consider for the grouping
+ *
+ * Return: cpumask array if successful, NULL otherwise. And each element
+ * includes CPUs assigned to this group.
+ *
+ * Try to put close CPUs from viewpoint of CPU and NUMA locality into
+ * same group. Allocate present CPUs on these groups evenly.
+ */
+static struct cpumask *group_mask_cpus_evenly(unsigned int numgrps,
+					      const struct cpumask *cpu_mask)
+{
+	cpumask_var_t *node_to_cpumask;
+	cpumask_var_t nmsk;
+	int ret = -ENOMEM;
+	struct cpumask *masks = NULL;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		return NULL;
+
+	node_to_cpumask = alloc_node_to_cpumask();
+	if (!node_to_cpumask)
+		goto fail_nmsk;
+
+	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto fail_node_to_cpumask;
+
+	build_node_to_cpumask(node_to_cpumask);
+
+	ret = __group_cpus_evenly(0, numgrps, node_to_cpumask, cpu_mask, nmsk,
+				  masks);
+
+fail_node_to_cpumask:
+	free_node_to_cpumask(node_to_cpumask);
+
+fail_nmsk:
+	free_cpumask_var(nmsk);
+	if (ret < 0) {
+		kfree(masks);
+		return NULL;
+	}
+	return masks;
+}
+
+/**
+ * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
+ * @numgrps: number of groups
+ *
+ * Return: cpumask array if successful, NULL otherwise.
+ *
+ * group_possible_cpus_evently() is used for distributing the cpus on all
+ * possible cpus in absence of isolcpus command line argument.
+ * group_mask_cpu_evenly() is used when the isolcpus command line
+ * argument is used with managed_irq option. In this case only the
+ * housekeeping CPUs are considered.
+ */
+struct cpumask *group_cpus_evenly(unsigned int numgrps)
+{
+	const struct cpumask *hk_mask;
+
+	hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
+	if (!cpumask_empty(hk_mask))
+		return group_mask_cpus_evenly(numgrps, hk_mask);
+
+	return group_possible_cpus_evenly(numgrps);
+}
 #else /* CONFIG_SMP */
 struct cpumask *group_cpus_evenly(unsigned int numgrps)
 {