diff mbox

[06/13] irq: add a helper spread an affinity mask for MSI/MSI-X vectors

Message ID 1465934346-20648-7-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Christoph Hellwig June 14, 2016, 7:58 p.m. UTC
This is lifted from the blk-mq code and adopted to use the affinity mask
concept just intruced in the irq handling code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/interrupt.h | 11 +++++++++
 kernel/irq/Makefile       |  1 +
 kernel/irq/affinity.c     | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100644 kernel/irq/affinity.c

Comments

Guilherme G. Piccoli June 14, 2016, 9:54 p.m. UTC | #1
On 06/14/2016 04:58 PM, Christoph Hellwig wrote:
> This is lifted from the blk-mq code and adopted to use the affinity mask
> concept just intruced in the irq handling code.

Very nice patch Christoph, thanks. There's a little typo above, on 
"intruced".

>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   include/linux/interrupt.h | 11 +++++++++
>   kernel/irq/Makefile       |  1 +
>   kernel/irq/affinity.c     | 60 +++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 72 insertions(+)
>   create mode 100644 kernel/irq/affinity.c
>
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 9fcabeb..12003c0 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -278,6 +278,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
>   extern int
>   irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
>
> +int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs);
> +
>   #else /* CONFIG_SMP */
>
>   static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
> @@ -308,6 +311,14 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
>   {
>   	return 0;
>   }
> +
> +static inline int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs)
> +{
> +	*affinity_mask = NULL;
> +	*nr_vecs = 1;
> +	return 0;
> +}
>   #endif /* CONFIG_SMP */
>
>   /*
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index 2ee42e9..1d3ee31 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>   obj-$(CONFIG_PM_SLEEP) += pm.o
>   obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>   obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_SMP) += affinity.o
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> new file mode 100644
> index 0000000..1daf8fb
> --- /dev/null
> +++ b/kernel/irq/affinity.c
> @@ -0,0 +1,60 @@
> +
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/cpu.h>
> +
> +static int get_first_sibling(unsigned int cpu)
> +{
> +	unsigned int ret;
> +
> +	ret = cpumask_first(topology_sibling_cpumask(cpu));
> +	if (ret < nr_cpu_ids)
> +		return ret;
> +	return cpu;
> +}
> +
> +/*
> + * Take a map of online CPUs and the number of available interrupt vectors
> + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
> + * so that they are distributed as good as possible around the CPUs.  If
> + * more vectors than CPUs are available we'll map one to each CPU,
> + * otherwise we map one to the first sibling of each socket.
> + *
> + * If there are more vectors than CPUs we will still only have one bit
> + * set per CPU, but interrupt code will keep on assining the vectors from
> + * the start of the bitmap until we run out of vectors.
> + */

Another little typo above in "assining".

I take this opportunity to ask you something, since I'm working in a 
related code in a specific driver - sorry in advance if my question is 
silly or if I misunderstood your code.

The function irq_create_affinity_mask() below deals with the case in 
which we have nr_vecs < num_online_cpus(); in this case, wouldn't be a 
good idea to trying distribute the vecs among cores?

Example: if we have 128 online cpus, 8 per core (meaning 16 cores) and 
64 vecs, I guess would be ideal to distribute 4 vecs _per core_, leaving 
4 CPUs in each core without vecs.

Makes sense for you?
Thanks,


Guilherme


> +int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs)
> +{
> +	unsigned int vecs = 0;
> +
> +	if (*nr_vecs == 1) {
> +		*affinity_mask = NULL;
> +		return 0;
> +	}
> +
> +	*affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
> +	if (!*affinity_mask)
> +		return -ENOMEM;
> +
> +	if (*nr_vecs >= num_online_cpus()) {
> +		cpumask_copy(*affinity_mask, cpu_online_mask);
> +	} else {
> +		unsigned int cpu;
> +
> +		for_each_online_cpu(cpu) {
> +			if (cpu == get_first_sibling(cpu)) {
> +				cpumask_set_cpu(cpu, *affinity_mask);
> +				vecs++;
> +			}
> +
> +			if (--(*nr_vecs) == 0)
> +				break;
> +		}
> +	}
> +
> +	*nr_vecs = vecs;
> +	return 0;
> +}
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche June 15, 2016, 8:35 a.m. UTC | #2
On 06/14/2016 11:54 PM, Guilherme G. Piccoli wrote:
> On 06/14/2016 04:58 PM, Christoph Hellwig wrote:
> I take this opportunity to ask you something, since I'm working in a
> related code in a specific driver - sorry in advance if my question is
> silly or if I misunderstood your code.
>
> The function irq_create_affinity_mask() below deals with the case in
> which we have nr_vecs < num_online_cpus(); in this case, wouldn't be a
> good idea to trying distribute the vecs among cores?
>
> Example: if we have 128 online cpus, 8 per core (meaning 16 cores) and
> 64 vecs, I guess would be ideal to distribute 4 vecs _per core_, leaving
> 4 CPUs in each core without vecs.

Hello Christoph and Guilherme,

I would also like to see irq_create_affinity_mask() modified such that 
it implements Guilherme's algorithm. I think blk-mq requests should be 
processed by a CPU core from the NUMA node from which the request has 
been submitted. With the proposed algorithm, if the number of MSI-X 
vectors is less than or equal to the number of CPU cores of a single 
node, all interrupt vectors will be assigned to the first NUMA node.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 15, 2016, 10:10 a.m. UTC | #3
On Tue, Jun 14, 2016 at 06:54:22PM -0300, Guilherme G. Piccoli wrote:
> On 06/14/2016 04:58 PM, Christoph Hellwig wrote:
>> This is lifted from the blk-mq code and adopted to use the affinity mask
>> concept just intruced in the irq handling code.
>
> Very nice patch Christoph, thanks. There's a little typo above, on 
> "intruced".

fixed.

> Another little typo above in "assining".

fixed a swell.

> I take this opportunity to ask you something, since I'm working in a 
> related code in a specific driver

Which driver?  One of the points here is to get this sort of code out
of drivers and into common code..

> - sorry in advance if my question is 
> silly or if I misunderstood your code.
>
> The function irq_create_affinity_mask() below deals with the case in which 
> we have nr_vecs < num_online_cpus(); in this case, wouldn't be a good idea 
> to trying distribute the vecs among cores?
>
> Example: if we have 128 online cpus, 8 per core (meaning 16 cores) and 64 
> vecs, I guess would be ideal to distribute 4 vecs _per core_, leaving 4 
> CPUs in each core without vecs.

There have been some reports about the blk-mq IRQ distribution being
suboptimal, but no one sent patches so far.  This patch just moves the
existing algorithm into the core code to be better bisectable.

I think an algorithm that takes cores into account instead of just SMT
sibling would be very useful.  So if you have a case where this helps
for you an incremental patch (or even one against the current blk-mq
code for now) would be appreciated.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guilherme G. Piccoli June 15, 2016, 1:09 p.m. UTC | #4
Thanks for the responses Bart and Christoph.


On 06/15/2016 07:10 AM, Christoph Hellwig wrote:
> On Tue, Jun 14, 2016 at 06:54:22PM -0300, Guilherme G. Piccoli wrote:
>> On 06/14/2016 04:58 PM, Christoph Hellwig wrote:
>>> This is lifted from the blk-mq code and adopted to use the affinity mask
>>> concept just intruced in the irq handling code.
>>
>> Very nice patch Christoph, thanks. There's a little typo above, on
>> "intruced".
>
> fixed.
>
>> Another little typo above in "assining".
>
> fixed a swell.
>
>> I take this opportunity to ask you something, since I'm working in a
>> related code in a specific driver
>
> Which driver?  One of the points here is to get this sort of code out
> of drivers and into common code..

A network driver, i40e. I'd be glad to implement/see some common code to 
raise the topology information I need, but I was implementing on i40e 
more as a test case/toy example heheh...


>> - sorry in advance if my question is
>> silly or if I misunderstood your code.
>>
>> The function irq_create_affinity_mask() below deals with the case in which
>> we have nr_vecs < num_online_cpus(); in this case, wouldn't be a good idea
>> to trying distribute the vecs among cores?
>>
>> Example: if we have 128 online cpus, 8 per core (meaning 16 cores) and 64
>> vecs, I guess would be ideal to distribute 4 vecs _per core_, leaving 4
>> CPUs in each core without vecs.
>
> There have been some reports about the blk-mq IRQ distribution being
> suboptimal, but no one sent patches so far.  This patch just moves the
> existing algorithm into the core code to be better bisectable.
>
> I think an algorithm that takes cores into account instead of just SMT
> sibling would be very useful.  So if you have a case where this helps
> for you an incremental patch (or even one against the current blk-mq
> code for now) would be appreciated.

...but now I'll focus on the common/general case! Thanks for the 
suggestion Christoph. I guess would be even better to have a generic 
function that retrieves an optimal mask, something like 
topology_get_optimal_mask(n, *cpumask), in which we get the best 
distribution of n CPUs among all cores and return such a mask - 
interesting case is when n < num_online_cpus. So, this function could be 
used inside your irq_create_affinity_mask() and maybe in other places it 
is needed.

I was planning to use topology_core_id() to retrieve the core of a CPU, 
if anybody has a better idea, I'd be glad to hear it.

Cheers,


Guilherme


>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 16, 2016, 3:16 p.m. UTC | #5
>
> ...but now I'll focus on the common/general case! Thanks for the suggestion 
> Christoph. I guess would be even better to have a generic function that 
> retrieves an optimal mask, something like topology_get_optimal_mask(n, 
> *cpumask), in which we get the best distribution of n CPUs among all cores 
> and return such a mask - interesting case is when n < num_online_cpus. So, 
> this function could be used inside your irq_create_affinity_mask() and 
> maybe in other places it is needed.

Yes, we should probably just plug this in where we're using the current
routines.  Block very much optimizes for the cases of either 1 queue
or enough queues for all cpus at the moments.  It would be good to check
what the network drivers currently do.
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Gordeev June 25, 2016, 8:05 p.m. UTC | #6
On Tue, Jun 14, 2016 at 09:58:59PM +0200, Christoph Hellwig wrote:
> This is lifted from the blk-mq code and adopted to use the affinity mask
> concept just intruced in the irq handling code.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  include/linux/interrupt.h | 11 +++++++++
>  kernel/irq/Makefile       |  1 +
>  kernel/irq/affinity.c     | 60 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 72 insertions(+)
>  create mode 100644 kernel/irq/affinity.c
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 9fcabeb..12003c0 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -278,6 +278,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
>  extern int
>  irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
>  
> +int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs);
> +
>  #else /* CONFIG_SMP */
>  
>  static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
> @@ -308,6 +311,14 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
>  {
>  	return 0;
>  }
> +
> +static inline int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs)
> +{
> +	*affinity_mask = NULL;
> +	*nr_vecs = 1;
> +	return 0;
> +}
>  #endif /* CONFIG_SMP */
>  
>  /*
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index 2ee42e9..1d3ee31 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_SMP) += affinity.o
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> new file mode 100644
> index 0000000..1daf8fb
> --- /dev/null
> +++ b/kernel/irq/affinity.c
> @@ -0,0 +1,60 @@
> +
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/cpu.h>
> +
> +static int get_first_sibling(unsigned int cpu)
> +{
> +	unsigned int ret;
> +
> +	ret = cpumask_first(topology_sibling_cpumask(cpu));
> +	if (ret < nr_cpu_ids)
> +		return ret;
> +	return cpu;
> +}
> +
> +/*
> + * Take a map of online CPUs and the number of available interrupt vectors
> + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
> + * so that they are distributed as good as possible around the CPUs.  If
> + * more vectors than CPUs are available we'll map one to each CPU,

Unless I do not misinterpret a loop from msix_setup_entries() (patch 08/13),
the above is incorrect:

	for (i = 0; i < nvec; i++) {
		if (dev->irq_affinity) {
			cpu = cpumask_next(cpu, dev->irq_affinity);
			if (cpu >= nr_cpu_ids)
				cpu = cpumask_first(dev->irq_affinity);
			mask = cpumask_of(cpu);
		}

		...

		entry->affinity			= mask;
	}

> + * otherwise we map one to the first sibling of each socket.

(*) I guess, in some topology configurations a total number of all
first siblings may be less than the number of vectors.

> + * If there are more vectors than CPUs we will still only have one bit
> + * set per CPU, but interrupt code will keep on assining the vectors from
> + * the start of the bitmap until we run out of vectors.
> + */
> +int irq_create_affinity_mask(struct cpumask **affinity_mask,
> +		unsigned int *nr_vecs)

Both the callers of this function and the function itself IMHO would
read better if it simply returned the affinity mask. Or passed the 
affinity mask pointer.

> +{
> +	unsigned int vecs = 0;

In case (*nr_vecs >= num_online_cpus()) the contents of *nr_vecs
will be overwritten with 0.

> +	if (*nr_vecs == 1) {
> +		*affinity_mask = NULL;
> +		return 0;
> +	}
> +
> +	*affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
> +	if (!*affinity_mask)
> +		return -ENOMEM;
> +
> +	if (*nr_vecs >= num_online_cpus()) {
> +		cpumask_copy(*affinity_mask, cpu_online_mask);
> +	} else {
> +		unsigned int cpu;
> +
> +		for_each_online_cpu(cpu) {
> +			if (cpu == get_first_sibling(cpu)) {
> +				cpumask_set_cpu(cpu, *affinity_mask);
> +				vecs++;
> +			}
> +
> +			if (--(*nr_vecs) == 0)
> +				break;
> +		}
> +	}
> +
> +	*nr_vecs = vecs;

So considering (*) comment above the number of available vectors
might be unnecessarily shrunken here.

I think nr_vecs need not be an out-parameter since we always can
assign multiple vectors to a CPU. It is better than limiting number
of available vectors AFAIKT. Or you could pass one-per-cpu flag
explicitly.

> +	return 0;
> +}
> -- 
> 2.1.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 30, 2016, 5:48 p.m. UTC | #7
On Sat, Jun 25, 2016 at 10:05:19PM +0200, Alexander Gordeev wrote:
> > + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
> > + * so that they are distributed as good as possible around the CPUs.  If
> > + * more vectors than CPUs are available we'll map one to each CPU,
> 
> Unless I do not misinterpret a loop from msix_setup_entries() (patch 08/13),
> the above is incorrect:

What part do you think is incorrect?

> > + * otherwise we map one to the first sibling of each socket.
> 
> (*) I guess, in some topology configurations a total number of all
> first siblings may be less than the number of vectors.

Yes, in that case we'll assign imcompetely.  I've already heard people
complaining about that at LSF/MM, but no one volunteered patches.
I only have devices with 1 or enough vectores to test, so I don't
really dare to touch the algorithm.  Either way the algorithm
change should probably be a different patch than refactoring it and
moving it around.

> > + * If there are more vectors than CPUs we will still only have one bit
> > + * set per CPU, but interrupt code will keep on assining the vectors from
> > + * the start of the bitmap until we run out of vectors.
> > + */
> > +int irq_create_affinity_mask(struct cpumask **affinity_mask,
> > +		unsigned int *nr_vecs)
> 
> Both the callers of this function and the function itself IMHO would
> read better if it simply returned the affinity mask. Or passed the 
> affinity mask pointer.

We can't just return the pointer as NULL is a valid and common return
value.  If we pass the pointer we'd then also need to allocate one for
the (common) nvec = 1 case.

> 
> > +{
> > +	unsigned int vecs = 0;
> 
> In case (*nr_vecs >= num_online_cpus()) the contents of *nr_vecs
> will be overwritten with 0.

Thanks, fixed.

> So considering (*) comment above the number of available vectors
> might be unnecessarily shrunken here.
> 
> I think nr_vecs need not be an out-parameter since we always can
> assign multiple vectors to a CPU. It is better than limiting number
> of available vectors AFAIKT. Or you could pass one-per-cpu flag
> explicitly.

The function is intended to replicate the blk-mq algorithm.  I don't
think it's optimal, but I really want to avoid dragging the discussion
about the optimal algorithm into this patchset.  We should at least
move to a vector per node/socket model instead of just the siblings,
and be able to use all vectors (at least optionally).
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Gordeev July 1, 2016, 7:25 a.m. UTC | #8
On Thu, Jun 30, 2016 at 07:48:54PM +0200, Christoph Hellwig wrote:
> On Sat, Jun 25, 2016 at 10:05:19PM +0200, Alexander Gordeev wrote:
> > > + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
> > > + * so that they are distributed as good as possible around the CPUs.  If
> > > + * more vectors than CPUs are available we'll map one to each CPU,
> > 
> > Unless I do not misinterpret a loop from msix_setup_entries() (patch 08/13),
> > the above is incorrect:
> 
> What part do you think is incorrect?

With your explanations below and no immediate intention to fix the
algorithm it is correct.

> > (*) I guess, in some topology configurations a total number of all
> > first siblings may be less than the number of vectors.
> 
> Yes, in that case we'll assign imcompetely.  I've already heard people
> complaining about that at LSF/MM, but no one volunteered patches.
> I only have devices with 1 or enough vectores to test, so I don't
> really dare to touch the algorithm.  Either way the algorithm
> change should probably be a different patch than refactoring it and
> moving it around.

I see your approach now. Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9fcabeb..12003c0 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -278,6 +278,9 @@  extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
+int irq_create_affinity_mask(struct cpumask **affinity_mask,
+		unsigned int *nr_vecs);
+
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -308,6 +311,14 @@  irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 {
 	return 0;
 }
+
+static inline int irq_create_affinity_mask(struct cpumask **affinity_mask,
+		unsigned int *nr_vecs)
+{
+	*affinity_mask = NULL;
+	*nr_vecs = 1;
+	return 0;
+}
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e9..1d3ee31 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@  obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 0000000..1daf8fb
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,60 @@ 
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+static int get_first_sibling(unsigned int cpu)
+{
+	unsigned int ret;
+
+	ret = cpumask_first(topology_sibling_cpumask(cpu));
+	if (ret < nr_cpu_ids)
+		return ret;
+	return cpu;
+}
+
+/*
+ * Take a map of online CPUs and the number of available interrupt vectors
+ * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * so that they are distributed as good as possible around the CPUs.  If
+ * more vectors than CPUs are available we'll map one to each CPU,
+ * otherwise we map one to the first sibling of each socket.
+ *
+ * If there are more vectors than CPUs we will still only have one bit
+ * set per CPU, but interrupt code will keep on assining the vectors from
+ * the start of the bitmap until we run out of vectors.
+ */
+int irq_create_affinity_mask(struct cpumask **affinity_mask,
+		unsigned int *nr_vecs)
+{
+	unsigned int vecs = 0;
+
+	if (*nr_vecs == 1) {
+		*affinity_mask = NULL;
+		return 0;
+	}
+
+	*affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!*affinity_mask)
+		return -ENOMEM;
+
+	if (*nr_vecs >= num_online_cpus()) {
+		cpumask_copy(*affinity_mask, cpu_online_mask);
+	} else {
+		unsigned int cpu;
+
+		for_each_online_cpu(cpu) {
+			if (cpu == get_first_sibling(cpu)) {
+				cpumask_set_cpu(cpu, *affinity_mask);
+				vecs++;
+			}
+
+			if (--(*nr_vecs) == 0)
+				break;
+		}
+	}
+
+	*nr_vecs = vecs;
+	return 0;
+}