diff mbox

[v2,2/2] irqchip: dw-apb-ictl: add irq_set_affinity support

Message ID 1435987170-3962-3-git-send-email-jszhang@marvell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jisheng Zhang July 4, 2015, 5:19 a.m. UTC
On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
goes to a deep idle state, then the timer framework will be notified to
use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
interrupt chip, this patch adds irq_set_affinity support so that the
going to deep idle state cpu can set the interrupt affinity of the
broadcast interrupt to avoid unnecessary wakeups and IPIs.

A simple test:
~ # rm /tmp/test.sh
~ # cat > /tmp/test.sh
cat /proc/interrupts
for i in `seq 10` ; do sleep $i; done
cat /proc/interrupts
~ # chmod +x /tmp/test.sh
~ # taskset 0x2 /tmp/test.sh

without the patch:

           CPU0       CPU1
 27:        115         36       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         88          0  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        445          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          0         11  Timer broadcast interrupts
IPI2:         56        104  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         25         27  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0
           CPU0       CPU1
 27:        115         38       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:        160          0  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        514          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          0         83  Timer broadcast interrupts
IPI2:         56        104  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         25         46  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0

cpu0 get 160-88=72 timer interrupts, CPU1 got 83-11=72 broadcast timer
IPIs. So, overall system got 72+72=144 wake ups and 72 broadcast timer IPIs

With the patch:
           CPU0       CPU1
 27:        107         37       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         66          7  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        311          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          2          4  Timer broadcast interrupts
IPI2:         58        100  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         21         24  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0
           CPU0       CPU1
 27:        107         39       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         69         75  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        380          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          3          6  Timer broadcast interrupts
IPI2:         60        100  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         21         45  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0

cpu0 got 69-66=3, cpu1 got 75-7=68 timer interrupts. cpu0 got 3-2=1
broadcast timer IPIs, cpu1 got 6-4=2 broadcast timer IPIs. So, overall
system got 3+68+1+2=74 wakeups and 1+2=3 broadcast timer IPIs.

This patch removes 50% wakeups and almost 100% broadcast timer IPIs!
I believe this patch will also benefit other dw-apb-ictl users.

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
---
 drivers/irqchip/irq-dw-apb-ictl.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

Comments

Russell King - ARM Linux July 4, 2015, 8:26 a.m. UTC | #1
On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> goes to a deep idle state, then the timer framework will be notified to
> use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> interrupt chip, this patch adds irq_set_affinity support so that the
> going to deep idle state cpu can set the interrupt affinity of the
> broadcast interrupt to avoid unnecessary wakeups and IPIs.

NAK to this patch.

The real question is - if CPU0 is the CPU going offline, why is it
still receiving _any_ interrupts - all interrupts should be migrated
off it, including the chained interrupts.

Sounds like there's a bug in the migration code which needs further
investigation, rather than hacking around the problem by introducing
lots of driver code.
Jisheng Zhang July 4, 2015, 8:50 a.m. UTC | #2
Dear Russell,

On Sat, 4 Jul 2015 09:26:23 +0100
Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:

> On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > goes to a deep idle state, then the timer framework will be notified to
> > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > interrupt chip, this patch adds irq_set_affinity support so that the
> > going to deep idle state cpu can set the interrupt affinity of the
> > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> 
> NAK to this patch.
> 
> The real question is - if CPU0 is the CPU going offline, why is it
> still receiving _any_ interrupts - all interrupts should be migrated
> off it, including the chained interrupts.

I think it's due to broadcast timer interrupt. Let me describe the situation:

1. cpu1 is going offline
2. cpuidle notify timer framework to use a broadcast timer instead due to localtimer
is CLOCK_EVT_FEAT_C3STOP
3. when timer is expired, CPU0 will be waken up by the timer interrupt if it has
gone offline
4. CPU0 sends broadcast timer IPI to CPU1

As can be seen, both cpu0 waken up and the broadcast timer IPI are unnecessary.

This patch tries to improve such situation. Here I copied my simple test result

A simple test:
~ # rm /tmp/test.sh
~ # cat > /tmp/test.sh
cat /proc/interrupts
for i in `seq 10` ; do sleep $i; done
cat /proc/interrupts
~ # chmod +x /tmp/test.sh
~ # taskset 0x2 /tmp/test.sh

without the patch:

           CPU0       CPU1
 27:        115         36       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         88          0  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        445          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          0         11  Timer broadcast interrupts
IPI2:         56        104  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         25         27  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0
           CPU0       CPU1
 27:        115         38       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:        160          0  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        514          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          0         83  Timer broadcast interrupts
IPI2:         56        104  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         25         46  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0

cpu0 get 160-88=72 timer interrupts, CPU1 got 83-11=72 broadcast timer
IPIs. So, overall system got 72+72=144 wake ups and 72 broadcast timer IPIs

With the patch:
           CPU0       CPU1
 27:        107         37       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         66          7  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        311          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          2          4  Timer broadcast interrupts
IPI2:         58        100  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         21         24  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0
           CPU0       CPU1
 27:        107         39       GIC  27  arch_timer
 45:         62          0       GIC  45  mmc0
160:         69         75  interrupt-controller   8  timer
227:          0          0  interrupt-controller   4  f7e81400.i2c
228:          0          0  interrupt-controller   5  f7e81800.i2c
229:          0          0  interrupt-controller   7  dw_spi65535
230:          0          0  interrupt-controller  21  f7e84000.i2c
231:          0          0  interrupt-controller  20  f7e84800.i2c
265:        380          0  interrupt-controller   8  serial
IPI0:          0          0  CPU wakeup interrupts
IPI1:          3          6  Timer broadcast interrupts
IPI2:         60        100  Rescheduling interrupts
IPI3:          0          0  Function call interrupts
IPI4:          0          4  Single function call interrupts
IPI5:          0          0  CPU stop interrupts
IPI6:         21         45  IRQ work interrupts
IPI7:          0          0  completion interrupts
IPI8:          0          0  CPU backtrace
Err:          0

cpu0 got 69-66=3, cpu1 got 75-7=68 timer interrupts. cpu0 got 3-2=1
broadcast timer IPIs, cpu1 got 6-4=2 broadcast timer IPIs. So, overall
system got 3+68+1+2=74 wakeups and 1+2=3 broadcast timer IPIs.

This patch removes 50% wakeups and almost 100% broadcast timer IPIs!

What do you think?

Thanks,
Jisheng

> 
> Sounds like there's a bug in the migration code which needs further
> investigation, rather than hacking around the problem by introducing
> lots of driver code.
>
Russell King - ARM Linux July 4, 2015, 9:25 a.m. UTC | #3
On Sat, Jul 04, 2015 at 04:50:07PM +0800, Jisheng Zhang wrote:
> Dear Russell,
> 
> On Sat, 4 Jul 2015 09:26:23 +0100
> Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:
> 
> > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > goes to a deep idle state, then the timer framework will be notified to
> > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > going to deep idle state cpu can set the interrupt affinity of the
> > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > 
> > NAK to this patch.
> > 
> > The real question is - if CPU0 is the CPU going offline, why is it
> > still receiving _any_ interrupts - all interrupts should be migrated
> > off it, including the chained interrupts.
> 
> I think it's due to broadcast timer interrupt. Let me describe the situation:
> 
> 1. cpu1 is going offline
> 2. cpuidle notify timer framework to use a broadcast timer instead due to localtimer
> is CLOCK_EVT_FEAT_C3STOP
> 3. when timer is expired, CPU0 will be waken up by the timer interrupt if it has
> gone offline
> 4. CPU0 sends broadcast timer IPI to CPU1

If CPU1 is going offline, then CPU1 should have no interrupts delivered to
it.  However, this is not the situation you're testing - in your results
below, and your "simple test" you never take CPU1 offline.
Jisheng Zhang July 4, 2015, 9:35 a.m. UTC | #4
Dear Russell,

On Sat, 4 Jul 2015 10:25:46 +0100
Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:

> On Sat, Jul 04, 2015 at 04:50:07PM +0800, Jisheng Zhang wrote:
> > Dear Russell,
> > 
> > On Sat, 4 Jul 2015 09:26:23 +0100
> > Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:
> > 
> > > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > > goes to a deep idle state, then the timer framework will be notified to
> > > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > > going to deep idle state cpu can set the interrupt affinity of the
> > > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > > 
> > > NAK to this patch.
> > > 
> > > The real question is - if CPU0 is the CPU going offline, why is it
> > > still receiving _any_ interrupts - all interrupts should be migrated
> > > off it, including the chained interrupts.
> > 
> > I think it's due to broadcast timer interrupt. Let me describe the situation:
> > 
> > 1. cpu1 is going offline
> > 2. cpuidle notify timer framework to use a broadcast timer instead due to localtimer
> > is CLOCK_EVT_FEAT_C3STOP
> > 3. when timer is expired, CPU0 will be waken up by the timer interrupt if it has
> > gone offline
> > 4. CPU0 sends broadcast timer IPI to CPU1
> 
> If CPU1 is going offline, then CPU1 should have no interrupts delivered to

"sleep $i" on CPU1 will make the timer framework to wake up it at current_time+$i
in the future then go offline, currently this is achieved by programming one
broadcast timer. In our case, the broadcast timer interrupt will be routed to
CPU0 by default, so CPU0 has to send broadcast timer IPI to CPU1.

> it.  However, this is not the situation you're testing - in your results
> below, and your "simple test" you never take CPU1 offline.
> 

when cpu1 executes "sleep $i", cpu1 will go to deepest cpuidle level, in our
case it will go offline.

Thanks a lot for your review,
Jisheng
Jisheng Zhang July 4, 2015, 9:43 a.m. UTC | #5
Dear Russell,

On Sat, 4 Jul 2015 17:35:33 +0800
Jisheng Zhang <jszhang@marvell.com> wrote:

> Dear Russell,
> 
> On Sat, 4 Jul 2015 10:25:46 +0100
> Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:
> 
> > On Sat, Jul 04, 2015 at 04:50:07PM +0800, Jisheng Zhang wrote:
> > > Dear Russell,
> > > 
> > > On Sat, 4 Jul 2015 09:26:23 +0100
> > > Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:
> > > 
> > > > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > > > goes to a deep idle state, then the timer framework will be notified to
> > > > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > > > going to deep idle state cpu can set the interrupt affinity of the
> > > > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > > > 
> > > > NAK to this patch.
> > > > 
> > > > The real question is - if CPU0 is the CPU going offline, why is it
> > > > still receiving _any_ interrupts - all interrupts should be migrated
> > > > off it, including the chained interrupts.
> > > 
> > > I think it's due to broadcast timer interrupt. Let me describe the situation:
> > > 
> > > 1. cpu1 is going offline
> > > 2. cpuidle notify timer framework to use a broadcast timer instead due to localtimer
> > > is CLOCK_EVT_FEAT_C3STOP
> > > 3. when timer is expired, CPU0 will be waken up by the timer interrupt if it has
> > > gone offline
> > > 4. CPU0 sends broadcast timer IPI to CPU1
> > 
> > If CPU1 is going offline, then CPU1 should have no interrupts delivered to
> 
> "sleep $i" on CPU1 will make the timer framework to wake up it at current_time+$i
> in the future then go offline, currently this is achieved by programming one
> broadcast timer. In our case, the broadcast timer interrupt will be routed to
> CPU0 by default, so CPU0 has to send broadcast timer IPI to CPU1.
> 
> > it.  However, this is not the situation you're testing - in your results
> > below, and your "simple test" you never take CPU1 offline.
> > 
> 
> when cpu1 executes "sleep $i", cpu1 will go to deepest cpuidle level, in our
> case it will go offline.
> 

I may misread your emails. I guess the "offline" means "cpu hot unplug"? Sorry
for misunderstanding.

This patch doesn't try to improve anything related with "hog unplug", it tries to
improve the following situation instead:

1. cpu1 is entering deepest cpuidle level, shutdown in our case.
2. cpuidle notify timer framework to use a broadcast timer instead due to localtimer
is CLOCK_EVT_FEAT_C3STOP
3. when timer is expired, CPU0 will be waken up by the timer interrupt if it has
 been shutdown
4. CPU0 sends broadcast timer IPI to CPU1

Thanks,
Jisheng
Thomas Gleixner July 4, 2015, 9:53 a.m. UTC | #6
On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:

> On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > goes to a deep idle state, then the timer framework will be notified to
> > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > interrupt chip, this patch adds irq_set_affinity support so that the
> > going to deep idle state cpu can set the interrupt affinity of the
> > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> 
> NAK to this patch.
> 
> The real question is - if CPU0 is the CPU going offline, why is it
> still receiving _any_ interrupts - all interrupts should be migrated
> off it, including the chained interrupts.
> 
> Sounds like there's a bug in the migration code which needs further
> investigation, rather than hacking around the problem by introducing
> lots of driver code.

I think you misunderstood the changelog, which is horrible btw.

So the real reason to do this is to steer the broadcast interrupt to
the CPU which has the earliest expiry time. This avoids that another
cpu is woken from idle just to deliver the broadcast IPI to the other
cpu.

Thanks,

	tglx
Russell King - ARM Linux July 4, 2015, 10:08 a.m. UTC | #7
On Sat, Jul 04, 2015 at 11:53:57AM +0200, Thomas Gleixner wrote:
> On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> 
> > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > goes to a deep idle state, then the timer framework will be notified to
> > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > going to deep idle state cpu can set the interrupt affinity of the
> > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > 
> > NAK to this patch.
> > 
> > The real question is - if CPU0 is the CPU going offline, why is it
> > still receiving _any_ interrupts - all interrupts should be migrated
> > off it, including the chained interrupts.
> > 
> > Sounds like there's a bug in the migration code which needs further
> > investigation, rather than hacking around the problem by introducing
> > lots of driver code.
> 
> I think you misunderstood the changelog, which is horrible btw.
> 
> So the real reason to do this is to steer the broadcast interrupt to
> the CPU which has the earliest expiry time. This avoids that another
> cpu is woken from idle just to deliver the broadcast IPI to the other
> cpu.

Unless I'm mistaken, the code does this by messing around with the parent
interrupt affinity of a chained interrupt, which really isn't a good thing
to do, because it migrates every interrupt on the child interrupt
controller.
Jisheng Zhang July 4, 2015, 10:33 a.m. UTC | #8
Dear Russell, Thomas,

On Sat, 4 Jul 2015 11:08:27 +0100
Russell King - ARM Linux <linux@arm.linux.org.uk> wrote:

> On Sat, Jul 04, 2015 at 11:53:57AM +0200, Thomas Gleixner wrote:
> > On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> > 
> > > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > > goes to a deep idle state, then the timer framework will be notified to
> > > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > > going to deep idle state cpu can set the interrupt affinity of the
> > > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > > 
> > > NAK to this patch.
> > > 
> > > The real question is - if CPU0 is the CPU going offline, why is it
> > > still receiving _any_ interrupts - all interrupts should be migrated
> > > off it, including the chained interrupts.
> > > 
> > > Sounds like there's a bug in the migration code which needs further
> > > investigation, rather than hacking around the problem by introducing
> > > lots of driver code.
> > 
> > I think you misunderstood the changelog, which is horrible btw.
> > 
> > So the real reason to do this is to steer the broadcast interrupt to
> > the CPU which has the earliest expiry time. This avoids that another
> > cpu is woken from idle just to deliver the broadcast IPI to the other
> > cpu.
> 
> Unless I'm mistaken, the code does this by messing around with the parent
> interrupt affinity of a chained interrupt, which really isn't a good thing
> to do, because it migrates every interrupt on the child interrupt
> controller.
> 

Yep, that's the problem although it doesn't matter in our case for other 
interrupts don't care the affinity at all.
I'm requesting our HW people to connect timer interrupt to GIC directly in
future chips. But for the existing chips, this patch does really reduce
power consumption, is there any elegant solution?

PS: I noticed that exynos-combiner.c also migrated every interrupt on the
child irq controller.

Any suggestions are appreciated.

Thanks,
Jisheng
Thomas Gleixner July 4, 2015, 12:49 p.m. UTC | #9
On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> On Sat, Jul 04, 2015 at 11:53:57AM +0200, Thomas Gleixner wrote:
> > On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> > 
> > > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > > goes to a deep idle state, then the timer framework will be notified to
> > > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > > going to deep idle state cpu can set the interrupt affinity of the
> > > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > > 
> > > NAK to this patch.
> > > 
> > > The real question is - if CPU0 is the CPU going offline, why is it
> > > still receiving _any_ interrupts - all interrupts should be migrated
> > > off it, including the chained interrupts.
> > > 
> > > Sounds like there's a bug in the migration code which needs further
> > > investigation, rather than hacking around the problem by introducing
> > > lots of driver code.
> > 
> > I think you misunderstood the changelog, which is horrible btw.
> > 
> > So the real reason to do this is to steer the broadcast interrupt to
> > the CPU which has the earliest expiry time. This avoids that another
> > cpu is woken from idle just to deliver the broadcast IPI to the other
> > cpu.
> 
> Unless I'm mistaken, the code does this by messing around with the parent
> interrupt affinity of a chained interrupt, which really isn't a good thing
> to do, because it migrates every interrupt on the child interrupt
> controller.

Fair enough, I missed that chained hackery.

For that powersaving scenario it's probably ok to move the all child
irqs around, but we should at least make that an opt-in behaviour and
not enabled by default.

Thanks,

	tglx
Jisheng Zhang July 4, 2015, 1:16 p.m. UTC | #10
Dear Thomas,

On Sat, 4 Jul 2015 14:49:31 +0200
Thomas Gleixner <tglx@linutronix.de> wrote:

> On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> > On Sat, Jul 04, 2015 at 11:53:57AM +0200, Thomas Gleixner wrote:
> > > On Sat, 4 Jul 2015, Russell King - ARM Linux wrote:
> > > 
> > > > On Sat, Jul 04, 2015 at 01:19:30PM +0800, Jisheng Zhang wrote:
> > > > > On Marvell Berlin SoCs, the cpu's local timer is shutdown when the cpu
> > > > > goes to a deep idle state, then the timer framework will be notified to
> > > > > use a broadcast timer instead. The broadcast timer uses dw-apb-ictl as
> > > > > interrupt chip, this patch adds irq_set_affinity support so that the
> > > > > going to deep idle state cpu can set the interrupt affinity of the
> > > > > broadcast interrupt to avoid unnecessary wakeups and IPIs.
> > > > 
> > > > NAK to this patch.
> > > > 
> > > > The real question is - if CPU0 is the CPU going offline, why is it
> > > > still receiving _any_ interrupts - all interrupts should be migrated
> > > > off it, including the chained interrupts.
> > > > 
> > > > Sounds like there's a bug in the migration code which needs further
> > > > investigation, rather than hacking around the problem by introducing
> > > > lots of driver code.
> > > 
> > > I think you misunderstood the changelog, which is horrible btw.
> > > 
> > > So the real reason to do this is to steer the broadcast interrupt to
> > > the CPU which has the earliest expiry time. This avoids that another
> > > cpu is woken from idle just to deliver the broadcast IPI to the other
> > > cpu.
> > 
> > Unless I'm mistaken, the code does this by messing around with the parent
> > interrupt affinity of a chained interrupt, which really isn't a good thing
> > to do, because it migrates every interrupt on the child interrupt
> > controller.
> 
> Fair enough, I missed that chained hackery.
> 
> For that powersaving scenario it's probably ok to move the all child
> irqs around, but we should at least make that an opt-in behaviour and
> not enabled by default.

Thank you for your suggestion. Is is acceptable to make an config option
such as DW_APB_ICTL_SET_AFFINITY, and warn enabled this would migrates every
interrupt, and disable it by default?

Thanks a lot,
Jisheng
Thomas Gleixner July 4, 2015, 1:42 p.m. UTC | #11
On Sat, 4 Jul 2015, Jisheng Zhang wrote:
> Thank you for your suggestion. Is is acceptable to make an config option
> such as DW_APB_ICTL_SET_AFFINITY, and warn enabled this would migrates every
> interrupt, and disable it by default?

No, this wants to be a runtime/boottime configuration (device tree
might be a possible solution).

Thanks,

	tglx
diff mbox

Patch

diff --git a/drivers/irqchip/irq-dw-apb-ictl.c b/drivers/irqchip/irq-dw-apb-ictl.c
index 8bef7f7..efc0aec 100644
--- a/drivers/irqchip/irq-dw-apb-ictl.c
+++ b/drivers/irqchip/irq-dw-apb-ictl.c
@@ -29,6 +29,7 @@ 
 
 struct dw_apb_ictl_priv {
 	struct irq_domain *domain;
+	unsigned int parent_irq;
 };
 
 static void dw_apb_ictl_handler(unsigned int irq, struct irq_desc *desc)
@@ -56,6 +57,21 @@  static void dw_apb_ictl_handler(unsigned int irq, struct irq_desc *desc)
 	chained_irq_exit(chip, desc);
 }
 
+static int dw_apb_ictl_set_affinity(struct irq_data *d,
+				    const struct cpumask *mask_val,
+				    bool force)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	struct dw_apb_ictl_priv *priv = gc->private;
+	struct irq_chip *chip = irq_get_chip(priv->parent_irq);
+	struct irq_data *data = irq_get_irq_data(priv->parent_irq);
+
+	if (chip && chip->irq_set_affinity)
+		return chip->irq_set_affinity(data, mask_val, force);
+	else
+		return -EINVAL;
+}
+
 #ifdef CONFIG_PM
 static void dw_apb_ictl_resume(struct irq_data *d)
 {
@@ -95,6 +111,8 @@  static int __init dw_apb_ictl_init(struct device_node *np,
 		goto err_free;
 	}
 
+	priv->parent_irq = irq;
+
 	ret = of_address_to_resource(np, 0, &r);
 	if (ret) {
 		pr_err("%s: unable to get resource\n", np->full_name);
@@ -160,6 +178,7 @@  static int __init dw_apb_ictl_init(struct device_node *np,
 	gc->chip_types[0].chip.irq_mask = irq_gc_mask_set_bit;
 	gc->chip_types[0].chip.irq_unmask = irq_gc_mask_clr_bit;
 	gc->chip_types[0].chip.irq_resume = dw_apb_ictl_resume;
+	gc->chip_types[0].chip.irq_set_affinity = dw_apb_ictl_set_affinity;
 
 	if (nrirqs > 32) {
 		gc->chip_types[1].regs.mask = APB_INT_MASK_H;
@@ -167,6 +186,8 @@  static int __init dw_apb_ictl_init(struct device_node *np,
 		gc->chip_types[1].chip.irq_mask = irq_gc_mask_set_bit;
 		gc->chip_types[1].chip.irq_unmask = irq_gc_mask_clr_bit;
 		gc->chip_types[1].chip.irq_resume = dw_apb_ictl_resume;
+		gc->chip_types[1].chip.irq_set_affinity =
+						dw_apb_ictl_set_affinity;
 	}
 
 	irq_set_handler_data(irq, gc);