diff mbox

[v2] ARM: Don't use complete() during __cpu_die

Message ID 20150205105035.GL8656@n2100.arm.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Russell King - ARM Linux Feb. 5, 2015, 10:50 a.m. UTC
On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> The complete() should not be used on offlined CPU. Rewrite the
> wait-complete mechanism with wait_on_bit_timeout().

Yuck.

I think that the IPI idea would be far better, and a much smaller patch.
We can continue using the completions, but instead of running the
completion on the dying CPU, the dying CPU triggers an IPI which does
the completion on the requesting CPU.

You're modifying arch/arm/kernel/smp.c, so you just hook it directly
into the IPI mechanism without any registration required.

We can also kill the second cache flush by the dying CPU - as we're
not writing to memory anymore by calling complete() after the first
cache flush, so this will probably make CPU hotplug fractionally faster
too.

(You'll get some fuzz with this patch as I have the NMI backtrace stuff
in my kernel.)

Something like this - only build tested so far (waiting for the compile
to finish...):

 arch/arm/kernel/smp.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

Comments

Krzysztof Kozlowski Feb. 5, 2015, 11 a.m. UTC | #1
On czw, 2015-02-05 at 10:50 +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> > The complete() should not be used on offlined CPU. Rewrite the
> > wait-complete mechanism with wait_on_bit_timeout().
> 
> Yuck.
> 
> I think that the IPI idea would be far better, and a much smaller patch.
> We can continue using the completions, but instead of running the
> completion on the dying CPU, the dying CPU triggers an IPI which does
> the completion on the requesting CPU.
> 
> You're modifying arch/arm/kernel/smp.c, so you just hook it directly
> into the IPI mechanism without any registration required.
> 
> We can also kill the second cache flush by the dying CPU - as we're
> not writing to memory anymore by calling complete() after the first
> cache flush, so this will probably make CPU hotplug fractionally faster
> too.
> 
> (You'll get some fuzz with this patch as I have the NMI backtrace stuff
> in my kernel.)
> 
> Something like this - only build tested so far (waiting for the compile
> to finish...):

I am looking into IPI also. Maybe just smp_call_function_any() would be
enough?

Do you want to continue with the IPI version patch?

Best regards,
Krzysztof

> 
>  arch/arm/kernel/smp.c | 43 ++++++++++++++++++++++++-------------------
>  1 file changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index 194df2f1aa87..c623e27a9c85 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -73,6 +73,9 @@ enum ipi_msg_type {
>  	IPI_IRQ_WORK,
>  	IPI_COMPLETION,
>  	IPI_CPU_BACKTRACE,
> +#ifdef CONFIG_HOTPLUG_CPU
> +	IPI_CPU_DEAD,
> +#endif
>  };
>  
>  /* For reliability, we're prepared to waste bits here. */
> @@ -88,6 +91,14 @@ void __init smp_set_ops(struct smp_operations *ops)
>  		smp_ops = *ops;
>  };
>  
> +static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
> +
> +void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
> +{
> +	if (!__smp_cross_call)
> +		__smp_cross_call = fn;
> +}
> +
>  static unsigned long get_arch_pgd(pgd_t *pgd)
>  {
>  	phys_addr_t pgdir = virt_to_idmap(pgd);
> @@ -267,19 +278,13 @@ void __ref cpu_die(void)
>  	flush_cache_louis();
>  
>  	/*
> -	 * Tell __cpu_die() that this CPU is now safe to dispose of.  Once
> -	 * this returns, power and/or clocks can be removed at any point
> +	 * Tell __cpu_die() that this CPU is now safe to dispose of.  We
> +	 * do this via an IPI to any online CPU - it doesn't matter, we
> +	 * just need another CPU to run the completion.  Once this IPI
> +	 * has been sent, power and/or clocks can be removed at any point
>  	 * from this CPU and its cache by platform_cpu_kill().
>  	 */
> -	complete(&cpu_died);
> -
> -	/*
> -	 * Ensure that the cache lines associated with that completion are
> -	 * written out.  This covers the case where _this_ CPU is doing the
> -	 * powering down, to ensure that the completion is visible to the
> -	 * CPU waiting for this one.
> -	 */
> -	flush_cache_louis();
> +	__smp_cross_call(cpumask_of(cpumask_any(cpu_online_mask)), IPI_CPU_DEAD);
>  
>  	/*
>  	 * The actual CPU shutdown procedure is at least platform (if not
> @@ -442,14 +447,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	}
>  }
>  
> -static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
> -
> -void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
> -{
> -	if (!__smp_cross_call)
> -		__smp_cross_call = fn;
> -}
> -
>  static const char *ipi_types[NR_IPI] __tracepoint_string = {
>  #define S(x,s)	[x] = s
>  	S(IPI_WAKEUP, "CPU wakeup interrupts"),
> @@ -648,6 +645,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
>  		irq_exit();
>  		break;
>  
> +#ifdef CONFIG_HOTPLUG_CPU
> +	case IPI_CPU_DEAD:
> +		irq_enter();
> +		complete(&cpu_died);
> +		irq_exit();
> +		break;
> +#endif
> +
>  	default:
>  		pr_crit("CPU%u: Unknown IPI message 0x%x\n",
>  		        cpu, ipinr);
>
Russell King - ARM Linux Feb. 5, 2015, 11:08 a.m. UTC | #2
On Thu, Feb 05, 2015 at 12:00:58PM +0100, Krzysztof Kozlowski wrote:
> On czw, 2015-02-05 at 10:50 +0000, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> > > The complete() should not be used on offlined CPU. Rewrite the
> > > wait-complete mechanism with wait_on_bit_timeout().
> > 
> > Yuck.
> > 
> > I think that the IPI idea would be far better, and a much smaller patch.
> > We can continue using the completions, but instead of running the
> > completion on the dying CPU, the dying CPU triggers an IPI which does
> > the completion on the requesting CPU.
> > 
> > You're modifying arch/arm/kernel/smp.c, so you just hook it directly
> > into the IPI mechanism without any registration required.
> > 
> > We can also kill the second cache flush by the dying CPU - as we're
> > not writing to memory anymore by calling complete() after the first
> > cache flush, so this will probably make CPU hotplug fractionally faster
> > too.
> > 
> > (You'll get some fuzz with this patch as I have the NMI backtrace stuff
> > in my kernel.)
> > 
> > Something like this - only build tested so far (waiting for the compile
> > to finish...):
> 
> I am looking into IPI also. Maybe just smp_call_function_any() would be
> enough?

I really don't like that idea.  Let's keep it simple, and avoid calling
code unnecessarily which could end up with RCU issues - and avoid the
need for a second L1 cache flush.

This does seem to work on iMX6 (if you ignore the lockdep splat caused
by that frigging CMA lock - yes, that issue I reported earlier last
year still exists and is still unsolved, which is really disgusting.)
Mark Rutland Feb. 5, 2015, 11:28 a.m. UTC | #3
Hi Russell,

On Thu, Feb 05, 2015 at 10:50:35AM +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> > The complete() should not be used on offlined CPU. Rewrite the
> > wait-complete mechanism with wait_on_bit_timeout().
> 
> Yuck.
> 
> I think that the IPI idea would be far better, and a much smaller patch.
> We can continue using the completions, but instead of running the
> completion on the dying CPU, the dying CPU triggers an IPI which does
> the completion on the requesting CPU.

This does look _much_ nicer than the bitmask approach.

[...]

> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index 194df2f1aa87..c623e27a9c85 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -73,6 +73,9 @@ enum ipi_msg_type {
>  	IPI_IRQ_WORK,
>  	IPI_COMPLETION,
>  	IPI_CPU_BACKTRACE,
> +#ifdef CONFIG_HOTPLUG_CPU
> +	IPI_CPU_DEAD,
> +#endif
>  };

[...]

>  static const char *ipi_types[NR_IPI] __tracepoint_string = {
>  #define S(x,s)	[x] = s
>  	S(IPI_WAKEUP, "CPU wakeup interrupts"),

We'll probably want to add an entry here ("CPU teardown interrupts"?),
and bump NR_IPI in asm/hardirq.h.

Thanks,
Mark.
Russell King - ARM Linux Feb. 5, 2015, 11:30 a.m. UTC | #4
On Thu, Feb 05, 2015 at 11:28:05AM +0000, Mark Rutland wrote:
> Hi Russell,
> 
> On Thu, Feb 05, 2015 at 10:50:35AM +0000, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> > > The complete() should not be used on offlined CPU. Rewrite the
> > > wait-complete mechanism with wait_on_bit_timeout().
> > 
> > Yuck.
> > 
> > I think that the IPI idea would be far better, and a much smaller patch.
> > We can continue using the completions, but instead of running the
> > completion on the dying CPU, the dying CPU triggers an IPI which does
> > the completion on the requesting CPU.
> 
> This does look _much_ nicer than the bitmask approach.
> 
> [...]
> 
> > diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> > index 194df2f1aa87..c623e27a9c85 100644
> > --- a/arch/arm/kernel/smp.c
> > +++ b/arch/arm/kernel/smp.c
> > @@ -73,6 +73,9 @@ enum ipi_msg_type {
> >  	IPI_IRQ_WORK,
> >  	IPI_COMPLETION,
> >  	IPI_CPU_BACKTRACE,
> > +#ifdef CONFIG_HOTPLUG_CPU
> > +	IPI_CPU_DEAD,
> > +#endif
> >  };
> 
> [...]
> 
> >  static const char *ipi_types[NR_IPI] __tracepoint_string = {
> >  #define S(x,s)	[x] = s
> >  	S(IPI_WAKEUP, "CPU wakeup interrupts"),
> 
> We'll probably want to add an entry here ("CPU teardown interrupts"?),
> and bump NR_IPI in asm/hardirq.h.

I'd need to move IPI_CPU_BACKTRACE out of the way then - that'll mostly
always be zero (even if the NMI IPI happens.)  I'll sort that when I
backport the patch to mainline kernels. :)
Paul E. McKenney Feb. 5, 2015, 2:29 p.m. UTC | #5
On Thu, Feb 05, 2015 at 10:50:35AM +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 11:14:30AM +0100, Krzysztof Kozlowski wrote:
> > The complete() should not be used on offlined CPU. Rewrite the
> > wait-complete mechanism with wait_on_bit_timeout().
> 
> Yuck.
> 
> I think that the IPI idea would be far better, and a much smaller patch.
> We can continue using the completions, but instead of running the
> completion on the dying CPU, the dying CPU triggers an IPI which does
> the completion on the requesting CPU.
> 
> You're modifying arch/arm/kernel/smp.c, so you just hook it directly
> into the IPI mechanism without any registration required.
> 
> We can also kill the second cache flush by the dying CPU - as we're
> not writing to memory anymore by calling complete() after the first
> cache flush, so this will probably make CPU hotplug fractionally faster
> too.
> 
> (You'll get some fuzz with this patch as I have the NMI backtrace stuff
> in my kernel.)
> 
> Something like this - only build tested so far (waiting for the compile
> to finish...):

Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)

							Thanx, Paul

>  arch/arm/kernel/smp.c | 43 ++++++++++++++++++++++++-------------------
>  1 file changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index 194df2f1aa87..c623e27a9c85 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -73,6 +73,9 @@ enum ipi_msg_type {
>  	IPI_IRQ_WORK,
>  	IPI_COMPLETION,
>  	IPI_CPU_BACKTRACE,
> +#ifdef CONFIG_HOTPLUG_CPU
> +	IPI_CPU_DEAD,
> +#endif
>  };
> 
>  /* For reliability, we're prepared to waste bits here. */
> @@ -88,6 +91,14 @@ void __init smp_set_ops(struct smp_operations *ops)
>  		smp_ops = *ops;
>  };
> 
> +static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
> +
> +void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
> +{
> +	if (!__smp_cross_call)
> +		__smp_cross_call = fn;
> +}
> +
>  static unsigned long get_arch_pgd(pgd_t *pgd)
>  {
>  	phys_addr_t pgdir = virt_to_idmap(pgd);
> @@ -267,19 +278,13 @@ void __ref cpu_die(void)
>  	flush_cache_louis();
> 
>  	/*
> -	 * Tell __cpu_die() that this CPU is now safe to dispose of.  Once
> -	 * this returns, power and/or clocks can be removed at any point
> +	 * Tell __cpu_die() that this CPU is now safe to dispose of.  We
> +	 * do this via an IPI to any online CPU - it doesn't matter, we
> +	 * just need another CPU to run the completion.  Once this IPI
> +	 * has been sent, power and/or clocks can be removed at any point
>  	 * from this CPU and its cache by platform_cpu_kill().
>  	 */
> -	complete(&cpu_died);
> -
> -	/*
> -	 * Ensure that the cache lines associated with that completion are
> -	 * written out.  This covers the case where _this_ CPU is doing the
> -	 * powering down, to ensure that the completion is visible to the
> -	 * CPU waiting for this one.
> -	 */
> -	flush_cache_louis();
> +	__smp_cross_call(cpumask_of(cpumask_any(cpu_online_mask)), IPI_CPU_DEAD);
> 
>  	/*
>  	 * The actual CPU shutdown procedure is at least platform (if not
> @@ -442,14 +447,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	}
>  }
> 
> -static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
> -
> -void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
> -{
> -	if (!__smp_cross_call)
> -		__smp_cross_call = fn;
> -}
> -
>  static const char *ipi_types[NR_IPI] __tracepoint_string = {
>  #define S(x,s)	[x] = s
>  	S(IPI_WAKEUP, "CPU wakeup interrupts"),
> @@ -648,6 +645,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
>  		irq_exit();
>  		break;
> 
> +#ifdef CONFIG_HOTPLUG_CPU
> +	case IPI_CPU_DEAD:
> +		irq_enter();
> +		complete(&cpu_died);
> +		irq_exit();
> +		break;
> +#endif
> +
>  	default:
>  		pr_crit("CPU%u: Unknown IPI message 0x%x\n",
>  		        cpu, ipinr);
> 
> -- 
> FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
> according to speedtest.net.
>
Russell King - ARM Linux Feb. 5, 2015, 4:11 p.m. UTC | #6
On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)

Sigh... I kind'a new it wouldn't be this simple.  The gic code which
actually raises the IPI takes a raw spinlock, so it's not going to be
this simple - there's a small theoretical window where we have taken
this lock, written the register to send the IPI, and then dropped the
lock - the update to the lock to release it could get lost if the
CPU power is quickly cut at that point.

Also, we _do_ need the second cache flush in place to ensure that the
unlock is seen to other CPUs.

We could work around that by taking and releasing the lock in the IPI
processing function... but this is starting to look less attractive
as the lock is private to irq-gic.c.

Well, we're very close to 3.19, we're too close to be trying to sort
this out, so I'm hoping that your changes which cause this RCU error
are *not* going in during this merge window, because we seem to have
something of a problem right now which needs more time to resolve.
Paul E. McKenney Feb. 5, 2015, 5:02 p.m. UTC | #7
On Thu, Feb 05, 2015 at 04:11:00PM +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> > Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> 
> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> actually raises the IPI takes a raw spinlock, so it's not going to be
> this simple - there's a small theoretical window where we have taken
> this lock, written the register to send the IPI, and then dropped the
> lock - the update to the lock to release it could get lost if the
> CPU power is quickly cut at that point.
> 
> Also, we _do_ need the second cache flush in place to ensure that the
> unlock is seen to other CPUs.
> 
> We could work around that by taking and releasing the lock in the IPI
> processing function... but this is starting to look less attractive
> as the lock is private to irq-gic.c.
> 
> Well, we're very close to 3.19, we're too close to be trying to sort
> this out, so I'm hoping that your changes which cause this RCU error
> are *not* going in during this merge window, because we seem to have
> something of a problem right now which needs more time to resolve.

Most likely into the 3.20 merge window.  But please keep in mind that
RCU is just the messenger here -- the current code will break if any
CPU for whatever reason takes more than a jiffy to get from its
_stop_machine() handler to the end of its last RCU read-side critical
section on its way out.  A jiffy may sound like a lot, but it is not
hard to exceed this limit, especially in virtualized environments.

So not like to go into v3.19, but it does need to be resolved.

							Thanx, Paul
Russell King - ARM Linux Feb. 5, 2015, 5:34 p.m. UTC | #8
On Thu, Feb 05, 2015 at 09:02:28AM -0800, Paul E. McKenney wrote:
> On Thu, Feb 05, 2015 at 04:11:00PM +0000, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> > > Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> > 
> > Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> > actually raises the IPI takes a raw spinlock, so it's not going to be
> > this simple - there's a small theoretical window where we have taken
> > this lock, written the register to send the IPI, and then dropped the
> > lock - the update to the lock to release it could get lost if the
> > CPU power is quickly cut at that point.
> > 
> > Also, we _do_ need the second cache flush in place to ensure that the
> > unlock is seen to other CPUs.
> > 
> > We could work around that by taking and releasing the lock in the IPI
> > processing function... but this is starting to look less attractive
> > as the lock is private to irq-gic.c.
> > 
> > Well, we're very close to 3.19, we're too close to be trying to sort
> > this out, so I'm hoping that your changes which cause this RCU error
> > are *not* going in during this merge window, because we seem to have
> > something of a problem right now which needs more time to resolve.
> 
> Most likely into the 3.20 merge window.  But please keep in mind that
> RCU is just the messenger here -- the current code will break if any
> CPU for whatever reason takes more than a jiffy to get from its
> _stop_machine() handler to the end of its last RCU read-side critical
> section on its way out.  A jiffy may sound like a lot, but it is not
> hard to exceed this limit, especially in virtualized environments.

What I'm saying is that we can't likely get a good fix prepared before
the 3.20 merge window opens.

I don't term the set_bit/clear_bit solution a "good fix" because it is
far too complex - I've not done a thorough review on it, but the idea
of setting and clearing a couple of bits in unison, making sure that
their state is set appropriately through multiple different code paths
does not strike me as a provably correct replacement for this completion.
The reason for that complexity is because there is no pre-notification
to arch code that a CPU might be going down, so there's no way for the
"CPU is dead" flag to be properly reset (which is why there's all the
manipulation in lots of possible failure paths.)

The idea that we could reset it in the CPU up code doesn't fly - that
would only work if we had one secondary CPU (which would guarantee a
strict up/down/up ordering on it) but as soon as you have more than one
CPU, that doesn't hold true.

We could hook into the CPU hotplug notifiers - which would be quite a
lot of additional code to achieve the reset early enough in the hot
unplug path, though it would probably be the most reliable solution to
the wait-for-bit solution.

However, any of those solutions needs writing and thorough testing,
which, if Linus opens the merge window on Sunday, isn't going to
happen before hand (and we know Linus doesn't like extra development
appearing which wasn't in -next prior to the merge window - he's taken
snapshots of -next to check during the merge window in the past - so
it's not something I'm going to be adding during that time, not even
as a "fix" because we know about the problem right now, before the
merge window.  To me, to treat this as a "fix" would be wilfully
deceitful.)

I don't think the existing code is a big problem at the moment - it's
been like this for about 10 years, and no one has ever reported an
issue with it, although there have been changes over that time:

aa033810461ee56abbef6cef10aabd6b97f5caee
ARM: smp: Drop RCU_NONIDLE usage in cpu_die()

	This removed the RCU_NONIDLE() from the completion() call.

ff081e05bfba3461119cd280201d163b6858eda2
ARM: 7457/1: smp: Fix suspicious RCU originating from cpu_die()

	This added the RCU_NONIDLE() to the completion() call.

3c030beabf937b1d3b4ecaedfd1fb2f1e2aa0c70
ARM: CPU hotplug: move cpu_killed completion to core code

	This moved the completion code from Realview (and other ARM
	platforms) into core ARM code.

and 97a63ecff4bd06da5d8feb8c0394a4d020f2d34d
[ARM SMP] Add CPU hotplug support for Realview MPcore

	The earliest current introduction of CPU hotplug in 2005.
Paul E. McKenney Feb. 5, 2015, 5:54 p.m. UTC | #9
On Thu, Feb 05, 2015 at 05:34:40PM +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 09:02:28AM -0800, Paul E. McKenney wrote:
> > On Thu, Feb 05, 2015 at 04:11:00PM +0000, Russell King - ARM Linux wrote:
> > > On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> > > > Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> > > 
> > > Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> > > actually raises the IPI takes a raw spinlock, so it's not going to be
> > > this simple - there's a small theoretical window where we have taken
> > > this lock, written the register to send the IPI, and then dropped the
> > > lock - the update to the lock to release it could get lost if the
> > > CPU power is quickly cut at that point.
> > > 
> > > Also, we _do_ need the second cache flush in place to ensure that the
> > > unlock is seen to other CPUs.
> > > 
> > > We could work around that by taking and releasing the lock in the IPI
> > > processing function... but this is starting to look less attractive
> > > as the lock is private to irq-gic.c.
> > > 
> > > Well, we're very close to 3.19, we're too close to be trying to sort
> > > this out, so I'm hoping that your changes which cause this RCU error
> > > are *not* going in during this merge window, because we seem to have
> > > something of a problem right now which needs more time to resolve.
> > 
> > Most likely into the 3.20 merge window.  But please keep in mind that
> > RCU is just the messenger here -- the current code will break if any
> > CPU for whatever reason takes more than a jiffy to get from its
> > _stop_machine() handler to the end of its last RCU read-side critical
> > section on its way out.  A jiffy may sound like a lot, but it is not
> > hard to exceed this limit, especially in virtualized environments.
> 
> What I'm saying is that we can't likely get a good fix prepared before
> the 3.20 merge window opens.

And I cannot count.  Or cannot type.  Or something.

I meant do say "Most likely into the 3.21 merge window."  I agree that
this stuff is not ready for next week's merge window.  For one thing,
there are similar issues with a number of other architectures as well.

> I don't term the set_bit/clear_bit solution a "good fix" because it is
> far too complex - I've not done a thorough review on it, but the idea
> of setting and clearing a couple of bits in unison, making sure that
> their state is set appropriately through multiple different code paths
> does not strike me as a provably correct replacement for this completion.
> The reason for that complexity is because there is no pre-notification
> to arch code that a CPU might be going down, so there's no way for the
> "CPU is dead" flag to be properly reset (which is why there's all the
> manipulation in lots of possible failure paths.)
> 
> The idea that we could reset it in the CPU up code doesn't fly - that
> would only work if we had one secondary CPU (which would guarantee a
> strict up/down/up ordering on it) but as soon as you have more than one
> CPU, that doesn't hold true.
> 
> We could hook into the CPU hotplug notifiers - which would be quite a
> lot of additional code to achieve the reset early enough in the hot
> unplug path, though it would probably be the most reliable solution to
> the wait-for-bit solution.
> 
> However, any of those solutions needs writing and thorough testing,
> which, if Linus opens the merge window on Sunday, isn't going to
> happen before hand (and we know Linus doesn't like extra development
> appearing which wasn't in -next prior to the merge window - he's taken
> snapshots of -next to check during the merge window in the past - so
> it's not something I'm going to be adding during that time, not even
> as a "fix" because we know about the problem right now, before the
> merge window.  To me, to treat this as a "fix" would be wilfully
> deceitful.)
> 
> I don't think the existing code is a big problem at the moment - it's
> been like this for about 10 years, and no one has ever reported an
> issue with it, although there have been changes over that time:
> 
> aa033810461ee56abbef6cef10aabd6b97f5caee
> ARM: smp: Drop RCU_NONIDLE usage in cpu_die()
> 
> 	This removed the RCU_NONIDLE() from the completion() call.
> 
> ff081e05bfba3461119cd280201d163b6858eda2
> ARM: 7457/1: smp: Fix suspicious RCU originating from cpu_die()
> 
> 	This added the RCU_NONIDLE() to the completion() call.
> 
> 3c030beabf937b1d3b4ecaedfd1fb2f1e2aa0c70
> ARM: CPU hotplug: move cpu_killed completion to core code
> 
> 	This moved the completion code from Realview (and other ARM
> 	platforms) into core ARM code.
> 
> and 97a63ecff4bd06da5d8feb8c0394a4d020f2d34d
> [ARM SMP] Add CPU hotplug support for Realview MPcore
> 
> 	The earliest current introduction of CPU hotplug in 2005.

Agreed.  It does need to be fixed, but it makes sense to take a few
weeks and get a fix that is more likely to be correct.

							Thanx, Paul
Stephen Boyd Feb. 10, 2015, 1:24 a.m. UTC | #10
On 02/05/15 08:11, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
>> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> actually raises the IPI takes a raw spinlock, so it's not going to be
> this simple - there's a small theoretical window where we have taken
> this lock, written the register to send the IPI, and then dropped the
> lock - the update to the lock to release it could get lost if the
> CPU power is quickly cut at that point.

Hm.. at first glance it would seem like a similar problem exists with
the completion variable. But it seems that we rely on the call to
complete() fom the dying CPU to synchronize with wait_for_completion()
on the killing CPU via the completion's wait.lock.

void complete(struct completion *x)
{
        unsigned long flags;

        spin_lock_irqsave(&x->wait.lock, flags);
        x->done++;
        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
        spin_unlock_irqrestore(&x->wait.lock, flags);
}

and

static inline long __sched
do_wait_for_common(struct completion *x,
                  long (*action)(long), long timeout, int state)
                        ...
			spin_unlock_irq(&x->wait.lock);
			timeout = action(timeout);
			spin_lock_irq(&x->wait.lock);


so the power can't really be cut until the killing CPU sees the lock
released either explicitly via the second cache flush in cpu_die() or
implicitly via hardware. Maybe we can do the same thing here by using a
spinlock for synchronization between the IPI handler and the dying CPU?
So lock/unlock around the IPI sending from the dying CPU and then do a
lock/unlock on the killing CPU before continuing.

It would be nice if we didn't have to do anything at all though so
perhaps we can make it a nop on configs where there isn't a big little
switcher. Yeah it's some ugly coupling between these two pieces of code,
but I'm not sure how we can do better.

>
> Also, we _do_ need the second cache flush in place to ensure that the
> unlock is seen to other CPUs.
>
> We could work around that by taking and releasing the lock in the IPI
> processing function... but this is starting to look less attractive
> as the lock is private to irq-gic.c.

With Daniel Thompson's NMI fiq patches at least the lock would almost
always be gone, except for the bL switcher users. Another solution might
be to put a hotplug lock around the bL switcher code and then skip
taking the lock in gic_raise_softirq() if the IPI is our special hotplug
one. Conditional locking is pretty ugly though, so perhaps this isn't
such a great idea.
Paul E. McKenney Feb. 10, 2015, 1:37 a.m. UTC | #11
On Mon, Feb 09, 2015 at 05:24:08PM -0800, Stephen Boyd wrote:
> On 02/05/15 08:11, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> >> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> > Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> > actually raises the IPI takes a raw spinlock, so it's not going to be
> > this simple - there's a small theoretical window where we have taken
> > this lock, written the register to send the IPI, and then dropped the
> > lock - the update to the lock to release it could get lost if the
> > CPU power is quickly cut at that point.
> 
> Hm.. at first glance it would seem like a similar problem exists with
> the completion variable. But it seems that we rely on the call to
> complete() fom the dying CPU to synchronize with wait_for_completion()
> on the killing CPU via the completion's wait.lock.
> 
> void complete(struct completion *x)
> {
>         unsigned long flags;
> 
>         spin_lock_irqsave(&x->wait.lock, flags);
>         x->done++;
>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
>         spin_unlock_irqrestore(&x->wait.lock, flags);
> }
> 
> and
> 
> static inline long __sched
> do_wait_for_common(struct completion *x,
>                   long (*action)(long), long timeout, int state)
>                         ...
> 			spin_unlock_irq(&x->wait.lock);
> 			timeout = action(timeout);
> 			spin_lock_irq(&x->wait.lock);
> 
> 
> so the power can't really be cut until the killing CPU sees the lock
> released either explicitly via the second cache flush in cpu_die() or
> implicitly via hardware. Maybe we can do the same thing here by using a
> spinlock for synchronization between the IPI handler and the dying CPU?
> So lock/unlock around the IPI sending from the dying CPU and then do a
> lock/unlock on the killing CPU before continuing.
> 
> It would be nice if we didn't have to do anything at all though so
> perhaps we can make it a nop on configs where there isn't a big little
> switcher. Yeah it's some ugly coupling between these two pieces of code,
> but I'm not sure how we can do better.

The default ugly-but-known-to-work approach is to set a variable in
the dying CPU that the surviving CPU periodically polls.  If all else
fails and all that.

> > Also, we _do_ need the second cache flush in place to ensure that the
> > unlock is seen to other CPUs.
> >
> > We could work around that by taking and releasing the lock in the IPI
> > processing function... but this is starting to look less attractive
> > as the lock is private to irq-gic.c.
> 
> With Daniel Thompson's NMI fiq patches at least the lock would almost
> always be gone, except for the bL switcher users. Another solution might
> be to put a hotplug lock around the bL switcher code and then skip
> taking the lock in gic_raise_softirq() if the IPI is our special hotplug
> one. Conditional locking is pretty ugly though, so perhaps this isn't
> such a great idea.

Which hotplug lock are you suggesting?  We cannot use sleeplocks, because
releasing them can go through the scheduler, which is not legal at this
point.

							Thanx, Paul
Stephen Boyd Feb. 10, 2015, 2:05 a.m. UTC | #12
On 02/09/15 17:37, Paul E. McKenney wrote:
> On Mon, Feb 09, 2015 at 05:24:08PM -0800, Stephen Boyd wrote:
>> On 02/05/15 08:11, Russell King - ARM Linux wrote:
>>> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
>>>> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
>>> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
>>> actually raises the IPI takes a raw spinlock, so it's not going to be
>>> this simple - there's a small theoretical window where we have taken
>>> this lock, written the register to send the IPI, and then dropped the
>>> lock - the update to the lock to release it could get lost if the
>>> CPU power is quickly cut at that point.
>> Hm.. at first glance it would seem like a similar problem exists with
>> the completion variable. But it seems that we rely on the call to
>> complete() fom the dying CPU to synchronize with wait_for_completion()
>> on the killing CPU via the completion's wait.lock.
>>
>> void complete(struct completion *x)
>> {
>>         unsigned long flags;
>>
>>         spin_lock_irqsave(&x->wait.lock, flags);
>>         x->done++;
>>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
>>         spin_unlock_irqrestore(&x->wait.lock, flags);
>> }
>>
>> and
>>
>> static inline long __sched
>> do_wait_for_common(struct completion *x,
>>                   long (*action)(long), long timeout, int state)
>>                         ...
>> 			spin_unlock_irq(&x->wait.lock);
>> 			timeout = action(timeout);
>> 			spin_lock_irq(&x->wait.lock);
>>
>>
>> so the power can't really be cut until the killing CPU sees the lock
>> released either explicitly via the second cache flush in cpu_die() or
>> implicitly via hardware. Maybe we can do the same thing here by using a
>> spinlock for synchronization between the IPI handler and the dying CPU?
>> So lock/unlock around the IPI sending from the dying CPU and then do a
>> lock/unlock on the killing CPU before continuing.
>>
>> It would be nice if we didn't have to do anything at all though so
>> perhaps we can make it a nop on configs where there isn't a big little
>> switcher. Yeah it's some ugly coupling between these two pieces of code,
>> but I'm not sure how we can do better.
> The default ugly-but-known-to-work approach is to set a variable in
> the dying CPU that the surviving CPU periodically polls.  If all else
> fails and all that.

So it isn't the ugliest. Good.

>
>>> Also, we _do_ need the second cache flush in place to ensure that the
>>> unlock is seen to other CPUs.
>>>
>>> We could work around that by taking and releasing the lock in the IPI
>>> processing function... but this is starting to look less attractive
>>> as the lock is private to irq-gic.c.
>> With Daniel Thompson's NMI fiq patches at least the lock would almost
>> always be gone, except for the bL switcher users. Another solution might
>> be to put a hotplug lock around the bL switcher code and then skip
>> taking the lock in gic_raise_softirq() if the IPI is our special hotplug
>> one. Conditional locking is pretty ugly though, so perhaps this isn't
>> such a great idea.
> Which hotplug lock are you suggesting?  We cannot use sleeplocks, because
> releasing them can go through the scheduler, which is not legal at this
> point.
>

I'm not suggesting we take any hotplug locks here in the cpu_die() path.
I'm thinking we make the bL switcher code hold a hotplug lock or at
least prevent hotplug from happening while it's moving IPIs from the
outgoing CPU to the incoming CPU (see code in
arch/arm/common/bL_switcher.c). Actually, I seem to recall that hotplug
can't happen if preemption/irqs are disabled, so maybe nothing needs to
change there and we can just assume that if we're sending the hotplug
IPI we don't need to worry about taking the spinlock in
gic_raise_softirq()? We still have conditional locking so it's still
fragile.
Paul E. McKenney Feb. 10, 2015, 3:05 a.m. UTC | #13
On Mon, Feb 09, 2015 at 06:05:55PM -0800, Stephen Boyd wrote:
> On 02/09/15 17:37, Paul E. McKenney wrote:
> > On Mon, Feb 09, 2015 at 05:24:08PM -0800, Stephen Boyd wrote:
> >> On 02/05/15 08:11, Russell King - ARM Linux wrote:
> >>> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> >>>> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> >>> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> >>> actually raises the IPI takes a raw spinlock, so it's not going to be
> >>> this simple - there's a small theoretical window where we have taken
> >>> this lock, written the register to send the IPI, and then dropped the
> >>> lock - the update to the lock to release it could get lost if the
> >>> CPU power is quickly cut at that point.
> >> Hm.. at first glance it would seem like a similar problem exists with
> >> the completion variable. But it seems that we rely on the call to
> >> complete() fom the dying CPU to synchronize with wait_for_completion()
> >> on the killing CPU via the completion's wait.lock.
> >>
> >> void complete(struct completion *x)
> >> {
> >>         unsigned long flags;
> >>
> >>         spin_lock_irqsave(&x->wait.lock, flags);
> >>         x->done++;
> >>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
> >>         spin_unlock_irqrestore(&x->wait.lock, flags);
> >> }
> >>
> >> and
> >>
> >> static inline long __sched
> >> do_wait_for_common(struct completion *x,
> >>                   long (*action)(long), long timeout, int state)
> >>                         ...
> >> 			spin_unlock_irq(&x->wait.lock);
> >> 			timeout = action(timeout);
> >> 			spin_lock_irq(&x->wait.lock);
> >>
> >>
> >> so the power can't really be cut until the killing CPU sees the lock
> >> released either explicitly via the second cache flush in cpu_die() or
> >> implicitly via hardware. Maybe we can do the same thing here by using a
> >> spinlock for synchronization between the IPI handler and the dying CPU?
> >> So lock/unlock around the IPI sending from the dying CPU and then do a
> >> lock/unlock on the killing CPU before continuing.
> >>
> >> It would be nice if we didn't have to do anything at all though so
> >> perhaps we can make it a nop on configs where there isn't a big little
> >> switcher. Yeah it's some ugly coupling between these two pieces of code,
> >> but I'm not sure how we can do better.
> > The default ugly-but-known-to-work approach is to set a variable in
> > the dying CPU that the surviving CPU periodically polls.  If all else
> > fails and all that.
> 
> So it isn't the ugliest. Good.

Woo-hoo!!!  Something to aspire to!  ;-)

> >>> Also, we _do_ need the second cache flush in place to ensure that the
> >>> unlock is seen to other CPUs.
> >>>
> >>> We could work around that by taking and releasing the lock in the IPI
> >>> processing function... but this is starting to look less attractive
> >>> as the lock is private to irq-gic.c.
> >> With Daniel Thompson's NMI fiq patches at least the lock would almost
> >> always be gone, except for the bL switcher users. Another solution might
> >> be to put a hotplug lock around the bL switcher code and then skip
> >> taking the lock in gic_raise_softirq() if the IPI is our special hotplug
> >> one. Conditional locking is pretty ugly though, so perhaps this isn't
> >> such a great idea.
> > Which hotplug lock are you suggesting?  We cannot use sleeplocks, because
> > releasing them can go through the scheduler, which is not legal at this
> > point.
> >
> 
> I'm not suggesting we take any hotplug locks here in the cpu_die() path.
> I'm thinking we make the bL switcher code hold a hotplug lock or at
> least prevent hotplug from happening while it's moving IPIs from the
> outgoing CPU to the incoming CPU (see code in
> arch/arm/common/bL_switcher.c). Actually, I seem to recall that hotplug
> can't happen if preemption/irqs are disabled, so maybe nothing needs to
> change there and we can just assume that if we're sending the hotplug
> IPI we don't need to worry about taking the spinlock in
> gic_raise_softirq()? We still have conditional locking so it's still
> fragile.

More precisely, if you are running on a given CPU with preemption disabled
(and disabling irqs disables preemption), then that CPU cannot go offline.
On the other hand, some other CPU may have already been partway offline,
and if it is far enough along in that process, it might well go the rest
of the way offline during the time your CPU is running with preemption
disabled.

Does that help?

							Thanx, Paul
Mark Rutland Feb. 10, 2015, 3:14 p.m. UTC | #14
On Tue, Feb 10, 2015 at 01:24:08AM +0000, Stephen Boyd wrote:
> On 02/05/15 08:11, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> >> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> > Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> > actually raises the IPI takes a raw spinlock, so it's not going to be
> > this simple - there's a small theoretical window where we have taken
> > this lock, written the register to send the IPI, and then dropped the
> > lock - the update to the lock to release it could get lost if the
> > CPU power is quickly cut at that point.
> 
> Hm.. at first glance it would seem like a similar problem exists with
> the completion variable. But it seems that we rely on the call to
> complete() fom the dying CPU to synchronize with wait_for_completion()
> on the killing CPU via the completion's wait.lock.
> 
> void complete(struct completion *x)
> {
>         unsigned long flags;
> 
>         spin_lock_irqsave(&x->wait.lock, flags);
>         x->done++;
>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
>         spin_unlock_irqrestore(&x->wait.lock, flags);
> }
> 
> and
> 
> static inline long __sched
> do_wait_for_common(struct completion *x,
>                   long (*action)(long), long timeout, int state)
>                         ...
> 			spin_unlock_irq(&x->wait.lock);
> 			timeout = action(timeout);
> 			spin_lock_irq(&x->wait.lock);
> 
> 
> so the power can't really be cut until the killing CPU sees the lock
> released either explicitly via the second cache flush in cpu_die() or
> implicitly via hardware.

That sounds about right, though surely cache flush is irrelevant w.r.t.
publishing of the unlock? The dsb(ishst) in the unlock path will ensure
that the write is visibile prior to the second flush_cache_louis().

That said, we _do_ need to flush the cache prior to the CPU being
killed, or we can lose any (shared) dirty cache lines the CPU owns. In
the presence of dirty cacheline migration we need to be sure the CPU to
be killed doesn't acquire any lines prior to being killed (i.e. its
caches need to be off and flushed). Given that I don't think it's
feasible to perform an IPI.

I think we need to move the synchronisation down into the
cpu_ops::{cpu_die,cpu_kill} implementations, so that we can have the
dying CPU signal readiness after it has disabled and flushed its caches.

If the CPU can kill itself and we can query the state of the CPU, then
the dying CPU needs to do nothing, and cpu_kill can just poll until it
is dead. If the CPU needs to be killed from another CPU, it can update a
(cacheline-padded) percpu variable that cpu_kill can poll (cleaning
before each read).

> Maybe we can do the same thing here by using a
> spinlock for synchronization between the IPI handler and the dying CPU?
> So lock/unlock around the IPI sending from the dying CPU and then do a
> lock/unlock on the killing CPU before continuing.
> 
> It would be nice if we didn't have to do anything at all though so
> perhaps we can make it a nop on configs where there isn't a big little
> switcher. Yeah it's some ugly coupling between these two pieces of code,
> but I'm not sure how we can do better.

I'm missing something here. What does the switcher have to do with this?

> > Also, we _do_ need the second cache flush in place to ensure that the
> > unlock is seen to other CPUs.
> >
> > We could work around that by taking and releasing the lock in the IPI
> > processing function... but this is starting to look less attractive
> > as the lock is private to irq-gic.c.
> 
> With Daniel Thompson's NMI fiq patches at least the lock would almost
> always be gone, except for the bL switcher users. Another solution might
> be to put a hotplug lock around the bL switcher code and then skip
> taking the lock in gic_raise_softirq() if the IPI is our special hotplug
> one. Conditional locking is pretty ugly though, so perhaps this isn't
> such a great idea.

There are also SMP platforms without a GIC (e.g. hip04) that would need
similar modifications to their interrupt controller drivers, which is
going to be painful.

Thanks,
Mark.
Russell King - ARM Linux Feb. 10, 2015, 3:41 p.m. UTC | #15
On Mon, Feb 09, 2015 at 05:24:08PM -0800, Stephen Boyd wrote:
> On 02/05/15 08:11, Russell King - ARM Linux wrote:
> > On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> >> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> > Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> > actually raises the IPI takes a raw spinlock, so it's not going to be
> > this simple - there's a small theoretical window where we have taken
> > this lock, written the register to send the IPI, and then dropped the
> > lock - the update to the lock to release it could get lost if the
> > CPU power is quickly cut at that point.
> 
> Hm.. at first glance it would seem like a similar problem exists with
> the completion variable. But it seems that we rely on the call to
> complete() fom the dying CPU to synchronize with wait_for_completion()
> on the killing CPU via the completion's wait.lock.
> 
> void complete(struct completion *x)
> {
>         unsigned long flags;
> 
>         spin_lock_irqsave(&x->wait.lock, flags);
>         x->done++;
>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
>         spin_unlock_irqrestore(&x->wait.lock, flags);
> }
> 
> and
> 
> static inline long __sched
> do_wait_for_common(struct completion *x,
>                   long (*action)(long), long timeout, int state)
>                         ...
> 			spin_unlock_irq(&x->wait.lock);
> 			timeout = action(timeout);
> 			spin_lock_irq(&x->wait.lock);
> 
> 
> so the power can't really be cut until the killing CPU sees the lock
> released either explicitly via the second cache flush in cpu_die() or
> implicitly via hardware.

Correct - so the caller of wait_for_completion_timeout() needs to
re-acquire the cache line after the complete() in order to return
successfully - which means that the spin_unlock_irqrestore() on the
dying CPU _must_ have become visible to other observers for the
requesting CPU to proceed.

> Maybe we can do the same thing here by using a
> spinlock for synchronization between the IPI handler and the dying CPU?
> So lock/unlock around the IPI sending from the dying CPU and then do a
> lock/unlock on the killing CPU before continuing.

It would be nice, but it means exporting irq_controller_lock from irq_gic.c.
It's doable, but it's really not nice - it creates a layering issue, buy
making arch/arm/kernel/smp.c depend on symbols exported from
drivers/irqchip/irq-gic.c.

> > Also, we _do_ need the second cache flush in place to ensure that the
> > unlock is seen to other CPUs.
> >
> > We could work around that by taking and releasing the lock in the IPI
> > processing function... but this is starting to look less attractive
> > as the lock is private to irq-gic.c.
> 
> With Daniel Thompson's NMI fiq patches at least the lock would almost
> always be gone, except for the bL switcher users. Another solution might
> be to put a hotplug lock around the bL switcher code and then skip
> taking the lock in gic_raise_softirq() if the IPI is our special hotplug
> one. Conditional locking is pretty ugly though, so perhaps this isn't
> such a great idea.

I haven't even thought about the implications of that yet. :)  We need to
fix the already existing in-kernel code before we consider not-yet-merged
code.
Stephen Boyd Feb. 10, 2015, 8:48 p.m. UTC | #16
On 02/10/15 07:14, Mark Rutland wrote:
> On Tue, Feb 10, 2015 at 01:24:08AM +0000, Stephen Boyd wrote:
>> On 02/05/15 08:11, Russell King - ARM Linux wrote:
>>> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
>>>> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
>>> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
>>> actually raises the IPI takes a raw spinlock, so it's not going to be
>>> this simple - there's a small theoretical window where we have taken
>>> this lock, written the register to send the IPI, and then dropped the
>>> lock - the update to the lock to release it could get lost if the
>>> CPU power is quickly cut at that point.
>> Hm.. at first glance it would seem like a similar problem exists with
>> the completion variable. But it seems that we rely on the call to
>> complete() fom the dying CPU to synchronize with wait_for_completion()
>> on the killing CPU via the completion's wait.lock.
>>
>> void complete(struct completion *x)
>> {
>>         unsigned long flags;
>>
>>         spin_lock_irqsave(&x->wait.lock, flags);
>>         x->done++;
>>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
>>         spin_unlock_irqrestore(&x->wait.lock, flags);
>> }
>>
>> and
>>
>> static inline long __sched
>> do_wait_for_common(struct completion *x,
>>                   long (*action)(long), long timeout, int state)
>>                         ...
>> 			spin_unlock_irq(&x->wait.lock);
>> 			timeout = action(timeout);
>> 			spin_lock_irq(&x->wait.lock);
>>
>>
>> so the power can't really be cut until the killing CPU sees the lock
>> released either explicitly via the second cache flush in cpu_die() or
>> implicitly via hardware.
> That sounds about right, though surely cache flush is irrelevant w.r.t.
> publishing of the unlock? The dsb(ishst) in the unlock path will ensure
> that the write is visibile prior to the second flush_cache_louis().

Ah right. I was incorrectly thinking that the CPU had already disabled
coherency at this point.

>
> That said, we _do_ need to flush the cache prior to the CPU being
> killed, or we can lose any (shared) dirty cache lines the CPU owns. In
> the presence of dirty cacheline migration we need to be sure the CPU to
> be killed doesn't acquire any lines prior to being killed (i.e. its
> caches need to be off and flushed). Given that I don't think it's
> feasible to perform an IPI.

The IPI/completion sounds nice because it allows the killing CPU to
schedule and do other work until the dying CPU notifies that it's almost
dead.

>
> I think we need to move the synchronisation down into the
> cpu_ops::{cpu_die,cpu_kill} implementations, so that we can have the
> dying CPU signal readiness after it has disabled and flushed its caches.
>
> If the CPU can kill itself and we can query the state of the CPU, then
> the dying CPU needs to do nothing, and cpu_kill can just poll until it
> is dead. If the CPU needs to be killed from another CPU, it can update a
> (cacheline-padded) percpu variable that cpu_kill can poll (cleaning
> before each read).

How about a hybrid approach where we send the IPI from generic cpu_die()
and then do the cacheline-padded bit poll + invalidate and bit set? That
way we get the benefit of not doing that poll until we really need to
and if we need to do it at all.

cpu_kill | cpu_die | IPI | bit poll 
---------+---------+-----+----------
    Y    |    Y    |  Y  |   N
    N    |    Y    |  Y  |   Y
    Y    |    N    |  ?  |   ?    <-- Is this a valid configuration?
    N    |    N    |  N  |   N    <-- Hotplug should be disabled


If the hardware doesn't have a synchronization method in row 1 we can
expose the bit polling functionality to the ops so that they can set and
poll the bit. It looks like rockchip would need this because we just
pull the power in cpu_kill without any synchronization. Unfortunately
this is starting to sound like a fairly large patch to backport.

Aside: For that last row we really should be setting cpu->hotpluggable
in struct cpu based on what cpu_ops::cpu_disable returns (from what I
can tell we use that op to indicate if a CPU can be hotplugged).

Double Aside: It seems that exynos has a bug with coherency.
exynos_cpu_die() calls v7_exit_coherency_flush() and then calls
exynos_cpu_power_down() which may call of_machine_is_compatible() and
that function will grab and release a kref which uses ldrex/strex for
atomics after we've disabled coherency in v7_exit_coherency_flush().

>> Maybe we can do the same thing here by using a
>> spinlock for synchronization between the IPI handler and the dying CPU?
>> So lock/unlock around the IPI sending from the dying CPU and then do a
>> lock/unlock on the killing CPU before continuing.
>>
>> It would be nice if we didn't have to do anything at all though so
>> perhaps we can make it a nop on configs where there isn't a big little
>> switcher. Yeah it's some ugly coupling between these two pieces of code,
>> but I'm not sure how we can do better.
> I'm missing something here. What does the switcher have to do with this?

The switcher is the reason we have a spinlock in gic_raise_softirq().
That's the problem that Russell was mentioning.
Russell King - ARM Linux Feb. 10, 2015, 8:58 p.m. UTC | #17
On Tue, Feb 10, 2015 at 03:14:16PM +0000, Mark Rutland wrote:
> That sounds about right, though surely cache flush is irrelevant w.r.t.
> publishing of the unlock? The dsb(ishst) in the unlock path will ensure
> that the write is visibile prior to the second flush_cache_louis().
> 
> That said, we _do_ need to flush the cache prior to the CPU being
> killed, or we can lose any (shared) dirty cache lines the CPU owns. In
> the presence of dirty cacheline migration we need to be sure the CPU to
> be killed doesn't acquire any lines prior to being killed (i.e. its
> caches need to be off and flushed). Given that I don't think it's
> feasible to perform an IPI.
> 
> I think we need to move the synchronisation down into the
> cpu_ops::{cpu_die,cpu_kill} implementations, so that we can have the
> dying CPU signal readiness after it has disabled and flushed its caches.

Absolutely not - we're not having lots of implementations of the same
fscking thing.  We need this basic stuff to be in the generic code, not
repeated some hundreds of times across every single SMP implementation
that's out there.
Mark Rutland Feb. 13, 2015, 3:38 p.m. UTC | #18
On Tue, Feb 10, 2015 at 08:48:18PM +0000, Stephen Boyd wrote:
> On 02/10/15 07:14, Mark Rutland wrote:
> > On Tue, Feb 10, 2015 at 01:24:08AM +0000, Stephen Boyd wrote:
> >> On 02/05/15 08:11, Russell King - ARM Linux wrote:
> >>> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> >>>> Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> >>> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> >>> actually raises the IPI takes a raw spinlock, so it's not going to be
> >>> this simple - there's a small theoretical window where we have taken
> >>> this lock, written the register to send the IPI, and then dropped the
> >>> lock - the update to the lock to release it could get lost if the
> >>> CPU power is quickly cut at that point.
> >> Hm.. at first glance it would seem like a similar problem exists with
> >> the completion variable. But it seems that we rely on the call to
> >> complete() fom the dying CPU to synchronize with wait_for_completion()
> >> on the killing CPU via the completion's wait.lock.
> >>
> >> void complete(struct completion *x)
> >> {
> >>         unsigned long flags;
> >>
> >>         spin_lock_irqsave(&x->wait.lock, flags);
> >>         x->done++;
> >>         __wake_up_locked(&x->wait, TASK_NORMAL, 1);
> >>         spin_unlock_irqrestore(&x->wait.lock, flags);
> >> }
> >>
> >> and
> >>
> >> static inline long __sched
> >> do_wait_for_common(struct completion *x,
> >>                   long (*action)(long), long timeout, int state)
> >>                         ...
> >> 			spin_unlock_irq(&x->wait.lock);
> >> 			timeout = action(timeout);
> >> 			spin_lock_irq(&x->wait.lock);
> >>
> >>
> >> so the power can't really be cut until the killing CPU sees the lock
> >> released either explicitly via the second cache flush in cpu_die() or
> >> implicitly via hardware.
> > That sounds about right, though surely cache flush is irrelevant w.r.t.
> > publishing of the unlock? The dsb(ishst) in the unlock path will ensure
> > that the write is visibile prior to the second flush_cache_louis().
> 
> Ah right. I was incorrectly thinking that the CPU had already disabled
> coherency at this point.
> 
> >
> > That said, we _do_ need to flush the cache prior to the CPU being
> > killed, or we can lose any (shared) dirty cache lines the CPU owns. In
> > the presence of dirty cacheline migration we need to be sure the CPU to
> > be killed doesn't acquire any lines prior to being killed (i.e. its
> > caches need to be off and flushed). Given that I don't think it's
> > feasible to perform an IPI.
> 
> The IPI/completion sounds nice because it allows the killing CPU to
> schedule and do other work until the dying CPU notifies that it's almost
> dead.

Ok. My main concern was that it's not sufficient to know that a CPU is
ready to die, but I guess it may signal that it is close.

> > I think we need to move the synchronisation down into the
> > cpu_ops::{cpu_die,cpu_kill} implementations, so that we can have the
> > dying CPU signal readiness after it has disabled and flushed its caches.
> >
> > If the CPU can kill itself and we can query the state of the CPU, then
> > the dying CPU needs to do nothing, and cpu_kill can just poll until it
> > is dead. If the CPU needs to be killed from another CPU, it can update a
> > (cacheline-padded) percpu variable that cpu_kill can poll (cleaning
> > before each read).
> 
> How about a hybrid approach where we send the IPI from generic cpu_die()
> and then do the cacheline-padded bit poll + invalidate and bit set? That
> way we get the benefit of not doing that poll until we really need to
> and if we need to do it at all.

That sounds sane to me.

> cpu_kill | cpu_die | IPI | bit poll 
> ---------+---------+-----+----------
>     Y    |    Y    |  Y  |   N
>     N    |    Y    |  Y  |   Y
>     Y    |    N    |  ?  |   ?    <-- Is this a valid configuration?
>     N    |    N    |  N  |   N    <-- Hotplug should be disabled
> 
> 
> If the hardware doesn't have a synchronization method in row 1 we can
> expose the bit polling functionality to the ops so that they can set and
> poll the bit. It looks like rockchip would need this because we just
> pull the power in cpu_kill without any synchronization. Unfortunately
> this is starting to sound like a fairly large patch to backport.

Oh, fun.

> Aside: For that last row we really should be setting cpu->hotpluggable
> in struct cpu based on what cpu_ops::cpu_disable returns (from what I
> can tell we use that op to indicate if a CPU can be hotplugged).
> 
> Double Aside: It seems that exynos has a bug with coherency.
> exynos_cpu_die() calls v7_exit_coherency_flush() and then calls
> exynos_cpu_power_down() which may call of_machine_is_compatible() and
> that function will grab and release a kref which uses ldrex/strex for
> atomics after we've disabled coherency in v7_exit_coherency_flush().
> 
> >> Maybe we can do the same thing here by using a
> >> spinlock for synchronization between the IPI handler and the dying CPU?
> >> So lock/unlock around the IPI sending from the dying CPU and then do a
> >> lock/unlock on the killing CPU before continuing.
> >>
> >> It would be nice if we didn't have to do anything at all though so
> >> perhaps we can make it a nop on configs where there isn't a big little
> >> switcher. Yeah it's some ugly coupling between these two pieces of code,
> >> but I'm not sure how we can do better.
> > I'm missing something here. What does the switcher have to do with this?
> 
> The switcher is the reason we have a spinlock in gic_raise_softirq().
> That's the problem that Russell was mentioning.

Ah. Thanks for the pointer.

Thanks,
Mark.
Russell King - ARM Linux Feb. 25, 2015, 12:56 p.m. UTC | #19
On Thu, Feb 05, 2015 at 04:11:00PM +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 05, 2015 at 06:29:18AM -0800, Paul E. McKenney wrote:
> > Works for me, assuming no hidden uses of RCU in the IPI code.  ;-)
> 
> Sigh... I kind'a new it wouldn't be this simple.  The gic code which
> actually raises the IPI takes a raw spinlock, so it's not going to be
> this simple - there's a small theoretical window where we have taken
> this lock, written the register to send the IPI, and then dropped the
> lock - the update to the lock to release it could get lost if the
> CPU power is quickly cut at that point.
> 
> Also, we _do_ need the second cache flush in place to ensure that the
> unlock is seen to other CPUs.

It's time to start discussing this problem again now that we're the
other side of the merge window.

I've been thinking about the lock in the GIC code.  Do we actually need
this lock in gic_raise_softirq(), or could we move this lock into the
higher level code?

Let's consider the bL switcher.

I think the bL switcher is racy wrt CPU hotplug at the moment.  What
happens if we're sleeping in wait_for_completion(&inbound_alive) and
CPU hotplug unplugs the CPU outgoing CPU?  What protects us against
this scenario?  I can't see anything in bL_switch_to() which ensures
that CPU hotplug can't run.

Let's assume that this rather glaring bug has been fixed, and that CPU
hotplug can't run in parallel with the bL switcher (and hence
gic_migrate_target() can't run concurrently with a CPU being taken
offline.)

If we have that guarantee, then we don't need to take a lock when sending
the completion IPI - we would know that while a CPU is being taken down,
the bL switcher could not run.  What we now need is a lock-less way to
raise an IPI.

Now, is the locking between the normal IPI paths and the bL switcher
something that is specific to the interrupt controller, or should generic
code care about it?  I think it's something generic code should care about
- and I believe that would make life a little easier.

The current bL switcher idea is to bring the new CPU up, disable IRQs
and FIQs on the outgoing CPU, change the IRQ/IPI targets, then read
any pending SGIs and raise them on the new CPU.  But what about any
pending SPIs?  These look like they could be lost.

How about this for an idea instead - the bL switcher code:

- brings the new CPU online.
- disables IRQs and FIQs.
- takes the IPI lock, which prevents new IPIs being raised.
- re-targets IRQs and IPIs onto the new CPU.
- releases the IPI lock.
- re-enables IRQs and FIQs.
- polls the IRQ controller to wait for any remaining IRQs and IPIs
  to be delivered.
- re-disables IRQs and FIQs (which shouldn't be received anyway since
  they're now targetting the new CPU.)
- shuts down the tick device.
- completes the switch

What this means is that we're not needing to have complex code in the
interrupt controllers to re-raise interrupts on other CPUs, and we
don't need a lock in the interrupt controller code synchronising IPI
raising with the bL switcher.

I'd also suggest is that this IPI lock should _not_ be a spinlock - it
should be a read/write spin lock - it's perfectly acceptable to have
multiple CPUs raising IPIs to each other, but it is not acceptable to
have any CPU raising an IPI when the bL switcher is modifying the IRQ
targets.  That fits the rwlock semantics.

What this means is that gic_raise_softirq() should again become a lock-
less function, which opens the door to using an IPI to complete the
CPU hot-unplug operation cleanly.

Thoughts (especially from Nico)?
diff mbox

Patch

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 194df2f1aa87..c623e27a9c85 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -73,6 +73,9 @@  enum ipi_msg_type {
 	IPI_IRQ_WORK,
 	IPI_COMPLETION,
 	IPI_CPU_BACKTRACE,
+#ifdef CONFIG_HOTPLUG_CPU
+	IPI_CPU_DEAD,
+#endif
 };
 
 /* For reliability, we're prepared to waste bits here. */
@@ -88,6 +91,14 @@  void __init smp_set_ops(struct smp_operations *ops)
 		smp_ops = *ops;
 };
 
+static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
+
+void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
+{
+	if (!__smp_cross_call)
+		__smp_cross_call = fn;
+}
+
 static unsigned long get_arch_pgd(pgd_t *pgd)
 {
 	phys_addr_t pgdir = virt_to_idmap(pgd);
@@ -267,19 +278,13 @@  void __ref cpu_die(void)
 	flush_cache_louis();
 
 	/*
-	 * Tell __cpu_die() that this CPU is now safe to dispose of.  Once
-	 * this returns, power and/or clocks can be removed at any point
+	 * Tell __cpu_die() that this CPU is now safe to dispose of.  We
+	 * do this via an IPI to any online CPU - it doesn't matter, we
+	 * just need another CPU to run the completion.  Once this IPI
+	 * has been sent, power and/or clocks can be removed at any point
 	 * from this CPU and its cache by platform_cpu_kill().
 	 */
-	complete(&cpu_died);
-
-	/*
-	 * Ensure that the cache lines associated with that completion are
-	 * written out.  This covers the case where _this_ CPU is doing the
-	 * powering down, to ensure that the completion is visible to the
-	 * CPU waiting for this one.
-	 */
-	flush_cache_louis();
+	__smp_cross_call(cpumask_of(cpumask_any(cpu_online_mask)), IPI_CPU_DEAD);
 
 	/*
 	 * The actual CPU shutdown procedure is at least platform (if not
@@ -442,14 +447,6 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 	}
 }
 
-static void (*__smp_cross_call)(const struct cpumask *, unsigned int);
-
-void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
-{
-	if (!__smp_cross_call)
-		__smp_cross_call = fn;
-}
-
 static const char *ipi_types[NR_IPI] __tracepoint_string = {
 #define S(x,s)	[x] = s
 	S(IPI_WAKEUP, "CPU wakeup interrupts"),
@@ -648,6 +645,14 @@  void handle_IPI(int ipinr, struct pt_regs *regs)
 		irq_exit();
 		break;
 
+#ifdef CONFIG_HOTPLUG_CPU
+	case IPI_CPU_DEAD:
+		irq_enter();
+		complete(&cpu_died);
+		irq_exit();
+		break;
+#endif
+
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n",
 		        cpu, ipinr);