diff mbox

[5/5] xen: RCU: avoid busy waiting until the end of grace period.

Message ID 150114249858.22910.4601418126082976816.stgit@Solace (mailing list archive)
State New, archived
Headers show

Commit Message

Dario Faggioli July 27, 2017, 8:01 a.m. UTC
Instead of having the CPU where a callback is queued, busy
looping on rcu_pending(), use a timer.

In fact, we let the CPU go idla,e but we program a timer
that will periodically wake it up, for checking whether the
grace period has actually ended.

It is kind of similar to introducing a periodic tick, but
with a much more limited scope, and a lot less overhead. In
fact, this timer is:
- only active for the CPU(s) that have callbacks queued,
  waiting for the end of a grace period;
- only active when those CPU(s) are idle (and stopped as
  soon as they resume execution).

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
---
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Julien Grall <julien.grall@arm.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
 xen/arch/arm/domain.c         |    4 ++-
 xen/arch/x86/acpi/cpu_idle.c  |    6 +++--
 xen/arch/x86/cpu/mwait-idle.c |    6 +++--
 xen/common/rcupdate.c         |   52 ++++++++++++++++++++++++++++++++++++++++-
 xen/include/xen/rcupdate.h    |    3 ++
 5 files changed, 65 insertions(+), 6 deletions(-)

Comments

Stefano Stabellini July 31, 2017, 9:20 p.m. UTC | #1
On Thu, 27 Jul 2017, Dario Faggioli wrote:
> Instead of having the CPU where a callback is queued, busy
> looping on rcu_pending(), use a timer.
> 
> In fact, we let the CPU go idla,e but we program a timer
                              ^ idle,


> that will periodically wake it up, for checking whether the
> grace period has actually ended.
> 
> It is kind of similar to introducing a periodic tick, but
> with a much more limited scope, and a lot less overhead. In
> fact, this timer is:
> - only active for the CPU(s) that have callbacks queued,
>   waiting for the end of a grace period;
> - only active when those CPU(s) are idle (and stopped as
>   soon as they resume execution).
> 
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> ---
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Julien Grall <julien.grall@arm.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
>  xen/arch/arm/domain.c         |    4 ++-
>  xen/arch/x86/acpi/cpu_idle.c  |    6 +++--
>  xen/arch/x86/cpu/mwait-idle.c |    6 +++--
>  xen/common/rcupdate.c         |   52 ++++++++++++++++++++++++++++++++++++++++-
>  xen/include/xen/rcupdate.h    |    3 ++
>  5 files changed, 65 insertions(+), 6 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 666b7ef..01da96e 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -43,8 +43,9 @@ static void do_idle(void)
>  {
>      unsigned int cpu = smp_processor_id();
>  
> +    rcu_idle_timer_start();
>      sched_tick_suspend();
> -    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
> +    /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
>      process_pending_softirqs();
>  
>      local_irq_disable();
> @@ -58,6 +59,7 @@ static void do_idle(void)
>      local_irq_enable();
>  
>      sched_tick_resume();
> +    rcu_idle_timer_stop();
>  }
>  
>  void idle_loop(void)
> diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
> index 04c52e8..b97986f 100644
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -576,10 +576,10 @@ static void acpi_processor_idle(void)
>          return;
>      }
>  
> +    rcu_idle_timer_start();
>      cpufreq_dbs_timer_suspend();
> -
>      sched_tick_suspend();
> -    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
> +    /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
>      process_pending_softirqs();
>  
>      /*
> @@ -593,6 +593,7 @@ static void acpi_processor_idle(void)
>          local_irq_enable();
>          sched_tick_resume();
>          cpufreq_dbs_timer_resume();
> +        rcu_idle_timer_stop();
>          return;
>      }
>  
> @@ -726,6 +727,7 @@ static void acpi_processor_idle(void)
>  
>      sched_tick_resume();
>      cpufreq_dbs_timer_resume();
> +    rcu_idle_timer_stop();
>  
>      if ( cpuidle_current_governor->reflect )
>          cpuidle_current_governor->reflect(power);
> diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
> index ae9e92b..c426e41 100644
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -743,10 +743,10 @@ static void mwait_idle(void)
>  		return;
>  	}
>  
> +	rcu_idle_timer_start();
>  	cpufreq_dbs_timer_suspend();
> -
>  	sched_tick_suspend();
> -	/* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
> +	/* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
>  	process_pending_softirqs();
>  
>  	/* Interrupts must be disabled for C2 and higher transitions. */
> @@ -756,6 +756,7 @@ static void mwait_idle(void)
>  		local_irq_enable();
>  		sched_tick_resume();
>  		cpufreq_dbs_timer_resume();
> +                rcu_idle_timer_stop();
>  		return;
>  	}
>  
> @@ -802,6 +803,7 @@ static void mwait_idle(void)
>  
>  	sched_tick_resume();
>  	cpufreq_dbs_timer_resume();
> +	rcu_idle_timer_stop();
>  
>  	if ( cpuidle_current_governor->reflect )
>  		cpuidle_current_governor->reflect(power);
> diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
> index f0fdc87..4586f2a 100644
> --- a/xen/common/rcupdate.c
> +++ b/xen/common/rcupdate.c
> @@ -84,8 +84,14 @@ struct rcu_data {
>      int cpu;
>      struct rcu_head barrier;
>      long            last_rs_qlen;     /* qlen during the last resched */
> +
> +    /* 3) idle CPUs handling */
> +    struct timer idle_timer;
> +    bool idle_timer_active;
>  };
>  
> +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)

Isn't this a bit too short? How is it chosen?


>  static DEFINE_PER_CPU(struct rcu_data, rcu_data);
>  
>  static int blimit = 10;
> @@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu)
>  {
>      struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
>  
> -    return (!!rdp->curlist || rcu_pending(cpu));
> +    return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active;
> +}
> +
> +/*
> + * Timer for making sure the CPU where a callback is queued does
> + * periodically poke rcu_pedning(), so that it will invoke the callback
> + * not too late after the end of the grace period.
> + */
> +void rcu_idle_timer_start()
> +{
> +    struct rcu_data *rdp = &this_cpu(rcu_data);
> +
> +    if (likely(!rdp->curlist))
> +        return;
> +
> +    set_timer(&rdp->idle_timer, NOW() + RCU_IDLE_TIMER_PERIOD);
> +    rdp->idle_timer_active = true;
> +}
> +
> +void rcu_idle_timer_stop()
> +{
> +    struct rcu_data *rdp = &this_cpu(rcu_data);
> +
> +    if (likely(!rdp->idle_timer_active))
> +        return;
> +
> +    rdp->idle_timer_active = false;
> +    stop_timer(&rdp->idle_timer);
> +}
> +
> +static void rcu_idle_timer_handler(void* data)
> +{
> +    /*
> +     * Nothing, really... And in fact, we don't expect to ever get in here,
> +     * as rcu_idle_timer_stop(), called while waking from idle, prevent that
> +     * to happen by stopping the timer before the TIMER_SOFTIRQ handler has
> +     * a chance to run.
> +     *
> +     * But that's fine, because all we want is the CPU that needs to execute
> +     * the callback to be periodically woken up and check rcu_pending().
> +     */
> +    ASSERT_UNREACHABLE();
>  }
>  
>  void rcu_check_callbacks(int cpu)
> @@ -423,6 +470,8 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
>  static void rcu_offline_cpu(struct rcu_data *this_rdp,
>                              struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
>  {
> +    kill_timer(&rdp->idle_timer);
> +
>      /* If the cpu going offline owns the grace period we can block
>       * indefinitely waiting for it, so flush it here.
>       */
> @@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
>      rdp->qs_pending = 0;
>      rdp->cpu = cpu;
>      rdp->blimit = blimit;
> +    init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu);
>  }
>  
>  static int cpu_callback(
> diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h
> index 561ac43..3402eb5 100644
> --- a/xen/include/xen/rcupdate.h
> +++ b/xen/include/xen/rcupdate.h
> @@ -149,4 +149,7 @@ int rcu_barrier(void);
>  void rcu_idle_enter(unsigned int cpu);
>  void rcu_idle_exit(unsigned int cpu);
>  
> +void rcu_idle_timer_start(void);
> +void rcu_idle_timer_stop(void);
> +
>  #endif /* __XEN_RCUPDATE_H */
>
Dario Faggioli July 31, 2017, 10:03 p.m. UTC | #2
On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote:
> On Thu, 27 Jul 2017, Dario Faggioli wrote:
> > 
> > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
> > index f0fdc87..4586f2a 100644
> > --- a/xen/common/rcupdate.c
> > +++ b/xen/common/rcupdate.c
> > @@ -84,8 +84,14 @@ struct rcu_data {
> >      int cpu;
> >      struct rcu_head barrier;
> >      long            last_rs_qlen;     /* qlen during the last
> > resched */
> > +
> > +    /* 3) idle CPUs handling */
> > +    struct timer idle_timer;
> > +    bool idle_timer_active;
> >  };
> >  
> > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
> 
> Isn't this a bit too short? How is it chosen?
> 
It's totally arbitrary (and that would be the case for whatever value
we choose).

Basically, it's how long, at worst, after the actual end of a grace
period, a (batch of) callback(s) will be invoked. Currently, on Credit1
on ARM (without my patch, from this series, that suspends the tick)
that's (by chance) 30 ms (or whatever value is chosen for Credit1
timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it
(apparently) is 'however frequent time sync rendezvouses happs' (which
I don't recall, but it's longer), while on ARM is (potentially) never.

I accept suggestions about alternatives values, and I'm certainly fine
with adding a comment, containing something along the lines of the
explanation above, but I fear it's going to be hard to figure out what
value is actually the "absolute best".

In Linux (which is where the same 'callback book-keeping' happens for
them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low-
latency/Deskop/real-time'. For us, as said above, tick --when it's
there-- would be 30ms by default.

I just went with something in the middle.

Also, it's not that we'll have a 10ms periodic timer going on for
significant amount of time. In fact we expect it to actually fire just
once (for each grace period). It's not 100% guaranteed that it won't be
reprogrammed and fire a couple of times, but it should not, in the vast
majority of cases.

What makes you think it's short?

Regards,
Dario
Stefano Stabellini July 31, 2017, 11:58 p.m. UTC | #3
On Tue, 1 Aug 2017, Dario Faggioli wrote:
> On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote:
> > On Thu, 27 Jul 2017, Dario Faggioli wrote:
> > > 
> > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
> > > index f0fdc87..4586f2a 100644
> > > --- a/xen/common/rcupdate.c
> > > +++ b/xen/common/rcupdate.c
> > > @@ -84,8 +84,14 @@ struct rcu_data {
> > >      int cpu;
> > >      struct rcu_head barrier;
> > >      long            last_rs_qlen;     /* qlen during the last
> > > resched */
> > > +
> > > +    /* 3) idle CPUs handling */
> > > +    struct timer idle_timer;
> > > +    bool idle_timer_active;
> > >  };
> > >  
> > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
> > 
> > Isn't this a bit too short? How is it chosen?
> > 
> It's totally arbitrary (and that would be the case for whatever value
> we choose).
> 
> Basically, it's how long, at worst, after the actual end of a grace
> period, a (batch of) callback(s) will be invoked. Currently, on Credit1
> on ARM (without my patch, from this series, that suspends the tick)
> that's (by chance) 30 ms (or whatever value is chosen for Credit1
> timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it
> (apparently) is 'however frequent time sync rendezvouses happs' (which
> I don't recall, but it's longer), while on ARM is (potentially) never.
> 
> I accept suggestions about alternatives values, and I'm certainly fine
> with adding a comment, containing something along the lines of the
> explanation above, but I fear it's going to be hard to figure out what
> value is actually the "absolute best".
> 
> In Linux (which is where the same 'callback book-keeping' happens for
> them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low-
> latency/Deskop/real-time'. For us, as said above, tick --when it's
> there-- would be 30ms by default.
> 
> I just went with something in the middle.
> 
> Also, it's not that we'll have a 10ms periodic timer going on for
> significant amount of time. In fact we expect it to actually fire just
> once (for each grace period). It's not 100% guaranteed that it won't be
> reprogrammed and fire a couple of times, but it should not, in the vast
> majority of cases.
> 
> What makes you think it's short?

In terms of power saving and CPU sleep states, 10ms is not much to sleep
for. I wonder if there are any power saving benefits in sleeping for
only 10ms (especially on x86 where entering and exiting CPU sleep states
takes longer, to be confirmed).  We might as well do the thing we need
to do immediately? I guess we cannot do that?
Dario Faggioli Aug. 1, 2017, 12:47 a.m. UTC | #4
On Mon, 2017-07-31 at 16:58 -0700, Stefano Stabellini wrote:
> On Tue, 1 Aug 2017, Dario Faggioli wrote:
> > On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote:
> > > On Thu, 27 Jul 2017, Dario Faggioli wrote:
> > > > 
> > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
> > > > index f0fdc87..4586f2a 100644
> > > > --- a/xen/common/rcupdate.c
> > > > +++ b/xen/common/rcupdate.c
> > > > @@ -84,8 +84,14 @@ struct rcu_data {
> > > >      int cpu;
> > > >      struct rcu_head barrier;
> > > >      long            last_rs_qlen;     /* qlen during the last
> > > > resched */
> > > > +
> > > > +    /* 3) idle CPUs handling */
> > > > +    struct timer idle_timer;
> > > > +    bool idle_timer_active;
> > > >  };
> > > >  
> > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
> > > 
> > > Isn't this a bit too short? How is it chosen?
> > > 
> > 
> > What makes you think it's short?
> 
> In terms of power saving and CPU sleep states, 10ms is not much to
> sleep
> for. I wonder if there are any power saving benefits in sleeping for
> only 10ms (especially on x86 where entering and exiting CPU sleep
> states
> takes longer, to be confirmed).
>
I *think* we should be fine with, say, 100ms. But that's again,
guess/rule of thumb, nothing more than that. And, TBH, I'm not even
sure what a good experiment/benchmark would be, to assess whether a
particular value is good or bad. :-/

>   We might as well do the thing we need
> to do immediately? I guess we cannot do that?
>
You're guessing correct, we can't. It's indeed a bit tricky, and it
took it a little bit to me as well to figure all of it out properly.

Basically, let's say that, at time t1, on CPU1, someone calls
call_rcu(). The situation on the other CPUs is: CPU0 busy; CPU2 idle
(no timer pending); CPU3 busy.

So, a new grace period starts, and its exact end will be when CPU0,
CPU1 and CPU3 have quiesced once (in Xen, quiescence means: "going
through do_softirq()").

But RCU it's a passive mechanism, i.e., we rely on each CPU coming to
the RCU core logic, and tell <<hey, btw, I've quiesced>>.
Let's say that CPU0 quiesces at time t2 > t1. CPU1 quiesces at time
t3 > t2, and goes fully idle (no timer pending). CPU3 quiesces at time
t4 > t3. Now, at time t4, CPU1 can actually invoke the callbeck, queued
at time t1, from within call_rcu().

This patch series solves two problems, of our current RCU
implementation:

1) right now, we don't only wait for CPU0, CPU1 and CPU3, we also wait 
   for CPU2. But since CPU2 is fully idle, it just won't bother 
   telling RCU that it has quiesced (well, on x86, that would actually 
   happen at some point, while on ARM, it is really possible that this 
   never happens!). This is solved in patch 3, by introducing the 
   cpumask;

2) now (after patch 3) we know that we just can avoid waiting for 
   CPU2. Good. But at time t4, when CPU3 quiesces, marking the end of
   the grace period, how would CPU1 --which is fully idle-- know that
   it can now safely invoke the callback? Again, RCU is passive, so it
   relies on CPU1 to figure that out on its own, next time it wakes
   up, e.g., because of the periodic tick timer. But we don't have a
   periodic tick timer! Well, this again means that, on x86, CPU1 will
   actually figure that out at some (unpredictable) point in future.
   On ARM, not so much. The purpose of the timer in this patch is to
   make sure it always will.
   In fact, with patches 4 and 5 applied, at time t3, we let CPU1 go 
   idle, but we program the timer. It will fire at t3+T (with T=10ms, 
   right now). When this happens, if t3+T > t4, CPU1 invokes the
   callback, and we're done. If not (and CPU1 is still idle) we retry
   in another T.

So, when you say "immediately", *when* do you actually mean?

We can't invoke the callback at t3 (i.e., immediately after CPU1
quiesces), because we need to wait for CPU3 to do the same.

We can't invoke the callback when CPU3 quiesces, at t4 (i.e.,
immediately after the grace period ends) either, because the callback
it's on CPU1, not on CPU3.

Linux's solution is not to stop the tick for CPU1 at time t3. It will
be stopped only after the first firing of the tick itself at time
t > t4 (if CPU1 is still idle, of course). This timer thing is, IMO,
pretty similar. The only difference is that we don't have a tick to not
stop... So I had to make up a very special one. :-D

TL;DR, yes, I also would have loved the world (or even just this
problem) to be simple. It's not! ;-P

Thanks and Regards,
Dario
Julien Grall Aug. 1, 2017, 8:45 a.m. UTC | #5
On 01/08/2017 00:58, Stefano Stabellini wrote:
> On Tue, 1 Aug 2017, Dario Faggioli wrote:
>> On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote:
>>> On Thu, 27 Jul 2017, Dario Faggioli wrote:
>>>>
>>>> diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
>>>> index f0fdc87..4586f2a 100644
>>>> --- a/xen/common/rcupdate.c
>>>> +++ b/xen/common/rcupdate.c
>>>> @@ -84,8 +84,14 @@ struct rcu_data {
>>>>      int cpu;
>>>>      struct rcu_head barrier;
>>>>      long            last_rs_qlen;     /* qlen during the last
>>>> resched */
>>>> +
>>>> +    /* 3) idle CPUs handling */
>>>> +    struct timer idle_timer;
>>>> +    bool idle_timer_active;
>>>>  };
>>>>
>>>> +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
>>>
>>> Isn't this a bit too short? How is it chosen?
>>>
>> It's totally arbitrary (and that would be the case for whatever value
>> we choose).
>>
>> Basically, it's how long, at worst, after the actual end of a grace
>> period, a (batch of) callback(s) will be invoked. Currently, on Credit1
>> on ARM (without my patch, from this series, that suspends the tick)
>> that's (by chance) 30 ms (or whatever value is chosen for Credit1
>> timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it
>> (apparently) is 'however frequent time sync rendezvouses happs' (which
>> I don't recall, but it's longer), while on ARM is (potentially) never.
>>
>> I accept suggestions about alternatives values, and I'm certainly fine
>> with adding a comment, containing something along the lines of the
>> explanation above, but I fear it's going to be hard to figure out what
>> value is actually the "absolute best".
>>
>> In Linux (which is where the same 'callback book-keeping' happens for
>> them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low-
>> latency/Deskop/real-time'. For us, as said above, tick --when it's
>> there-- would be 30ms by default.
>>
>> I just went with something in the middle.
>>
>> Also, it's not that we'll have a 10ms periodic timer going on for
>> significant amount of time. In fact we expect it to actually fire just
>> once (for each grace period). It's not 100% guaranteed that it won't be
>> reprogrammed and fire a couple of times, but it should not, in the vast
>> majority of cases.
>>
>> What makes you think it's short?
>
> In terms of power saving and CPU sleep states, 10ms is not much to sleep
> for. I wonder if there are any power saving benefits in sleeping for
> only 10ms (especially on x86 where entering and exiting CPU sleep states
> takes longer, to be confirmed).  We might as well do the thing we need
> to do immediately? I guess we cannot do that?

Given that on ARM we use WFI to sleep today, I think we can already 
benefit some power saving even for 10ms. Although, it will depend on the 
processor.

Cheers,
Julien Grall Aug. 1, 2017, 8:54 a.m. UTC | #6
Hi Dario,

On 27/07/2017 09:01, Dario Faggioli wrote:
> Instead of having the CPU where a callback is queued, busy
> looping on rcu_pending(), use a timer.
>
> In fact, we let the CPU go idla,e but we program a timer
> that will periodically wake it up, for checking whether the
> grace period has actually ended.
>
> It is kind of similar to introducing a periodic tick, but
> with a much more limited scope, and a lot less overhead. In
> fact, this timer is:
> - only active for the CPU(s) that have callbacks queued,
>   waiting for the end of a grace period;
> - only active when those CPU(s) are idle (and stopped as
>   soon as they resume execution).

If I read this correctly, it means on ARM the idling will now get 
interrupted periodically. This is a bit unfortunate, given that if you 
have a CPU doing nothing, you would still interrupt it intermittently.

I was expected that we could remove the CPU from the RCU whilst it is 
idle. Is there any reason for not doing that?

Cheers,
Dario Faggioli Aug. 1, 2017, 9:17 a.m. UTC | #7
On Tue, 2017-08-01 at 09:54 +0100, Julien Grall wrote:
> Hi Dario,
> 
> On 27/07/2017 09:01, Dario Faggioli wrote:
> > Instead of having the CPU where a callback is queued, busy
> > looping on rcu_pending(), use a timer.
> > 
> > In fact, we let the CPU go idla,e but we program a timer
> > that will periodically wake it up, for checking whether the
> > grace period has actually ended.
> > 
> > It is kind of similar to introducing a periodic tick, but
> > with a much more limited scope, and a lot less overhead. In
> > fact, this timer is:
> > - only active for the CPU(s) that have callbacks queued,
> >   waiting for the end of a grace period;
> > - only active when those CPU(s) are idle (and stopped as
> >   soon as they resume execution).
> 
> If I read this correctly, it means on ARM the idling will now get 
> interrupted periodically. This is a bit unfortunate, given that if
> you 
> have a CPU doing nothing, you would still interrupt it
> intermittently.
> 
Not really periodically, not always, at least. What this really means
is that a CPU that is idle, *but* have pending RCU callbacks, will be
interrupted periodically to see if the grace period ended, so it can
invoke the callbacks.

As soon as this (callbacks being invoked) will have happened, we won't
interrupt it any longer.

And idle CPUs _without_ queued RCU callbacks, won't be interrupted at
all.

> I was expected that we could remove the CPU from the RCU whilst it
> is 
> idle. Is there any reason for not doing that?
> 
I'm not sure I understand what you mean here. I tried to explain as
good as I could how this works, and why I think it can't work in other
ways, in this reply to Stefano: <1501548445.30551.5.camel@citrix.com>

Every CPU that participates in the grace period, and has already
quiesced, is "removed from RCU", and hence, when it becomes idle, it is
never interrupted (by this timer). With the only exception of the
CPU(s) that has queued callbacks.

We simply can't forget about these CPUs, even if they go idle. If we
do, the callbacks won't be invoked never (or will only be invoked when
the CPU becomes active again, which may happen really really late,
which is part of the reason why we're seeing the bug we're seeing).

Linux does this, in the *exact* same way (well, actually, in a totally
different way, from an implementation point of view, but the effect is
indeed exactly the same):

See here:

http://elixir.free-electrons.com/linux/v2.6.21/source/kernel/time/tick-sched.c#L198

We're in tick_nohz_stop_sched_tick(), i.e., the CPU is going idle, and
the periodic timer tick is being stopped (no interruptions). But

	if (rcu_needs_cpu(cpu))
		delta_jiffies = 1;

Where, rcu_needs_cpu() means:

/*
 * Check to see if any future RCU-related work will need to be done
 * by the current CPU, even if none need be done immediately, returning
 * 1 if so.
 */
int rcu_needs_cpu(int cpu)

Then, again in tick_nohz_stop_sched_tick()

	/*
	 * Do not stop the tick, if we are only one off
	 * or if the cpu is required for rcu
	 */
	if (!ts->tick_stopped && delta_jiffies == 1)
		goto out;

And in fact, in my testing, without patches 3 and 5 applied, the bug is
still there.

Regards,
Dario
Julien Grall Aug. 1, 2017, 10:22 a.m. UTC | #8
Hi Dario,

On 01/08/17 10:17, Dario Faggioli wrote:
> On Tue, 2017-08-01 at 09:54 +0100, Julien Grall wrote:
>> Hi Dario,
>>
>> On 27/07/2017 09:01, Dario Faggioli wrote:
>>> Instead of having the CPU where a callback is queued, busy
>>> looping on rcu_pending(), use a timer.
>>>
>>> In fact, we let the CPU go idla,e but we program a timer
>>> that will periodically wake it up, for checking whether the
>>> grace period has actually ended.
>>>
>>> It is kind of similar to introducing a periodic tick, but
>>> with a much more limited scope, and a lot less overhead. In
>>> fact, this timer is:
>>> - only active for the CPU(s) that have callbacks queued,
>>>   waiting for the end of a grace period;
>>> - only active when those CPU(s) are idle (and stopped as
>>>   soon as they resume execution).
>>
>> If I read this correctly, it means on ARM the idling will now get
>> interrupted periodically. This is a bit unfortunate, given that if
>> you
>> have a CPU doing nothing, you would still interrupt it
>> intermittently.
>>
> Not really periodically, not always, at least. What this really means
> is that a CPU that is idle, *but* have pending RCU callbacks, will be
> interrupted periodically to see if the grace period ended, so it can
> invoke the callbacks.
>
> As soon as this (callbacks being invoked) will have happened, we won't
> interrupt it any longer.
>
> And idle CPUs _without_ queued RCU callbacks, won't be interrupted at
> all.

Oh, the commit message is not clear about it. The wording gives the 
impression the timer will always be here on every idle CPU(s). In the 
case on active CPU(s) you specific mention the end of the grace period.

Thank you for the clarification and please disregard the rest of my 
e-mail then.

Cheers,
Dario Faggioli Aug. 1, 2017, 10:33 a.m. UTC | #9
On Tue, 2017-08-01 at 11:22 +0100, Julien Grall wrote:
> On 01/08/17 10:17, Dario Faggioli wrote:
> > As soon as this (callbacks being invoked) will have happened, we
> > won't
> > interrupt it any longer.
> > 
> > And idle CPUs _without_ queued RCU callbacks, won't be interrupted
> > at
> > all.
> 
> Oh, the commit message is not clear about it. The wording gives the 
> impression the timer will always be here on every idle CPU(s). In
> the 
> case on active CPU(s) you specific mention the end of the grace
> period.
> 
Ok, I'll try to improve the changelog and resend.

Thanks and Regards,
Dario
Stefano Stabellini Aug. 1, 2017, 7:13 p.m. UTC | #10
On Tue, 1 Aug 2017, Dario Faggioli wrote:
> On Mon, 2017-07-31 at 16:58 -0700, Stefano Stabellini wrote:
> > On Tue, 1 Aug 2017, Dario Faggioli wrote:
> > > On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote:
> > > > On Thu, 27 Jul 2017, Dario Faggioli wrote:
> > > > > 
> > > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
> > > > > index f0fdc87..4586f2a 100644
> > > > > --- a/xen/common/rcupdate.c
> > > > > +++ b/xen/common/rcupdate.c
> > > > > @@ -84,8 +84,14 @@ struct rcu_data {
> > > > >      int cpu;
> > > > >      struct rcu_head barrier;
> > > > >      long            last_rs_qlen;     /* qlen during the last
> > > > > resched */
> > > > > +
> > > > > +    /* 3) idle CPUs handling */
> > > > > +    struct timer idle_timer;
> > > > > +    bool idle_timer_active;
> > > > >  };
> > > > >  
> > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
> > > > 
> > > > Isn't this a bit too short? How is it chosen?
> > > > 
> > > 
> > > What makes you think it's short?
> > 
> > In terms of power saving and CPU sleep states, 10ms is not much to
> > sleep
> > for. I wonder if there are any power saving benefits in sleeping for
> > only 10ms (especially on x86 where entering and exiting CPU sleep
> > states
> > takes longer, to be confirmed).
> >
> I *think* we should be fine with, say, 100ms. But that's again,
> guess/rule of thumb, nothing more than that. And, TBH, I'm not even
> sure what a good experiment/benchmark would be, to assess whether a
> particular value is good or bad. :-/

Given the description below, it's possible that the new timer has to
fire several times before the callback can be finally invoked, right?

I would make some measurements to check how many times the timer has to
fire (how long we actually need to wait before calling the callback) in
various scenarios. Then I would choose a value to minimize the number of
unnecessary wake-ups.


> >   We might as well do the thing we need
> > to do immediately? I guess we cannot do that?
> >
> You're guessing correct, we can't. It's indeed a bit tricky, and it
> took it a little bit to me as well to figure all of it out properly.
> 
> Basically, let's say that, at time t1, on CPU1, someone calls
> call_rcu(). The situation on the other CPUs is: CPU0 busy; CPU2 idle
> (no timer pending); CPU3 busy.
> 
> So, a new grace period starts, and its exact end will be when CPU0,
> CPU1 and CPU3 have quiesced once (in Xen, quiescence means: "going
> through do_softirq()").
> 
> But RCU it's a passive mechanism, i.e., we rely on each CPU coming to
> the RCU core logic, and tell <<hey, btw, I've quiesced>>.
> Let's say that CPU0 quiesces at time t2 > t1. CPU1 quiesces at time
> t3 > t2, and goes fully idle (no timer pending). CPU3 quiesces at time
> t4 > t3. Now, at time t4, CPU1 can actually invoke the callbeck, queued
> at time t1, from within call_rcu().
> 
> This patch series solves two problems, of our current RCU
> implementation:
> 
> 1) right now, we don't only wait for CPU0, CPU1 and CPU3, we also wait 
>    for CPU2. But since CPU2 is fully idle, it just won't bother 
>    telling RCU that it has quiesced (well, on x86, that would actually 
>    happen at some point, while on ARM, it is really possible that this 
>    never happens!). This is solved in patch 3, by introducing the 
>    cpumask;
> 
> 2) now (after patch 3) we know that we just can avoid waiting for 
>    CPU2. Good. But at time t4, when CPU3 quiesces, marking the end of
>    the grace period, how would CPU1 --which is fully idle-- know that
>    it can now safely invoke the callback? Again, RCU is passive, so it
>    relies on CPU1 to figure that out on its own, next time it wakes
>    up, e.g., because of the periodic tick timer. But we don't have a
>    periodic tick timer! Well, this again means that, on x86, CPU1 will
>    actually figure that out at some (unpredictable) point in future.
>    On ARM, not so much. The purpose of the timer in this patch is to
>    make sure it always will.
>    In fact, with patches 4 and 5 applied, at time t3, we let CPU1 go 
>    idle, but we program the timer. It will fire at t3+T (with T=10ms, 
>    right now). When this happens, if t3+T > t4, CPU1 invokes the
>    callback, and we're done. If not (and CPU1 is still idle) we retry
>    in another T.
> 
> So, when you say "immediately", *when* do you actually mean?
> 
> We can't invoke the callback at t3 (i.e., immediately after CPU1
> quiesces), because we need to wait for CPU3 to do the same.
> 
> We can't invoke the callback when CPU3 quiesces, at t4 (i.e.,
> immediately after the grace period ends) either, because the callback
> it's on CPU1, not on CPU3.
> 
> Linux's solution is not to stop the tick for CPU1 at time t3. It will
> be stopped only after the first firing of the tick itself at time
> t > t4 (if CPU1 is still idle, of course). This timer thing is, IMO,
> pretty similar. The only difference is that we don't have a tick to not
> stop... So I had to make up a very special one. :-D
> 
> TL;DR, yes, I also would have loved the world (or even just this
> problem) to be simple. It's not! ;-P
Dario Faggioli Aug. 2, 2017, 10:14 a.m. UTC | #11
Hey, apologies in advance, for the very long email! :-O

On Tue, 2017-08-01 at 12:13 -0700, Stefano Stabellini wrote:
> Given the description below, it's possible that the new timer has to
> fire several times before the callback can be finally invoked, right?
> 
> I would make some measurements to check how many times the timer has
> to
> fire (how long we actually need to wait before calling the callback)
> in
> various scenarios. Then I would choose a value to minimize the number
> of
> unnecessary wake-ups.
> 
You seem to be thinking to this timer as something that disturbs the
idleness of a CPU. It's not, because the CPU is not idle! It has a
callback waiting to be invoked, and it better do that as soon as
possible, in order to:

- freeing the memory/the resources as soon as practically possible. 
  E.g., if we wait for, say, 1 sec, destroying a VM with a "passed-
  through" I/O range, and then trying to create another one, with the 
  same resource assigned, would fail until that 1 sec expires. True,
  we save more power on that CPU, but it does not look neither fair 
  nor correct to me to strive toward achieving that, in this situation.
  Power should indeed be saved, and unnecessary wake-ups prevented, but
  when there isn't anything to do! If there is stuff to do, and this is
  the case, our goal should be to get it done as soon as possible, so
  that, afterwords, we can go into a (hopefully) long sleep.

- by delaying detection of a finished grace period, we're also 
  delaying the beginning of a new one. This means, in case there are
  RCU operation waiting for it (because they arrived too late for the
  current one, and have been queued), we're potentially significantly
  delaying them too. So, basically, this again looks like the same
  kind of paradox to me, where we have stuff to do, but we decide to
  try to save power.

RCU are not a mechanism for allow the CPUs to stay idle longer, they're
a lockless and asymmetric serialization primitive. And as any
serialization primitive --either lock-based or lockless-- keeping the
duration of critical sections small is rather important.

So, I think the goal here is not minimizing the wakeups; it much rather
is completing the grace period sooner rather than later. In theory, if
rcu_needs_cpu() returns true, we may very well not even let the CPU go
idle, and have it spin (which indeed is what happens without this
patch).
That would guarantee a prompt detection of CPU quiescence, and of the
ending of a grace period. Then, since we know that spinning consumes a
lot of power, and generates overhead that, potentially, may even affect
and slow down activities going on other CPUs, we let it go idle with a
timer armed. But that's a compromise, and can't be confused with the
primary goal. And any nanosecond that we spend, on that CPU, consuming
less power than what we'd have consumed running a tight loop around
rcu_pending(), we should be thankful for it, instead of regretting that
there could have been more. :-)

In Linux, as said a few times, they do the same, and the role of this
new timer I'm introducing here, is played by the tick. The tick period
is controlled by the HZ config variable, possible values for which (in
2.6.21, where CONFIG_NO_HZ was introduced for first time for all
arches) are 100Hz, 250Hz, 300Hz and 1000Hz, which translates into 10ms,
 4ms, 3.33ms or 1ms. Default is 250HZ==4ms. The periodic tick is the
minimum time granularity of the kernel for a lot of subsystem
(especially in that kernel), so it's something that can be considered
pretty frequently running. As also said many times, we don't have any
such thing, but still I think we should use something with a similar
order of magnitude.

[NOTE that I'm not using Linux as an example because what is done in
Linux is always correct and is always what we also should do. However:
(1) RCU has been basically invented for Linux, or at least Linux is
where they've seen the widest adoption and development, and (2) our RCU
code has been (not in the best possible way, allow me to say) imported
from there. therefore, looking at what's done in Linux seems reasonable
to me, in this case.]

About the kind of measurements you're suggesting doing. I think I
understand the rationale behind it, but at the same time, I'm quite
sure that what we'd get would be pseudo-random numbers.

In fact, the length of this "wait time", although it has a relationship
with the workload being run, it also depends on a lot of other things,
within the *system* as a whole. It would be basically impossible to run
any meaningful statistical analysis on the results and come to
something that is good enough in general.

Think, for instance, at how this is not at all architecture specific
code, but the behavior we're seeing is so different between x86 and 
ARM. Think at how this doesn't really have to do anything with the
scheduler, but the behavior we observe varies, depending on some
internal implementation details of a scheduler!

What follows is a more detailed explanation of how things works, in a
simple enough scenario.

+ == CPU is busy
. == CPU is idle
R == when call_rcu() is invoked (i.e., beginning of new grace
          period)
Q == CPU quiesces (when all have done this, grace period ends).
     This happens when do_softirq() is called on the CPU
C == pending callbacks are invoked

                      t1
                      |
                      |
CPU1 +++R+++++++++++++Q..............
CPU2 ++++++++++++++++++++++++++Q++...
                               |
                               |
                               t2

If CPU1 is where call_rcu() happens, and CPU2 is the only other busy
CPU in the system at the time, and T is the period of our timer, the
following cases are possible:

x) t2 < t1

   In this case, the timer never fires:

                         t1
                         |
                         |
   CPU1 +++R+++++++++++++QC.........
   CPU2 ++++++++++++++Q.............
                      |
                      |
                      t2

x) t2 - t1 < T

   In this case, the timer only fires once

                    t1     t1+T
                    |       |
                    |       |
   CPU1 +++R++++++++Q.......C......
   CPU2 +++++++++++++++++Q.........
                         |
                         |
                         t2

x) t2 - t1 > (n-1)*T

                t1     t1+T   t1+2T   t1+3T   t1+4T
                |       |       |       |       |
                |       |       |       |       |
   CPU1 +R++++++Q...............................C..
   CPU2 ++++++++++++++++++++++++++++++++++++Q......
                                            |
                                            |
                                            t2

   In this case, the timer will fire n times (with n=4 in the example).

So, you're suggesting to measure, under different workloads, and then
do some kind of averaging/statistical analysis, t2-t1. What I'm saying
is that the occurrence of those events marked 'Q', on the appropriate
CPU, is really hard to predict.

In fact, t2-t1 is, basically, the difference between the time when the
two CPUs call do_softirq(). How much the frequency of invocation of
do_softirq() is related to the actual characteristics of the workload
we're running? well, a relationship sure exists, but not one that I
think is easy (possible?) to generalize... And here we're also talking
about considering *different* workloads! :-O

For instance, because of either hard-affinity, soft-affinity, or
scheduling weights, there may be more or less preemptions, on the CPUs
in question (and preemptions means going through do_softirq())... And
that's not related to system load a particular workload produces, as
it's in total control of the user. So we'd have not only to measure
with different workloads, but for each workload, we'd have to test a
certain number of variations of the scheduling parameters of the vCPUs!
:-O

On x86, there's an upper bound to the time CPUs will stay idle, because
there's always (something like) a timer running (for time sync
rendezvous, said Andrew, and I haven't checked myself, but have no
reason to doubt that to be the case), while on ARM there's no such
thing (or we weren't here! :-D). So we'd have to measure each workload,
with each combination of parameters, on each (present and future)
architecture we support! :-O

CPU onlining and offlining, or CPUs moving around cpupools, can cause
timers (and RCU callbacks even, in the online/offline case!) to be
moved around, which means they'd be firing (and a timer firing means
going through do_softirq()) on different CPUs from where we expected
them to. So, we'd have to measure each workload, with each combination
of parameters, on each architecture, while doing CPU online/offline,
and while moving CPUs around pools! :-O

And this was just to mention the few examples I could fish from the top
of my head...

No, IMO, there are too many variables and too many unrelated factors
involved, to even hope that any set of measurements we actually manage
to put together, can be representative of the general case.


*PERHAPS* we can think of an heuristic... Provided it's not too
complex, or at least, that it does not try to capture all this
variability (because that'd be equally impossible).

Something like this: when a CPU with queued callbacks quiesces and then
goes idle, we know how many other CPUs, among the ones that are
participating in the grace period, have not quiesced yet. Instead of
using, say, always 10ms, we can program the timer to fire at something
like 1ms*cpumask_weight(not_yet_quiesced_cpus).

In fact, I think it makes sense to assume that, if there are only few
CPUs involved in the grace period, that still has not quiesced, then
the grace period itself is about to finish, and we should check
frequently whether or not that has happened.
if, OTOH, there are a lot of CPUs involved in the grace period, that
still haven't quiesced, we can check whether the grace period has ended
in a less eager way.

This will make the period of the timer vary, i.e., it may be different
between two subsequent timer_set(), but that is not a problem at all.
Basically, we're making the timer kind of adaptive.

This does not work well in the case where there is, e.g., only 1 CPU
that has not quiesced yet (so we'd set the timer to fire very
frequently), but it takes a lot of time for such CPU to do so. Well,
it's an heuristic, so it can't always work well.. but I think this
isn't a common situation.

Would you be happy with something like this?

Regards,
Dario
Jan Beulich Aug. 7, 2017, 8:54 a.m. UTC | #12
>>> Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM >>>
>Instead of having the CPU where a callback is queued, busy
>looping on rcu_pending(), use a timer.

Isn't this rcu_needs_cpu(), according to patch 4?

>--- a/xen/arch/x86/acpi/cpu_idle.c
>+++ b/xen/arch/x86/acpi/cpu_idle.c
>@@ -576,10 +576,10 @@ static void acpi_processor_idle(void)
>return;
>}
 >
>+    rcu_idle_timer_start();
>cpufreq_dbs_timer_suspend();

Is the ordering wrt cpufreq handling here arbitrary? If so, wouldn't it be
better to suspend cpufreq handling before starting the idle timer (and
also invert the ordering when going back to normal operation below)?

>-
>sched_tick_suspend();

At which point I'd then wonder whether this couldn't be integrated into
sched_tick_{suspend,resume}(), avoiding arch-specific code to be
altered.

>@@ -756,6 +756,7 @@ static void mwait_idle(void)
>        local_irq_enable();
>        sched_tick_resume();
>        cpufreq_dbs_timer_resume();
>+                rcu_idle_timer_stop();

Indentation looks odd here, but I can't exclude this being an effect produced
by my mail web frontend.

>--- a/xen/common/rcupdate.c
>+++ b/xen/common/rcupdate.c
>@@ -84,8 +84,14 @@ struct rcu_data {
>int cpu;
     >struct rcu_head barrier;
>long            last_rs_qlen;     /* qlen during the last resched */
>+
>+    /* 3) idle CPUs handling */
>+    struct timer idle_timer;
>+    bool idle_timer_active;
>};
 >
>+#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)

If I'm not mistaken someone else had already commented on this: If this
is (mostly) arbitrary, please say so in a comment.

>@@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu)
>{
>struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 >
>-    return (!!rdp->curlist || rcu_pending(cpu));
>+    return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active;

Please take the opportunity and drop the pointless !! here (unless it's needed
for better matching up with the Linux original).

>+}
>+
>+/*
>+ * Timer for making sure the CPU where a callback is queued does
>+ * periodically poke rcu_pedning(), so that it will invoke the callback
>+ * not too late after the end of the grace period.
>+ */
>+void rcu_idle_timer_start()
>+{
>+    struct rcu_data *rdp = &this_cpu(rcu_data);
>+
>+    if (likely(!rdp->curlist))
>+        return;

I would have expected this to be the inverse of the original condition in
rcu_needs_cpu() - why is there no rcu_pending() invocation here?

>@@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
>rdp->qs_pending = 0;
>rdp->cpu = cpu;
>rdp->blimit = blimit;
>+    init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu);

Again, unless it is this bogus way in the Linux original, please drop the
pointless cast, or at least correct its style.

Jan
Dario Faggioli Aug. 9, 2017, 5:34 p.m. UTC | #13
On Mon, 2017-08-07 at 02:54 -0600, Jan Beulich wrote:
> > > > Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM
> > > > >>>
> > 
> > Instead of having the CPU where a callback is queued, busy
> > looping on rcu_pending(), use a timer.
> 
> Isn't this rcu_needs_cpu(), according to patch 4?
> 
I was referring to the rcu_pending() in do_softirq(). In fact, if this
(roughly) is idle_loop:

    for ( ; ; )
    {
        if ( unlikely(tasklet_work_to_do(cpu)) )
            do_tasklet();
        else
            pm_idle();
        do_softirq();

        check_for_livepatch_work();
    }

we don't have tasklet work to do, so we call pm_idle(). However, if we
have a callback queued, rcu_needs_cpu() returns true (without calling
rcu_pending()), which means cpu_is_haltable() returns false, and hence
we exit pm_idle() without actually going idle, and we call
do_softirq(), which does:

    if ( rcu_pending(cpu) )
        rcu_check_callbacks(cpu);

in a rather tight loop.

IAC, I certainly can rephrase the sentence above, and make all this
more clear.

> > --- a/xen/arch/x86/acpi/cpu_idle.c
> > +++ b/xen/arch/x86/acpi/cpu_idle.c
> > @@ -576,10 +576,10 @@ static void acpi_processor_idle(void)
> > return;
> > }
> 
>  >
> > +    rcu_idle_timer_start();
> > cpufreq_dbs_timer_suspend();
> 
> Is the ordering wrt cpufreq handling here arbitrary? 
>
Yes.

> If so, wouldn't it be
> better to suspend cpufreq handling before starting the idle timer
> (and
> also invert the ordering when going back to normal operation below)?
> 
Well, as said above, it's arbitrary. Therefore, yes, I'm ok with doing
things like you suggest.

> > -
> > sched_tick_suspend();
> 
> At which point I'd then wonder whether this couldn't be integrated
> into
> sched_tick_{suspend,resume}(), avoiding arch-specific code to be
> altered.
> 
Sure, I'm all for it. It's easier, cleaner, and makes a lot of sense!

Only problem may be if some new arch decide not/forget to call
sched_tick_suspend() and resume().

Perhaps I can add some checking in place, to make sure that this can't
happen (in debug builds only, of course).

> > @@ -756,6 +756,7 @@ static void mwait_idle(void)
> >        local_irq_enable();
> >        sched_tick_resume();
> >        cpufreq_dbs_timer_resume();
> > +                rcu_idle_timer_stop();
> 
> Indentation looks odd here, but I can't exclude this being an effect
> produced
> by my mail web frontend.
> 
Yep, it was indeed wrong. Fixed. Sorry. Thanks. :-)

> > --- a/xen/common/rcupdate.c
> > +++ b/xen/common/rcupdate.c
> > @@ -84,8 +84,14 @@ struct rcu_data {
> > 
> > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
> 
> If I'm not mistaken someone else had already commented on this: If
> this
> is (mostly) arbitrary, please say so in a comment.
> 
Indeed there is being a long 'debate'. :-)

For v2, I'm changing this to something more dynamic. And I'll sure add
a comment explaining the adopted solution.

> > @@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu)
> > {
> > struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> 
>  >
> > -    return (!!rdp->curlist || rcu_pending(cpu));
> > +    return (!!rdp->curlist || rcu_pending(cpu)) && !rdp-
> > >idle_timer_active;
> 
> Please take the opportunity and drop the pointless !! here (unless
> it's needed
> for better matching up with the Linux original).
> 
It's in the Linux code, indeed:
http://elixir.free-electrons.com/linux/v2.6.21/source/kernel/rcupdate.c#L517

However, this very line is already different (because Linux has
rdp_bh->curlist, which we don't, and we have rdp->idle_timer_active).
So I guess I can drop '!!'.

Any patch touching this line coming from Linux will need manual
adjustment anyway.

> > +/*
> > + * Timer for making sure the CPU where a callback is queued does
> > + * periodically poke rcu_pedning(), so that it will invoke the
> > callback
> > + * not too late after the end of the grace period.
> > + */
> > +void rcu_idle_timer_start()
> > +{
> > +    struct rcu_data *rdp = &this_cpu(rcu_data);
> > +
> > +    if (likely(!rdp->curlist))
> > +        return;
> 
> I would have expected this to be the inverse of the original
> condition in
> rcu_needs_cpu() - why is there no rcu_pending() invocation here?
> 
The original rcu_needs_cpu() condition is:

 rcu->curlist || rcu_pending(cpu)

So, you're saying we need something like this:

 if (!rdp->curlist && !rcu_pending(cpu))
   return;

Well, if I do this, what happens is that the if evaluate to false (and
hence we don't exit the function, and we arm the timer), on CPUs that:
 - does not have a calback queued (curlist == NULL)
 - are in the process of quiescing, by going idle.

In fact, in that case, here's what happens (I compacted the trace, and
slightly edited some of the lines, for better readability, and for
keeping the highlighting the characteristics of the situation under
investigation):

* complete_domain_destroy callback is queued on CPU 0. The CPU quiesces
  (goes through softirq), but stay budy, so no timer is armed (that's 
  not super relevant, but is what happens in this case):

x-|- d0v12 rcu_call fn=complete_domain_destroy
x-|- d0v12 rcu_pending? yes (ret=2): no pending entries but new entries
x-|- d0v12 raise_softirq nr 3
x-|- d0v12 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: yes
x-|- d0v12 do_softirq
x-|- d0v12 rcu_pending? yes (ret=5): waiting for CPU to quiesce (rdp_qs_pending=1)
x-|- d0v12 raise_softirq nr 3
x-|- d0v12 rcu_process_callbacks, rdp_curlist: yes, rdp_nxtlist: null
x-|- d0v12 rcu_check_quiesc_state, rdp_qs_pending: yes
x-|- d0v12 rcu_cpu_quiet, rcp_cpumask=0x00004100
x-|- d0v12 rcu_pending? no

* the vCPU running on CPU 2, which participates in the grace period, 
  blocks, and CPU 2 goes idle. That means that CPU 2 quiesces (goes 
  through softirq), and we can forget about it. Surely we don't need 
  the timer to fire on it, as the callback is not queued there...

|-x- d0v7 vcpu_block d0v7
|-x- d0v7 raise_softirq nr 1
|-x- d0v7 do_softirq
|-x- d0v7 rcu_pending? yes (ret=4): waiting for CPU to quiesce
(rdp_qs_pending=0)
|-x- d0v7 raise_softirq nr 3
|-x- d0v7 softirq_handler nr 1
|-x- d0v7 runstate_change d0v7 running->blocked
|-x- d?v? runstate_change d32767v14 runnable->running

  Now, this is cpufreq_dbs_timer_suspend():

|-x- d32767v14 timer_stop t=do_dbs_timer

  And this is sched_tick_suspend(), which now calls 
  rcu_idle_timer_start(), gated by the condition suggested above.

|-x- d32767v14 rcu_pending? yes (ret=4): waiting for CPU to quiesce (rdp_qs_pending=0)
|-x- d32767v14 timer_set t=rcu_idle_timer_handler, expires_in=9787us

  So, the CPU is actually quiescing, and since it has no callback
  queues, as we said, we wouldn't have needed the timer. But the
  condition is false, because, at this stage, rcu_pending() was still
  true. So, we don't bail early from rcu_idle_timer_start(), and we 
  actually have armed the timer.

|-x- d32767v14 rcu_pending? yes (ret=4): waiting for CPU to quiesce (rdp_qs_pending=0)
|-x- d32767v14 raise_softirq nr 3
|-x- d32767v14 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: null
|-x- d32767v14 rcu_check_quiesc_state, rdp_qs_pending: no
|-x- d32767v14 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: null
|-x- d32767v14 rcu_check_quiesc_state, rdp_qs_pending: yes
|-x- d32767v14 rcu_cpu_quiet, rcp_cpumask=0x00000100
|-x- d32767v14 pm_idle_start c2

* The timer fires, for no useful reason, and cause a spurious wakeup:

|-x- d32767v14 raise_softirq nr 0
|-x- d32767v14 pm_idle_end c2
|-x- d32767v14 irq_enter
|-x- d32767v14 irq_direct, vec=0xfa, handler=apic_timer_interrupt
|-x- d32767v14 raise_softirq nr 0
|-x- d32767v14 irq_exit, in_irq = 0
|-x- d32767v14 softirq_handler nr 0
|-x- d32767v14 timer_exec t=rcu_idle_timer_handler
|-x- d32767v14 timer_reprogr deadline=954.566us
|-x- d32767v14 rcu_pending? no
|-x- d32767v14 pm_idle_start c3

So, this is why I don't want rcu_pending() in that if. If we don't like
this, I can see about moving around a bit the timer starting and
stopping helpers (I've a couple of ideas in mind already, but I need to
try).

Actually, it's entirely possible that it is having rcu_pending(cpu) in
rcu_needs_cpu() is, for us, redundant. In fact, although it does make
sense in Linux, both code inspection and some investigation I've just
done, makes me believe that there won't be cases where a CPU is denied
going offline because it sees rcu_pending() returning 1.

In fact, when we call rcu_pending(), within cpu_is_haltable(), we have
already gone through it before. And if there were pending work, we've
raised the softirq and dealt with it. If there weren't, neither there
is now.

I'm therefore leaning toward removing rcu_pending() from the
rcu_needs_cpu() check as well. At that point, we'll indeed have the
check inside rcu_start_idle_timer(), be the opposite of the original
check in rcu_needs_cpu(). :-)

> > @@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu,
> > struct rcu_ctrlblk *rcp,
> > rdp->qs_pending = 0;
> > rdp->cpu = cpu;
> > rdp->blimit = blimit;
> > +    init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*)
> > rdp, cpu);
> 
> Again, unless it is this bogus way in the Linux original, please drop
> the
> pointless cast, or at least correct its style.
> 
Ah, no, this one, I can kill it.

Thanks and Regards,
Dario
Dario Faggioli Aug. 10, 2017, 1:55 p.m. UTC | #14
On Wed, 2017-08-09 at 19:34 +0200, Dario Faggioli wrote:
> On Mon, 2017-08-07 at 02:54 -0600, Jan Beulich wrote:
> > > > > Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM
> > > +/*
> > > + * Timer for making sure the CPU where a callback is queued does
> > > + * periodically poke rcu_pedning(), so that it will invoke the
> > > callback
> > > + * not too late after the end of the grace period.
> > > + */
> > > +void rcu_idle_timer_start()
> > > +{
> > > +    struct rcu_data *rdp = &this_cpu(rcu_data);
> > > +
> > > +    if (likely(!rdp->curlist))
> > > +        return;
> > 
> > I would have expected this to be the inverse of the original
> > condition in
> > rcu_needs_cpu() - why is there no rcu_pending() invocation here?
> > 
> 
> [...]
>
> Actually, it's entirely possible that it is having rcu_pending(cpu)
> in
> rcu_needs_cpu() is, for us, redundant. In fact, although it does make
> sense in Linux, both code inspection and some investigation I've just
> done, makes me believe that there won't be cases where a CPU is
> denied
> going offline because it sees rcu_pending() returning 1.
> 
> In fact, when we call rcu_pending(), within cpu_is_haltable(), we
> have
> already gone through it before. And if there were pending work, we've
> raised the softirq and dealt with it. If there weren't, neither there
> is now.
> 
> I'm therefore leaning toward removing rcu_pending() from the
> rcu_needs_cpu() check as well. At that point, we'll indeed have the
> check inside rcu_start_idle_timer(), be the opposite of the original
> check in rcu_needs_cpu(). :-)
> 
FTR, I'm not so sure of this last thing any longer. I mean, the
analysis I provided is still correct, but I'm investigating the other
possible race existing in the code that Tim has hinted at in his mail,
and I think it could be useful to have rcu_pending() checked in here,
to solve/avoid that one.

It's also possible that I'll actually remove it from rcu_needs_cpu(),
but to move it somewhere else... As I said, I'm still looking into the
problem.

Regards,
Dario
diff mbox

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 666b7ef..01da96e 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -43,8 +43,9 @@  static void do_idle(void)
 {
     unsigned int cpu = smp_processor_id();
 
+    rcu_idle_timer_start();
     sched_tick_suspend();
-    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+    /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
     process_pending_softirqs();
 
     local_irq_disable();
@@ -58,6 +59,7 @@  static void do_idle(void)
     local_irq_enable();
 
     sched_tick_resume();
+    rcu_idle_timer_stop();
 }
 
 void idle_loop(void)
diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 04c52e8..b97986f 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -576,10 +576,10 @@  static void acpi_processor_idle(void)
         return;
     }
 
+    rcu_idle_timer_start();
     cpufreq_dbs_timer_suspend();
-
     sched_tick_suspend();
-    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+    /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
     process_pending_softirqs();
 
     /*
@@ -593,6 +593,7 @@  static void acpi_processor_idle(void)
         local_irq_enable();
         sched_tick_resume();
         cpufreq_dbs_timer_resume();
+        rcu_idle_timer_stop();
         return;
     }
 
@@ -726,6 +727,7 @@  static void acpi_processor_idle(void)
 
     sched_tick_resume();
     cpufreq_dbs_timer_resume();
+    rcu_idle_timer_stop();
 
     if ( cpuidle_current_governor->reflect )
         cpuidle_current_governor->reflect(power);
diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
index ae9e92b..c426e41 100644
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -743,10 +743,10 @@  static void mwait_idle(void)
 		return;
 	}
 
+	rcu_idle_timer_start();
 	cpufreq_dbs_timer_suspend();
-
 	sched_tick_suspend();
-	/* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+	/* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */
 	process_pending_softirqs();
 
 	/* Interrupts must be disabled for C2 and higher transitions. */
@@ -756,6 +756,7 @@  static void mwait_idle(void)
 		local_irq_enable();
 		sched_tick_resume();
 		cpufreq_dbs_timer_resume();
+                rcu_idle_timer_stop();
 		return;
 	}
 
@@ -802,6 +803,7 @@  static void mwait_idle(void)
 
 	sched_tick_resume();
 	cpufreq_dbs_timer_resume();
+	rcu_idle_timer_stop();
 
 	if ( cpuidle_current_governor->reflect )
 		cpuidle_current_governor->reflect(power);
diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index f0fdc87..4586f2a 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -84,8 +84,14 @@  struct rcu_data {
     int cpu;
     struct rcu_head barrier;
     long            last_rs_qlen;     /* qlen during the last resched */
+
+    /* 3) idle CPUs handling */
+    struct timer idle_timer;
+    bool idle_timer_active;
 };
 
+#define RCU_IDLE_TIMER_PERIOD MILLISECS(10)
+
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
 static int blimit = 10;
@@ -402,7 +408,48 @@  int rcu_needs_cpu(int cpu)
 {
     struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 
-    return (!!rdp->curlist || rcu_pending(cpu));
+    return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active;
+}
+
+/*
+ * Timer for making sure the CPU where a callback is queued does
+ * periodically poke rcu_pedning(), so that it will invoke the callback
+ * not too late after the end of the grace period.
+ */
+void rcu_idle_timer_start()
+{
+    struct rcu_data *rdp = &this_cpu(rcu_data);
+
+    if (likely(!rdp->curlist))
+        return;
+
+    set_timer(&rdp->idle_timer, NOW() + RCU_IDLE_TIMER_PERIOD);
+    rdp->idle_timer_active = true;
+}
+
+void rcu_idle_timer_stop()
+{
+    struct rcu_data *rdp = &this_cpu(rcu_data);
+
+    if (likely(!rdp->idle_timer_active))
+        return;
+
+    rdp->idle_timer_active = false;
+    stop_timer(&rdp->idle_timer);
+}
+
+static void rcu_idle_timer_handler(void* data)
+{
+    /*
+     * Nothing, really... And in fact, we don't expect to ever get in here,
+     * as rcu_idle_timer_stop(), called while waking from idle, prevent that
+     * to happen by stopping the timer before the TIMER_SOFTIRQ handler has
+     * a chance to run.
+     *
+     * But that's fine, because all we want is the CPU that needs to execute
+     * the callback to be periodically woken up and check rcu_pending().
+     */
+    ASSERT_UNREACHABLE();
 }
 
 void rcu_check_callbacks(int cpu)
@@ -423,6 +470,8 @@  static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 static void rcu_offline_cpu(struct rcu_data *this_rdp,
                             struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+    kill_timer(&rdp->idle_timer);
+
     /* If the cpu going offline owns the grace period we can block
      * indefinitely waiting for it, so flush it here.
      */
@@ -451,6 +500,7 @@  static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
     rdp->qs_pending = 0;
     rdp->cpu = cpu;
     rdp->blimit = blimit;
+    init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu);
 }
 
 static int cpu_callback(
diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h
index 561ac43..3402eb5 100644
--- a/xen/include/xen/rcupdate.h
+++ b/xen/include/xen/rcupdate.h
@@ -149,4 +149,7 @@  int rcu_barrier(void);
 void rcu_idle_enter(unsigned int cpu);
 void rcu_idle_exit(unsigned int cpu);
 
+void rcu_idle_timer_start(void);
+void rcu_idle_timer_stop(void);
+
 #endif /* __XEN_RCUPDATE_H */