Message ID | 150114249858.22910.4601418126082976816.stgit@Solace (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, 27 Jul 2017, Dario Faggioli wrote: > Instead of having the CPU where a callback is queued, busy > looping on rcu_pending(), use a timer. > > In fact, we let the CPU go idla,e but we program a timer ^ idle, > that will periodically wake it up, for checking whether the > grace period has actually ended. > > It is kind of similar to introducing a periodic tick, but > with a much more limited scope, and a lot less overhead. In > fact, this timer is: > - only active for the CPU(s) that have callbacks queued, > waiting for the end of a grace period; > - only active when those CPU(s) are idle (and stopped as > soon as they resume execution). > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > --- > Cc: Stefano Stabellini <sstabellini@kernel.org> > Cc: Julien Grall <julien.grall@arm.com> > Cc: Jan Beulich <jbeulich@suse.com> > Cc: Andrew Cooper <andrew.cooper3@citrix.com> > --- > xen/arch/arm/domain.c | 4 ++- > xen/arch/x86/acpi/cpu_idle.c | 6 +++-- > xen/arch/x86/cpu/mwait-idle.c | 6 +++-- > xen/common/rcupdate.c | 52 ++++++++++++++++++++++++++++++++++++++++- > xen/include/xen/rcupdate.h | 3 ++ > 5 files changed, 65 insertions(+), 6 deletions(-) > > diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c > index 666b7ef..01da96e 100644 > --- a/xen/arch/arm/domain.c > +++ b/xen/arch/arm/domain.c > @@ -43,8 +43,9 @@ static void do_idle(void) > { > unsigned int cpu = smp_processor_id(); > > + rcu_idle_timer_start(); > sched_tick_suspend(); > - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ > + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ > process_pending_softirqs(); > > local_irq_disable(); > @@ -58,6 +59,7 @@ static void do_idle(void) > local_irq_enable(); > > sched_tick_resume(); > + rcu_idle_timer_stop(); > } > > void idle_loop(void) > diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c > index 04c52e8..b97986f 100644 > --- a/xen/arch/x86/acpi/cpu_idle.c > +++ b/xen/arch/x86/acpi/cpu_idle.c > @@ -576,10 +576,10 @@ static void acpi_processor_idle(void) > return; > } > > + rcu_idle_timer_start(); > cpufreq_dbs_timer_suspend(); > - > sched_tick_suspend(); > - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ > + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ > process_pending_softirqs(); > > /* > @@ -593,6 +593,7 @@ static void acpi_processor_idle(void) > local_irq_enable(); > sched_tick_resume(); > cpufreq_dbs_timer_resume(); > + rcu_idle_timer_stop(); > return; > } > > @@ -726,6 +727,7 @@ static void acpi_processor_idle(void) > > sched_tick_resume(); > cpufreq_dbs_timer_resume(); > + rcu_idle_timer_stop(); > > if ( cpuidle_current_governor->reflect ) > cpuidle_current_governor->reflect(power); > diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c > index ae9e92b..c426e41 100644 > --- a/xen/arch/x86/cpu/mwait-idle.c > +++ b/xen/arch/x86/cpu/mwait-idle.c > @@ -743,10 +743,10 @@ static void mwait_idle(void) > return; > } > > + rcu_idle_timer_start(); > cpufreq_dbs_timer_suspend(); > - > sched_tick_suspend(); > - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ > + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ > process_pending_softirqs(); > > /* Interrupts must be disabled for C2 and higher transitions. */ > @@ -756,6 +756,7 @@ static void mwait_idle(void) > local_irq_enable(); > sched_tick_resume(); > cpufreq_dbs_timer_resume(); > + rcu_idle_timer_stop(); > return; > } > > @@ -802,6 +803,7 @@ static void mwait_idle(void) > > sched_tick_resume(); > cpufreq_dbs_timer_resume(); > + rcu_idle_timer_stop(); > > if ( cpuidle_current_governor->reflect ) > cpuidle_current_governor->reflect(power); > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c > index f0fdc87..4586f2a 100644 > --- a/xen/common/rcupdate.c > +++ b/xen/common/rcupdate.c > @@ -84,8 +84,14 @@ struct rcu_data { > int cpu; > struct rcu_head barrier; > long last_rs_qlen; /* qlen during the last resched */ > + > + /* 3) idle CPUs handling */ > + struct timer idle_timer; > + bool idle_timer_active; > }; > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) Isn't this a bit too short? How is it chosen? > static DEFINE_PER_CPU(struct rcu_data, rcu_data); > > static int blimit = 10; > @@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu) > { > struct rcu_data *rdp = &per_cpu(rcu_data, cpu); > > - return (!!rdp->curlist || rcu_pending(cpu)); > + return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active; > +} > + > +/* > + * Timer for making sure the CPU where a callback is queued does > + * periodically poke rcu_pedning(), so that it will invoke the callback > + * not too late after the end of the grace period. > + */ > +void rcu_idle_timer_start() > +{ > + struct rcu_data *rdp = &this_cpu(rcu_data); > + > + if (likely(!rdp->curlist)) > + return; > + > + set_timer(&rdp->idle_timer, NOW() + RCU_IDLE_TIMER_PERIOD); > + rdp->idle_timer_active = true; > +} > + > +void rcu_idle_timer_stop() > +{ > + struct rcu_data *rdp = &this_cpu(rcu_data); > + > + if (likely(!rdp->idle_timer_active)) > + return; > + > + rdp->idle_timer_active = false; > + stop_timer(&rdp->idle_timer); > +} > + > +static void rcu_idle_timer_handler(void* data) > +{ > + /* > + * Nothing, really... And in fact, we don't expect to ever get in here, > + * as rcu_idle_timer_stop(), called while waking from idle, prevent that > + * to happen by stopping the timer before the TIMER_SOFTIRQ handler has > + * a chance to run. > + * > + * But that's fine, because all we want is the CPU that needs to execute > + * the callback to be periodically woken up and check rcu_pending(). > + */ > + ASSERT_UNREACHABLE(); > } > > void rcu_check_callbacks(int cpu) > @@ -423,6 +470,8 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, > static void rcu_offline_cpu(struct rcu_data *this_rdp, > struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > { > + kill_timer(&rdp->idle_timer); > + > /* If the cpu going offline owns the grace period we can block > * indefinitely waiting for it, so flush it here. > */ > @@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, > rdp->qs_pending = 0; > rdp->cpu = cpu; > rdp->blimit = blimit; > + init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu); > } > > static int cpu_callback( > diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h > index 561ac43..3402eb5 100644 > --- a/xen/include/xen/rcupdate.h > +++ b/xen/include/xen/rcupdate.h > @@ -149,4 +149,7 @@ int rcu_barrier(void); > void rcu_idle_enter(unsigned int cpu); > void rcu_idle_exit(unsigned int cpu); > > +void rcu_idle_timer_start(void); > +void rcu_idle_timer_stop(void); > + > #endif /* __XEN_RCUPDATE_H */ >
On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote: > On Thu, 27 Jul 2017, Dario Faggioli wrote: > > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c > > index f0fdc87..4586f2a 100644 > > --- a/xen/common/rcupdate.c > > +++ b/xen/common/rcupdate.c > > @@ -84,8 +84,14 @@ struct rcu_data { > > int cpu; > > struct rcu_head barrier; > > long last_rs_qlen; /* qlen during the last > > resched */ > > + > > + /* 3) idle CPUs handling */ > > + struct timer idle_timer; > > + bool idle_timer_active; > > }; > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) > > Isn't this a bit too short? How is it chosen? > It's totally arbitrary (and that would be the case for whatever value we choose). Basically, it's how long, at worst, after the actual end of a grace period, a (batch of) callback(s) will be invoked. Currently, on Credit1 on ARM (without my patch, from this series, that suspends the tick) that's (by chance) 30 ms (or whatever value is chosen for Credit1 timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it (apparently) is 'however frequent time sync rendezvouses happs' (which I don't recall, but it's longer), while on ARM is (potentially) never. I accept suggestions about alternatives values, and I'm certainly fine with adding a comment, containing something along the lines of the explanation above, but I fear it's going to be hard to figure out what value is actually the "absolute best". In Linux (which is where the same 'callback book-keeping' happens for them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low- latency/Deskop/real-time'. For us, as said above, tick --when it's there-- would be 30ms by default. I just went with something in the middle. Also, it's not that we'll have a 10ms periodic timer going on for significant amount of time. In fact we expect it to actually fire just once (for each grace period). It's not 100% guaranteed that it won't be reprogrammed and fire a couple of times, but it should not, in the vast majority of cases. What makes you think it's short? Regards, Dario
On Tue, 1 Aug 2017, Dario Faggioli wrote: > On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote: > > On Thu, 27 Jul 2017, Dario Faggioli wrote: > > > > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c > > > index f0fdc87..4586f2a 100644 > > > --- a/xen/common/rcupdate.c > > > +++ b/xen/common/rcupdate.c > > > @@ -84,8 +84,14 @@ struct rcu_data { > > > int cpu; > > > struct rcu_head barrier; > > > long last_rs_qlen; /* qlen during the last > > > resched */ > > > + > > > + /* 3) idle CPUs handling */ > > > + struct timer idle_timer; > > > + bool idle_timer_active; > > > }; > > > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) > > > > Isn't this a bit too short? How is it chosen? > > > It's totally arbitrary (and that would be the case for whatever value > we choose). > > Basically, it's how long, at worst, after the actual end of a grace > period, a (batch of) callback(s) will be invoked. Currently, on Credit1 > on ARM (without my patch, from this series, that suspends the tick) > that's (by chance) 30 ms (or whatever value is chosen for Credit1 > timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it > (apparently) is 'however frequent time sync rendezvouses happs' (which > I don't recall, but it's longer), while on ARM is (potentially) never. > > I accept suggestions about alternatives values, and I'm certainly fine > with adding a comment, containing something along the lines of the > explanation above, but I fear it's going to be hard to figure out what > value is actually the "absolute best". > > In Linux (which is where the same 'callback book-keeping' happens for > them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low- > latency/Deskop/real-time'. For us, as said above, tick --when it's > there-- would be 30ms by default. > > I just went with something in the middle. > > Also, it's not that we'll have a 10ms periodic timer going on for > significant amount of time. In fact we expect it to actually fire just > once (for each grace period). It's not 100% guaranteed that it won't be > reprogrammed and fire a couple of times, but it should not, in the vast > majority of cases. > > What makes you think it's short? In terms of power saving and CPU sleep states, 10ms is not much to sleep for. I wonder if there are any power saving benefits in sleeping for only 10ms (especially on x86 where entering and exiting CPU sleep states takes longer, to be confirmed). We might as well do the thing we need to do immediately? I guess we cannot do that?
On Mon, 2017-07-31 at 16:58 -0700, Stefano Stabellini wrote: > On Tue, 1 Aug 2017, Dario Faggioli wrote: > > On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote: > > > On Thu, 27 Jul 2017, Dario Faggioli wrote: > > > > > > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c > > > > index f0fdc87..4586f2a 100644 > > > > --- a/xen/common/rcupdate.c > > > > +++ b/xen/common/rcupdate.c > > > > @@ -84,8 +84,14 @@ struct rcu_data { > > > > int cpu; > > > > struct rcu_head barrier; > > > > long last_rs_qlen; /* qlen during the last > > > > resched */ > > > > + > > > > + /* 3) idle CPUs handling */ > > > > + struct timer idle_timer; > > > > + bool idle_timer_active; > > > > }; > > > > > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) > > > > > > Isn't this a bit too short? How is it chosen? > > > > > > > What makes you think it's short? > > In terms of power saving and CPU sleep states, 10ms is not much to > sleep > for. I wonder if there are any power saving benefits in sleeping for > only 10ms (especially on x86 where entering and exiting CPU sleep > states > takes longer, to be confirmed). > I *think* we should be fine with, say, 100ms. But that's again, guess/rule of thumb, nothing more than that. And, TBH, I'm not even sure what a good experiment/benchmark would be, to assess whether a particular value is good or bad. :-/ > We might as well do the thing we need > to do immediately? I guess we cannot do that? > You're guessing correct, we can't. It's indeed a bit tricky, and it took it a little bit to me as well to figure all of it out properly. Basically, let's say that, at time t1, on CPU1, someone calls call_rcu(). The situation on the other CPUs is: CPU0 busy; CPU2 idle (no timer pending); CPU3 busy. So, a new grace period starts, and its exact end will be when CPU0, CPU1 and CPU3 have quiesced once (in Xen, quiescence means: "going through do_softirq()"). But RCU it's a passive mechanism, i.e., we rely on each CPU coming to the RCU core logic, and tell <<hey, btw, I've quiesced>>. Let's say that CPU0 quiesces at time t2 > t1. CPU1 quiesces at time t3 > t2, and goes fully idle (no timer pending). CPU3 quiesces at time t4 > t3. Now, at time t4, CPU1 can actually invoke the callbeck, queued at time t1, from within call_rcu(). This patch series solves two problems, of our current RCU implementation: 1) right now, we don't only wait for CPU0, CPU1 and CPU3, we also wait for CPU2. But since CPU2 is fully idle, it just won't bother telling RCU that it has quiesced (well, on x86, that would actually happen at some point, while on ARM, it is really possible that this never happens!). This is solved in patch 3, by introducing the cpumask; 2) now (after patch 3) we know that we just can avoid waiting for CPU2. Good. But at time t4, when CPU3 quiesces, marking the end of the grace period, how would CPU1 --which is fully idle-- know that it can now safely invoke the callback? Again, RCU is passive, so it relies on CPU1 to figure that out on its own, next time it wakes up, e.g., because of the periodic tick timer. But we don't have a periodic tick timer! Well, this again means that, on x86, CPU1 will actually figure that out at some (unpredictable) point in future. On ARM, not so much. The purpose of the timer in this patch is to make sure it always will. In fact, with patches 4 and 5 applied, at time t3, we let CPU1 go idle, but we program the timer. It will fire at t3+T (with T=10ms, right now). When this happens, if t3+T > t4, CPU1 invokes the callback, and we're done. If not (and CPU1 is still idle) we retry in another T. So, when you say "immediately", *when* do you actually mean? We can't invoke the callback at t3 (i.e., immediately after CPU1 quiesces), because we need to wait for CPU3 to do the same. We can't invoke the callback when CPU3 quiesces, at t4 (i.e., immediately after the grace period ends) either, because the callback it's on CPU1, not on CPU3. Linux's solution is not to stop the tick for CPU1 at time t3. It will be stopped only after the first firing of the tick itself at time t > t4 (if CPU1 is still idle, of course). This timer thing is, IMO, pretty similar. The only difference is that we don't have a tick to not stop... So I had to make up a very special one. :-D TL;DR, yes, I also would have loved the world (or even just this problem) to be simple. It's not! ;-P Thanks and Regards, Dario
On 01/08/2017 00:58, Stefano Stabellini wrote: > On Tue, 1 Aug 2017, Dario Faggioli wrote: >> On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote: >>> On Thu, 27 Jul 2017, Dario Faggioli wrote: >>>> >>>> diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c >>>> index f0fdc87..4586f2a 100644 >>>> --- a/xen/common/rcupdate.c >>>> +++ b/xen/common/rcupdate.c >>>> @@ -84,8 +84,14 @@ struct rcu_data { >>>> int cpu; >>>> struct rcu_head barrier; >>>> long last_rs_qlen; /* qlen during the last >>>> resched */ >>>> + >>>> + /* 3) idle CPUs handling */ >>>> + struct timer idle_timer; >>>> + bool idle_timer_active; >>>> }; >>>> >>>> +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) >>> >>> Isn't this a bit too short? How is it chosen? >>> >> It's totally arbitrary (and that would be the case for whatever value >> we choose). >> >> Basically, it's how long, at worst, after the actual end of a grace >> period, a (batch of) callback(s) will be invoked. Currently, on Credit1 >> on ARM (without my patch, from this series, that suspends the tick) >> that's (by chance) 30 ms (or whatever value is chosen for Credit1 >> timeslice). On Credit2 (on both ARM and x86), it's never, but on x86 it >> (apparently) is 'however frequent time sync rendezvouses happs' (which >> I don't recall, but it's longer), while on ARM is (potentially) never. >> >> I accept suggestions about alternatives values, and I'm certainly fine >> with adding a comment, containing something along the lines of the >> explanation above, but I fear it's going to be hard to figure out what >> value is actually the "absolute best". >> >> In Linux (which is where the same 'callback book-keeping' happens for >> them), a tick with a frequency of 1000Hz (== 1ms) is considered 'low- >> latency/Deskop/real-time'. For us, as said above, tick --when it's >> there-- would be 30ms by default. >> >> I just went with something in the middle. >> >> Also, it's not that we'll have a 10ms periodic timer going on for >> significant amount of time. In fact we expect it to actually fire just >> once (for each grace period). It's not 100% guaranteed that it won't be >> reprogrammed and fire a couple of times, but it should not, in the vast >> majority of cases. >> >> What makes you think it's short? > > In terms of power saving and CPU sleep states, 10ms is not much to sleep > for. I wonder if there are any power saving benefits in sleeping for > only 10ms (especially on x86 where entering and exiting CPU sleep states > takes longer, to be confirmed). We might as well do the thing we need > to do immediately? I guess we cannot do that? Given that on ARM we use WFI to sleep today, I think we can already benefit some power saving even for 10ms. Although, it will depend on the processor. Cheers,
Hi Dario, On 27/07/2017 09:01, Dario Faggioli wrote: > Instead of having the CPU where a callback is queued, busy > looping on rcu_pending(), use a timer. > > In fact, we let the CPU go idla,e but we program a timer > that will periodically wake it up, for checking whether the > grace period has actually ended. > > It is kind of similar to introducing a periodic tick, but > with a much more limited scope, and a lot less overhead. In > fact, this timer is: > - only active for the CPU(s) that have callbacks queued, > waiting for the end of a grace period; > - only active when those CPU(s) are idle (and stopped as > soon as they resume execution). If I read this correctly, it means on ARM the idling will now get interrupted periodically. This is a bit unfortunate, given that if you have a CPU doing nothing, you would still interrupt it intermittently. I was expected that we could remove the CPU from the RCU whilst it is idle. Is there any reason for not doing that? Cheers,
On Tue, 2017-08-01 at 09:54 +0100, Julien Grall wrote: > Hi Dario, > > On 27/07/2017 09:01, Dario Faggioli wrote: > > Instead of having the CPU where a callback is queued, busy > > looping on rcu_pending(), use a timer. > > > > In fact, we let the CPU go idla,e but we program a timer > > that will periodically wake it up, for checking whether the > > grace period has actually ended. > > > > It is kind of similar to introducing a periodic tick, but > > with a much more limited scope, and a lot less overhead. In > > fact, this timer is: > > - only active for the CPU(s) that have callbacks queued, > > waiting for the end of a grace period; > > - only active when those CPU(s) are idle (and stopped as > > soon as they resume execution). > > If I read this correctly, it means on ARM the idling will now get > interrupted periodically. This is a bit unfortunate, given that if > you > have a CPU doing nothing, you would still interrupt it > intermittently. > Not really periodically, not always, at least. What this really means is that a CPU that is idle, *but* have pending RCU callbacks, will be interrupted periodically to see if the grace period ended, so it can invoke the callbacks. As soon as this (callbacks being invoked) will have happened, we won't interrupt it any longer. And idle CPUs _without_ queued RCU callbacks, won't be interrupted at all. > I was expected that we could remove the CPU from the RCU whilst it > is > idle. Is there any reason for not doing that? > I'm not sure I understand what you mean here. I tried to explain as good as I could how this works, and why I think it can't work in other ways, in this reply to Stefano: <1501548445.30551.5.camel@citrix.com> Every CPU that participates in the grace period, and has already quiesced, is "removed from RCU", and hence, when it becomes idle, it is never interrupted (by this timer). With the only exception of the CPU(s) that has queued callbacks. We simply can't forget about these CPUs, even if they go idle. If we do, the callbacks won't be invoked never (or will only be invoked when the CPU becomes active again, which may happen really really late, which is part of the reason why we're seeing the bug we're seeing). Linux does this, in the *exact* same way (well, actually, in a totally different way, from an implementation point of view, but the effect is indeed exactly the same): See here: http://elixir.free-electrons.com/linux/v2.6.21/source/kernel/time/tick-sched.c#L198 We're in tick_nohz_stop_sched_tick(), i.e., the CPU is going idle, and the periodic timer tick is being stopped (no interruptions). But if (rcu_needs_cpu(cpu)) delta_jiffies = 1; Where, rcu_needs_cpu() means: /* * Check to see if any future RCU-related work will need to be done * by the current CPU, even if none need be done immediately, returning * 1 if so. */ int rcu_needs_cpu(int cpu) Then, again in tick_nohz_stop_sched_tick() /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu */ if (!ts->tick_stopped && delta_jiffies == 1) goto out; And in fact, in my testing, without patches 3 and 5 applied, the bug is still there. Regards, Dario
Hi Dario, On 01/08/17 10:17, Dario Faggioli wrote: > On Tue, 2017-08-01 at 09:54 +0100, Julien Grall wrote: >> Hi Dario, >> >> On 27/07/2017 09:01, Dario Faggioli wrote: >>> Instead of having the CPU where a callback is queued, busy >>> looping on rcu_pending(), use a timer. >>> >>> In fact, we let the CPU go idla,e but we program a timer >>> that will periodically wake it up, for checking whether the >>> grace period has actually ended. >>> >>> It is kind of similar to introducing a periodic tick, but >>> with a much more limited scope, and a lot less overhead. In >>> fact, this timer is: >>> - only active for the CPU(s) that have callbacks queued, >>> waiting for the end of a grace period; >>> - only active when those CPU(s) are idle (and stopped as >>> soon as they resume execution). >> >> If I read this correctly, it means on ARM the idling will now get >> interrupted periodically. This is a bit unfortunate, given that if >> you >> have a CPU doing nothing, you would still interrupt it >> intermittently. >> > Not really periodically, not always, at least. What this really means > is that a CPU that is idle, *but* have pending RCU callbacks, will be > interrupted periodically to see if the grace period ended, so it can > invoke the callbacks. > > As soon as this (callbacks being invoked) will have happened, we won't > interrupt it any longer. > > And idle CPUs _without_ queued RCU callbacks, won't be interrupted at > all. Oh, the commit message is not clear about it. The wording gives the impression the timer will always be here on every idle CPU(s). In the case on active CPU(s) you specific mention the end of the grace period. Thank you for the clarification and please disregard the rest of my e-mail then. Cheers,
On Tue, 2017-08-01 at 11:22 +0100, Julien Grall wrote: > On 01/08/17 10:17, Dario Faggioli wrote: > > As soon as this (callbacks being invoked) will have happened, we > > won't > > interrupt it any longer. > > > > And idle CPUs _without_ queued RCU callbacks, won't be interrupted > > at > > all. > > Oh, the commit message is not clear about it. The wording gives the > impression the timer will always be here on every idle CPU(s). In > the > case on active CPU(s) you specific mention the end of the grace > period. > Ok, I'll try to improve the changelog and resend. Thanks and Regards, Dario
On Tue, 1 Aug 2017, Dario Faggioli wrote: > On Mon, 2017-07-31 at 16:58 -0700, Stefano Stabellini wrote: > > On Tue, 1 Aug 2017, Dario Faggioli wrote: > > > On Mon, 2017-07-31 at 14:20 -0700, Stefano Stabellini wrote: > > > > On Thu, 27 Jul 2017, Dario Faggioli wrote: > > > > > > > > > > diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c > > > > > index f0fdc87..4586f2a 100644 > > > > > --- a/xen/common/rcupdate.c > > > > > +++ b/xen/common/rcupdate.c > > > > > @@ -84,8 +84,14 @@ struct rcu_data { > > > > > int cpu; > > > > > struct rcu_head barrier; > > > > > long last_rs_qlen; /* qlen during the last > > > > > resched */ > > > > > + > > > > > + /* 3) idle CPUs handling */ > > > > > + struct timer idle_timer; > > > > > + bool idle_timer_active; > > > > > }; > > > > > > > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) > > > > > > > > Isn't this a bit too short? How is it chosen? > > > > > > > > > > What makes you think it's short? > > > > In terms of power saving and CPU sleep states, 10ms is not much to > > sleep > > for. I wonder if there are any power saving benefits in sleeping for > > only 10ms (especially on x86 where entering and exiting CPU sleep > > states > > takes longer, to be confirmed). > > > I *think* we should be fine with, say, 100ms. But that's again, > guess/rule of thumb, nothing more than that. And, TBH, I'm not even > sure what a good experiment/benchmark would be, to assess whether a > particular value is good or bad. :-/ Given the description below, it's possible that the new timer has to fire several times before the callback can be finally invoked, right? I would make some measurements to check how many times the timer has to fire (how long we actually need to wait before calling the callback) in various scenarios. Then I would choose a value to minimize the number of unnecessary wake-ups. > > We might as well do the thing we need > > to do immediately? I guess we cannot do that? > > > You're guessing correct, we can't. It's indeed a bit tricky, and it > took it a little bit to me as well to figure all of it out properly. > > Basically, let's say that, at time t1, on CPU1, someone calls > call_rcu(). The situation on the other CPUs is: CPU0 busy; CPU2 idle > (no timer pending); CPU3 busy. > > So, a new grace period starts, and its exact end will be when CPU0, > CPU1 and CPU3 have quiesced once (in Xen, quiescence means: "going > through do_softirq()"). > > But RCU it's a passive mechanism, i.e., we rely on each CPU coming to > the RCU core logic, and tell <<hey, btw, I've quiesced>>. > Let's say that CPU0 quiesces at time t2 > t1. CPU1 quiesces at time > t3 > t2, and goes fully idle (no timer pending). CPU3 quiesces at time > t4 > t3. Now, at time t4, CPU1 can actually invoke the callbeck, queued > at time t1, from within call_rcu(). > > This patch series solves two problems, of our current RCU > implementation: > > 1) right now, we don't only wait for CPU0, CPU1 and CPU3, we also wait > for CPU2. But since CPU2 is fully idle, it just won't bother > telling RCU that it has quiesced (well, on x86, that would actually > happen at some point, while on ARM, it is really possible that this > never happens!). This is solved in patch 3, by introducing the > cpumask; > > 2) now (after patch 3) we know that we just can avoid waiting for > CPU2. Good. But at time t4, when CPU3 quiesces, marking the end of > the grace period, how would CPU1 --which is fully idle-- know that > it can now safely invoke the callback? Again, RCU is passive, so it > relies on CPU1 to figure that out on its own, next time it wakes > up, e.g., because of the periodic tick timer. But we don't have a > periodic tick timer! Well, this again means that, on x86, CPU1 will > actually figure that out at some (unpredictable) point in future. > On ARM, not so much. The purpose of the timer in this patch is to > make sure it always will. > In fact, with patches 4 and 5 applied, at time t3, we let CPU1 go > idle, but we program the timer. It will fire at t3+T (with T=10ms, > right now). When this happens, if t3+T > t4, CPU1 invokes the > callback, and we're done. If not (and CPU1 is still idle) we retry > in another T. > > So, when you say "immediately", *when* do you actually mean? > > We can't invoke the callback at t3 (i.e., immediately after CPU1 > quiesces), because we need to wait for CPU3 to do the same. > > We can't invoke the callback when CPU3 quiesces, at t4 (i.e., > immediately after the grace period ends) either, because the callback > it's on CPU1, not on CPU3. > > Linux's solution is not to stop the tick for CPU1 at time t3. It will > be stopped only after the first firing of the tick itself at time > t > t4 (if CPU1 is still idle, of course). This timer thing is, IMO, > pretty similar. The only difference is that we don't have a tick to not > stop... So I had to make up a very special one. :-D > > TL;DR, yes, I also would have loved the world (or even just this > problem) to be simple. It's not! ;-P
Hey, apologies in advance, for the very long email! :-O On Tue, 2017-08-01 at 12:13 -0700, Stefano Stabellini wrote: > Given the description below, it's possible that the new timer has to > fire several times before the callback can be finally invoked, right? > > I would make some measurements to check how many times the timer has > to > fire (how long we actually need to wait before calling the callback) > in > various scenarios. Then I would choose a value to minimize the number > of > unnecessary wake-ups. > You seem to be thinking to this timer as something that disturbs the idleness of a CPU. It's not, because the CPU is not idle! It has a callback waiting to be invoked, and it better do that as soon as possible, in order to: - freeing the memory/the resources as soon as practically possible. E.g., if we wait for, say, 1 sec, destroying a VM with a "passed- through" I/O range, and then trying to create another one, with the same resource assigned, would fail until that 1 sec expires. True, we save more power on that CPU, but it does not look neither fair nor correct to me to strive toward achieving that, in this situation. Power should indeed be saved, and unnecessary wake-ups prevented, but when there isn't anything to do! If there is stuff to do, and this is the case, our goal should be to get it done as soon as possible, so that, afterwords, we can go into a (hopefully) long sleep. - by delaying detection of a finished grace period, we're also delaying the beginning of a new one. This means, in case there are RCU operation waiting for it (because they arrived too late for the current one, and have been queued), we're potentially significantly delaying them too. So, basically, this again looks like the same kind of paradox to me, where we have stuff to do, but we decide to try to save power. RCU are not a mechanism for allow the CPUs to stay idle longer, they're a lockless and asymmetric serialization primitive. And as any serialization primitive --either lock-based or lockless-- keeping the duration of critical sections small is rather important. So, I think the goal here is not minimizing the wakeups; it much rather is completing the grace period sooner rather than later. In theory, if rcu_needs_cpu() returns true, we may very well not even let the CPU go idle, and have it spin (which indeed is what happens without this patch). That would guarantee a prompt detection of CPU quiescence, and of the ending of a grace period. Then, since we know that spinning consumes a lot of power, and generates overhead that, potentially, may even affect and slow down activities going on other CPUs, we let it go idle with a timer armed. But that's a compromise, and can't be confused with the primary goal. And any nanosecond that we spend, on that CPU, consuming less power than what we'd have consumed running a tight loop around rcu_pending(), we should be thankful for it, instead of regretting that there could have been more. :-) In Linux, as said a few times, they do the same, and the role of this new timer I'm introducing here, is played by the tick. The tick period is controlled by the HZ config variable, possible values for which (in 2.6.21, where CONFIG_NO_HZ was introduced for first time for all arches) are 100Hz, 250Hz, 300Hz and 1000Hz, which translates into 10ms, 4ms, 3.33ms or 1ms. Default is 250HZ==4ms. The periodic tick is the minimum time granularity of the kernel for a lot of subsystem (especially in that kernel), so it's something that can be considered pretty frequently running. As also said many times, we don't have any such thing, but still I think we should use something with a similar order of magnitude. [NOTE that I'm not using Linux as an example because what is done in Linux is always correct and is always what we also should do. However: (1) RCU has been basically invented for Linux, or at least Linux is where they've seen the widest adoption and development, and (2) our RCU code has been (not in the best possible way, allow me to say) imported from there. therefore, looking at what's done in Linux seems reasonable to me, in this case.] About the kind of measurements you're suggesting doing. I think I understand the rationale behind it, but at the same time, I'm quite sure that what we'd get would be pseudo-random numbers. In fact, the length of this "wait time", although it has a relationship with the workload being run, it also depends on a lot of other things, within the *system* as a whole. It would be basically impossible to run any meaningful statistical analysis on the results and come to something that is good enough in general. Think, for instance, at how this is not at all architecture specific code, but the behavior we're seeing is so different between x86 and ARM. Think at how this doesn't really have to do anything with the scheduler, but the behavior we observe varies, depending on some internal implementation details of a scheduler! What follows is a more detailed explanation of how things works, in a simple enough scenario. + == CPU is busy . == CPU is idle R == when call_rcu() is invoked (i.e., beginning of new grace period) Q == CPU quiesces (when all have done this, grace period ends). This happens when do_softirq() is called on the CPU C == pending callbacks are invoked t1 | | CPU1 +++R+++++++++++++Q.............. CPU2 ++++++++++++++++++++++++++Q++... | | t2 If CPU1 is where call_rcu() happens, and CPU2 is the only other busy CPU in the system at the time, and T is the period of our timer, the following cases are possible: x) t2 < t1 In this case, the timer never fires: t1 | | CPU1 +++R+++++++++++++QC......... CPU2 ++++++++++++++Q............. | | t2 x) t2 - t1 < T In this case, the timer only fires once t1 t1+T | | | | CPU1 +++R++++++++Q.......C...... CPU2 +++++++++++++++++Q......... | | t2 x) t2 - t1 > (n-1)*T t1 t1+T t1+2T t1+3T t1+4T | | | | | | | | | | CPU1 +R++++++Q...............................C.. CPU2 ++++++++++++++++++++++++++++++++++++Q...... | | t2 In this case, the timer will fire n times (with n=4 in the example). So, you're suggesting to measure, under different workloads, and then do some kind of averaging/statistical analysis, t2-t1. What I'm saying is that the occurrence of those events marked 'Q', on the appropriate CPU, is really hard to predict. In fact, t2-t1 is, basically, the difference between the time when the two CPUs call do_softirq(). How much the frequency of invocation of do_softirq() is related to the actual characteristics of the workload we're running? well, a relationship sure exists, but not one that I think is easy (possible?) to generalize... And here we're also talking about considering *different* workloads! :-O For instance, because of either hard-affinity, soft-affinity, or scheduling weights, there may be more or less preemptions, on the CPUs in question (and preemptions means going through do_softirq())... And that's not related to system load a particular workload produces, as it's in total control of the user. So we'd have not only to measure with different workloads, but for each workload, we'd have to test a certain number of variations of the scheduling parameters of the vCPUs! :-O On x86, there's an upper bound to the time CPUs will stay idle, because there's always (something like) a timer running (for time sync rendezvous, said Andrew, and I haven't checked myself, but have no reason to doubt that to be the case), while on ARM there's no such thing (or we weren't here! :-D). So we'd have to measure each workload, with each combination of parameters, on each (present and future) architecture we support! :-O CPU onlining and offlining, or CPUs moving around cpupools, can cause timers (and RCU callbacks even, in the online/offline case!) to be moved around, which means they'd be firing (and a timer firing means going through do_softirq()) on different CPUs from where we expected them to. So, we'd have to measure each workload, with each combination of parameters, on each architecture, while doing CPU online/offline, and while moving CPUs around pools! :-O And this was just to mention the few examples I could fish from the top of my head... No, IMO, there are too many variables and too many unrelated factors involved, to even hope that any set of measurements we actually manage to put together, can be representative of the general case. *PERHAPS* we can think of an heuristic... Provided it's not too complex, or at least, that it does not try to capture all this variability (because that'd be equally impossible). Something like this: when a CPU with queued callbacks quiesces and then goes idle, we know how many other CPUs, among the ones that are participating in the grace period, have not quiesced yet. Instead of using, say, always 10ms, we can program the timer to fire at something like 1ms*cpumask_weight(not_yet_quiesced_cpus). In fact, I think it makes sense to assume that, if there are only few CPUs involved in the grace period, that still has not quiesced, then the grace period itself is about to finish, and we should check frequently whether or not that has happened. if, OTOH, there are a lot of CPUs involved in the grace period, that still haven't quiesced, we can check whether the grace period has ended in a less eager way. This will make the period of the timer vary, i.e., it may be different between two subsequent timer_set(), but that is not a problem at all. Basically, we're making the timer kind of adaptive. This does not work well in the case where there is, e.g., only 1 CPU that has not quiesced yet (so we'd set the timer to fire very frequently), but it takes a lot of time for such CPU to do so. Well, it's an heuristic, so it can't always work well.. but I think this isn't a common situation. Would you be happy with something like this? Regards, Dario
>>> Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM >>> >Instead of having the CPU where a callback is queued, busy >looping on rcu_pending(), use a timer. Isn't this rcu_needs_cpu(), according to patch 4? >--- a/xen/arch/x86/acpi/cpu_idle.c >+++ b/xen/arch/x86/acpi/cpu_idle.c >@@ -576,10 +576,10 @@ static void acpi_processor_idle(void) >return; >} > >+ rcu_idle_timer_start(); >cpufreq_dbs_timer_suspend(); Is the ordering wrt cpufreq handling here arbitrary? If so, wouldn't it be better to suspend cpufreq handling before starting the idle timer (and also invert the ordering when going back to normal operation below)? >- >sched_tick_suspend(); At which point I'd then wonder whether this couldn't be integrated into sched_tick_{suspend,resume}(), avoiding arch-specific code to be altered. >@@ -756,6 +756,7 @@ static void mwait_idle(void) > local_irq_enable(); > sched_tick_resume(); > cpufreq_dbs_timer_resume(); >+ rcu_idle_timer_stop(); Indentation looks odd here, but I can't exclude this being an effect produced by my mail web frontend. >--- a/xen/common/rcupdate.c >+++ b/xen/common/rcupdate.c >@@ -84,8 +84,14 @@ struct rcu_data { >int cpu; >struct rcu_head barrier; >long last_rs_qlen; /* qlen during the last resched */ >+ >+ /* 3) idle CPUs handling */ >+ struct timer idle_timer; >+ bool idle_timer_active; >}; > >+#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) If I'm not mistaken someone else had already commented on this: If this is (mostly) arbitrary, please say so in a comment. >@@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu) >{ >struct rcu_data *rdp = &per_cpu(rcu_data, cpu); > >- return (!!rdp->curlist || rcu_pending(cpu)); >+ return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active; Please take the opportunity and drop the pointless !! here (unless it's needed for better matching up with the Linux original). >+} >+ >+/* >+ * Timer for making sure the CPU where a callback is queued does >+ * periodically poke rcu_pedning(), so that it will invoke the callback >+ * not too late after the end of the grace period. >+ */ >+void rcu_idle_timer_start() >+{ >+ struct rcu_data *rdp = &this_cpu(rcu_data); >+ >+ if (likely(!rdp->curlist)) >+ return; I would have expected this to be the inverse of the original condition in rcu_needs_cpu() - why is there no rcu_pending() invocation here? >@@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, >rdp->qs_pending = 0; >rdp->cpu = cpu; >rdp->blimit = blimit; >+ init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu); Again, unless it is this bogus way in the Linux original, please drop the pointless cast, or at least correct its style. Jan
On Mon, 2017-08-07 at 02:54 -0600, Jan Beulich wrote: > > > > Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM > > > > >>> > > > > Instead of having the CPU where a callback is queued, busy > > looping on rcu_pending(), use a timer. > > Isn't this rcu_needs_cpu(), according to patch 4? > I was referring to the rcu_pending() in do_softirq(). In fact, if this (roughly) is idle_loop: for ( ; ; ) { if ( unlikely(tasklet_work_to_do(cpu)) ) do_tasklet(); else pm_idle(); do_softirq(); check_for_livepatch_work(); } we don't have tasklet work to do, so we call pm_idle(). However, if we have a callback queued, rcu_needs_cpu() returns true (without calling rcu_pending()), which means cpu_is_haltable() returns false, and hence we exit pm_idle() without actually going idle, and we call do_softirq(), which does: if ( rcu_pending(cpu) ) rcu_check_callbacks(cpu); in a rather tight loop. IAC, I certainly can rephrase the sentence above, and make all this more clear. > > --- a/xen/arch/x86/acpi/cpu_idle.c > > +++ b/xen/arch/x86/acpi/cpu_idle.c > > @@ -576,10 +576,10 @@ static void acpi_processor_idle(void) > > return; > > } > > > > > + rcu_idle_timer_start(); > > cpufreq_dbs_timer_suspend(); > > Is the ordering wrt cpufreq handling here arbitrary? > Yes. > If so, wouldn't it be > better to suspend cpufreq handling before starting the idle timer > (and > also invert the ordering when going back to normal operation below)? > Well, as said above, it's arbitrary. Therefore, yes, I'm ok with doing things like you suggest. > > - > > sched_tick_suspend(); > > At which point I'd then wonder whether this couldn't be integrated > into > sched_tick_{suspend,resume}(), avoiding arch-specific code to be > altered. > Sure, I'm all for it. It's easier, cleaner, and makes a lot of sense! Only problem may be if some new arch decide not/forget to call sched_tick_suspend() and resume(). Perhaps I can add some checking in place, to make sure that this can't happen (in debug builds only, of course). > > @@ -756,6 +756,7 @@ static void mwait_idle(void) > > local_irq_enable(); > > sched_tick_resume(); > > cpufreq_dbs_timer_resume(); > > + rcu_idle_timer_stop(); > > Indentation looks odd here, but I can't exclude this being an effect > produced > by my mail web frontend. > Yep, it was indeed wrong. Fixed. Sorry. Thanks. :-) > > --- a/xen/common/rcupdate.c > > +++ b/xen/common/rcupdate.c > > @@ -84,8 +84,14 @@ struct rcu_data { > > > > +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) > > If I'm not mistaken someone else had already commented on this: If > this > is (mostly) arbitrary, please say so in a comment. > Indeed there is being a long 'debate'. :-) For v2, I'm changing this to something more dynamic. And I'll sure add a comment explaining the adopted solution. > > @@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu) > > { > > struct rcu_data *rdp = &per_cpu(rcu_data, cpu); > > > > > - return (!!rdp->curlist || rcu_pending(cpu)); > > + return (!!rdp->curlist || rcu_pending(cpu)) && !rdp- > > >idle_timer_active; > > Please take the opportunity and drop the pointless !! here (unless > it's needed > for better matching up with the Linux original). > It's in the Linux code, indeed: http://elixir.free-electrons.com/linux/v2.6.21/source/kernel/rcupdate.c#L517 However, this very line is already different (because Linux has rdp_bh->curlist, which we don't, and we have rdp->idle_timer_active). So I guess I can drop '!!'. Any patch touching this line coming from Linux will need manual adjustment anyway. > > +/* > > + * Timer for making sure the CPU where a callback is queued does > > + * periodically poke rcu_pedning(), so that it will invoke the > > callback > > + * not too late after the end of the grace period. > > + */ > > +void rcu_idle_timer_start() > > +{ > > + struct rcu_data *rdp = &this_cpu(rcu_data); > > + > > + if (likely(!rdp->curlist)) > > + return; > > I would have expected this to be the inverse of the original > condition in > rcu_needs_cpu() - why is there no rcu_pending() invocation here? > The original rcu_needs_cpu() condition is: rcu->curlist || rcu_pending(cpu) So, you're saying we need something like this: if (!rdp->curlist && !rcu_pending(cpu)) return; Well, if I do this, what happens is that the if evaluate to false (and hence we don't exit the function, and we arm the timer), on CPUs that: - does not have a calback queued (curlist == NULL) - are in the process of quiescing, by going idle. In fact, in that case, here's what happens (I compacted the trace, and slightly edited some of the lines, for better readability, and for keeping the highlighting the characteristics of the situation under investigation): * complete_domain_destroy callback is queued on CPU 0. The CPU quiesces (goes through softirq), but stay budy, so no timer is armed (that's not super relevant, but is what happens in this case): x-|- d0v12 rcu_call fn=complete_domain_destroy x-|- d0v12 rcu_pending? yes (ret=2): no pending entries but new entries x-|- d0v12 raise_softirq nr 3 x-|- d0v12 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: yes x-|- d0v12 do_softirq x-|- d0v12 rcu_pending? yes (ret=5): waiting for CPU to quiesce (rdp_qs_pending=1) x-|- d0v12 raise_softirq nr 3 x-|- d0v12 rcu_process_callbacks, rdp_curlist: yes, rdp_nxtlist: null x-|- d0v12 rcu_check_quiesc_state, rdp_qs_pending: yes x-|- d0v12 rcu_cpu_quiet, rcp_cpumask=0x00004100 x-|- d0v12 rcu_pending? no * the vCPU running on CPU 2, which participates in the grace period, blocks, and CPU 2 goes idle. That means that CPU 2 quiesces (goes through softirq), and we can forget about it. Surely we don't need the timer to fire on it, as the callback is not queued there... |-x- d0v7 vcpu_block d0v7 |-x- d0v7 raise_softirq nr 1 |-x- d0v7 do_softirq |-x- d0v7 rcu_pending? yes (ret=4): waiting for CPU to quiesce (rdp_qs_pending=0) |-x- d0v7 raise_softirq nr 3 |-x- d0v7 softirq_handler nr 1 |-x- d0v7 runstate_change d0v7 running->blocked |-x- d?v? runstate_change d32767v14 runnable->running Now, this is cpufreq_dbs_timer_suspend(): |-x- d32767v14 timer_stop t=do_dbs_timer And this is sched_tick_suspend(), which now calls rcu_idle_timer_start(), gated by the condition suggested above. |-x- d32767v14 rcu_pending? yes (ret=4): waiting for CPU to quiesce (rdp_qs_pending=0) |-x- d32767v14 timer_set t=rcu_idle_timer_handler, expires_in=9787us So, the CPU is actually quiescing, and since it has no callback queues, as we said, we wouldn't have needed the timer. But the condition is false, because, at this stage, rcu_pending() was still true. So, we don't bail early from rcu_idle_timer_start(), and we actually have armed the timer. |-x- d32767v14 rcu_pending? yes (ret=4): waiting for CPU to quiesce (rdp_qs_pending=0) |-x- d32767v14 raise_softirq nr 3 |-x- d32767v14 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: null |-x- d32767v14 rcu_check_quiesc_state, rdp_qs_pending: no |-x- d32767v14 rcu_process_callbacks, rdp_curlist: null, rdp_nxtlist: null |-x- d32767v14 rcu_check_quiesc_state, rdp_qs_pending: yes |-x- d32767v14 rcu_cpu_quiet, rcp_cpumask=0x00000100 |-x- d32767v14 pm_idle_start c2 * The timer fires, for no useful reason, and cause a spurious wakeup: |-x- d32767v14 raise_softirq nr 0 |-x- d32767v14 pm_idle_end c2 |-x- d32767v14 irq_enter |-x- d32767v14 irq_direct, vec=0xfa, handler=apic_timer_interrupt |-x- d32767v14 raise_softirq nr 0 |-x- d32767v14 irq_exit, in_irq = 0 |-x- d32767v14 softirq_handler nr 0 |-x- d32767v14 timer_exec t=rcu_idle_timer_handler |-x- d32767v14 timer_reprogr deadline=954.566us |-x- d32767v14 rcu_pending? no |-x- d32767v14 pm_idle_start c3 So, this is why I don't want rcu_pending() in that if. If we don't like this, I can see about moving around a bit the timer starting and stopping helpers (I've a couple of ideas in mind already, but I need to try). Actually, it's entirely possible that it is having rcu_pending(cpu) in rcu_needs_cpu() is, for us, redundant. In fact, although it does make sense in Linux, both code inspection and some investigation I've just done, makes me believe that there won't be cases where a CPU is denied going offline because it sees rcu_pending() returning 1. In fact, when we call rcu_pending(), within cpu_is_haltable(), we have already gone through it before. And if there were pending work, we've raised the softirq and dealt with it. If there weren't, neither there is now. I'm therefore leaning toward removing rcu_pending() from the rcu_needs_cpu() check as well. At that point, we'll indeed have the check inside rcu_start_idle_timer(), be the opposite of the original check in rcu_needs_cpu(). :-) > > @@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, > > struct rcu_ctrlblk *rcp, > > rdp->qs_pending = 0; > > rdp->cpu = cpu; > > rdp->blimit = blimit; > > + init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) > > rdp, cpu); > > Again, unless it is this bogus way in the Linux original, please drop > the > pointless cast, or at least correct its style. > Ah, no, this one, I can kill it. Thanks and Regards, Dario
On Wed, 2017-08-09 at 19:34 +0200, Dario Faggioli wrote: > On Mon, 2017-08-07 at 02:54 -0600, Jan Beulich wrote: > > > > > Dario Faggioli <dario.faggioli@citrix.com> 07/27/17 10:01 AM > > > +/* > > > + * Timer for making sure the CPU where a callback is queued does > > > + * periodically poke rcu_pedning(), so that it will invoke the > > > callback > > > + * not too late after the end of the grace period. > > > + */ > > > +void rcu_idle_timer_start() > > > +{ > > > + struct rcu_data *rdp = &this_cpu(rcu_data); > > > + > > > + if (likely(!rdp->curlist)) > > > + return; > > > > I would have expected this to be the inverse of the original > > condition in > > rcu_needs_cpu() - why is there no rcu_pending() invocation here? > > > > [...] > > Actually, it's entirely possible that it is having rcu_pending(cpu) > in > rcu_needs_cpu() is, for us, redundant. In fact, although it does make > sense in Linux, both code inspection and some investigation I've just > done, makes me believe that there won't be cases where a CPU is > denied > going offline because it sees rcu_pending() returning 1. > > In fact, when we call rcu_pending(), within cpu_is_haltable(), we > have > already gone through it before. And if there were pending work, we've > raised the softirq and dealt with it. If there weren't, neither there > is now. > > I'm therefore leaning toward removing rcu_pending() from the > rcu_needs_cpu() check as well. At that point, we'll indeed have the > check inside rcu_start_idle_timer(), be the opposite of the original > check in rcu_needs_cpu(). :-) > FTR, I'm not so sure of this last thing any longer. I mean, the analysis I provided is still correct, but I'm investigating the other possible race existing in the code that Tim has hinted at in his mail, and I think it could be useful to have rcu_pending() checked in here, to solve/avoid that one. It's also possible that I'll actually remove it from rcu_needs_cpu(), but to move it somewhere else... As I said, I'm still looking into the problem. Regards, Dario
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c index 666b7ef..01da96e 100644 --- a/xen/arch/arm/domain.c +++ b/xen/arch/arm/domain.c @@ -43,8 +43,9 @@ static void do_idle(void) { unsigned int cpu = smp_processor_id(); + rcu_idle_timer_start(); sched_tick_suspend(); - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); local_irq_disable(); @@ -58,6 +59,7 @@ static void do_idle(void) local_irq_enable(); sched_tick_resume(); + rcu_idle_timer_stop(); } void idle_loop(void) diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c index 04c52e8..b97986f 100644 --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -576,10 +576,10 @@ static void acpi_processor_idle(void) return; } + rcu_idle_timer_start(); cpufreq_dbs_timer_suspend(); - sched_tick_suspend(); - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); /* @@ -593,6 +593,7 @@ static void acpi_processor_idle(void) local_irq_enable(); sched_tick_resume(); cpufreq_dbs_timer_resume(); + rcu_idle_timer_stop(); return; } @@ -726,6 +727,7 @@ static void acpi_processor_idle(void) sched_tick_resume(); cpufreq_dbs_timer_resume(); + rcu_idle_timer_stop(); if ( cpuidle_current_governor->reflect ) cpuidle_current_governor->reflect(power); diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c index ae9e92b..c426e41 100644 --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -743,10 +743,10 @@ static void mwait_idle(void) return; } + rcu_idle_timer_start(); cpufreq_dbs_timer_suspend(); - sched_tick_suspend(); - /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ + /* Timer related operations can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); /* Interrupts must be disabled for C2 and higher transitions. */ @@ -756,6 +756,7 @@ static void mwait_idle(void) local_irq_enable(); sched_tick_resume(); cpufreq_dbs_timer_resume(); + rcu_idle_timer_stop(); return; } @@ -802,6 +803,7 @@ static void mwait_idle(void) sched_tick_resume(); cpufreq_dbs_timer_resume(); + rcu_idle_timer_stop(); if ( cpuidle_current_governor->reflect ) cpuidle_current_governor->reflect(power); diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c index f0fdc87..4586f2a 100644 --- a/xen/common/rcupdate.c +++ b/xen/common/rcupdate.c @@ -84,8 +84,14 @@ struct rcu_data { int cpu; struct rcu_head barrier; long last_rs_qlen; /* qlen during the last resched */ + + /* 3) idle CPUs handling */ + struct timer idle_timer; + bool idle_timer_active; }; +#define RCU_IDLE_TIMER_PERIOD MILLISECS(10) + static DEFINE_PER_CPU(struct rcu_data, rcu_data); static int blimit = 10; @@ -402,7 +408,48 @@ int rcu_needs_cpu(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - return (!!rdp->curlist || rcu_pending(cpu)); + return (!!rdp->curlist || rcu_pending(cpu)) && !rdp->idle_timer_active; +} + +/* + * Timer for making sure the CPU where a callback is queued does + * periodically poke rcu_pedning(), so that it will invoke the callback + * not too late after the end of the grace period. + */ +void rcu_idle_timer_start() +{ + struct rcu_data *rdp = &this_cpu(rcu_data); + + if (likely(!rdp->curlist)) + return; + + set_timer(&rdp->idle_timer, NOW() + RCU_IDLE_TIMER_PERIOD); + rdp->idle_timer_active = true; +} + +void rcu_idle_timer_stop() +{ + struct rcu_data *rdp = &this_cpu(rcu_data); + + if (likely(!rdp->idle_timer_active)) + return; + + rdp->idle_timer_active = false; + stop_timer(&rdp->idle_timer); +} + +static void rcu_idle_timer_handler(void* data) +{ + /* + * Nothing, really... And in fact, we don't expect to ever get in here, + * as rcu_idle_timer_stop(), called while waking from idle, prevent that + * to happen by stopping the timer before the TIMER_SOFTIRQ handler has + * a chance to run. + * + * But that's fine, because all we want is the CPU that needs to execute + * the callback to be periodically woken up and check rcu_pending(). + */ + ASSERT_UNREACHABLE(); } void rcu_check_callbacks(int cpu) @@ -423,6 +470,8 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + kill_timer(&rdp->idle_timer); + /* If the cpu going offline owns the grace period we can block * indefinitely waiting for it, so flush it here. */ @@ -451,6 +500,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + init_timer(&rdp->idle_timer, rcu_idle_timer_handler, (void*) rdp, cpu); } static int cpu_callback( diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h index 561ac43..3402eb5 100644 --- a/xen/include/xen/rcupdate.h +++ b/xen/include/xen/rcupdate.h @@ -149,4 +149,7 @@ int rcu_barrier(void); void rcu_idle_enter(unsigned int cpu); void rcu_idle_exit(unsigned int cpu); +void rcu_idle_timer_start(void); +void rcu_idle_timer_stop(void); + #endif /* __XEN_RCUPDATE_H */
Instead of having the CPU where a callback is queued, busy looping on rcu_pending(), use a timer. In fact, we let the CPU go idla,e but we program a timer that will periodically wake it up, for checking whether the grace period has actually ended. It is kind of similar to introducing a periodic tick, but with a much more limited scope, and a lot less overhead. In fact, this timer is: - only active for the CPU(s) that have callbacks queued, waiting for the end of a grace period; - only active when those CPU(s) are idle (and stopped as soon as they resume execution). Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> --- Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Julien Grall <julien.grall@arm.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> --- xen/arch/arm/domain.c | 4 ++- xen/arch/x86/acpi/cpu_idle.c | 6 +++-- xen/arch/x86/cpu/mwait-idle.c | 6 +++-- xen/common/rcupdate.c | 52 ++++++++++++++++++++++++++++++++++++++++- xen/include/xen/rcupdate.h | 3 ++ 5 files changed, 65 insertions(+), 6 deletions(-)