diff mbox

NOHZ: WARNING: at arch/x86/kernel/smp.c:123 native_smp_send_reschedule, round 2

Message ID 20130520132355.GF12690@pd.tnic (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Borislav Petkov May 20, 2013, 1:23 p.m. UTC
On Mon, May 20, 2013 at 05:24:05PM +0800, Michael Wang wrote:
> >> diff --git a/drivers/cpufreq/cpufreq_governor.c
> >> b/drivers/cpufreq/cpufreq_governor.c
> >> index 443442d..449be88 100644
> >> --- a/drivers/cpufreq/cpufreq_governor.c
> >> +++ b/drivers/cpufreq/cpufreq_governor.c
> >> @@ -26,6 +26,7 @@
> >>  #include <linux/tick.h>
> >>  #include <linux/types.h>
> >>  #include <linux/workqueue.h>
> >> +#include <linux/cpu.h>
> >>
> >>  #include "cpufreq_governor.h"
> >>
> >> @@ -180,8 +181,10 @@ void gov_queue_work(struct dbs_data *dbs_data,
> >> struct cpufreq_policy *policy,
> >>         if (!all_cpus) {
> >>                 __gov_queue_work(smp_processor_id(), dbs_data, delay);
> >>         } else {
> >> +               get_online_cpus();
> >>                 for_each_cpu(i, policy->cpus)
> >>                         __gov_queue_work(i, dbs_data, delay);
> >> +               put_online_cpus();
> >>         }
> >>  }
> >>  EXPORT_SYMBOL_GPL(gov_queue_work);
> >>
> >> This is supposed to make WARN disappear, if it works, then BINGO :)
> > 
> > Let people test it and then we can talk :)
> 
> Agree :)
> 
> Borislav, would you like to take a try?
> 
> If this fix cause other troubles, you could try get_cpu() or disable irq
> also.

I just confirmed that policy->cpus contains offlined cores with this:

--


[   94.386340] EXT4-fs (sda7): re-mounted. Opts: (null)
[   96.520362] kvm: exiting hardware virtualization
[   96.637687] ACPI: Preparing to enter system sleep state S5
[   96.643506] Disabling non-boot CPUs ...
[   96.855499] ------------[ cut here ]------------
[   96.860172] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   96.868501] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   96.914238] CPU: 0 PID: 315 Comm: kworker/1:2 Tainted: G        W    3.10.0-rc1+ #2
[   96.921969] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   96.929424] Workqueue: events od_dbs_timer
[   96.933574]  0000000000000009 ffff88043a08bc78 ffffffff8161445c ffff88043a08bcb8
[   96.941085]  ffffffff8103e540 ffff88043b712a80 0000000000000001 ffff88043a296400
[   96.948602]  ffff88043b712a80 ffffffff81cdc910 0000000000000001 ffff88043a08bcc8
[   96.956123] Call Trace:
[   96.958602]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   96.963801]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   96.969858]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   96.975735]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   96.981359]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   96.986808]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   96.992691]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   96.998756]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   97.004371]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   97.010256]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   97.015185]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.021605]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   97.027049]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.033457] ---[ end trace d36d91c626ac81a0 ]---
[   97.039221] ------------[ cut here ]------------
[   97.039227] ------------[ cut here ]------------
[   97.039229] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   97.039243] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   97.039245] CPU: 4 PID: 82 Comm: kworker/2:1 Tainted: G        W    3.10.0-rc1+ #2
[   97.039245] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   97.039248] Workqueue: events od_dbs_timer
[   97.039250]  0000000000000009 ffff88043b5cfc78 ffffffff8161445c ffff88043b5cfcb8
[   97.039251]  ffffffff8103e540 ffff88043b712a80 0000000000000002 ffff88043a295e00
[   97.039253]  ffff88043b712a80 ffffffff81cdc910 0000000000000002 ffff88043b5cfcc8
[   97.039253] Call Trace:
[   97.039255]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   97.039257]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   97.039258]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   97.039259]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   97.039261]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   97.039263]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   97.039264]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   97.039265]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   97.039267]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   97.039268]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   97.039269]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.039270]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   97.039272]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.039272] ---[ end trace d36d91c626ac81a1 ]---
[   97.143214] nouveau E[     DRM] GPU lockup - switching to software fbcon
[   97.318430] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   97.326804] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   97.374578] CPU: 0 PID: 98 Comm: kworker/3:1 Tainted: G        W    3.10.0-rc1+ #2
[   97.384154] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   97.393566] Workqueue: events od_dbs_timer
[   97.399675]  0000000000000009 ffff88043b179c78 ffffffff8161445c ffff88043b179cb8
[   97.409153]  ffffffff8103e540 ffff88043b712a80 0000000000000003 ffff88043a295a00
[   97.418623]  ffff88043b712a80 ffffffff81cdc910 0000000000000003 ffff88043b179cc8
[   97.428103] Call Trace:
[   97.432520]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   97.439678]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   97.447694]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   97.455512]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   97.462993]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   97.470259]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   97.477878]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   97.485652]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   97.492969]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   97.500565]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   97.507167]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.515255]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   97.522389]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.530472] ---[ end trace d36d91c626ac81a2 ]---
[   97.543176] ------------[ cut here ]------------
[   97.547172] ------------[ cut here ]------------
[   97.547178] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   97.547197] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   97.547199] CPU: 7 PID: 316 Comm: kworker/5:1 Tainted: G        W    3.10.0-rc1+ #2
[   97.547200] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   97.547202] Workqueue: events od_dbs_timer
[   97.547204]  0000000000000009 ffff88043905dc78 ffffffff8161445c ffff88043905dcb8
[   97.547205]  ffffffff8103e540 ffff88043b712a80 0000000000000005 ffff88043a295800
[   97.547206]  ffff88043b712a80 ffffffff81cdc910 0000000000000005 ffff88043905dcc8
[   97.547207] Call Trace:
[   97.547211]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   97.547214]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   97.547215]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   97.547216]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   97.547218]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   97.547220]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   97.547221]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   97.547222]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   97.547224]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   97.547225]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   97.547226]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.547228]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   97.547229]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.547230] ---[ end trace d36d91c626ac81a3 ]---
[   97.761326] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   97.770798] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   97.819617] CPU: 0 PID: 253 Comm: kworker/4:1 Tainted: G        W    3.10.0-rc1+ #2
[   97.828623] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   97.837372] Workqueue: events od_dbs_timer
[   97.842805]  0000000000000009 ffff880439529c78 ffffffff8161445c ffff880439529cb8
[   97.851628]  ffffffff8103e540 ffff88043b712a80 0000000000000004 ffff88043a295c00
[   97.860445]  ffff88043b712a80 ffffffff81cdc910 0000000000000004 ffff880439529cc8
[   97.869249] Call Trace:
[   97.873041]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   97.879533]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   97.886912]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   97.894100]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   97.901002]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   97.907706]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   97.914797]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   97.922016]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   97.928803]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   97.935837]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   97.941900]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.949443]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   97.956027]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   97.963563] ---[ end trace d36d91c626ac81a4 ]---
[   97.970449] ------------[ cut here ]------------
[   97.976277] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   97.985762] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   98.035051] CPU: 0 PID: 102 Comm: kworker/6:1 Tainted: G        W    3.10.0-rc1+ #2
[   98.044067] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   98.052834] Workqueue: events od_dbs_timer
[   98.058285]  0000000000000009 ffff88043b6f3c78 ffffffff8161445c ffff88043b6f3cb8
[   98.067114]  ffffffff8103e540 ffff88043b712a80 0000000000000006 ffff88043a295600
[   98.075924]  ffff88043b712a80 ffffffff81cdc910 0000000000000006 ffff88043b6f3cc8
[   98.084735] Call Trace:
[   98.088518]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   98.095024]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   98.102386]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   98.109565]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   98.116502]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   98.123253]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   98.130394]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   98.137667]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   98.144456]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   98.151510]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   98.157583]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   98.165143]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   98.171730]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   98.179282] ---[ end trace d36d91c626ac81a5 ]---
[   98.185098] ------------[ cut here ]------------
[   98.190903] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
[   98.200387] Modules linked in: ext2 vfat fat loop
[   98.205029] nouveau W[   PFIFO][0000:03:00.0] unknown intr 0x00400000, ch 1
[   98.214563]  usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
[   98.258886] CPU: 0 PID: 318 Comm: kworker/7:1 Tainted: G        W    3.10.0-rc1+ #2
[   98.267919] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[   98.276689] Workqueue: events od_dbs_timer
[   98.282147]  0000000000000009 ffff88043969dc78 ffffffff8161445c ffff88043969dcb8
[   98.290991]  ffffffff8103e540 ffff88043b712a80 0000000000000007 ffff88043a295200
[   98.299832]  ffff88043b712a80 ffffffff81cdc910 0000000000000007 ffff88043969dcc8
[   98.308671] Call Trace:
[   98.312471]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
[   98.318982]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
[   98.326376]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
[   98.333577]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
[   98.340482]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
[   98.347160]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
[   98.354232]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
[   98.361471]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
[   98.368260]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
[   98.375309]  [<ffffffff8106634a>] kthread+0xea/0xf0
[   98.381385]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   98.388951]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
[   98.395546]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
[   98.403097] ---[ end trace d36d91c626ac81a6 ]---
[   98.409180] Power down.
[   98.413109] acpi_power_off called

Comments

Viresh Kumar May 20, 2013, 1:43 p.m. UTC | #1
On 20 May 2013 18:53, Borislav Petkov <bp@alien8.de> wrote:
> I just confirmed that policy->cpus contains offlined cores with this:
>
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad82d23..e8c25f71e9b6 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -169,6 +169,9 @@ static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
>  {
>         struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
>
> +       if (WARN_ON(!cpu_online(cpu)))
> +               return;
> +
>         mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
>  }

Hmm, so for sure there is some locking issue there.
Have you tried my patch? I am not sure if it will fix everything but may
fix it.

> see splats collection below.
>
> And I don't think your fix above addresses the issue for the simple
> reason that if cpus go offline *before* you do get_online_cpus(), then
> policy->cpus will already contain offlined cpus.
>
> Rather, a better fix would be, IMHO, to do this (it works here, of course):
>
> ---
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad82d23..58541b164494 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -17,6 +17,7 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>
>  #include <asm/cputime.h>
> +#include <linux/cpu.h>
>  #include <linux/cpufreq.h>
>  #include <linux/cpumask.h>
>  #include <linux/export.h>
> @@ -169,7 +170,15 @@ static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
>  {
>         struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
>
> +       get_online_cpus();
> +
> +       if (!cpu_online(cpu))
> +               goto out;
> +
>         mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
> +
> + out:
> +       put_online_cpus();
>  }
>
>  void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,

This looks fine, but I want to fix the locking rather than just
hiding the issue. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov May 20, 2013, 3:08 p.m. UTC | #2
On Mon, May 20, 2013 at 07:13:08PM +0530, Viresh Kumar wrote:
> Hmm, so for sure there is some locking issue there. ave you tried my
> Hpatch?

No, not yet. Pretty busy ATM. Btw, you could try reproducing it too, in
the meantime - simply enable

CONFIG_NO_HZ_COMMON=y
# CONFIG_NO_HZ_IDLE is not set
CONFIG_NO_HZ_FULL=y
CONFIG_NO_HZ_FULL_ALL=y
CONFIG_NO_HZ=y
CONFIG_RCU_FAST_NO_HZ=y

and halt the box. I don't know whether !x86 boxen support NO_HZ_FULL yet
though.
Michael Wang May 21, 2013, 2:20 a.m. UTC | #3
On 05/20/2013 09:23 PM, Borislav Petkov wrote:
> On Mon, May 20, 2013 at 05:24:05PM +0800, Michael Wang wrote:
>>>> diff --git a/drivers/cpufreq/cpufreq_governor.c
>>>> b/drivers/cpufreq/cpufreq_governor.c
>>>> index 443442d..449be88 100644
>>>> --- a/drivers/cpufreq/cpufreq_governor.c
>>>> +++ b/drivers/cpufreq/cpufreq_governor.c
>>>> @@ -26,6 +26,7 @@
>>>>  #include <linux/tick.h>
>>>>  #include <linux/types.h>
>>>>  #include <linux/workqueue.h>
>>>> +#include <linux/cpu.h>
>>>>
>>>>  #include "cpufreq_governor.h"
>>>>
>>>> @@ -180,8 +181,10 @@ void gov_queue_work(struct dbs_data *dbs_data,
>>>> struct cpufreq_policy *policy,
>>>>         if (!all_cpus) {
>>>>                 __gov_queue_work(smp_processor_id(), dbs_data, delay);
>>>>         } else {
>>>> +               get_online_cpus();
>>>>                 for_each_cpu(i, policy->cpus)
>>>>                         __gov_queue_work(i, dbs_data, delay);
>>>> +               put_online_cpus();
>>>>         }
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(gov_queue_work);
>>>>
>>>> This is supposed to make WARN disappear, if it works, then BINGO :)
>>>
>>> Let people test it and then we can talk :)
>>
>> Agree :)
>>
>> Borislav, would you like to take a try?
>>
>> If this fix cause other troubles, you could try get_cpu() or disable irq
>> also.
> 
> I just confirmed that policy->cpus contains offlined cores with this:
> 
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad82d23..e8c25f71e9b6 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -169,6 +169,9 @@ static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
>  {
>         struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
> 
> +       if (WARN_ON(!cpu_online(cpu)))
> +               return;
> +

This is not enough to prove that policy->cpus is wrong, the cpu could be
online when get from policy->cpus, but offline when checked here, since
hotplug is able to happen during the period.

>         mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
>  }
> 
> see splats collection below.
> 
> And I don't think your fix above addresses the issue for the simple
> reason that if cpus go offline *before* you do get_online_cpus(), then
> policy->cpus will already contain offlined cpus.

I don't get it...

get_online_cpus() is just stop hotplug happen after it was invoked, so
unless policy->cpus is really wrong, otherwise all the cpu it masked
won't go offline any more.

> 
> Rather, a better fix would be, IMHO, to do this (it works here, of course):
> 
> ---
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad82d23..58541b164494 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -17,6 +17,7 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> 
>  #include <asm/cputime.h>
> +#include <linux/cpu.h>
>  #include <linux/cpufreq.h>
>  #include <linux/cpumask.h>
>  #include <linux/export.h>
> @@ -169,7 +170,15 @@ static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
>  {
>         struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
> 
> +       get_online_cpus();

This protect nothing...before we go here, the cpu could already offline,
nothing changed...

If you really want to confirm the policy->cpus was wrong, the way should
be apply the fix I suggested, than check online in here.

If hotplug could not happen but still get an offline cpu from
policy->cpus, than we could say it's wrong, otherwise we proved nothing...

Regards,
Michael Wang

> +
> +       if (!cpu_online(cpu))
> +               goto out;
> +
>         mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
> +
> + out:
> +       put_online_cpus();
>  }
> 
>  void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
> --
> 
> 
> [   94.386340] EXT4-fs (sda7): re-mounted. Opts: (null)
> [   96.520362] kvm: exiting hardware virtualization
> [   96.637687] ACPI: Preparing to enter system sleep state S5
> [   96.643506] Disabling non-boot CPUs ...
> [   96.855499] ------------[ cut here ]------------
> [   96.860172] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   96.868501] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   96.914238] CPU: 0 PID: 315 Comm: kworker/1:2 Tainted: G        W    3.10.0-rc1+ #2
> [   96.921969] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   96.929424] Workqueue: events od_dbs_timer
> [   96.933574]  0000000000000009 ffff88043a08bc78 ffffffff8161445c ffff88043a08bcb8
> [   96.941085]  ffffffff8103e540 ffff88043b712a80 0000000000000001 ffff88043a296400
> [   96.948602]  ffff88043b712a80 ffffffff81cdc910 0000000000000001 ffff88043a08bcc8
> [   96.956123] Call Trace:
> [   96.958602]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   96.963801]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   96.969858]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   96.975735]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   96.981359]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   96.986808]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   96.992691]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   96.998756]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   97.004371]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   97.010256]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   97.015185]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.021605]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   97.027049]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.033457] ---[ end trace d36d91c626ac81a0 ]---
> [   97.039221] ------------[ cut here ]------------
> [   97.039227] ------------[ cut here ]------------
> [   97.039229] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   97.039243] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   97.039245] CPU: 4 PID: 82 Comm: kworker/2:1 Tainted: G        W    3.10.0-rc1+ #2
> [   97.039245] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   97.039248] Workqueue: events od_dbs_timer
> [   97.039250]  0000000000000009 ffff88043b5cfc78 ffffffff8161445c ffff88043b5cfcb8
> [   97.039251]  ffffffff8103e540 ffff88043b712a80 0000000000000002 ffff88043a295e00
> [   97.039253]  ffff88043b712a80 ffffffff81cdc910 0000000000000002 ffff88043b5cfcc8
> [   97.039253] Call Trace:
> [   97.039255]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   97.039257]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   97.039258]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   97.039259]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   97.039261]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   97.039263]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   97.039264]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   97.039265]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   97.039267]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   97.039268]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   97.039269]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.039270]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   97.039272]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.039272] ---[ end trace d36d91c626ac81a1 ]---
> [   97.143214] nouveau E[     DRM] GPU lockup - switching to software fbcon
> [   97.318430] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   97.326804] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   97.374578] CPU: 0 PID: 98 Comm: kworker/3:1 Tainted: G        W    3.10.0-rc1+ #2
> [   97.384154] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   97.393566] Workqueue: events od_dbs_timer
> [   97.399675]  0000000000000009 ffff88043b179c78 ffffffff8161445c ffff88043b179cb8
> [   97.409153]  ffffffff8103e540 ffff88043b712a80 0000000000000003 ffff88043a295a00
> [   97.418623]  ffff88043b712a80 ffffffff81cdc910 0000000000000003 ffff88043b179cc8
> [   97.428103] Call Trace:
> [   97.432520]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   97.439678]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   97.447694]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   97.455512]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   97.462993]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   97.470259]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   97.477878]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   97.485652]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   97.492969]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   97.500565]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   97.507167]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.515255]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   97.522389]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.530472] ---[ end trace d36d91c626ac81a2 ]---
> [   97.543176] ------------[ cut here ]------------
> [   97.547172] ------------[ cut here ]------------
> [   97.547178] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   97.547197] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   97.547199] CPU: 7 PID: 316 Comm: kworker/5:1 Tainted: G        W    3.10.0-rc1+ #2
> [   97.547200] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   97.547202] Workqueue: events od_dbs_timer
> [   97.547204]  0000000000000009 ffff88043905dc78 ffffffff8161445c ffff88043905dcb8
> [   97.547205]  ffffffff8103e540 ffff88043b712a80 0000000000000005 ffff88043a295800
> [   97.547206]  ffff88043b712a80 ffffffff81cdc910 0000000000000005 ffff88043905dcc8
> [   97.547207] Call Trace:
> [   97.547211]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   97.547214]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   97.547215]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   97.547216]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   97.547218]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   97.547220]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   97.547221]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   97.547222]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   97.547224]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   97.547225]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   97.547226]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.547228]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   97.547229]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.547230] ---[ end trace d36d91c626ac81a3 ]---
> [   97.761326] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   97.770798] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   97.819617] CPU: 0 PID: 253 Comm: kworker/4:1 Tainted: G        W    3.10.0-rc1+ #2
> [   97.828623] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   97.837372] Workqueue: events od_dbs_timer
> [   97.842805]  0000000000000009 ffff880439529c78 ffffffff8161445c ffff880439529cb8
> [   97.851628]  ffffffff8103e540 ffff88043b712a80 0000000000000004 ffff88043a295c00
> [   97.860445]  ffff88043b712a80 ffffffff81cdc910 0000000000000004 ffff880439529cc8
> [   97.869249] Call Trace:
> [   97.873041]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   97.879533]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   97.886912]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   97.894100]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   97.901002]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   97.907706]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   97.914797]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   97.922016]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   97.928803]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   97.935837]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   97.941900]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.949443]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   97.956027]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   97.963563] ---[ end trace d36d91c626ac81a4 ]---
> [   97.970449] ------------[ cut here ]------------
> [   97.976277] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   97.985762] Modules linked in: ext2 vfat fat loop usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   98.035051] CPU: 0 PID: 102 Comm: kworker/6:1 Tainted: G        W    3.10.0-rc1+ #2
> [   98.044067] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   98.052834] Workqueue: events od_dbs_timer
> [   98.058285]  0000000000000009 ffff88043b6f3c78 ffffffff8161445c ffff88043b6f3cb8
> [   98.067114]  ffffffff8103e540 ffff88043b712a80 0000000000000006 ffff88043a295600
> [   98.075924]  ffff88043b712a80 ffffffff81cdc910 0000000000000006 ffff88043b6f3cc8
> [   98.084735] Call Trace:
> [   98.088518]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   98.095024]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   98.102386]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   98.109565]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   98.116502]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   98.123253]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   98.130394]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   98.137667]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   98.144456]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   98.151510]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   98.157583]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   98.165143]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   98.171730]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   98.179282] ---[ end trace d36d91c626ac81a5 ]---
> [   98.185098] ------------[ cut here ]------------
> [   98.190903] WARNING: at drivers/cpufreq/cpufreq_governor.c:172 gov_queue_work+0xf0/0x110()
> [   98.200387] Modules linked in: ext2 vfat fat loop
> [   98.205029] nouveau W[   PFIFO][0000:03:00.0] unknown intr 0x00400000, ch 1
> [   98.214563]  usbhid snd_hda_codec_hdmi coretemp kvm_intel kvm snd_hda_codec_realtek snd_hda_intel snd_hda_codec ehci_pci xhci_hcd ehci_hcd usbcore crc32_pclmul crc32c_intel snd_hwdep snd_pcm snd_page_alloc snd_timer ghash_clmulni_intel snd aesni_intel aes_x86_64 glue_helper sb_edac edac_core acpi_cpufreq mperf pcspkr lrw gf128mul ablk_helper cryptd iTCO_wdt iTCO_vendor_support evdev soundcore lpc_ich mfd_core processor dcdbas i2c_i801 usb_common button microcode
> [   98.258886] CPU: 0 PID: 318 Comm: kworker/7:1 Tainted: G        W    3.10.0-rc1+ #2
> [   98.267919] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
> [   98.276689] Workqueue: events od_dbs_timer
> [   98.282147]  0000000000000009 ffff88043969dc78 ffffffff8161445c ffff88043969dcb8
> [   98.290991]  ffffffff8103e540 ffff88043b712a80 0000000000000007 ffff88043a295200
> [   98.299832]  ffff88043b712a80 ffffffff81cdc910 0000000000000007 ffff88043969dcc8
> [   98.308671] Call Trace:
> [   98.312471]  [<ffffffff8161445c>] dump_stack+0x19/0x1b
> [   98.318982]  [<ffffffff8103e540>] warn_slowpath_common+0x70/0xa0
> [   98.326376]  [<ffffffff8103e58a>] warn_slowpath_null+0x1a/0x20
> [   98.333577]  [<ffffffff814f6bf0>] gov_queue_work+0xf0/0x110
> [   98.340482]  [<ffffffff814f60bb>] od_dbs_timer+0xcb/0x170
> [   98.347160]  [<ffffffff8105e75d>] process_one_work+0x1fd/0x540
> [   98.354232]  [<ffffffff8105e6f2>] ? process_one_work+0x192/0x540
> [   98.361471]  [<ffffffff8105ef22>] worker_thread+0x122/0x380
> [   98.368260]  [<ffffffff8105ee00>] ? rescuer_thread+0x320/0x320
> [   98.375309]  [<ffffffff8106634a>] kthread+0xea/0xf0
> [   98.381385]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   98.388951]  [<ffffffff81623d9c>] ret_from_fork+0x7c/0xb0
> [   98.395546]  [<ffffffff81066260>] ? flush_kthread_worker+0x150/0x150
> [   98.403097] ---[ end trace d36d91c626ac81a6 ]---
> [   98.409180] Power down.
> [   98.413109] acpi_power_off called
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov May 21, 2013, 7:21 a.m. UTC | #4
On Tue, May 21, 2013 at 10:20:51AM +0800, Michael Wang wrote:
> This is not enough to prove that policy->cpus is wrong, the cpu could
> be online when get from policy->cpus, but offline when checked here,
> since hotplug is able to happen during the period.

Strictly speaking you're correct but I don't do any hotplug besides the
one-time thing which is part of halting the box.

> I don't get it...
> 
> get_online_cpus() is just stop hotplug happen after it was invoked, so
> unless policy->cpus is really wrong, otherwise all the cpu it masked
> won't go offline any more.

Yes, that's my impression too - at the point we do gov_queue_work,
policy->cpus already contains offline cpus.

> This protect nothing...before we go here, the cpu could already
> offline, nothing changed...

Yes, but I don't want to schedule work on an offlined cpu and that is
ensured here.

> If you really want to confirm the policy->cpus was wrong, the way
> should be apply the fix I suggested, than check online in here.

Sure, feel free to get a box, enable NO_HZ_FULL and do all the
experimentations you desire. I surely cannot be the only one who
triggers this.
Michael Wang May 21, 2013, 7:58 a.m. UTC | #5
On 05/21/2013 03:21 PM, Borislav Petkov wrote:
> On Tue, May 21, 2013 at 10:20:51AM +0800, Michael Wang wrote:
>> This is not enough to prove that policy->cpus is wrong, the cpu could
>> be online when get from policy->cpus, but offline when checked here,
>> since hotplug is able to happen during the period.
> 
> Strictly speaking you're correct but I don't do any hotplug besides the
> one-time thing which is part of halting the box.

Well, they share the same cpu_down() I suppose...

> 
>> I don't get it...
>>
>> get_online_cpus() is just stop hotplug happen after it was invoked, so
>> unless policy->cpus is really wrong, otherwise all the cpu it masked
>> won't go offline any more.
> 
> Yes, that's my impression too - at the point we do gov_queue_work,
> policy->cpus already contains offline cpus.
> 
>> This protect nothing...before we go here, the cpu could already
>> offline, nothing changed...
> 
> Yes, but I don't want to schedule work on an offlined cpu and that is
> ensured here.

IMHO, the problem seems mostly like the wrong usage of policy->cpus,
it's providing the right info, but just at that time, we don't need
worry about work on offlined cpu if we don't allow cpu disappear.

Your approach could be good respect to performance, but if we could
prove that policy->cpus is correct firstly, than we could fix the
problem without any concern, don't we?

> 
>> If you really want to confirm the policy->cpus was wrong, the way
>> should be apply the fix I suggested, than check online in here.
> 
> Sure, feel free to get a box, enable NO_HZ_FULL and do all the
> experimentations you desire. I surely cannot be the only one who
> triggers this.

I'm fine if the problem get solved, that means your box doesn't show
WARN any more :)

Regards,
Michael Wang

> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5af40ad82d23..e8c25f71e9b6 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -169,6 +169,9 @@  static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
 {
        struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
 
+       if (WARN_ON(!cpu_online(cpu)))
+               return;
+
        mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
 }

see splats collection below.

And I don't think your fix above addresses the issue for the simple
reason that if cpus go offline *before* you do get_online_cpus(), then
policy->cpus will already contain offlined cpus.

Rather, a better fix would be, IMHO, to do this (it works here, of course):

---
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5af40ad82d23..58541b164494 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -17,6 +17,7 @@ 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <asm/cputime.h>
+#include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 #include <linux/export.h>
@@ -169,7 +170,15 @@  static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data,
 {
        struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
 
+       get_online_cpus();
+
+       if (!cpu_online(cpu))
+               goto out;
+
        mod_delayed_work_on(cpu, system_wq, &cdbs->work, delay);
+
+ out:
+       put_online_cpus();
 }
 
 void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,