Message ID | 20211202123553.96412-3-david@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | s390x: firq: floating interrupt test | expand |
On Thu, 2 Dec 2021 13:35:53 +0100 David Hildenbrand <david@redhat.com> wrote: > We had a KVM BUG fixed by kernel commit a3e03bc1368c ("KVM: s390: index > kvm->arch.idle_mask by vcpu_idx"), whereby a floating interrupt might get > stuck forever because a CPU in the wait state would not get woken up. > > The issue can be triggered when CPUs are created in a nonlinear fashion, > such that the CPU address ("core-id") and the KVM cpu id don't match. > > So let's start with a floating interrupt test that will trigger a > floating interrupt (via SCLP) to be delivered to a CPU in the wait state. > > Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com> > --- > lib/s390x/sclp.c | 11 ++-- > lib/s390x/sclp.h | 1 + > s390x/Makefile | 1 + > s390x/firq.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ > s390x/unittests.cfg | 10 ++++ > 5 files changed, 142 insertions(+), 3 deletions(-) > create mode 100644 s390x/firq.c > > diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c > index 0272249..33985eb 100644 > --- a/lib/s390x/sclp.c > +++ b/lib/s390x/sclp.c > @@ -60,9 +60,7 @@ void sclp_setup_int(void) > void sclp_handle_ext(void) > { > ctl_clear_bit(0, CTL0_SERVICE_SIGNAL); > - spin_lock(&sclp_lock); > - sclp_busy = false; > - spin_unlock(&sclp_lock); > + sclp_clear_busy(); > } > > void sclp_wait_busy(void) > @@ -89,6 +87,13 @@ void sclp_mark_busy(void) > } > } > > +void sclp_clear_busy(void) > +{ > + spin_lock(&sclp_lock); > + sclp_busy = false; > + spin_unlock(&sclp_lock); > +} > + > static void sclp_read_scp_info(ReadInfo *ri, int length) > { > unsigned int commands[] = { SCLP_CMDW_READ_SCP_INFO_FORCED, > diff --git a/lib/s390x/sclp.h b/lib/s390x/sclp.h > index 61e9cf5..fead007 100644 > --- a/lib/s390x/sclp.h > +++ b/lib/s390x/sclp.h > @@ -318,6 +318,7 @@ void sclp_setup_int(void); > void sclp_handle_ext(void); > void sclp_wait_busy(void); > void sclp_mark_busy(void); > +void sclp_clear_busy(void); > void sclp_console_setup(void); > void sclp_print(const char *str); > void sclp_read_info(void); > diff --git a/s390x/Makefile b/s390x/Makefile > index f95f2e6..1e567c1 100644 > --- a/s390x/Makefile > +++ b/s390x/Makefile > @@ -25,6 +25,7 @@ tests += $(TEST_DIR)/uv-host.elf > tests += $(TEST_DIR)/edat.elf > tests += $(TEST_DIR)/mvpg-sie.elf > tests += $(TEST_DIR)/spec_ex-sie.elf > +tests += $(TEST_DIR)/firq.elf > > tests_binary = $(patsubst %.elf,%.bin,$(tests)) > ifneq ($(HOST_KEY_DOCUMENT),) > diff --git a/s390x/firq.c b/s390x/firq.c > new file mode 100644 > index 0000000..1f87718 > --- /dev/null > +++ b/s390x/firq.c > @@ -0,0 +1,122 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Floating interrupt tests. > + * > + * Copyright 2021 Red Hat Inc > + * > + * Authors: > + * David Hildenbrand <david@redhat.com> > + */ > +#include <libcflat.h> > +#include <asm/asm-offsets.h> > +#include <asm/interrupt.h> > +#include <asm/page.h> > +#include <asm-generic/barrier.h> > + > +#include <sclp.h> > +#include <smp.h> > +#include <alloc_page.h> > + > +static void wait_for_sclp_int(void) > +{ > + /* Enable SCLP interrupts on this CPU only. */ > + ctl_set_bit(0, CTL0_SERVICE_SIGNAL); > + > + /* Enable external interrupts and go to the wait state. */ > + wait_for_interrupt(PSW_MASK_EXT); > +} > + > +/* > + * Some KVM versions might mix CPUs when looking for a floating IRQ target, > + * accidentially detecting a stopped CPU as waiting and resulting in the actually > + * waiting CPU not getting woken up for the interrupt. > + */ > +static void test_wait_state_delivery(void) > +{ > + struct psw psw; > + SCCBHeader *h; > + int ret; > + > + report_prefix_push("wait state delivery"); > + > + if (smp_query_num_cpus() < 3) { > + report_skip("need at least 3 CPUs for this test"); > + goto out; > + } > + > + if (stap()) { > + report_skip("need to start on CPU #0"); > + goto out; > + } > + > + /* > + * We want CPU #2 to be stopped. This should be the case at this > + * point, however, we want to sense if it even exists as well. > + */ > + ret = smp_cpu_stop(2); > + if (ret) { > + report_skip("CPU #2 not found"); > + goto out; > + } > + > + /* > + * We're going to perform an SCLP service call but expect > + * the interrupt on CPU #1 while it is in the wait state. > + */ > + sclp_mark_busy(); > + > + /* Start CPU #1 and let it wait for the interrupt. */ > + psw.mask = extract_psw_mask(); > + psw.addr = (unsigned long)wait_for_sclp_int; > + ret = smp_cpu_setup(1, psw); > + if (ret) { > + sclp_clear_busy(); > + report_skip("cpu #1 not found"); > + goto out; > + } > + > + /* > + * We'd have to jump trough some hoops to sense e.g., via SIGP > + * CONDITIONAL EMERGENCY SIGNAL if CPU #1 is already in the > + * wait state. > + * > + * Although not completely reliable, use SIGP SENSE RUNNING STATUS > + * until not reported as running -- after all, our SCLP processing > + * will take some time as well and smp_cpu_setup() returns when we're > + * either already in wait_for_sclp_int() or just about to execute it. > + */ > + while(smp_sense_running_status(1)); > + > + h = alloc_page(); > + h->length = 4096; > + ret = servc(SCLP_CMDW_READ_CPU_INFO, __pa(h)); > + if (ret) { > + sclp_clear_busy(); > + report_fail("SCLP_CMDW_READ_CPU_INFO failed"); > + goto out_destroy; > + } > + > + /* > + * Wait until the interrupt gets delivered on CPU #1, marking the > + * SCLP requests as done. > + */ > + sclp_wait_busy(); > + > + report(true, "sclp interrupt delivered"); > + > +out_destroy: > + free_page(h); > + smp_cpu_destroy(1); > +out: > + report_prefix_pop(); > +} > + > +int main(void) > +{ > + report_prefix_push("firq"); > + > + test_wait_state_delivery(); > + > + report_prefix_pop(); > + return report_summary(); > +} > diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg > index 3b454b7..054560c 100644 > --- a/s390x/unittests.cfg > +++ b/s390x/unittests.cfg > @@ -112,3 +112,13 @@ file = mvpg-sie.elf > > [spec_ex-sie] > file = spec_ex-sie.elf > + > +[firq-linear-cpu-ids] > +file = firq.elf > +timeout = 20 > +extra_params = -smp 1,maxcpus=3 -cpu qemu -device qemu-s390x-cpu,core-id=1 -device qemu-s390x-cpu,core-id=2 > + > +[firq-nonlinear-cpu-ids] > +file = firq.elf > +timeout = 20 > +extra_params = -smp 1,maxcpus=3 -cpu qemu -device qemu-s390x-cpu,core-id=2 -device qemu-s390x-cpu,core-id=1
On 02/12/2021 13.35, David Hildenbrand wrote: > We had a KVM BUG fixed by kernel commit a3e03bc1368c ("KVM: s390: index > kvm->arch.idle_mask by vcpu_idx"), whereby a floating interrupt might get > stuck forever because a CPU in the wait state would not get woken up. > > The issue can be triggered when CPUs are created in a nonlinear fashion, > such that the CPU address ("core-id") and the KVM cpu id don't match. > > So let's start with a floating interrupt test that will trigger a > floating interrupt (via SCLP) to be delivered to a CPU in the wait state. Thank you very much for tackling this! Some remarks below... > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > lib/s390x/sclp.c | 11 ++-- > lib/s390x/sclp.h | 1 + > s390x/Makefile | 1 + > s390x/firq.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ > s390x/unittests.cfg | 10 ++++ > 5 files changed, 142 insertions(+), 3 deletions(-) > create mode 100644 s390x/firq.c > > diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c > index 0272249..33985eb 100644 > --- a/lib/s390x/sclp.c > +++ b/lib/s390x/sclp.c > @@ -60,9 +60,7 @@ void sclp_setup_int(void) > void sclp_handle_ext(void) > { > ctl_clear_bit(0, CTL0_SERVICE_SIGNAL); > - spin_lock(&sclp_lock); > - sclp_busy = false; > - spin_unlock(&sclp_lock); > + sclp_clear_busy(); > } > > void sclp_wait_busy(void) > @@ -89,6 +87,13 @@ void sclp_mark_busy(void) > } > } > > +void sclp_clear_busy(void) > +{ > + spin_lock(&sclp_lock); > + sclp_busy = false; > + spin_unlock(&sclp_lock); > +} > + > static void sclp_read_scp_info(ReadInfo *ri, int length) > { > unsigned int commands[] = { SCLP_CMDW_READ_SCP_INFO_FORCED, > diff --git a/lib/s390x/sclp.h b/lib/s390x/sclp.h > index 61e9cf5..fead007 100644 > --- a/lib/s390x/sclp.h > +++ b/lib/s390x/sclp.h > @@ -318,6 +318,7 @@ void sclp_setup_int(void); > void sclp_handle_ext(void); > void sclp_wait_busy(void); > void sclp_mark_busy(void); > +void sclp_clear_busy(void); > void sclp_console_setup(void); > void sclp_print(const char *str); > void sclp_read_info(void); > diff --git a/s390x/Makefile b/s390x/Makefile > index f95f2e6..1e567c1 100644 > --- a/s390x/Makefile > +++ b/s390x/Makefile > @@ -25,6 +25,7 @@ tests += $(TEST_DIR)/uv-host.elf > tests += $(TEST_DIR)/edat.elf > tests += $(TEST_DIR)/mvpg-sie.elf > tests += $(TEST_DIR)/spec_ex-sie.elf > +tests += $(TEST_DIR)/firq.elf > > tests_binary = $(patsubst %.elf,%.bin,$(tests)) > ifneq ($(HOST_KEY_DOCUMENT),) > diff --git a/s390x/firq.c b/s390x/firq.c > new file mode 100644 > index 0000000..1f87718 > --- /dev/null > +++ b/s390x/firq.c > @@ -0,0 +1,122 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Floating interrupt tests. > + * > + * Copyright 2021 Red Hat Inc > + * > + * Authors: > + * David Hildenbrand <david@redhat.com> > + */ > +#include <libcflat.h> > +#include <asm/asm-offsets.h> > +#include <asm/interrupt.h> > +#include <asm/page.h> > +#include <asm-generic/barrier.h> > + > +#include <sclp.h> > +#include <smp.h> > +#include <alloc_page.h> > + > +static void wait_for_sclp_int(void) > +{ > + /* Enable SCLP interrupts on this CPU only. */ > + ctl_set_bit(0, CTL0_SERVICE_SIGNAL); > + > + /* Enable external interrupts and go to the wait state. */ > + wait_for_interrupt(PSW_MASK_EXT); > +} What happens if the CPU got an interrupt? Should there be a "while (true)" at the end of the function to avoid that the CPU ends up crashing at the end of the function? > +/* > + * Some KVM versions might mix CPUs when looking for a floating IRQ target, > + * accidentially detecting a stopped CPU as waiting and resulting in the actually > + * waiting CPU not getting woken up for the interrupt. > + */ > +static void test_wait_state_delivery(void) > +{ > + struct psw psw; > + SCCBHeader *h; > + int ret; > + > + report_prefix_push("wait state delivery"); > + > + if (smp_query_num_cpus() < 3) { > + report_skip("need at least 3 CPUs for this test"); > + goto out; > + } > + > + if (stap()) { > + report_skip("need to start on CPU #0"); > + goto out; > + } I think I'd rather turn this into an assert() instead ... no strong opinion about it, though. > + > + /* > + * We want CPU #2 to be stopped. This should be the case at this > + * point, however, we want to sense if it even exists as well. > + */ > + ret = smp_cpu_stop(2); > + if (ret) { > + report_skip("CPU #2 not found"); Since you already queried for the availablity of at least 3 CPUs above, I think you could turn this into a report_fail() instead? > + goto out; > + } > + > + /* > + * We're going to perform an SCLP service call but expect > + * the interrupt on CPU #1 while it is in the wait state. > + */ > + sclp_mark_busy(); > + > + /* Start CPU #1 and let it wait for the interrupt. */ > + psw.mask = extract_psw_mask(); > + psw.addr = (unsigned long)wait_for_sclp_int; > + ret = smp_cpu_setup(1, psw); > + if (ret) { > + sclp_clear_busy(); > + report_skip("cpu #1 not found"); > + goto out; > + } > + > + /* > + * We'd have to jump trough some hoops to sense e.g., via SIGP > + * CONDITIONAL EMERGENCY SIGNAL if CPU #1 is already in the > + * wait state. > + * > + * Although not completely reliable, use SIGP SENSE RUNNING STATUS > + * until not reported as running -- after all, our SCLP processing > + * will take some time as well and smp_cpu_setup() returns when we're > + * either already in wait_for_sclp_int() or just about to execute it. > + */ > + while(smp_sense_running_status(1)); > + > + h = alloc_page(); > + h->length = 4096; > + ret = servc(SCLP_CMDW_READ_CPU_INFO, __pa(h)); > + if (ret) { > + sclp_clear_busy(); > + report_fail("SCLP_CMDW_READ_CPU_INFO failed"); > + goto out_destroy; > + } > + > + /* > + * Wait until the interrupt gets delivered on CPU #1, marking the > + * SCLP requests as done. > + */ > + sclp_wait_busy(); > + > + report(true, "sclp interrupt delivered"); > + > +out_destroy: > + free_page(h); > + smp_cpu_destroy(1); > +out: > + report_prefix_pop(); > +} Anyway, code looks fine for me, either with my comments addressed or not: Reviewed-by: Thomas Huth <thuth@redhat.com>
On Fri, 3 Dec 2021 11:55:31 +0100 Thomas Huth <thuth@redhat.com> wrote: > On 02/12/2021 13.35, David Hildenbrand wrote: > > We had a KVM BUG fixed by kernel commit a3e03bc1368c ("KVM: s390: index > > kvm->arch.idle_mask by vcpu_idx"), whereby a floating interrupt might get > > stuck forever because a CPU in the wait state would not get woken up. > > > > The issue can be triggered when CPUs are created in a nonlinear fashion, > > such that the CPU address ("core-id") and the KVM cpu id don't match. > > > > So let's start with a floating interrupt test that will trigger a > > floating interrupt (via SCLP) to be delivered to a CPU in the wait state. > > Thank you very much for tackling this! Some remarks below... > > > Signed-off-by: David Hildenbrand <david@redhat.com> > > --- > > lib/s390x/sclp.c | 11 ++-- > > lib/s390x/sclp.h | 1 + > > s390x/Makefile | 1 + > > s390x/firq.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ > > s390x/unittests.cfg | 10 ++++ > > 5 files changed, 142 insertions(+), 3 deletions(-) > > create mode 100644 s390x/firq.c > > > > diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c > > index 0272249..33985eb 100644 > > --- a/lib/s390x/sclp.c > > +++ b/lib/s390x/sclp.c > > @@ -60,9 +60,7 @@ void sclp_setup_int(void) > > void sclp_handle_ext(void) > > { > > ctl_clear_bit(0, CTL0_SERVICE_SIGNAL); > > - spin_lock(&sclp_lock); > > - sclp_busy = false; > > - spin_unlock(&sclp_lock); > > + sclp_clear_busy(); > > } > > > > void sclp_wait_busy(void) > > @@ -89,6 +87,13 @@ void sclp_mark_busy(void) > > } > > } > > > > +void sclp_clear_busy(void) > > +{ > > + spin_lock(&sclp_lock); > > + sclp_busy = false; > > + spin_unlock(&sclp_lock); > > +} > > + > > static void sclp_read_scp_info(ReadInfo *ri, int length) > > { > > unsigned int commands[] = { SCLP_CMDW_READ_SCP_INFO_FORCED, > > diff --git a/lib/s390x/sclp.h b/lib/s390x/sclp.h > > index 61e9cf5..fead007 100644 > > --- a/lib/s390x/sclp.h > > +++ b/lib/s390x/sclp.h > > @@ -318,6 +318,7 @@ void sclp_setup_int(void); > > void sclp_handle_ext(void); > > void sclp_wait_busy(void); > > void sclp_mark_busy(void); > > +void sclp_clear_busy(void); > > void sclp_console_setup(void); > > void sclp_print(const char *str); > > void sclp_read_info(void); > > diff --git a/s390x/Makefile b/s390x/Makefile > > index f95f2e6..1e567c1 100644 > > --- a/s390x/Makefile > > +++ b/s390x/Makefile > > @@ -25,6 +25,7 @@ tests += $(TEST_DIR)/uv-host.elf > > tests += $(TEST_DIR)/edat.elf > > tests += $(TEST_DIR)/mvpg-sie.elf > > tests += $(TEST_DIR)/spec_ex-sie.elf > > +tests += $(TEST_DIR)/firq.elf > > > > tests_binary = $(patsubst %.elf,%.bin,$(tests)) > > ifneq ($(HOST_KEY_DOCUMENT),) > > diff --git a/s390x/firq.c b/s390x/firq.c > > new file mode 100644 > > index 0000000..1f87718 > > --- /dev/null > > +++ b/s390x/firq.c > > @@ -0,0 +1,122 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +/* > > + * Floating interrupt tests. > > + * > > + * Copyright 2021 Red Hat Inc > > + * > > + * Authors: > > + * David Hildenbrand <david@redhat.com> > > + */ > > +#include <libcflat.h> > > +#include <asm/asm-offsets.h> > > +#include <asm/interrupt.h> > > +#include <asm/page.h> > > +#include <asm-generic/barrier.h> > > + > > +#include <sclp.h> > > +#include <smp.h> > > +#include <alloc_page.h> > > + > > +static void wait_for_sclp_int(void) > > +{ > > + /* Enable SCLP interrupts on this CPU only. */ > > + ctl_set_bit(0, CTL0_SERVICE_SIGNAL); > > + > > + /* Enable external interrupts and go to the wait state. */ > > + wait_for_interrupt(PSW_MASK_EXT); > > +} > > What happens if the CPU got an interrupt? Should there be a "while (true)" it should not get any interrupts, but if it does anyway... > at the end of the function to avoid that the CPU ends up crashing at the end > of the function? ... we have this in smp_cpu_setup_state, after the call to the actual function body: /* If the function returns, just loop here */ 0: j 0 so if the function returns, it will hang in there anyway > > > +/* > > + * Some KVM versions might mix CPUs when looking for a floating IRQ target, > > + * accidentially detecting a stopped CPU as waiting and resulting in the actually > > + * waiting CPU not getting woken up for the interrupt. > > + */ > > +static void test_wait_state_delivery(void) > > +{ > > + struct psw psw; > > + SCCBHeader *h; > > + int ret; > > + > > + report_prefix_push("wait state delivery"); > > + > > + if (smp_query_num_cpus() < 3) { > > + report_skip("need at least 3 CPUs for this test"); > > + goto out; > > + } > > + > > + if (stap()) { > > + report_skip("need to start on CPU #0"); > > + goto out; > > + } > > I think I'd rather turn this into an assert() instead ... no strong opinion > about it, though. I agree, including the part about no strong opinions (which is why I did not comment on it before) > > > + > > + /* > > + * We want CPU #2 to be stopped. This should be the case at this > > + * point, however, we want to sense if it even exists as well. > > + */ > > + ret = smp_cpu_stop(2); > > + if (ret) { > > + report_skip("CPU #2 not found"); > > Since you already queried for the availablity of at least 3 CPUs above, I > think you could turn this into a report_fail() instead? either that or an assert, but again, no strong opinions > > > + goto out; > > + } > > + > > + /* > > + * We're going to perform an SCLP service call but expect > > + * the interrupt on CPU #1 while it is in the wait state. > > + */ > > + sclp_mark_busy(); > > + > > + /* Start CPU #1 and let it wait for the interrupt. */ > > + psw.mask = extract_psw_mask(); > > + psw.addr = (unsigned long)wait_for_sclp_int; > > + ret = smp_cpu_setup(1, psw); > > + if (ret) { > > + sclp_clear_busy(); > > + report_skip("cpu #1 not found"); > > + goto out; > > + } > > + > > + /* > > + * We'd have to jump trough some hoops to sense e.g., via SIGP > > + * CONDITIONAL EMERGENCY SIGNAL if CPU #1 is already in the > > + * wait state. > > + * > > + * Although not completely reliable, use SIGP SENSE RUNNING STATUS > > + * until not reported as running -- after all, our SCLP processing > > + * will take some time as well and smp_cpu_setup() returns when we're > > + * either already in wait_for_sclp_int() or just about to execute it. > > + */ > > + while(smp_sense_running_status(1)); > > + > > + h = alloc_page(); > > + h->length = 4096; > > + ret = servc(SCLP_CMDW_READ_CPU_INFO, __pa(h)); > > + if (ret) { > > + sclp_clear_busy(); > > + report_fail("SCLP_CMDW_READ_CPU_INFO failed"); > > + goto out_destroy; > > + } > > + > > + /* > > + * Wait until the interrupt gets delivered on CPU #1, marking the > > + * SCLP requests as done. > > + */ > > + sclp_wait_busy(); > > + > > + report(true, "sclp interrupt delivered"); > > + > > +out_destroy: > > + free_page(h); > > + smp_cpu_destroy(1); > > +out: > > + report_prefix_pop(); > > +} > > Anyway, code looks fine for me, either with my comments addressed or not: > > Reviewed-by: Thomas Huth <thuth@redhat.com> >
On 03/12/2021 12.18, Claudio Imbrenda wrote: > On Fri, 3 Dec 2021 11:55:31 +0100 > Thomas Huth <thuth@redhat.com> wrote: > >> On 02/12/2021 13.35, David Hildenbrand wrote: >>> We had a KVM BUG fixed by kernel commit a3e03bc1368c ("KVM: s390: index >>> kvm->arch.idle_mask by vcpu_idx"), whereby a floating interrupt might get >>> stuck forever because a CPU in the wait state would not get woken up. >>> >>> The issue can be triggered when CPUs are created in a nonlinear fashion, >>> such that the CPU address ("core-id") and the KVM cpu id don't match. >>> >>> So let's start with a floating interrupt test that will trigger a >>> floating interrupt (via SCLP) to be delivered to a CPU in the wait state. >> >> Thank you very much for tackling this! Some remarks below... >> >>> Signed-off-by: David Hildenbrand <david@redhat.com> >>> --- >>> lib/s390x/sclp.c | 11 ++-- >>> lib/s390x/sclp.h | 1 + >>> s390x/Makefile | 1 + >>> s390x/firq.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ >>> s390x/unittests.cfg | 10 ++++ >>> 5 files changed, 142 insertions(+), 3 deletions(-) >>> create mode 100644 s390x/firq.c >>> >>> diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c >>> index 0272249..33985eb 100644 >>> --- a/lib/s390x/sclp.c >>> +++ b/lib/s390x/sclp.c >>> @@ -60,9 +60,7 @@ void sclp_setup_int(void) >>> void sclp_handle_ext(void) >>> { >>> ctl_clear_bit(0, CTL0_SERVICE_SIGNAL); >>> - spin_lock(&sclp_lock); >>> - sclp_busy = false; >>> - spin_unlock(&sclp_lock); >>> + sclp_clear_busy(); >>> } >>> >>> void sclp_wait_busy(void) >>> @@ -89,6 +87,13 @@ void sclp_mark_busy(void) >>> } >>> } >>> >>> +void sclp_clear_busy(void) >>> +{ >>> + spin_lock(&sclp_lock); >>> + sclp_busy = false; >>> + spin_unlock(&sclp_lock); >>> +} >>> + >>> static void sclp_read_scp_info(ReadInfo *ri, int length) >>> { >>> unsigned int commands[] = { SCLP_CMDW_READ_SCP_INFO_FORCED, >>> diff --git a/lib/s390x/sclp.h b/lib/s390x/sclp.h >>> index 61e9cf5..fead007 100644 >>> --- a/lib/s390x/sclp.h >>> +++ b/lib/s390x/sclp.h >>> @@ -318,6 +318,7 @@ void sclp_setup_int(void); >>> void sclp_handle_ext(void); >>> void sclp_wait_busy(void); >>> void sclp_mark_busy(void); >>> +void sclp_clear_busy(void); >>> void sclp_console_setup(void); >>> void sclp_print(const char *str); >>> void sclp_read_info(void); >>> diff --git a/s390x/Makefile b/s390x/Makefile >>> index f95f2e6..1e567c1 100644 >>> --- a/s390x/Makefile >>> +++ b/s390x/Makefile >>> @@ -25,6 +25,7 @@ tests += $(TEST_DIR)/uv-host.elf >>> tests += $(TEST_DIR)/edat.elf >>> tests += $(TEST_DIR)/mvpg-sie.elf >>> tests += $(TEST_DIR)/spec_ex-sie.elf >>> +tests += $(TEST_DIR)/firq.elf >>> >>> tests_binary = $(patsubst %.elf,%.bin,$(tests)) >>> ifneq ($(HOST_KEY_DOCUMENT),) >>> diff --git a/s390x/firq.c b/s390x/firq.c >>> new file mode 100644 >>> index 0000000..1f87718 >>> --- /dev/null >>> +++ b/s390x/firq.c >>> @@ -0,0 +1,122 @@ >>> +/* SPDX-License-Identifier: GPL-2.0-only */ >>> +/* >>> + * Floating interrupt tests. >>> + * >>> + * Copyright 2021 Red Hat Inc >>> + * >>> + * Authors: >>> + * David Hildenbrand <david@redhat.com> >>> + */ >>> +#include <libcflat.h> >>> +#include <asm/asm-offsets.h> >>> +#include <asm/interrupt.h> >>> +#include <asm/page.h> >>> +#include <asm-generic/barrier.h> >>> + >>> +#include <sclp.h> >>> +#include <smp.h> >>> +#include <alloc_page.h> >>> + >>> +static void wait_for_sclp_int(void) >>> +{ >>> + /* Enable SCLP interrupts on this CPU only. */ >>> + ctl_set_bit(0, CTL0_SERVICE_SIGNAL); >>> + >>> + /* Enable external interrupts and go to the wait state. */ >>> + wait_for_interrupt(PSW_MASK_EXT); >>> +} >> >> What happens if the CPU got an interrupt? Should there be a "while (true)" > > it should not get any interrupts, but if it does anyway... > >> at the end of the function to avoid that the CPU ends up crashing at the end >> of the function? > > ... we have this in smp_cpu_setup_state, after the call to the actual > function body: > > /* If the function returns, just loop here */ > 0: j 0 > > so if the function returns, it will hang in there anyway Ah, great, so we're fine indeed! Thomas
>>> + if (smp_query_num_cpus() < 3) { >>> + report_skip("need at least 3 CPUs for this test"); >>> + goto out; >>> + } >>> + >>> + if (stap()) { >>> + report_skip("need to start on CPU #0"); >>> + goto out; >>> + } >> >> I think I'd rather turn this into an assert() instead ... no strong opinion >> about it, though. > > I agree, including the part about no strong opinions (which is why I > did not comment on it before) Would it be the case on any system we might end up running, even under LPAR ... and whoever could run these tests ? > >> >>> + >>> + /* >>> + * We want CPU #2 to be stopped. This should be the case at this >>> + * point, however, we want to sense if it even exists as well. >>> + */ >>> + ret = smp_cpu_stop(2); >>> + if (ret) { >>> + report_skip("CPU #2 not found"); >> >> Since you already queried for the availablity of at least 3 CPUs above, I >> think you could turn this into a report_fail() instead? > > either that or an assert, but again, no strong opinions > Just because there are >= 3 CPUs doesn't imply that CPU #2 is around. What we could remove is the "if (smp_query_num_cpus() < 3) {" check, though! Thanks!
On 03/12/2021 19.23, David Hildenbrand wrote: > >>>> + if (smp_query_num_cpus() < 3) { >>>> + report_skip("need at least 3 CPUs for this test"); >>>> + goto out; >>>> + } >>>> + >>>> + if (stap()) { >>>> + report_skip("need to start on CPU #0"); >>>> + goto out; >>>> + } >>> >>> I think I'd rather turn this into an assert() instead ... no strong opinion >>> about it, though. >> >> I agree, including the part about no strong opinions (which is why I >> did not comment on it before) > > Would it be the case on any system we might end up running, even under > LPAR ... and whoever could run these tests ? Well, ok, since it likely doesn't happen in real life anyway, simply keep the report_skip(). >> >>> >>>> + >>>> + /* >>>> + * We want CPU #2 to be stopped. This should be the case at this >>>> + * point, however, we want to sense if it even exists as well. >>>> + */ >>>> + ret = smp_cpu_stop(2); >>>> + if (ret) { >>>> + report_skip("CPU #2 not found"); >>> >>> Since you already queried for the availablity of at least 3 CPUs above, I >>> think you could turn this into a report_fail() instead? >> >> either that or an assert, but again, no strong opinions >> > > Just because there are >= 3 CPUs doesn't imply that CPU #2 is around. Ok, fair point. But if #2 is not around, it means that the test has been run in the wrong way by the user... I wonder what's better in that case - to skip this test or to go out with a bang. Skipping the test has the advantage of looking a little bit more "polite", but it has the disadvantage that it might get lost in automation, e.g. if somebody enabled the test in their CI, but did something wrong in the settings, they might not notice that the test is not run at all... > What we could remove is the "if (smp_query_num_cpus() < 3) {" check, though! Yes, that seems to be redundant, indeed. Thomas
>>> >>>> >>>>> + >>>>> + /* >>>>> + * We want CPU #2 to be stopped. This should be the case at this >>>>> + * point, however, we want to sense if it even exists as well. >>>>> + */ >>>>> + ret = smp_cpu_stop(2); >>>>> + if (ret) { >>>>> + report_skip("CPU #2 not found"); >>>> >>>> Since you already queried for the availablity of at least 3 CPUs above, I >>>> think you could turn this into a report_fail() instead? >>> >>> either that or an assert, but again, no strong opinions >>> >> >> Just because there are >= 3 CPUs doesn't imply that CPU #2 is around. > > Ok, fair point. But if #2 is not around, it means that the test has been run > in the wrong way by the user... I wonder what's better in that case - to > skip this test or to go out with a bang. Skipping the test has the advantage > of looking a little bit more "polite", but it has the disadvantage that it > might get lost in automation, e.g. if somebody enabled the test in their CI, > but did something wrong in the settings, they might not notice that the test > is not run at all... I sticked to what we have in s390x/smp.c, where we fail if we only have a single CPU. But I don't particularly care (and have to move on doing other stuff), so I'll do whatever maintainers want and resend :)
On Mon, 6 Dec 2021 09:15:00 +0100 David Hildenbrand <david@redhat.com> wrote: > >>> > >>>> > >>>>> + > >>>>> + /* > >>>>> + * We want CPU #2 to be stopped. This should be the case at this > >>>>> + * point, however, we want to sense if it even exists as well. > >>>>> + */ > >>>>> + ret = smp_cpu_stop(2); > >>>>> + if (ret) { > >>>>> + report_skip("CPU #2 not found"); > >>>> > >>>> Since you already queried for the availablity of at least 3 CPUs above, I > >>>> think you could turn this into a report_fail() instead? > >>> > >>> either that or an assert, but again, no strong opinions > >>> > >> > >> Just because there are >= 3 CPUs doesn't imply that CPU #2 is around. > > > > Ok, fair point. But if #2 is not around, it means that the test has been run > > in the wrong way by the user... I wonder what's better in that case - to > > skip this test or to go out with a bang. Skipping the test has the advantage > > of looking a little bit more "polite", but it has the disadvantage that it > > might get lost in automation, e.g. if somebody enabled the test in their CI, > > but did something wrong in the settings, they might not notice that the test > > is not run at all... > > I sticked to what we have in s390x/smp.c, where we fail if we only have > a single CPU. > > But I don't particularly care (and have to move on doing other stuff), > so I'll do whatever maintainers want and resend :) > a better solution for number != ID is needed (aka: I'll try to fix it when I have the time), for now it works, so leave it as it is.
diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c index 0272249..33985eb 100644 --- a/lib/s390x/sclp.c +++ b/lib/s390x/sclp.c @@ -60,9 +60,7 @@ void sclp_setup_int(void) void sclp_handle_ext(void) { ctl_clear_bit(0, CTL0_SERVICE_SIGNAL); - spin_lock(&sclp_lock); - sclp_busy = false; - spin_unlock(&sclp_lock); + sclp_clear_busy(); } void sclp_wait_busy(void) @@ -89,6 +87,13 @@ void sclp_mark_busy(void) } } +void sclp_clear_busy(void) +{ + spin_lock(&sclp_lock); + sclp_busy = false; + spin_unlock(&sclp_lock); +} + static void sclp_read_scp_info(ReadInfo *ri, int length) { unsigned int commands[] = { SCLP_CMDW_READ_SCP_INFO_FORCED, diff --git a/lib/s390x/sclp.h b/lib/s390x/sclp.h index 61e9cf5..fead007 100644 --- a/lib/s390x/sclp.h +++ b/lib/s390x/sclp.h @@ -318,6 +318,7 @@ void sclp_setup_int(void); void sclp_handle_ext(void); void sclp_wait_busy(void); void sclp_mark_busy(void); +void sclp_clear_busy(void); void sclp_console_setup(void); void sclp_print(const char *str); void sclp_read_info(void); diff --git a/s390x/Makefile b/s390x/Makefile index f95f2e6..1e567c1 100644 --- a/s390x/Makefile +++ b/s390x/Makefile @@ -25,6 +25,7 @@ tests += $(TEST_DIR)/uv-host.elf tests += $(TEST_DIR)/edat.elf tests += $(TEST_DIR)/mvpg-sie.elf tests += $(TEST_DIR)/spec_ex-sie.elf +tests += $(TEST_DIR)/firq.elf tests_binary = $(patsubst %.elf,%.bin,$(tests)) ifneq ($(HOST_KEY_DOCUMENT),) diff --git a/s390x/firq.c b/s390x/firq.c new file mode 100644 index 0000000..1f87718 --- /dev/null +++ b/s390x/firq.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Floating interrupt tests. + * + * Copyright 2021 Red Hat Inc + * + * Authors: + * David Hildenbrand <david@redhat.com> + */ +#include <libcflat.h> +#include <asm/asm-offsets.h> +#include <asm/interrupt.h> +#include <asm/page.h> +#include <asm-generic/barrier.h> + +#include <sclp.h> +#include <smp.h> +#include <alloc_page.h> + +static void wait_for_sclp_int(void) +{ + /* Enable SCLP interrupts on this CPU only. */ + ctl_set_bit(0, CTL0_SERVICE_SIGNAL); + + /* Enable external interrupts and go to the wait state. */ + wait_for_interrupt(PSW_MASK_EXT); +} + +/* + * Some KVM versions might mix CPUs when looking for a floating IRQ target, + * accidentially detecting a stopped CPU as waiting and resulting in the actually + * waiting CPU not getting woken up for the interrupt. + */ +static void test_wait_state_delivery(void) +{ + struct psw psw; + SCCBHeader *h; + int ret; + + report_prefix_push("wait state delivery"); + + if (smp_query_num_cpus() < 3) { + report_skip("need at least 3 CPUs for this test"); + goto out; + } + + if (stap()) { + report_skip("need to start on CPU #0"); + goto out; + } + + /* + * We want CPU #2 to be stopped. This should be the case at this + * point, however, we want to sense if it even exists as well. + */ + ret = smp_cpu_stop(2); + if (ret) { + report_skip("CPU #2 not found"); + goto out; + } + + /* + * We're going to perform an SCLP service call but expect + * the interrupt on CPU #1 while it is in the wait state. + */ + sclp_mark_busy(); + + /* Start CPU #1 and let it wait for the interrupt. */ + psw.mask = extract_psw_mask(); + psw.addr = (unsigned long)wait_for_sclp_int; + ret = smp_cpu_setup(1, psw); + if (ret) { + sclp_clear_busy(); + report_skip("cpu #1 not found"); + goto out; + } + + /* + * We'd have to jump trough some hoops to sense e.g., via SIGP + * CONDITIONAL EMERGENCY SIGNAL if CPU #1 is already in the + * wait state. + * + * Although not completely reliable, use SIGP SENSE RUNNING STATUS + * until not reported as running -- after all, our SCLP processing + * will take some time as well and smp_cpu_setup() returns when we're + * either already in wait_for_sclp_int() or just about to execute it. + */ + while(smp_sense_running_status(1)); + + h = alloc_page(); + h->length = 4096; + ret = servc(SCLP_CMDW_READ_CPU_INFO, __pa(h)); + if (ret) { + sclp_clear_busy(); + report_fail("SCLP_CMDW_READ_CPU_INFO failed"); + goto out_destroy; + } + + /* + * Wait until the interrupt gets delivered on CPU #1, marking the + * SCLP requests as done. + */ + sclp_wait_busy(); + + report(true, "sclp interrupt delivered"); + +out_destroy: + free_page(h); + smp_cpu_destroy(1); +out: + report_prefix_pop(); +} + +int main(void) +{ + report_prefix_push("firq"); + + test_wait_state_delivery(); + + report_prefix_pop(); + return report_summary(); +} diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg index 3b454b7..054560c 100644 --- a/s390x/unittests.cfg +++ b/s390x/unittests.cfg @@ -112,3 +112,13 @@ file = mvpg-sie.elf [spec_ex-sie] file = spec_ex-sie.elf + +[firq-linear-cpu-ids] +file = firq.elf +timeout = 20 +extra_params = -smp 1,maxcpus=3 -cpu qemu -device qemu-s390x-cpu,core-id=1 -device qemu-s390x-cpu,core-id=2 + +[firq-nonlinear-cpu-ids] +file = firq.elf +timeout = 20 +extra_params = -smp 1,maxcpus=3 -cpu qemu -device qemu-s390x-cpu,core-id=2 -device qemu-s390x-cpu,core-id=1
We had a KVM BUG fixed by kernel commit a3e03bc1368c ("KVM: s390: index kvm->arch.idle_mask by vcpu_idx"), whereby a floating interrupt might get stuck forever because a CPU in the wait state would not get woken up. The issue can be triggered when CPUs are created in a nonlinear fashion, such that the CPU address ("core-id") and the KVM cpu id don't match. So let's start with a floating interrupt test that will trigger a floating interrupt (via SCLP) to be delivered to a CPU in the wait state. Signed-off-by: David Hildenbrand <david@redhat.com> --- lib/s390x/sclp.c | 11 ++-- lib/s390x/sclp.h | 1 + s390x/Makefile | 1 + s390x/firq.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ s390x/unittests.cfg | 10 ++++ 5 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 s390x/firq.c